t* hacker news on gopher
URI git clone git://git.codevoid.de/hn-gopher
DIR Log
DIR Files
DIR Refs
---
DIR commit 74ea59bf91cf72e4d5e82ebd8bc852b7546c1a0a
DIR parent 6533286bc1276e7584436c98fa4f88251da82bf9
URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
Date: Mon, 30 Jul 2018 21:43:49 +0200
big update
- pretty bars
- pretty dates
- parallel scraping
- proper front page top stories
- configurable front page story count
- link parser update (still a bit wonky)
- story file cache
Diffstat:
M hn-scraper.pl | 252 ++++++++++++++++++++++++++-----
1 file changed, 215 insertions(+), 37 deletions(-)
---
DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
t@@ -2,6 +2,10 @@
use strict;
use warnings;
+use Parallel::ForkManager;
+use DateTime;
+use DateTime::Duration;
+use DateTime::Format::Duration;
use LWP::UserAgent;
use JSON;
use HTML::LinkExtractor;
t@@ -10,21 +14,48 @@ use HTML::Entities;
use Encode;
use Text::Wrap;
$Text::Wrap::columns=72;
-use Data::Dumper;
### CONFIGURATION
-my $protocol = "https";
-my $server = "hn.algolia.com";
-my $api_uri = "/api/v1";
-my $go_root = "/srv/codevoid-gopher";
-my $go_path = "/hn";
-
+my $protocol = "https";
+my $server = "hn.algolia.com";
+my $api_uri = "/api/v1";
+my $go_root = "/srv/codevoid-gopher";
+my $go_path = "/hn";
+my $index_count = 60;
+
+my $logo =" _______ __ _______\n";
+ $logo .="| | |.---.-..----.| |--..-----..----. | | |.-----..--.--.--..-----.\n";
+ $logo .="| || _ || __|| < | -__|| _| | || -__|| | | ||__ --|\n";
+ $logo .="|___|___||___._||____||__|__||_____||__| |__|____||_____||________||_____|\n";
+ $logo .=" on Gopher (inofficial)\n";
+ $logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n";
### FUNCTIONS
+### SUB: $json = getTopStories();
+sub getTopStories {
+ # FIXME make this configurable, maybe.
+ #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n";
+ my $REST= ({HOST => "hacker-news.firebaseio.com",
+ URL => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" });
+ $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
+ $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1");
+ $REST->{resource} = $REST->{URL};
+ $REST->{request} = HTTP::Request->new( GET => $REST->{resource} );
+ $REST->{response} = $REST->{UA}->request( $REST->{request} );
+ if(not $REST->{response}->is_success()) {
+ my $delay = 0.5;
+ #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n";
+ sleep $delay;
+ return getTopStories();
+ }
+ return decode_json($REST->{response}->content);
+}
+
+
### SUB: $json = getApiData("/api/...");
sub getApiData {
my ( $uri ) = @_;
- print "Debug: getApiData($protocol://$server$uri)\n";
+ #print "Debug: getApiData($protocol://$server$uri)\n";
my $REST= ({HOST => "$server",
URL => "$protocol://$server$uri" });
$REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
t@@ -33,13 +64,14 @@ sub getApiData {
$REST->{request} = HTTP::Request->new( GET => $REST->{resource} );
$REST->{response} = $REST->{UA}->request( $REST->{request} );
if(not $REST->{response}->is_success()) {
- print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
+ #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
sleep 2;
return getApiData ( $uri );
}
return decode_json($REST->{response}->content);
}
+
### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
sub scrapeSubComments {
my ( $payload, $parentID, $lvl ) = @_;
t@@ -50,7 +82,8 @@ sub scrapeSubComments {
my $text = encode("UTF-8", $comment->{'comment_text'});
my $author = encode("UTF-8", $comment->{'author'});
my $objectID = $comment->{'objectID'};
- $output .= formatContent("$author:", $lvl);
+ my $ago = parseDate($comment->{'created_at'});
+ $output .= formatContent("$author wrote $ago:", $lvl);
$output .= formatContent("$text", $lvl)."\n";
$output .= scrapeSubComments( $payload, $objectID, ++$lvl );
$lvl--;
t@@ -60,17 +93,96 @@ sub scrapeSubComments {
return $output;
}
-### SUB: scrapeComments($objectID, $number)
+### SUB: $datestr = parseDate($datestring)
+sub parseDate {
+ my ( $datestring ) = @_;
+
+ # set output (parse) pattern
+ my $p = DateTime::Format::Duration->new(
+ pattern => '%Y|%m|%e|%H|%M',
+ normalize => 1
+ );
+
+ # FIXME: DateTime::Duration can do the parsing
+ # parse string and create datetime object
+ $datestring =~ /(....)-(..)-(..)T(..):(..).*/;
+ my $dt = DateTime->new(
+ year => $1,
+ month => $2,
+ day => $3,
+ hour => $4,
+ minute => $5,
+ second => 0,
+ nanosecond => 0,
+ time_zone => 'UTC'
+ );
+
+ # calculate difference
+ my $dt_now = DateTime->now;
+ my $dt_diff = $dt_now - $dt;
+
+ # parse result
+ my $o = $p->format_duration($dt_diff);
+
+ # parse output (FIXME: this is *so* ugly)
+ my $dtstr = "";
+ $o =~ /(\d+)\|(\d+)\|(\d+)\|(\d+)\|(\d+)/;
+ my $Y = int($1);
+ my $m = int($2);
+ my $d = int($3);
+ my $H = int($4);
+ my $M = int($5);
+ if($M) {
+ $dtstr = "$M min ago";
+ }
+ if($H) {
+ if($H == 1) {
+ $dtstr = "$H hour $M min ago";
+ } else {
+ $dtstr = "$H hours $M min ago";
+ }
+ }
+ if($d) {
+ if($d == 1) {
+ $dtstr = "$d day ago";
+ } else {
+ $dtstr = "$d days ago";
+ }
+ }
+ if($m) {
+ if($m == 1) {
+ if($d == 1) {
+ $dtstr = "$m month $d day ago";
+ } else {
+ $dtstr = "$m month $d days ago";
+ }
+ } else {
+ if($d == 1) {
+ $dtstr = "$m months $d day ago";
+ } else {
+ $dtstr = "$m months $d days ago";
+ }
+ }
+ }
+ if($Y) {
+ $dtstr = "on $Y-$m-$d ($H:$M)";
+ }
+
+ return $dtstr;
+}
+
+### SUB: scrapeComments($objectID, $number, $title)
sub scrapeComments {
- my ( $objectID, $number ) = @_;
- my $content = "";
+ my ( $objectID, $number, $title ) = @_;
+ my $content = "$logo\nCOMMENT PAGE FOR:\n \"$title\"\n\n";
if($number) {
my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number");
- $content = scrapeSubComments($payload, $objectID, 0);
+ $content .= scrapeSubComments($payload, $objectID, 0);
} else {
- $content = "No comments available\n";
+ $content .= "No comments available\n";
}
- saveFile($content, "story_$objectID.gph");
+ $content .= "\n[1|<- back to front page|$go_path|server|port]";
+ saveFile($content, "comments_$objectID.gph");
}
### SUB: formatContent($content, $lvl)
t@@ -90,8 +202,8 @@ sub formatContent {
# calculate padding
$Text::Wrap::columns=72-($lvl*2);
while($lvl > 0) {
- $pad=" ".$pad;
- $lvl--;
+ $pad=" ".$pad;
+ $lvl--;
}
# Search for links
t@@ -125,13 +237,21 @@ sub formatContent {
# make sure there are no newlines/extra spaces around [0]
$content_clean =~ s/[\s\n]+\[$c\][\s\n]+/ \[$c\] /g;
+ # fix the [1] [1] situation (FIXME: how to do this properly?)
+ $content_clean =~ s/\[1\][\.:\s\n]+\[1\]/\[1\]/g;
+ $content_clean =~ s/\[2\][\.:\s\n]+\[2\]/\[2\]/g;
+ $content_clean =~ s/\[3\][\.:\s\n]+\[3\]/\[3\]/g;
+ $content_clean =~ s/\[4\][\.:\s\n]+\[3\]/\[4\]/g;
+ $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g;
+ $content_clean =~ s/ \[\d\] $//g;
+
# shorten links
my $short = $linkitem->{href};
my $l = 62 - length($pad);
if(length($short) > $l) { $short = substr($short,0,$l)."..."; }
# add link to output scalar
- $links .= sprintf("[h|${pad}\\|[%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href});
+ $links .= sprintf("[h|${pad}║ [%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href});
}
}
t@@ -143,12 +263,12 @@ sub formatContent {
$content_clean =~ s/\n\n(\n)*/\n\n/g;
# Add padding to the left
- $content_clean =~ s/^/$pad\|/g;
- $content_clean =~ s/\n/\n$pad\|/g;
+ $content_clean =~ s/^/$pad║ /g;
+ $content_clean =~ s/\n/\n$pad║ /g;
# print links if there were any.
if($links) {
- $content_clean .= "\n$pad\|\n$links";
+ $content_clean .= "\n$pad║ \n$links";
} else {
$content_clean .= "\n";
}
t@@ -171,49 +291,107 @@ sub saveFile {
# rename to temporary file to real file (atomic)
rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n";
- print "Debug: saveFile(\$content, $filename);\n\n";
+ #print "Debug: saveFile(\$content, $filename);\n\n";
return 0;
}
### MAIN PROGRAM
-
my ($selected_story) = @ARGV;
-my $json_fp = getApiData("$api_uri/search_by_date?tags=front_page&numericFilters=points>20,num_comments>5&hitsPerPage=100");
-#my $json_fp = getApiData("$api_uri/search?tags=story");
-
my $content = "";
+
+# fetch top story IDs
+my $json_top = getTopStories();
+
+# construct search query
+my $query = "search?hitsPerPage=500&tags=story,(";
+
+# add stories to search query
+my $count = 0;
+for my $id (@$json_top) {
+ $query .="story_$id,";
+ $count++;
+ if($count > $index_count) {
+ last;
+ }
+}
+
+# remove trailing comma and close query
+$query =~ s/,$/\)/g;
+
+# set up background tasks for parallel scraping
+my $pm = new Parallel::ForkManager(50);
+
+my $json_fp = getApiData("$api_uri/$query");
for my $hit ($json_fp->{"hits"}) {
foreach my $story (@$hit) {
+
+ # do everything from here in background
+ $pm->start and next;
+
+ # title is a link, escape "|"
my $title = encode("UTF-8", $story->{'title'});
$title =~ s/\|/\\|/g;
+
+ # URL is either a HTML link line or a gopher dir
my $url = "";
if($story->{'url'}) {
$url = encode("UTF-8", $story->{'url'});
$content .= "[h| $title|URL:$url|server|port]\n";
} else {
- $url = "$go_path/story_$story->{'objectID'}.gph";
+ $url = "$go_path/comments_$story->{'objectID'}.gph";
$content .= "[1| $title|$url|server|port]\n";
}
+
+ #
my $author = encode("UTF-8", $story->{'author'});
+ my $objectID = $story->{'objectID'};
+
+ # parse date
+ my $ago = parseDate($story->{'created_at'});
- $story->{'created_at'} =~ /(....-..-..)T(..:..).*/;
- my $date = $1;
- my $time = $2;
my $number = 0;
if($story->{'num_comments'}) {
$number = $story->{'num_comments'};
}
- $content .= " by $author ($story->{'points'} points) at $time ($date)\n";
- $content .= "[1| read $number comments|$go_path/story_$story->{'objectID'}.gph|server|port]\n";
+ # build content
+ $content .= " by $author ($story->{'points'} points) $ago\n";
+ $content .= "[1| read $number comments|$go_path/comments_$objectID.gph|server|port]\n";
$content .= "\n";
- print "Debug: scrapeComments($story->{'objectID'}, $number);\n";
- scrapeComments($story->{'objectID'}, $number);
+ # Save (if not already done - assuming the story doesn't change)
+ # FIXME: the title could be changed by the staff
+ if (not -e "$go_root$go_path/story_$objectID.gph") {
+ saveFile($content, "story_$objectID.gph");
+ }
+
+ # Fire up the comment scraper
+ #print "Debug: scrapeComments($objectID, $number, $title);\n";
+ scrapeComments($story->{'objectID'}, $number, $title);
+
+ # background task stopps here
+ $pm->finish
+ }
+}
+
+# wait for all scraping be done and all cache files be present
+$pm->wait_all_children;
+
+# construct index from cached files
+$count = 0;
+my $index_out = "$logo";
+for my $id (@$json_top) {
+ if (-e "$go_root$go_path/story_$id.gph") {
+ open(my $fh, '<', "$go_root$go_path/story_$id.gph");
+ while (my $row = <$fh>) {
+ $index_out .= $row;
+ }
+ close($fh);
}
+ $count++;
+ if($count > $index_count) { last; }
}
-# saving index last to avoid broken links while scraper is running.
-saveFile($content, "index.gph");
+saveFile($index_out, "index.gph");
exit 0;