t* hacker news on gopher URI git clone git://git.codevoid.de/hn-gopher DIR Log DIR Files DIR Refs --- DIR commit 74ea59bf91cf72e4d5e82ebd8bc852b7546c1a0a DIR parent 6533286bc1276e7584436c98fa4f88251da82bf9 URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de> Date: Mon, 30 Jul 2018 21:43:49 +0200 big update - pretty bars - pretty dates - parallel scraping - proper front page top stories - configurable front page story count - link parser update (still a bit wonky) - story file cache Diffstat: M hn-scraper.pl | 252 ++++++++++++++++++++++++++----- 1 file changed, 215 insertions(+), 37 deletions(-) --- DIR diff --git a/hn-scraper.pl b/hn-scraper.pl t@@ -2,6 +2,10 @@ use strict; use warnings; +use Parallel::ForkManager; +use DateTime; +use DateTime::Duration; +use DateTime::Format::Duration; use LWP::UserAgent; use JSON; use HTML::LinkExtractor; t@@ -10,21 +14,48 @@ use HTML::Entities; use Encode; use Text::Wrap; $Text::Wrap::columns=72; -use Data::Dumper; ### CONFIGURATION -my $protocol = "https"; -my $server = "hn.algolia.com"; -my $api_uri = "/api/v1"; -my $go_root = "/srv/codevoid-gopher"; -my $go_path = "/hn"; - +my $protocol = "https"; +my $server = "hn.algolia.com"; +my $api_uri = "/api/v1"; +my $go_root = "/srv/codevoid-gopher"; +my $go_path = "/hn"; +my $index_count = 60; + +my $logo =" _______ __ _______\n"; + $logo .="| | |.---.-..----.| |--..-----..----. | | |.-----..--.--.--..-----.\n"; + $logo .="| || _ || __|| < | -__|| _| | || -__|| | | ||__ --|\n"; + $logo .="|___|___||___._||____||__|__||_____||__| |__|____||_____||________||_____|\n"; + $logo .=" on Gopher (inofficial)\n"; + $logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n"; ### FUNCTIONS +### SUB: $json = getTopStories(); +sub getTopStories { + # FIXME make this configurable, maybe. + #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n"; + my $REST= ({HOST => "hacker-news.firebaseio.com", + URL => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" }); + $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); + $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); + $REST->{resource} = $REST->{URL}; + $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); + $REST->{response} = $REST->{UA}->request( $REST->{request} ); + if(not $REST->{response}->is_success()) { + my $delay = 0.5; + #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n"; + sleep $delay; + return getTopStories(); + } + return decode_json($REST->{response}->content); +} + + ### SUB: $json = getApiData("/api/..."); sub getApiData { my ( $uri ) = @_; - print "Debug: getApiData($protocol://$server$uri)\n"; + #print "Debug: getApiData($protocol://$server$uri)\n"; my $REST= ({HOST => "$server", URL => "$protocol://$server$uri" }); $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); t@@ -33,13 +64,14 @@ sub getApiData { $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); $REST->{response} = $REST->{UA}->request( $REST->{request} ); if(not $REST->{response}->is_success()) { - print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n"; + #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n"; sleep 2; return getApiData ( $uri ); } return decode_json($REST->{response}->content); } + ### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) sub scrapeSubComments { my ( $payload, $parentID, $lvl ) = @_; t@@ -50,7 +82,8 @@ sub scrapeSubComments { my $text = encode("UTF-8", $comment->{'comment_text'}); my $author = encode("UTF-8", $comment->{'author'}); my $objectID = $comment->{'objectID'}; - $output .= formatContent("$author:", $lvl); + my $ago = parseDate($comment->{'created_at'}); + $output .= formatContent("$author wrote $ago:", $lvl); $output .= formatContent("$text", $lvl)."\n"; $output .= scrapeSubComments( $payload, $objectID, ++$lvl ); $lvl--; t@@ -60,17 +93,96 @@ sub scrapeSubComments { return $output; } -### SUB: scrapeComments($objectID, $number) +### SUB: $datestr = parseDate($datestring) +sub parseDate { + my ( $datestring ) = @_; + + # set output (parse) pattern + my $p = DateTime::Format::Duration->new( + pattern => '%Y|%m|%e|%H|%M', + normalize => 1 + ); + + # FIXME: DateTime::Duration can do the parsing + # parse string and create datetime object + $datestring =~ /(....)-(..)-(..)T(..):(..).*/; + my $dt = DateTime->new( + year => $1, + month => $2, + day => $3, + hour => $4, + minute => $5, + second => 0, + nanosecond => 0, + time_zone => 'UTC' + ); + + # calculate difference + my $dt_now = DateTime->now; + my $dt_diff = $dt_now - $dt; + + # parse result + my $o = $p->format_duration($dt_diff); + + # parse output (FIXME: this is *so* ugly) + my $dtstr = ""; + $o =~ /(\d+)\|(\d+)\|(\d+)\|(\d+)\|(\d+)/; + my $Y = int($1); + my $m = int($2); + my $d = int($3); + my $H = int($4); + my $M = int($5); + if($M) { + $dtstr = "$M min ago"; + } + if($H) { + if($H == 1) { + $dtstr = "$H hour $M min ago"; + } else { + $dtstr = "$H hours $M min ago"; + } + } + if($d) { + if($d == 1) { + $dtstr = "$d day ago"; + } else { + $dtstr = "$d days ago"; + } + } + if($m) { + if($m == 1) { + if($d == 1) { + $dtstr = "$m month $d day ago"; + } else { + $dtstr = "$m month $d days ago"; + } + } else { + if($d == 1) { + $dtstr = "$m months $d day ago"; + } else { + $dtstr = "$m months $d days ago"; + } + } + } + if($Y) { + $dtstr = "on $Y-$m-$d ($H:$M)"; + } + + return $dtstr; +} + +### SUB: scrapeComments($objectID, $number, $title) sub scrapeComments { - my ( $objectID, $number ) = @_; - my $content = ""; + my ( $objectID, $number, $title ) = @_; + my $content = "$logo\nCOMMENT PAGE FOR:\n \"$title\"\n\n"; if($number) { my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number"); - $content = scrapeSubComments($payload, $objectID, 0); + $content .= scrapeSubComments($payload, $objectID, 0); } else { - $content = "No comments available\n"; + $content .= "No comments available\n"; } - saveFile($content, "story_$objectID.gph"); + $content .= "\n[1|<- back to front page|$go_path|server|port]"; + saveFile($content, "comments_$objectID.gph"); } ### SUB: formatContent($content, $lvl) t@@ -90,8 +202,8 @@ sub formatContent { # calculate padding $Text::Wrap::columns=72-($lvl*2); while($lvl > 0) { - $pad=" ".$pad; - $lvl--; + $pad=" ".$pad; + $lvl--; } # Search for links t@@ -125,13 +237,21 @@ sub formatContent { # make sure there are no newlines/extra spaces around [0] $content_clean =~ s/[\s\n]+\[$c\][\s\n]+/ \[$c\] /g; + # fix the [1] [1] situation (FIXME: how to do this properly?) + $content_clean =~ s/\[1\][\.:\s\n]+\[1\]/\[1\]/g; + $content_clean =~ s/\[2\][\.:\s\n]+\[2\]/\[2\]/g; + $content_clean =~ s/\[3\][\.:\s\n]+\[3\]/\[3\]/g; + $content_clean =~ s/\[4\][\.:\s\n]+\[3\]/\[4\]/g; + $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g; + $content_clean =~ s/ \[\d\] $//g; + # shorten links my $short = $linkitem->{href}; my $l = 62 - length($pad); if(length($short) > $l) { $short = substr($short,0,$l)."..."; } # add link to output scalar - $links .= sprintf("[h|${pad}\\|[%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href}); + $links .= sprintf("[h|${pad}║ [%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href}); } } t@@ -143,12 +263,12 @@ sub formatContent { $content_clean =~ s/\n\n(\n)*/\n\n/g; # Add padding to the left - $content_clean =~ s/^/$pad\|/g; - $content_clean =~ s/\n/\n$pad\|/g; + $content_clean =~ s/^/$pad║ /g; + $content_clean =~ s/\n/\n$pad║ /g; # print links if there were any. if($links) { - $content_clean .= "\n$pad\|\n$links"; + $content_clean .= "\n$pad║ \n$links"; } else { $content_clean .= "\n"; } t@@ -171,49 +291,107 @@ sub saveFile { # rename to temporary file to real file (atomic) rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n"; - print "Debug: saveFile(\$content, $filename);\n\n"; + #print "Debug: saveFile(\$content, $filename);\n\n"; return 0; } ### MAIN PROGRAM - my ($selected_story) = @ARGV; -my $json_fp = getApiData("$api_uri/search_by_date?tags=front_page&numericFilters=points>20,num_comments>5&hitsPerPage=100"); -#my $json_fp = getApiData("$api_uri/search?tags=story"); - my $content = ""; + +# fetch top story IDs +my $json_top = getTopStories(); + +# construct search query +my $query = "search?hitsPerPage=500&tags=story,("; + +# add stories to search query +my $count = 0; +for my $id (@$json_top) { + $query .="story_$id,"; + $count++; + if($count > $index_count) { + last; + } +} + +# remove trailing comma and close query +$query =~ s/,$/\)/g; + +# set up background tasks for parallel scraping +my $pm = new Parallel::ForkManager(50); + +my $json_fp = getApiData("$api_uri/$query"); for my $hit ($json_fp->{"hits"}) { foreach my $story (@$hit) { + + # do everything from here in background + $pm->start and next; + + # title is a link, escape "|" my $title = encode("UTF-8", $story->{'title'}); $title =~ s/\|/\\|/g; + + # URL is either a HTML link line or a gopher dir my $url = ""; if($story->{'url'}) { $url = encode("UTF-8", $story->{'url'}); $content .= "[h| $title|URL:$url|server|port]\n"; } else { - $url = "$go_path/story_$story->{'objectID'}.gph"; + $url = "$go_path/comments_$story->{'objectID'}.gph"; $content .= "[1| $title|$url|server|port]\n"; } + + # my $author = encode("UTF-8", $story->{'author'}); + my $objectID = $story->{'objectID'}; + + # parse date + my $ago = parseDate($story->{'created_at'}); - $story->{'created_at'} =~ /(....-..-..)T(..:..).*/; - my $date = $1; - my $time = $2; my $number = 0; if($story->{'num_comments'}) { $number = $story->{'num_comments'}; } - $content .= " by $author ($story->{'points'} points) at $time ($date)\n"; - $content .= "[1| read $number comments|$go_path/story_$story->{'objectID'}.gph|server|port]\n"; + # build content + $content .= " by $author ($story->{'points'} points) $ago\n"; + $content .= "[1| read $number comments|$go_path/comments_$objectID.gph|server|port]\n"; $content .= "\n"; - print "Debug: scrapeComments($story->{'objectID'}, $number);\n"; - scrapeComments($story->{'objectID'}, $number); + # Save (if not already done - assuming the story doesn't change) + # FIXME: the title could be changed by the staff + if (not -e "$go_root$go_path/story_$objectID.gph") { + saveFile($content, "story_$objectID.gph"); + } + + # Fire up the comment scraper + #print "Debug: scrapeComments($objectID, $number, $title);\n"; + scrapeComments($story->{'objectID'}, $number, $title); + + # background task stopps here + $pm->finish + } +} + +# wait for all scraping be done and all cache files be present +$pm->wait_all_children; + +# construct index from cached files +$count = 0; +my $index_out = "$logo"; +for my $id (@$json_top) { + if (-e "$go_root$go_path/story_$id.gph") { + open(my $fh, '<', "$go_root$go_path/story_$id.gph"); + while (my $row = <$fh>) { + $index_out .= $row; + } + close($fh); } + $count++; + if($count > $index_count) { last; } } -# saving index last to avoid broken links while scraper is running. -saveFile($content, "index.gph"); +saveFile($index_out, "index.gph"); exit 0;