t* hacker news on gopher URI git clone git://git.codevoid.de/hn-gopher DIR Log DIR Files DIR Refs --- DIR commit 58874a778c1585b20a7dbd4696706ad13f248bee DIR parent 837b822cd29a7435100daf6426e86126dcb02dca URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de> Date: Tue, 31 Jul 2018 21:16:17 +0200 Add code comments, article scraper, pagination Diffstat: M hn-scraper.pl | 363 +++++++++++++++++++++++++------ 1 file changed, 292 insertions(+), 71 deletions(-) --- DIR diff --git a/hn-scraper.pl b/hn-scraper.pl t@@ -1,17 +1,29 @@ #!/usr/bin/env perl +# default use strict; use warnings; + +# parallel processing use Parallel::ForkManager; + +# date formatting use DateTime; use DateTime::Duration; use DateTime::Format::Duration; + +# network use LWP::UserAgent; + +# protocol transformation use JSON; +use Encode; + +# text formatting +use HTML::FormatText::WithLinks; use HTML::LinkExtractor; use HTML::Restrict; use HTML::Entities; -use Encode; use Text::Wrap; $Text::Wrap::columns=72; t@@ -21,8 +33,11 @@ my $server = "hn.algolia.com"; my $api_uri = "/api/v1"; my $go_root = "/srv/codevoid-gopher"; my $go_path = "/hn"; -my $index_count = 100; +my $index_count = 20; # item count per page +my $total_count = 400; # total item count (all pages) +my $dumper = 0; # 1 creates plain text versions +### CAN HAZ LOGO? SURE! my $logo =" _______ __ _______\n"; $logo .="| | |.---.-..----.| |--..-----..----. | | |.-----..--.--.--..-----.\n"; $logo .="| || _ || __|| < | -__|| _| | || -__|| | | ||__ --|\n"; t@@ -31,61 +46,94 @@ my $logo =" _______ __ _______\n"; $logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n"; ### FUNCTIONS -### SUB: $json = getTopStories(); + +# SUB: $json = getTopStories(); +# read all top stories supplied by the firebase API. This API will only return +# the IDs of stories that are currently on the front page. In order. sub getTopStories { # FIXME make this configurable, maybe. - #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n"; + # yes, this is dupicate code to getApiData() my $REST= ({HOST => "hacker-news.firebaseio.com", - URL => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" }); + URL => "https://hacker-news.firebaseio.com/v0/topstories.json" }); $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); $REST->{resource} = $REST->{URL}; $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); $REST->{response} = $REST->{UA}->request( $REST->{request} ); + + # we're not giving up if(not $REST->{response}->is_success()) { - my $delay = 5; - #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n"; - sleep $delay; + sleep 5; return getTopStories(); } + return decode_json($REST->{response}->content); } -### SUB: $json = getApiData("/api/..."); +# SUB: $json = getApiData("/api/..."); +# this call returns stories and comments. The nice thing about this is, that it +# can provide all comments to a story in one call. +# OPTIMIZE: right now, the story and comments are fetched separately. This +# could be combined in one call. sub getApiData { my ( $uri ) = @_; - #print "Debug: getApiData($protocol://$server$uri)\n"; + my $REST= ({HOST => "$server", URL => "$protocol://$server$uri" }); + $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); $REST->{resource} = $REST->{URL}; $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); $REST->{response} = $REST->{UA}->request( $REST->{request} ); + + # we're not giving up if(not $REST->{response}->is_success()) { - #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n"; sleep 2; return getApiData ( $uri ); - } + } + return decode_json($REST->{response}->content); } -### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) +# SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) +# recursive comment scraper +# this sub formats searches for a comment with the incoming parentID +# and adds it to $output. Then it calles itself again with the ID of +# the found comment and an increased indent level. +# +# Then searches for comments with the incoming ID as parent ID and +# adds the first hit to $output. Then it calls itself with the ID as +# parentID again... +# +# If no more comments are found with the supplied ID, it decreases +# the ident level and returns to the previous invocation. sub scrapeSubComments { my ( $payload, $parentID, $lvl ) = @_; + + # search for comment my $output = ""; for my $hit ($payload->{"hits"}) { foreach my $comment (@$hit) { + + # comment is found, add to output if ($comment->{'parent_id'} == $parentID) { - my $text = encode("UTF-8", $comment->{'comment_text'}); - my $author = encode("UTF-8", $comment->{'author'}); - my $objectID = $comment->{'objectID'}; - my $ago = parseDate($comment->{'created_at'}); + + # format data + my $text = encode("UTF-8", $comment->{'comment_text'}); + my $author = encode("UTF-8", $comment->{'author'}); + my $ago = parseDate($comment->{'created_at'}); + + # add to output $output .= formatContent("$author wrote $ago:", $lvl); $output .= formatContent("$text", $lvl)."\n"; - $output .= scrapeSubComments( $payload, $objectID, ++$lvl ); + + # invoke itself with objectID and travers down the hierarchy + $output .= scrapeSubComments( $payload, $comment->{'objectID'}, ++$lvl ); + + # decrease indentation level $lvl--; } } t@@ -93,7 +141,11 @@ sub scrapeSubComments { return $output; } -### SUB: $datestr = parseDate($datestring) +# SUB: $datestr = parseDate($datestring) +# takes someting like 2018-04-23T23:45Z002 and converts it to a relative +# and humand readable notation like "4 days ago". +# OPTIMIZE: the Duration API can be used with parse pattern this should +# be used. It's probably simpler and faster. sub parseDate { my ( $datestring ) = @_; t@@ -171,42 +223,147 @@ sub parseDate { return $dtstr; } -### SUB: scrapeComments($objectID, $number, $title) +# SUB: scrapeComments($objectID, $number, $link) +# this sets up the comment page frame. The content is added by hierarchial +# scrapeSubComments() calls. sub scrapeComments { - my ( $objectID, $number, $title ) = @_; - my $content = "$logo\nCOMMENT PAGE FOR:\n \"$title\"\n\n"; + my ( $objectID, $number, $link ) = @_; + + # set header + my $content = "$logo\nCOMMENT PAGE FOR:\n$link\n\n"; + + # the comment count. If this is zero, this call can be skipped. if($number) { + # call API to receive all comments. The previews call already contains my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number"); + + # invoke hiararchial scraper and hand over the payload + # (only working in memory from here) $content .= scrapeSubComments($payload, $objectID, 0); } else { + # previous call indicated 0 comments. $content .= "No comments available\n"; } + + # all comments have been added to the page. Add footer and save file. $content .= "\n[1|<- back to front page|$go_path|server|port]"; saveFile($content, "comments_$objectID.gph"); } +# SUB: $url = isHtml($url) +# this sub checks a given URL by performing a HEAD request. In case the URL is +# of type text/html, it will return the URL. Otherwise 0. +sub isHtml { + my ( $url ) = @_; + + # perform HEAD request + my $ua = LWP::UserAgent->new(keep_alive => 0, timeout => 30); + $ua->agent("codevoid-hackernews-gopherproxy/0.1"); + my $req = HTTP::Request->new(HEAD => $url); + $req->header('Accept' => 'text/html'); + my $resp = $ua->request($req); + + # check content type + if ($resp->is_success && ($resp->content_type =~ m/text\/html/)) { + return $resp->request()->uri(); + } + + return 0; +} + +# SUB: dumpArticle($url, $objectID) +# This sub downloads webpages and convert them into a plain text format than +# can be served on gopher. Once an article has been converted, it is not being +# downloaded again. +# OPTIMIZE: For some pages, this works great. Not for others. Some custom made +# preprocessing steps could be added to strip out navigation, footer, excessive +# ads and other non-relevant data. This could be done on a per domain basis. +# (this could be a separate program which could be reused in other projects) +sub dumpArticle { + my ( $url, $objectID ) = @_; + + # is it cached? return. + if (-e "$go_root$go_path/article_$objectID.gph") { + return 0; + } + + # content type check + $url = isHtml($url); + if($url == 0) { + print "Skipping (not html)\n"; + + # the supplied URL is not html, don't add it to the front page. + return 1; + } + + # we got html, let's download it + my $ua = LWP::UserAgent->new; + my $req = HTTP::Request->new(GET => $url); + my $resp = $ua->request($req); + + if ($resp->is_success) { + + # OPTIMIZE: this would be the place to modify the HTML + # in $resp->decoded_content + + # call successful - convert it to text + my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); + my $message = $f->parse($resp->decoded_content); + + # wrap it to 72 characters (will destroy link lists) + #$Text::Wrap::columns=72; + #$message = wrap("","",$message); + + # shrink multiple newlines + $message =~ s/\n\n(\n)*/\n\n/g; + $message =~ s/\t/ /g; + $message =~ s/\nt/\ntt/g; + + # save to file + saveFile($message, "article_$objectID.gph"); + } else { + # the call was unsuccessful. We're not trying again here. + # The call be repeated on the next scraper run. Returning 1 here + # leads to the link to this file will not be added on the front page. + return 1; + } + + # no complaints, add the link to this article. + return 0; +} + ### SUB: formatContent($content, $lvl) +# This is the comment page formatter. It takes text and an indentation +# level und put this nicely on a page, with a level bar on the left. sub formatContent { my ( $content, $lvl ) = @_; + + # decode html notations $content = decode_entities($content); # remove trailing space before wrapping $content =~ s/ $/\n/g; + # handle crazy indent levels that would leave no + # room for text on the right side my $pad=""; if($lvl > 20) { $pad = "$lvl> "; $lvl = 19; } - # calculate padding + # Setup text wrapper to wrap at 72 - indent level + # each level in/decreases two spaces $Text::Wrap::columns=72-($lvl*2); + + # Calculate spaces to add on the left side + # based on the reply/indent level. while($lvl > 0) { $pad=" ".$pad; $lvl--; } - # Search for links + # Search for links in comments my $LX = new HTML::LinkExtractor(); $LX->strip(1); $LX->parse(\$content); t@@ -215,8 +372,8 @@ sub formatContent { my $HR = HTML::Restrict->new(); $content =~ s/<p>/\n\n/g; $content =~ s/<li>/\n\n\* /g; - $content =~ s/<blockquote>/\n\n--- QUOTE ---\n/g; - $content =~ s/<\/blockquote>/\n---- END ----\n\n/g; + + # strip remaining HTML tags my $content_clean = $HR->process($content); # nobody needs more that one newline. t@@ -230,8 +387,11 @@ sub formatContent { # skip empty links (image links for example) if(!$linkitem->{_TEXT}) { next; } - + + # link found, increase counter $c++; + + # replace link text with [$counter] $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g; # make sure there are no newlines/extra spaces around [0] t@@ -245,7 +405,7 @@ sub formatContent { $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g; $content_clean =~ s/ \[\d\] $//g; - # shorten links + # shorten links that are too long for the indent level my $short = $linkitem->{href}; my $l = 62 - length($pad); if(length($short) > $l) { $short = substr($short,0,$l)."..."; } t@@ -255,7 +415,6 @@ sub formatContent { } } - # Wrap content 72 - padding $content_clean = wrap("","",$content_clean); t@@ -266,13 +425,14 @@ sub formatContent { $content_clean =~ s/^/$pad║ /g; $content_clean =~ s/\n/\n$pad║ /g; - # print links if there were any. + # print links if any... if($links) { $content_clean .= "\n$pad║ \n$links"; } else { $content_clean .= "\n"; } + # fix gopher issues (geomyidae design) $content_clean =~ s/\t/ /g; $content_clean =~ s/\nt/\ntt/g; t@@ -311,7 +471,7 @@ my $count = 0; for my $id (@$json_top) { $query .="story_$id,"; $count++; - if($count > $index_count) { + if($count >= $total_count) { last; } } t@@ -319,81 +479,142 @@ for my $id (@$json_top) { # remove trailing comma and close query $query =~ s/,$/\)/g; +# fetch the top story IDs from firebase API +my $topStoryList = getApiData("$api_uri/$query"); + # set up background tasks for parallel scraping -my $pm = new Parallel::ForkManager(10); +my $pm = new Parallel::ForkManager(50); -my $json_fp = getApiData("$api_uri/$query"); -for my $hit ($json_fp->{"hits"}) { +# scrape story header and comments +for my $hit ($topStoryList->{"hits"}) { foreach my $story (@$hit) { - # do everything from here in background + # do everything from here on in background $pm->start and next; + # convenient variables + my $objectID = $story->{'objectID'}; + my $author = encode("UTF-8", $story->{'author'}); + my $title = encode("UTF-8", $story->{'title'}); + my $url = encode("UTF-8", $story->{'url'}); + + # comments (default to 0) + my $number = 0; + if($story->{'num_comments'}) { + $number = $story->{'num_comments'}; + } + + # parse date and convert to relative notation (5 min ago) + my $ago = parseDate($story->{'created_at'}); + # title is a link, escape "|" - my $title = encode("UTF-8", $story->{'title'}); $title =~ s/\|/\\|/g; # URL is either a HTML link line or a gopher dir - my $url = ""; - if($story->{'url'}) { - $url = encode("UTF-8", $story->{'url'}); - $content .= "[h| $title|URL:$url|server|port]\n"; + my $link; + if($url) { + # link goes HTTP + $link = "[h| $title|URL:$url|server|port]\n"; + + # is the article dumper active? + if($dumper == 1) { + if(dumpArticle($url, $objectID) eq 0) { + $link .= "[1| plaintext version|$go_path/article_$objectID.gph|server|port]\n"; + } + } + } else { - $url = "$go_path/comments_$story->{'objectID'}.gph"; - $content .= "[1| $title|$url|server|port]\n"; + # link goes GOPHER (redefine URL to comments [Ask HN]) + $url = "$go_path/comments_$story->{'objectID'}.gph"; + $link = "[1| $title|$url|server|port]\n"; } - # - my $author = encode("UTF-8", $story->{'author'}); - my $objectID = $story->{'objectID'}; - - # parse date - my $ago = parseDate($story->{'created_at'}); + # add title link line + $content .= $link; - my $number = 0; - if($story->{'num_comments'}) { - $number = $story->{'num_comments'}; - } - - # build content + # add author line $content .= " by $author ($story->{'points'} points) $ago\n"; + + # add comment link line $content .= "[1| read $number comments|$go_path/comments_$objectID.gph|server|port]\n"; + + # aaaand one blank $content .= "\n"; - # Save (if not already done - assuming the story doesn't change) - # FIXME: the title could be changed by the staff - if (not -e "$go_root$go_path/story_$objectID.gph") { - saveFile($content, "story_$objectID.gph"); - } + # Save story file + saveFile($content, "story_$objectID.gph"); # Fire up the comment scraper - #print "Debug: scrapeComments($objectID, $number, $title);\n"; - scrapeComments($story->{'objectID'}, $number, $title); + scrapeComments($story->{'objectID'}, $number, $link); - # background task stopps here - $pm->finish + # background task stops here + $pm->finish; } } # wait for all scraping be done and all cache files be present $pm->wait_all_children; -# construct index from cached files +# construct index $count = 0; + +# setup pagination variables +my $page = 1; +my $nextpage; +my $prevpage; +my $filename; + +# initialize output variable my $index_out = "$logo"; + +# loop at all top stories (to keep the sequence) for my $id (@$json_top) { + + # append the story files if (-e "$go_root$go_path/story_$id.gph") { open(my $fh, '<', "$go_root$go_path/story_$id.gph"); - while (my $row = <$fh>) { - $index_out .= $row; - } + while (my $row = <$fh>) { $index_out .= $row; } close($fh); } + + # increase story counter $count++; - # OPTIMIZE: Add pagignation? (who goes to page 2 anyway...) - if($count > $index_count) { last; } + + # Pagination + if(($count % $index_count) eq 0) { + + # setup defaults + $filename = "index-$page.gph"; + $nextpage = $page + 1; + $prevpage = $page - 1; + + # special handling for first page (different name) + if($page eq 1) { + $filename = "index.gph"; + $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]\n\n"; + $index_out .= "[1|<< Back Home|/|server|port]"; + } else { + $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]"; + } + + # incease page counter + $page++; + + # done, save file, proceed with next page + saveFile($index_out, $filename); + + # initialize indexout for next run + $index_out = "$logo"; + + } else { + + # handle last page + if ( $count >= $total_count ) { + $index_out .= "[1| << Prev Page ($prevpage) |$go_path/index-$prevpage.gph|server|port]"; + saveFile($index_out, $filename); + last; + } + } } -$index_out .= "\n[1|<- go back home|/|server|port]"; -saveFile($index_out, "index.gph"); exit 0;