t* hacker news on gopher
URI git clone git://git.codevoid.de/hn-gopher
DIR Log
DIR Files
DIR Refs
---
DIR commit 58874a778c1585b20a7dbd4696706ad13f248bee
DIR parent 837b822cd29a7435100daf6426e86126dcb02dca
URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
Date: Tue, 31 Jul 2018 21:16:17 +0200
Add code comments, article scraper, pagination
Diffstat:
M hn-scraper.pl | 363 +++++++++++++++++++++++++------
1 file changed, 292 insertions(+), 71 deletions(-)
---
DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
t@@ -1,17 +1,29 @@
#!/usr/bin/env perl
+# default
use strict;
use warnings;
+
+# parallel processing
use Parallel::ForkManager;
+
+# date formatting
use DateTime;
use DateTime::Duration;
use DateTime::Format::Duration;
+
+# network
use LWP::UserAgent;
+
+# protocol transformation
use JSON;
+use Encode;
+
+# text formatting
+use HTML::FormatText::WithLinks;
use HTML::LinkExtractor;
use HTML::Restrict;
use HTML::Entities;
-use Encode;
use Text::Wrap;
$Text::Wrap::columns=72;
t@@ -21,8 +33,11 @@ my $server = "hn.algolia.com";
my $api_uri = "/api/v1";
my $go_root = "/srv/codevoid-gopher";
my $go_path = "/hn";
-my $index_count = 100;
+my $index_count = 20; # item count per page
+my $total_count = 400; # total item count (all pages)
+my $dumper = 0; # 1 creates plain text versions
+### CAN HAZ LOGO? SURE!
my $logo =" _______ __ _______\n";
$logo .="| | |.---.-..----.| |--..-----..----. | | |.-----..--.--.--..-----.\n";
$logo .="| || _ || __|| < | -__|| _| | || -__|| | | ||__ --|\n";
t@@ -31,61 +46,94 @@ my $logo =" _______ __ _______\n";
$logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n";
### FUNCTIONS
-### SUB: $json = getTopStories();
+
+# SUB: $json = getTopStories();
+# read all top stories supplied by the firebase API. This API will only return
+# the IDs of stories that are currently on the front page. In order.
sub getTopStories {
# FIXME make this configurable, maybe.
- #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n";
+ # yes, this is dupicate code to getApiData()
my $REST= ({HOST => "hacker-news.firebaseio.com",
- URL => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" });
+ URL => "https://hacker-news.firebaseio.com/v0/topstories.json" });
$REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
$REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1");
$REST->{resource} = $REST->{URL};
$REST->{request} = HTTP::Request->new( GET => $REST->{resource} );
$REST->{response} = $REST->{UA}->request( $REST->{request} );
+
+ # we're not giving up
if(not $REST->{response}->is_success()) {
- my $delay = 5;
- #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n";
- sleep $delay;
+ sleep 5;
return getTopStories();
}
+
return decode_json($REST->{response}->content);
}
-### SUB: $json = getApiData("/api/...");
+# SUB: $json = getApiData("/api/...");
+# this call returns stories and comments. The nice thing about this is, that it
+# can provide all comments to a story in one call.
+# OPTIMIZE: right now, the story and comments are fetched separately. This
+# could be combined in one call.
sub getApiData {
my ( $uri ) = @_;
- #print "Debug: getApiData($protocol://$server$uri)\n";
+
my $REST= ({HOST => "$server",
URL => "$protocol://$server$uri" });
+
$REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
$REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1");
$REST->{resource} = $REST->{URL};
$REST->{request} = HTTP::Request->new( GET => $REST->{resource} );
$REST->{response} = $REST->{UA}->request( $REST->{request} );
+
+ # we're not giving up
if(not $REST->{response}->is_success()) {
- #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
sleep 2;
return getApiData ( $uri );
- }
+ }
+
return decode_json($REST->{response}->content);
}
-### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
+# SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
+# recursive comment scraper
+# this sub formats searches for a comment with the incoming parentID
+# and adds it to $output. Then it calles itself again with the ID of
+# the found comment and an increased indent level.
+#
+# Then searches for comments with the incoming ID as parent ID and
+# adds the first hit to $output. Then it calls itself with the ID as
+# parentID again...
+#
+# If no more comments are found with the supplied ID, it decreases
+# the ident level and returns to the previous invocation.
sub scrapeSubComments {
my ( $payload, $parentID, $lvl ) = @_;
+
+ # search for comment
my $output = "";
for my $hit ($payload->{"hits"}) {
foreach my $comment (@$hit) {
+
+ # comment is found, add to output
if ($comment->{'parent_id'} == $parentID) {
- my $text = encode("UTF-8", $comment->{'comment_text'});
- my $author = encode("UTF-8", $comment->{'author'});
- my $objectID = $comment->{'objectID'};
- my $ago = parseDate($comment->{'created_at'});
+
+ # format data
+ my $text = encode("UTF-8", $comment->{'comment_text'});
+ my $author = encode("UTF-8", $comment->{'author'});
+ my $ago = parseDate($comment->{'created_at'});
+
+ # add to output
$output .= formatContent("$author wrote $ago:", $lvl);
$output .= formatContent("$text", $lvl)."\n";
- $output .= scrapeSubComments( $payload, $objectID, ++$lvl );
+
+ # invoke itself with objectID and travers down the hierarchy
+ $output .= scrapeSubComments( $payload, $comment->{'objectID'}, ++$lvl );
+
+ # decrease indentation level
$lvl--;
}
}
t@@ -93,7 +141,11 @@ sub scrapeSubComments {
return $output;
}
-### SUB: $datestr = parseDate($datestring)
+# SUB: $datestr = parseDate($datestring)
+# takes someting like 2018-04-23T23:45Z002 and converts it to a relative
+# and humand readable notation like "4 days ago".
+# OPTIMIZE: the Duration API can be used with parse pattern this should
+# be used. It's probably simpler and faster.
sub parseDate {
my ( $datestring ) = @_;
t@@ -171,42 +223,147 @@ sub parseDate {
return $dtstr;
}
-### SUB: scrapeComments($objectID, $number, $title)
+# SUB: scrapeComments($objectID, $number, $link)
+# this sets up the comment page frame. The content is added by hierarchial
+# scrapeSubComments() calls.
sub scrapeComments {
- my ( $objectID, $number, $title ) = @_;
- my $content = "$logo\nCOMMENT PAGE FOR:\n \"$title\"\n\n";
+ my ( $objectID, $number, $link ) = @_;
+
+ # set header
+ my $content = "$logo\nCOMMENT PAGE FOR:\n$link\n\n";
+
+ # the comment count. If this is zero, this call can be skipped.
if($number) {
+ # call API to receive all comments. The previews call already contains
my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number");
+
+ # invoke hiararchial scraper and hand over the payload
+ # (only working in memory from here)
$content .= scrapeSubComments($payload, $objectID, 0);
} else {
+ # previous call indicated 0 comments.
$content .= "No comments available\n";
}
+
+ # all comments have been added to the page. Add footer and save file.
$content .= "\n[1|<- back to front page|$go_path|server|port]";
saveFile($content, "comments_$objectID.gph");
}
+# SUB: $url = isHtml($url)
+# this sub checks a given URL by performing a HEAD request. In case the URL is
+# of type text/html, it will return the URL. Otherwise 0.
+sub isHtml {
+ my ( $url ) = @_;
+
+ # perform HEAD request
+ my $ua = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
+ $ua->agent("codevoid-hackernews-gopherproxy/0.1");
+ my $req = HTTP::Request->new(HEAD => $url);
+ $req->header('Accept' => 'text/html');
+ my $resp = $ua->request($req);
+
+ # check content type
+ if ($resp->is_success && ($resp->content_type =~ m/text\/html/)) {
+ return $resp->request()->uri();
+ }
+
+ return 0;
+}
+
+# SUB: dumpArticle($url, $objectID)
+# This sub downloads webpages and convert them into a plain text format than
+# can be served on gopher. Once an article has been converted, it is not being
+# downloaded again.
+# OPTIMIZE: For some pages, this works great. Not for others. Some custom made
+# preprocessing steps could be added to strip out navigation, footer, excessive
+# ads and other non-relevant data. This could be done on a per domain basis.
+# (this could be a separate program which could be reused in other projects)
+sub dumpArticle {
+ my ( $url, $objectID ) = @_;
+
+ # is it cached? return.
+ if (-e "$go_root$go_path/article_$objectID.gph") {
+ return 0;
+ }
+
+ # content type check
+ $url = isHtml($url);
+ if($url == 0) {
+ print "Skipping (not html)\n";
+
+ # the supplied URL is not html, don't add it to the front page.
+ return 1;
+ }
+
+ # we got html, let's download it
+ my $ua = LWP::UserAgent->new;
+ my $req = HTTP::Request->new(GET => $url);
+ my $resp = $ua->request($req);
+
+ if ($resp->is_success) {
+
+ # OPTIMIZE: this would be the place to modify the HTML
+ # in $resp->decoded_content
+
+ # call successful - convert it to text
+ my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url");
+ my $message = $f->parse($resp->decoded_content);
+
+ # wrap it to 72 characters (will destroy link lists)
+ #$Text::Wrap::columns=72;
+ #$message = wrap("","",$message);
+
+ # shrink multiple newlines
+ $message =~ s/\n\n(\n)*/\n\n/g;
+ $message =~ s/\t/ /g;
+ $message =~ s/\nt/\ntt/g;
+
+ # save to file
+ saveFile($message, "article_$objectID.gph");
+ } else {
+ # the call was unsuccessful. We're not trying again here.
+ # The call be repeated on the next scraper run. Returning 1 here
+ # leads to the link to this file will not be added on the front page.
+ return 1;
+ }
+
+ # no complaints, add the link to this article.
+ return 0;
+}
+
### SUB: formatContent($content, $lvl)
+# This is the comment page formatter. It takes text and an indentation
+# level und put this nicely on a page, with a level bar on the left.
sub formatContent {
my ( $content, $lvl ) = @_;
+
+ # decode html notations
$content = decode_entities($content);
# remove trailing space before wrapping
$content =~ s/ $/\n/g;
+ # handle crazy indent levels that would leave no
+ # room for text on the right side
my $pad="";
if($lvl > 20) {
$pad = "$lvl> ";
$lvl = 19;
}
- # calculate padding
+ # Setup text wrapper to wrap at 72 - indent level
+ # each level in/decreases two spaces
$Text::Wrap::columns=72-($lvl*2);
+
+ # Calculate spaces to add on the left side
+ # based on the reply/indent level.
while($lvl > 0) {
$pad=" ".$pad;
$lvl--;
}
- # Search for links
+ # Search for links in comments
my $LX = new HTML::LinkExtractor();
$LX->strip(1);
$LX->parse(\$content);
t@@ -215,8 +372,8 @@ sub formatContent {
my $HR = HTML::Restrict->new();
$content =~ s/<p>/\n\n/g;
$content =~ s/<li>/\n\n\* /g;
- $content =~ s/<blockquote>/\n\n--- QUOTE ---\n/g;
- $content =~ s/<\/blockquote>/\n---- END ----\n\n/g;
+
+ # strip remaining HTML tags
my $content_clean = $HR->process($content);
# nobody needs more that one newline.
t@@ -230,8 +387,11 @@ sub formatContent {
# skip empty links (image links for example)
if(!$linkitem->{_TEXT}) { next; }
-
+
+ # link found, increase counter
$c++;
+
+ # replace link text with [$counter]
$content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g;
# make sure there are no newlines/extra spaces around [0]
t@@ -245,7 +405,7 @@ sub formatContent {
$content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g;
$content_clean =~ s/ \[\d\] $//g;
- # shorten links
+ # shorten links that are too long for the indent level
my $short = $linkitem->{href};
my $l = 62 - length($pad);
if(length($short) > $l) { $short = substr($short,0,$l)."..."; }
t@@ -255,7 +415,6 @@ sub formatContent {
}
}
-
# Wrap content 72 - padding
$content_clean = wrap("","",$content_clean);
t@@ -266,13 +425,14 @@ sub formatContent {
$content_clean =~ s/^/$pad║ /g;
$content_clean =~ s/\n/\n$pad║ /g;
- # print links if there were any.
+ # print links if any...
if($links) {
$content_clean .= "\n$pad║ \n$links";
} else {
$content_clean .= "\n";
}
+ # fix gopher issues (geomyidae design)
$content_clean =~ s/\t/ /g;
$content_clean =~ s/\nt/\ntt/g;
t@@ -311,7 +471,7 @@ my $count = 0;
for my $id (@$json_top) {
$query .="story_$id,";
$count++;
- if($count > $index_count) {
+ if($count >= $total_count) {
last;
}
}
t@@ -319,81 +479,142 @@ for my $id (@$json_top) {
# remove trailing comma and close query
$query =~ s/,$/\)/g;
+# fetch the top story IDs from firebase API
+my $topStoryList = getApiData("$api_uri/$query");
+
# set up background tasks for parallel scraping
-my $pm = new Parallel::ForkManager(10);
+my $pm = new Parallel::ForkManager(50);
-my $json_fp = getApiData("$api_uri/$query");
-for my $hit ($json_fp->{"hits"}) {
+# scrape story header and comments
+for my $hit ($topStoryList->{"hits"}) {
foreach my $story (@$hit) {
- # do everything from here in background
+ # do everything from here on in background
$pm->start and next;
+ # convenient variables
+ my $objectID = $story->{'objectID'};
+ my $author = encode("UTF-8", $story->{'author'});
+ my $title = encode("UTF-8", $story->{'title'});
+ my $url = encode("UTF-8", $story->{'url'});
+
+ # comments (default to 0)
+ my $number = 0;
+ if($story->{'num_comments'}) {
+ $number = $story->{'num_comments'};
+ }
+
+ # parse date and convert to relative notation (5 min ago)
+ my $ago = parseDate($story->{'created_at'});
+
# title is a link, escape "|"
- my $title = encode("UTF-8", $story->{'title'});
$title =~ s/\|/\\|/g;
# URL is either a HTML link line or a gopher dir
- my $url = "";
- if($story->{'url'}) {
- $url = encode("UTF-8", $story->{'url'});
- $content .= "[h| $title|URL:$url|server|port]\n";
+ my $link;
+ if($url) {
+ # link goes HTTP
+ $link = "[h| $title|URL:$url|server|port]\n";
+
+ # is the article dumper active?
+ if($dumper == 1) {
+ if(dumpArticle($url, $objectID) eq 0) {
+ $link .= "[1| plaintext version|$go_path/article_$objectID.gph|server|port]\n";
+ }
+ }
+
} else {
- $url = "$go_path/comments_$story->{'objectID'}.gph";
- $content .= "[1| $title|$url|server|port]\n";
+ # link goes GOPHER (redefine URL to comments [Ask HN])
+ $url = "$go_path/comments_$story->{'objectID'}.gph";
+ $link = "[1| $title|$url|server|port]\n";
}
- #
- my $author = encode("UTF-8", $story->{'author'});
- my $objectID = $story->{'objectID'};
-
- # parse date
- my $ago = parseDate($story->{'created_at'});
+ # add title link line
+ $content .= $link;
- my $number = 0;
- if($story->{'num_comments'}) {
- $number = $story->{'num_comments'};
- }
-
- # build content
+ # add author line
$content .= " by $author ($story->{'points'} points) $ago\n";
+
+ # add comment link line
$content .= "[1| read $number comments|$go_path/comments_$objectID.gph|server|port]\n";
+
+ # aaaand one blank
$content .= "\n";
- # Save (if not already done - assuming the story doesn't change)
- # FIXME: the title could be changed by the staff
- if (not -e "$go_root$go_path/story_$objectID.gph") {
- saveFile($content, "story_$objectID.gph");
- }
+ # Save story file
+ saveFile($content, "story_$objectID.gph");
# Fire up the comment scraper
- #print "Debug: scrapeComments($objectID, $number, $title);\n";
- scrapeComments($story->{'objectID'}, $number, $title);
+ scrapeComments($story->{'objectID'}, $number, $link);
- # background task stopps here
- $pm->finish
+ # background task stops here
+ $pm->finish;
}
}
# wait for all scraping be done and all cache files be present
$pm->wait_all_children;
-# construct index from cached files
+# construct index
$count = 0;
+
+# setup pagination variables
+my $page = 1;
+my $nextpage;
+my $prevpage;
+my $filename;
+
+# initialize output variable
my $index_out = "$logo";
+
+# loop at all top stories (to keep the sequence)
for my $id (@$json_top) {
+
+ # append the story files
if (-e "$go_root$go_path/story_$id.gph") {
open(my $fh, '<', "$go_root$go_path/story_$id.gph");
- while (my $row = <$fh>) {
- $index_out .= $row;
- }
+ while (my $row = <$fh>) { $index_out .= $row; }
close($fh);
}
+
+ # increase story counter
$count++;
- # OPTIMIZE: Add pagignation? (who goes to page 2 anyway...)
- if($count > $index_count) { last; }
+
+ # Pagination
+ if(($count % $index_count) eq 0) {
+
+ # setup defaults
+ $filename = "index-$page.gph";
+ $nextpage = $page + 1;
+ $prevpage = $page - 1;
+
+ # special handling for first page (different name)
+ if($page eq 1) {
+ $filename = "index.gph";
+ $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]\n\n";
+ $index_out .= "[1|<< Back Home|/|server|port]";
+ } else {
+ $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]";
+ }
+
+ # incease page counter
+ $page++;
+
+ # done, save file, proceed with next page
+ saveFile($index_out, $filename);
+
+ # initialize indexout for next run
+ $index_out = "$logo";
+
+ } else {
+
+ # handle last page
+ if ( $count >= $total_count ) {
+ $index_out .= "[1| << Prev Page ($prevpage) |$go_path/index-$prevpage.gph|server|port]";
+ saveFile($index_out, $filename);
+ last;
+ }
+ }
}
-$index_out .= "\n[1|<- go back home|/|server|port]";
-saveFile($index_out, "index.gph");
exit 0;