t* hacker news on gopher
   URI git clone git://git.codevoid.de/hn-gopher
   DIR Log
   DIR Files
   DIR Refs
   DIR commit 58874a778c1585b20a7dbd4696706ad13f248bee
   DIR parent 837b822cd29a7435100daf6426e86126dcb02dca
   URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
       Date:   Tue, 31 Jul 2018 21:16:17 +0200
       Add code comments, article scraper, pagination
         M hn-scraper.pl                       |     363 +++++++++++++++++++++++++------
       1 file changed, 292 insertions(+), 71 deletions(-)
   DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
       t@@ -1,17 +1,29 @@
        #!/usr/bin/env perl
       +# default
        use strict;
        use warnings;
       +# parallel processing
        use Parallel::ForkManager;
       +# date formatting
        use DateTime;
        use DateTime::Duration;
        use DateTime::Format::Duration;
       +# network
        use LWP::UserAgent;
       +# protocol transformation
        use JSON;
       +use Encode;
       +# text formatting
       +use HTML::FormatText::WithLinks;
        use HTML::LinkExtractor;
        use HTML::Restrict;
        use HTML::Entities;
       -use Encode;
        use Text::Wrap;
       t@@ -21,8 +33,11 @@ my $server      = "hn.algolia.com";
        my $api_uri     = "/api/v1";
        my $go_root     = "/srv/codevoid-gopher";
        my $go_path     = "/hn";
       -my $index_count = 100;
       +my $index_count = 20;   # item count per page
       +my $total_count = 400;  # total item count (all pages)
       +my $dumper      = 0;    # 1 creates plain text versions
       +### CAN HAZ LOGO? SURE!
        my $logo  =" _______               __                   _______\n";
           $logo .="|   |   |.---.-..----.|  |--..-----..----. |    |  |.-----..--.--.--..-----.\n";
           $logo .="|       ||  _  ||  __||    < |  -__||   _| |       ||  -__||  |  |  ||__ --|\n";
       t@@ -31,61 +46,94 @@ my $logo  =" _______               __                   _______\n";
           $logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n";
        ### FUNCTIONS
       -### SUB: $json = getTopStories();
       +# SUB: $json = getTopStories();
       +# read all top stories supplied by the firebase API. This API will only return
       +# the IDs of stories that are currently on the front page. In order.
        sub getTopStories {
            # FIXME make this configurable, maybe.
       -    #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n";
       +    # yes, this is dupicate code to getApiData()
            my $REST= ({HOST => "hacker-news.firebaseio.com",
       -            URL  => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" });
       +                URL  => "https://hacker-news.firebaseio.com/v0/topstories.json" });
            $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
            $REST->{resource} = $REST->{URL};
            $REST->{request}  = HTTP::Request->new( GET => $REST->{resource} );
            $REST->{response} = $REST->{UA}->request( $REST->{request} );
       +    # we're not giving up
            if(not $REST->{response}->is_success()) {
       -        my $delay = 5;
       -        #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n";
       -        sleep $delay;
       +        sleep 5;
                return getTopStories();
            return decode_json($REST->{response}->content);
       -### SUB: $json = getApiData("/api/...");
       +# SUB: $json = getApiData("/api/...");
       +# this call returns stories and comments. The nice thing about this is, that it
       +# can provide all comments to a story in one call. 
       +# OPTIMIZE: right now, the story and comments are fetched separately. This
       +# could be combined in one call.
        sub getApiData {
            my ( $uri ) = @_;
       -    #print "Debug: getApiData($protocol://$server$uri)\n";
            my $REST= ({HOST => "$server",
                        URL  => "$protocol://$server$uri" });
            $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
            $REST->{resource} = $REST->{URL};
            $REST->{request}  = HTTP::Request->new( GET => $REST->{resource} );
            $REST->{response} = $REST->{UA}->request( $REST->{request} );
       +    # we're not giving up
            if(not $REST->{response}->is_success()) {
       -        #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
                sleep 2;
                return getApiData ( $uri );
       -    }    
       +    }
            return decode_json($REST->{response}->content);
       -### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
       +# SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
       +# recursive comment scraper
       +# this sub formats searches for a comment with the incoming parentID
       +# and adds it to $output. Then it calles itself again with the ID of
       +# the found comment and an increased indent level.
       +# Then searches for comments with the incoming ID as parent ID and
       +# adds the first hit to $output. Then it calls itself with the ID as
       +# parentID again...
       +# If no more comments are found with the supplied ID, it decreases
       +# the ident level and returns to the previous invocation.
        sub scrapeSubComments {
            my ( $payload, $parentID, $lvl ) = @_;
       +    # search for comment
            my $output = "";
            for my $hit ($payload->{"hits"}) {
                foreach my $comment (@$hit) {
       +            # comment is found, add to output
                    if ($comment->{'parent_id'} == $parentID) {
       -                my $text     = encode("UTF-8", $comment->{'comment_text'});
       -                my $author   = encode("UTF-8", $comment->{'author'});
       -                my $objectID = $comment->{'objectID'};
       -                my $ago = parseDate($comment->{'created_at'});
       +                # format data
       +                my $text   = encode("UTF-8", $comment->{'comment_text'});
       +                my $author = encode("UTF-8", $comment->{'author'});
       +                my $ago    = parseDate($comment->{'created_at'});
       +                # add to output
                        $output .= formatContent("$author wrote $ago:", $lvl);
                        $output .= formatContent("$text", $lvl)."\n";
       -                $output .= scrapeSubComments( $payload, $objectID, ++$lvl );
       +                # invoke itself with objectID and travers down the hierarchy
       +                $output .= scrapeSubComments( $payload, $comment->{'objectID'}, ++$lvl );
       +                # decrease indentation level
       t@@ -93,7 +141,11 @@ sub scrapeSubComments {
            return $output;
       -### SUB: $datestr = parseDate($datestring)
       +# SUB: $datestr = parseDate($datestring)
       +# takes someting like 2018-04-23T23:45Z002 and converts it to a relative
       +# and humand readable notation like "4 days ago".
       +# OPTIMIZE: the Duration API can be used with parse pattern this should
       +# be used. It's probably simpler and faster.
        sub parseDate {
            my ( $datestring ) = @_;
       t@@ -171,42 +223,147 @@ sub parseDate {
            return $dtstr;
       -### SUB: scrapeComments($objectID, $number, $title)
       +# SUB: scrapeComments($objectID, $number, $link)
       +# this sets up the comment page frame. The content is added by hierarchial
       +# scrapeSubComments() calls.
        sub scrapeComments {
       -    my ( $objectID, $number, $title ) = @_;
       -    my $content = "$logo\nCOMMENT PAGE FOR:\n  \"$title\"\n\n";
       +    my ( $objectID, $number, $link ) = @_;
       +    # set header
       +    my $content = "$logo\nCOMMENT PAGE FOR:\n$link\n\n";
       +    # the comment count. If this is zero, this call can be skipped.
            if($number) {
       +        # call API to receive all comments. The previews call already contains
                my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number");
       +        # invoke hiararchial scraper and hand over the payload
       +        # (only working in memory from here)
                $content .= scrapeSubComments($payload, $objectID, 0);
            } else {
       +        # previous call indicated 0 comments.
                $content .= "No comments available\n";
       +    # all comments have been added to the page. Add footer and save file.
            $content .= "\n[1|<- back to front page|$go_path|server|port]";
            saveFile($content, "comments_$objectID.gph");
       +# SUB: $url = isHtml($url)
       +# this sub checks a given URL by performing a HEAD request. In case the URL is
       +# of type text/html, it will return the URL. Otherwise 0.
       +sub isHtml {
       +    my ( $url ) = @_;
       +    # perform HEAD request
       +    my $ua = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
       +       $ua->agent("codevoid-hackernews-gopherproxy/0.1"); 
       +    my $req = HTTP::Request->new(HEAD => $url);
       +       $req->header('Accept' => 'text/html');
       +    my $resp = $ua->request($req);
       +    # check content type
       +    if ($resp->is_success && ($resp->content_type =~ m/text\/html/)) {
       +       return $resp->request()->uri();
       +    }
       +    return 0;
       +# SUB: dumpArticle($url, $objectID)
       +# This sub downloads webpages and convert them into a plain text format than
       +# can be served on gopher. Once an article has been converted, it is not being
       +# downloaded again.
       +# OPTIMIZE: For some pages, this works great. Not for others. Some custom made
       +# preprocessing steps could be added to strip out navigation, footer, excessive
       +# ads and other non-relevant data. This could be done on a per domain basis.
       +# (this could be a separate program which could be reused in other projects)
       +sub dumpArticle {
       +    my ( $url, $objectID ) = @_;
       +    # is it cached? return.
       +    if (-e "$go_root$go_path/article_$objectID.gph") {
       +        return 0;
       +    }
       +    # content type check
       +    $url = isHtml($url);
       +    if($url == 0) {
       +        print "Skipping (not html)\n";
       +        # the supplied URL is not html, don't add it to the front page.
       +        return 1;   
       +    }
       +    # we got html, let's download it
       +    my $ua  = LWP::UserAgent->new;
       +    my $req = HTTP::Request->new(GET => $url);
       +    my $resp = $ua->request($req);
       +    if ($resp->is_success) {
       +        # OPTIMIZE: this would be the place to modify the HTML
       +        # in $resp->decoded_content
       +        # call successful - convert it to text
       +        my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url");
       +        my $message = $f->parse($resp->decoded_content);
       +        # wrap it to 72 characters (will destroy link lists)
       +        #$Text::Wrap::columns=72;
       +        #$message = wrap("","",$message);
       +        # shrink multiple newlines
       +        $message =~ s/\n\n(\n)*/\n\n/g;
       +        $message =~ s/\t/    /g;
       +        $message =~ s/\nt/\ntt/g;
       +        # save to file
       +        saveFile($message, "article_$objectID.gph");
       +    } else {
       +        # the call was unsuccessful. We're not trying again here.
       +        # The call be repeated on the next scraper run. Returning 1 here
       +        # leads to the link to this file will not be added on the front page.
       +        return 1;
       +    }
       +    # no complaints, add the link to this article.
       +    return 0;
        ### SUB: formatContent($content, $lvl)
       +# This is the comment page formatter. It takes text and an indentation
       +# level und put this nicely on a page, with a level bar on the left.
        sub formatContent {
            my ( $content, $lvl ) = @_;
       +    # decode html notations
            $content = decode_entities($content);
            # remove trailing space before wrapping
            $content =~ s/ $/\n/g;
       +    # handle crazy indent levels that would leave no
       +    # room for text on the right side
            my $pad="";
            if($lvl > 20) {
                $pad = "$lvl> ";
                $lvl = 19;
       -    # calculate padding
       +    # Setup text wrapper to wrap at 72 - indent level
       +    # each level in/decreases two spaces
       +    # Calculate spaces to add on the left side
       +    # based on the reply/indent level.
            while($lvl > 0) {
                $pad="  ".$pad;
       -    # Search for links
       +    # Search for links in comments
            my $LX = new HTML::LinkExtractor();
       t@@ -215,8 +372,8 @@ sub formatContent {
            my $HR = HTML::Restrict->new();
            $content =~ s/<p>/\n\n/g;
            $content =~ s/<li>/\n\n\* /g;
       -    $content =~ s/<blockquote>/\n\n--- QUOTE ---\n/g;
       -    $content =~ s/<\/blockquote>/\n---- END ----\n\n/g;
       +    # strip remaining HTML tags
            my $content_clean = $HR->process($content);
            # nobody needs more that one newline.
       t@@ -230,8 +387,11 @@ sub formatContent {
                    # skip empty links (image links for example)
                    if(!$linkitem->{_TEXT}) { next; }
       +            # link found, increase counter
       +            # replace link text with [$counter]
                    $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g;
                    # make sure there are no newlines/extra spaces around [0]
       t@@ -245,7 +405,7 @@ sub formatContent {
                    $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g;
                    $content_clean =~ s/ \[\d\] $//g;
       -            # shorten links
       +            # shorten links that are too long for the indent level
                    my $short = $linkitem->{href};
                    my $l = 62 - length($pad);
                    if(length($short) > $l) { $short = substr($short,0,$l)."..."; }
       t@@ -255,7 +415,6 @@ sub formatContent {
            # Wrap content 72 - padding
            $content_clean = wrap("","",$content_clean);
       t@@ -266,13 +425,14 @@ sub formatContent {
            $content_clean =~ s/^/$pad║ /g;
            $content_clean =~ s/\n/\n$pad║ /g;
       -    # print links if there were any.
       +    # print links if any...
            if($links) {
                $content_clean .= "\n$pad║ \n$links";
            } else {
                $content_clean .= "\n";    
       +    # fix gopher issues (geomyidae design)
            $content_clean =~ s/\t/    /g;
            $content_clean =~ s/\nt/\ntt/g;
       t@@ -311,7 +471,7 @@ my $count = 0;
        for my $id (@$json_top) {
            $query .="story_$id,";
       -    if($count > $index_count) {
       +    if($count >= $total_count) {
       t@@ -319,81 +479,142 @@ for my $id (@$json_top) {
        # remove trailing comma and close query
        $query =~ s/,$/\)/g;
       +# fetch the top story IDs from firebase API
       +my $topStoryList = getApiData("$api_uri/$query");
        # set up background tasks for parallel scraping
       -my $pm = new Parallel::ForkManager(10);
       +my $pm = new Parallel::ForkManager(50);
       -my $json_fp = getApiData("$api_uri/$query");
       -for my $hit ($json_fp->{"hits"}) {
       +# scrape story header and comments
       +for my $hit ($topStoryList->{"hits"}) {
            foreach my $story (@$hit) {
       -        # do everything from here in background
       +        # do everything from here on in background
                $pm->start and next;
       +        # convenient variables
       +        my $objectID = $story->{'objectID'};
       +        my $author   = encode("UTF-8", $story->{'author'});
       +        my $title    = encode("UTF-8", $story->{'title'});
       +        my $url      = encode("UTF-8", $story->{'url'});
       +        # comments (default to 0)
       +        my $number = 0;
       +        if($story->{'num_comments'}) {
       +            $number = $story->{'num_comments'};
       +        }
       +        # parse date and convert to relative notation (5 min ago)
       +        my $ago = parseDate($story->{'created_at'});
                # title is a link, escape "|"
       -        my $title  = encode("UTF-8", $story->{'title'});
                $title =~ s/\|/\\|/g;
                # URL is either a HTML link line or a gopher dir
       -        my $url = "";
       -        if($story->{'url'}) {
       -            $url    = encode("UTF-8", $story->{'url'});
       -            $content .= "[h|  $title|URL:$url|server|port]\n";
       +        my $link;
       +        if($url) {
       +            # link goes HTTP
       +            $link = "[h|  $title|URL:$url|server|port]\n";
       +            # is the article dumper active?
       +            if($dumper == 1) {
       +                if(dumpArticle($url, $objectID) eq 0) {
       +                    $link .= "[1|  plaintext version|$go_path/article_$objectID.gph|server|port]\n";
       +                }
       +            }
                } else {
       -            $url    = "$go_path/comments_$story->{'objectID'}.gph";
       -            $content .= "[1|  $title|$url|server|port]\n";
       +            # link goes GOPHER (redefine URL to comments [Ask HN])
       +            $url = "$go_path/comments_$story->{'objectID'}.gph";
       +            $link = "[1|  $title|$url|server|port]\n";
       -        # 
       -        my $author = encode("UTF-8", $story->{'author'});
       -        my $objectID = $story->{'objectID'};
       -        # parse date
       -        my $ago = parseDate($story->{'created_at'});
       +        # add title link line
       +        $content .= $link;
       -        my $number = 0;
       -        if($story->{'num_comments'}) {
       -            $number = $story->{'num_comments'};
       -        }
       -        # build content
       +        # add author line
                $content .= "  by $author ($story->{'points'} points) $ago\n";
       +        # add comment link line
                $content .= "[1|  read $number comments|$go_path/comments_$objectID.gph|server|port]\n";
       +        # aaaand one blank
                $content .= "\n";
       -        # Save (if not already done - assuming the story doesn't change)
       -        # FIXME: the title could be changed by the staff
       -        if (not -e "$go_root$go_path/story_$objectID.gph") {
       -            saveFile($content, "story_$objectID.gph");
       -        }
       +        # Save story file
       +        saveFile($content, "story_$objectID.gph");
                # Fire up the comment scraper
       -        #print "Debug: scrapeComments($objectID, $number, $title);\n";
       -        scrapeComments($story->{'objectID'}, $number, $title);
       +        scrapeComments($story->{'objectID'}, $number, $link);
       -        # background task stopps here
       -        $pm->finish
       +        # background task stops here
       +        $pm->finish;
        # wait for all scraping be done and all cache files be present
       -# construct index from cached files
       +# construct index
        $count = 0;
       +# setup pagination variables
       +my $page = 1;
       +my $nextpage;
       +my $prevpage;
       +my $filename;
       +# initialize output variable
        my $index_out = "$logo";
       +# loop at all top stories (to keep the sequence)
        for my $id (@$json_top) {
       +    # append the story files
            if (-e "$go_root$go_path/story_$id.gph") {
                open(my $fh, '<', "$go_root$go_path/story_$id.gph");
       -        while (my $row = <$fh>) {
       -            $index_out .= $row;
       -        }
       +        while (my $row = <$fh>) { $index_out .= $row; }
       +    # increase story counter
       -    # OPTIMIZE: Add pagignation? (who goes to page 2 anyway...)
       -    if($count > $index_count) { last; }
       +    # Pagination
       +    if(($count % $index_count) eq 0) {
       +        # setup defaults
       +        $filename = "index-$page.gph";
       +        $nextpage = $page + 1;
       +        $prevpage = $page - 1;
       +        # special handling for first page (different name)
       +        if($page eq 1) {
       +            $filename = "index.gph";
       +            $index_out .= "[1|   Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]\n\n";
       +            $index_out .= "[1|<< Back Home|/|server|port]";
       +        } else {
       +            $index_out .= "[1|    Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]";
       +        }
       +        # incease page counter
       +        $page++;
       +        # done, save file, proceed with next page
       +        saveFile($index_out, $filename);
       +        # initialize indexout for next run
       +        $index_out = "$logo";
       +    } else {
       +        # handle last page
       +        if ( $count >= $total_count ) {
       +            $index_out .= "[1| << Prev Page ($prevpage)   |$go_path/index-$prevpage.gph|server|port]";
       +            saveFile($index_out, $filename);
       +            last;
       +        }
       +    }
       -$index_out .= "\n[1|<- go back home|/|server|port]";
       -saveFile($index_out, "index.gph");
        exit 0;