gopher://codevoid.de/1/git/hn-gopher/commit/74ea59bf91cf72e4d5e82ebd8bc852b7546c1a0a.gph

       t* hacker news on gopher
       
   URI git clone git://git.codevoid.de/hn-gopher
   DIR Log
   DIR Files
   DIR Refs
       ---
   DIR commit 74ea59bf91cf72e4d5e82ebd8bc852b7546c1a0a
   DIR parent 6533286bc1276e7584436c98fa4f88251da82bf9
   URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
       Date:   Mon, 30 Jul 2018 21:43:49 +0200
       
       big update
       
       - pretty bars
       - pretty dates
       - parallel scraping
       - proper front page top stories
       - configurable front page story count
       - link parser update (still a bit wonky)
       - story file cache
       
       Diffstat:
         M hn-scraper.pl                       |     252 ++++++++++++++++++++++++++-----
       
       1 file changed, 215 insertions(+), 37 deletions(-)
       ---
   DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
       t@@ -2,6 +2,10 @@
        
        use strict;
        use warnings;
       +use Parallel::ForkManager;
       +use DateTime;
       +use DateTime::Duration;
       +use DateTime::Format::Duration;
        use LWP::UserAgent;
        use JSON;
        use HTML::LinkExtractor;
       t@@ -10,21 +14,48 @@ use HTML::Entities;
        use Encode;
        use Text::Wrap;
        $Text::Wrap::columns=72;
       -use Data::Dumper;
        
        ### CONFIGURATION
       -my $protocol   = "https";
       -my $server     = "hn.algolia.com";
       -my $api_uri    = "/api/v1";
       -my $go_root    = "/srv/codevoid-gopher";
       -my $go_path    = "/hn";
       -
       +my $protocol    = "https";
       +my $server      = "hn.algolia.com";
       +my $api_uri     = "/api/v1";
       +my $go_root     = "/srv/codevoid-gopher";
       +my $go_path     = "/hn";
       +my $index_count = 60;
       +
       +my $logo  =" _______               __                   _______\n";
       +   $logo .="|   |   |.---.-..----.|  |--..-----..----. |    |  |.-----..--.--.--..-----.\n";
       +   $logo .="|       ||  _  ||  __||    < |  -__||   _| |       ||  -__||  |  |  ||__ --|\n";
       +   $logo .="|___|___||___._||____||__|__||_____||__|   |__|____||_____||________||_____|\n";
       +   $logo .="                                                      on Gopher (inofficial)\n";
       +   $logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n";
        
        ### FUNCTIONS
       +### SUB: $json = getTopStories();
       +sub getTopStories {
       +    # FIXME make this configurable, maybe.
       +    #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n";
       +    my $REST= ({HOST => "hacker-news.firebaseio.com",
       +            URL  => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" });
       +    $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
       +    $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); 
       +    $REST->{resource} = $REST->{URL};
       +    $REST->{request}  = HTTP::Request->new( GET => $REST->{resource} );
       +    $REST->{response} = $REST->{UA}->request( $REST->{request} );
       +    if(not $REST->{response}->is_success()) {
       +        my $delay = 0.5;
       +        #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n";
       +        sleep $delay;
       +        return getTopStories();
       +    }    
       +    return decode_json($REST->{response}->content);
       +}
       +
       +
        ### SUB: $json = getApiData("/api/...");
        sub getApiData {
            my ( $uri ) = @_;
       -    print "Debug: getApiData($protocol://$server$uri)\n";
       +    #print "Debug: getApiData($protocol://$server$uri)\n";
            my $REST= ({HOST => "$server",
                        URL  => "$protocol://$server$uri" });
            $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
       t@@ -33,13 +64,14 @@ sub getApiData {
            $REST->{request}  = HTTP::Request->new( GET => $REST->{resource} );
            $REST->{response} = $REST->{UA}->request( $REST->{request} );
            if(not $REST->{response}->is_success()) {
       -        print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
       +        #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
                sleep 2;
                return getApiData ( $uri );
            }    
            return decode_json($REST->{response}->content);
        }
        
       +
        ### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
        sub scrapeSubComments {
            my ( $payload, $parentID, $lvl ) = @_;
       t@@ -50,7 +82,8 @@ sub scrapeSubComments {
                        my $text     = encode("UTF-8", $comment->{'comment_text'});
                        my $author   = encode("UTF-8", $comment->{'author'});
                        my $objectID = $comment->{'objectID'};
       -                $output .= formatContent("$author:", $lvl);
       +                my $ago = parseDate($comment->{'created_at'});
       +                $output .= formatContent("$author wrote $ago:", $lvl);
                        $output .= formatContent("$text", $lvl)."\n";
                        $output .= scrapeSubComments( $payload, $objectID, ++$lvl );
                        $lvl--;
       t@@ -60,17 +93,96 @@ sub scrapeSubComments {
            return $output;
        }
        
       -### SUB: scrapeComments($objectID, $number)
       +### SUB: $datestr = parseDate($datestring)
       +sub parseDate {
       +    my ( $datestring ) = @_;
       +
       +    # set output (parse) pattern
       +    my $p = DateTime::Format::Duration->new(
       +        pattern => '%Y|%m|%e|%H|%M',
       +        normalize => 1
       +    );
       +
       +    # FIXME: DateTime::Duration can do the parsing
       +    # parse string and create datetime object
       +    $datestring =~ /(....)-(..)-(..)T(..):(..).*/;
       +    my $dt = DateTime->new(
       +        year       => $1,
       +        month      => $2,
       +        day        => $3,
       +        hour       => $4,
       +        minute     => $5,
       +        second     => 0,
       +        nanosecond => 0,
       +        time_zone  => 'UTC'
       +    );
       +
       +    # calculate difference
       +    my $dt_now = DateTime->now;
       +    my $dt_diff =  $dt_now - $dt;
       +
       +    # parse result
       +    my $o = $p->format_duration($dt_diff);
       +
       +    # parse output (FIXME: this is *so* ugly)
       +    my $dtstr = "";
       +    $o =~ /(\d+)\|(\d+)\|(\d+)\|(\d+)\|(\d+)/;
       +    my $Y = int($1);
       +    my $m = int($2);
       +    my $d = int($3);
       +    my $H = int($4);
       +    my $M = int($5);
       +    if($M) {
       +        $dtstr = "$M min ago";
       +    }
       +    if($H) {
       +        if($H == 1) {
       +            $dtstr = "$H hour $M min ago";
       +        } else {
       +            $dtstr = "$H hours $M min ago";
       +        }
       +    }
       +    if($d) {
       +        if($d == 1) {
       +            $dtstr = "$d day ago"; 
       +        } else {
       +            $dtstr = "$d days ago"; 
       +        }
       +    }
       +    if($m) {
       +        if($m == 1) {
       +            if($d == 1) {
       +                $dtstr = "$m month $d day ago"; 
       +            } else {
       +                $dtstr = "$m month $d days ago"; 
       +            }
       +        } else {
       +            if($d == 1) {
       +                $dtstr = "$m months $d day ago"; 
       +            } else {
       +                $dtstr = "$m months $d days ago"; 
       +            }
       +        }
       +    }
       +    if($Y) {
       +        $dtstr = "on $Y-$m-$d ($H:$M)"; 
       +    }
       +
       +    return $dtstr;
       +}
       +
       +### SUB: scrapeComments($objectID, $number, $title)
        sub scrapeComments {
       -    my ( $objectID, $number ) = @_;
       -    my $content = "";
       +    my ( $objectID, $number, $title ) = @_;
       +    my $content = "$logo\nCOMMENT PAGE FOR:\n  \"$title\"\n\n";
            if($number) {
                my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number");
       -        $content = scrapeSubComments($payload, $objectID, 0);
       +        $content .= scrapeSubComments($payload, $objectID, 0);
            } else {
       -        $content = "No comments available\n";
       +        $content .= "No comments available\n";
            }
       -    saveFile($content, "story_$objectID.gph");
       +    $content .= "\n[1|<- back to front page|$go_path|server|port]";
       +    saveFile($content, "comments_$objectID.gph");
        }
        
        ### SUB: formatContent($content, $lvl)
       t@@ -90,8 +202,8 @@ sub formatContent {
            # calculate padding
            $Text::Wrap::columns=72-($lvl*2);
            while($lvl > 0) {
       -      $pad="  ".$pad;
       -      $lvl--;
       +        $pad="  ".$pad;
       +        $lvl--;
            }
        
            # Search for links
       t@@ -125,13 +237,21 @@ sub formatContent {
                    # make sure there are no newlines/extra spaces around [0]
                    $content_clean =~ s/[\s\n]+\[$c\][\s\n]+/ \[$c\] /g;
        
       +            # fix the [1] [1] situation (FIXME: how to do this properly?)
       +            $content_clean =~ s/\[1\][\.:\s\n]+\[1\]/\[1\]/g;
       +            $content_clean =~ s/\[2\][\.:\s\n]+\[2\]/\[2\]/g;
       +            $content_clean =~ s/\[3\][\.:\s\n]+\[3\]/\[3\]/g;
       +            $content_clean =~ s/\[4\][\.:\s\n]+\[3\]/\[4\]/g;
       +            $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g;
       +            $content_clean =~ s/ \[\d\] $//g;
       +
                    # shorten links
                    my $short = $linkitem->{href};
                    my $l = 62 - length($pad);
                    if(length($short) > $l) { $short = substr($short,0,$l)."..."; }
        
                    # add link to output scalar
       -            $links .= sprintf("[h|${pad}\\|[%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href});
       +            $links .= sprintf("[h|${pad}║ [%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href});
                }   
            }
        
       t@@ -143,12 +263,12 @@ sub formatContent {
            $content_clean =~ s/\n\n(\n)*/\n\n/g;
        
            # Add padding to the left
       -    $content_clean =~ s/^/$pad\|/g;
       -    $content_clean =~ s/\n/\n$pad\|/g;
       +    $content_clean =~ s/^/$pad║ /g;
       +    $content_clean =~ s/\n/\n$pad║ /g;
        
            # print links if there were any.
            if($links) {
       -        $content_clean .= "\n$pad\|\n$links";
       +        $content_clean .= "\n$pad║ \n$links";
            } else {
                $content_clean .= "\n";    
            }
       t@@ -171,49 +291,107 @@ sub saveFile {
        
            # rename to temporary file to real file (atomic)
            rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n";
       -    print "Debug: saveFile(\$content, $filename);\n\n";
       +    #print "Debug: saveFile(\$content, $filename);\n\n";
            return 0;
        }
        
        
        ### MAIN PROGRAM
       -
        my ($selected_story) = @ARGV;
       -my $json_fp = getApiData("$api_uri/search_by_date?tags=front_page&numericFilters=points>20,num_comments>5&hitsPerPage=100");
       -#my $json_fp = getApiData("$api_uri/search?tags=story");
       -
        my $content = "";
       +
       +# fetch top story IDs
       +my $json_top = getTopStories();
       +
       +# construct search query
       +my $query = "search?hitsPerPage=500&tags=story,(";
       +
       +# add stories to search query
       +my $count = 0;
       +for my $id (@$json_top) {
       +    $query .="story_$id,";
       +    $count++;
       +    if($count > $index_count) {
       +        last;
       +    }
       +}
       +
       +# remove trailing comma and close query
       +$query =~ s/,$/\)/g;
       +
       +# set up background tasks for parallel scraping
       +my $pm = new Parallel::ForkManager(50);
       +
       +my $json_fp = getApiData("$api_uri/$query");
        for my $hit ($json_fp->{"hits"}) {
            foreach my $story (@$hit) {
       +
       +        # do everything from here in background
       +        $pm->start and next;
       +
       +        # title is a link, escape "|"
                my $title  = encode("UTF-8", $story->{'title'});
                $title =~ s/\|/\\|/g;
       +
       +        # URL is either a HTML link line or a gopher dir
                my $url = "";
                if($story->{'url'}) {
                    $url    = encode("UTF-8", $story->{'url'});
                    $content .= "[h|  $title|URL:$url|server|port]\n";
                } else {
       -            $url    = "$go_path/story_$story->{'objectID'}.gph";
       +            $url    = "$go_path/comments_$story->{'objectID'}.gph";
                    $content .= "[1|  $title|$url|server|port]\n";
                }
       +
       +        # 
                my $author = encode("UTF-8", $story->{'author'});
       +        my $objectID = $story->{'objectID'};
       +
       +        # parse date
       +        my $ago = parseDate($story->{'created_at'});
        
       -        $story->{'created_at'} =~ /(....-..-..)T(..:..).*/;
       -        my $date = $1;
       -        my $time = $2;
                my $number = 0;
                if($story->{'num_comments'}) {
                    $number = $story->{'num_comments'};
                }
        
       -        $content .= "  by $author ($story->{'points'} points) at $time ($date)\n";
       -        $content .= "[1|  read $number comments|$go_path/story_$story->{'objectID'}.gph|server|port]\n";
       +        # build content
       +        $content .= "  by $author ($story->{'points'} points) $ago\n";
       +        $content .= "[1|  read $number comments|$go_path/comments_$objectID.gph|server|port]\n";
                $content .= "\n";
       -        print "Debug: scrapeComments($story->{'objectID'}, $number);\n";
        
       -        scrapeComments($story->{'objectID'}, $number);
       +        # Save (if not already done - assuming the story doesn't change)
       +        # FIXME: the title could be changed by the staff
       +        if (not -e "$go_root$go_path/story_$objectID.gph") {
       +            saveFile($content, "story_$objectID.gph");
       +        }
       +
       +        # Fire up the comment scraper
       +        #print "Debug: scrapeComments($objectID, $number, $title);\n";
       +        scrapeComments($story->{'objectID'}, $number, $title);
       +
       +        # background task stopps here
       +        $pm->finish
       +    }
       +}
       +
       +# wait for all scraping be done and all cache files be present
       +$pm->wait_all_children;
       +
       +# construct index from cached files
       +$count = 0;
       +my $index_out = "$logo";
       +for my $id (@$json_top) {
       +    if (-e "$go_root$go_path/story_$id.gph") {
       +        open(my $fh, '<', "$go_root$go_path/story_$id.gph");
       +        while (my $row = <$fh>) {
       +            $index_out .= $row;
       +        }
       +        close($fh);
            }
       +    $count++;
       +    if($count > $index_count) { last; }
        }
       -# saving index last to avoid broken links while scraper is running.
       -saveFile($content, "index.gph");
       +saveFile($index_out, "index.gph");
        
        exit 0;