t* hacker news on gopher
       
   URI git clone git://git.codevoid.de/hn-gopher
   DIR Log
   DIR Files
   DIR Refs
       ---
   DIR commit 23a060ffd2c3ed13af0c3c49a8938c4802e3e5da
   URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
       Date:   Sun, 29 Jul 2018 22:50:47 +0200
       
       Initial Commit
       
       Diffstat:
         A hn-scraper.pl                       |     214 +++++++++++++++++++++++++++++++
       
       1 file changed, 214 insertions(+), 0 deletions(-)
       ---
   DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
       t@@ -0,0 +1,214 @@
       +#!/usr/bin/env perl
       +
       +use strict;
       +use warnings;
       +use LWP::UserAgent;
       +use JSON;
       +use HTML::LinkExtractor;
       +use HTML::Restrict;
       +use HTML::Entities;
       +use Encode;
       +use Text::Wrap;
       +$Text::Wrap::columns=72;
       +use Data::Dumper;
       +
       +### CONFIGURATION
       +my $protocol   = "https";
       +my $server     = "hn.algolia.com";
       +my $api_taguri = "/api/v1/search?tags=";
       +my $go_root    = "/srv/codevoid-gopher";
       +my $go_path    = "/hn";
       +
       +
       +### FUNCTIONS
       +### SUB: $json = getApiData("/api/...");
       +sub getApiData {
       +    my ( $uri ) = @_;
       +    print "Debug: getApiData($protocol://$server$uri)\n";
       +    my $REST= ({HOST => "$server",
       +                URL  => "$protocol://$server$uri" });
       +    $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30);
       +    $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); 
       +    $REST->{resource} = $REST->{URL};
       +    $REST->{request}  = HTTP::Request->new( GET => $REST->{resource} );
       +    $REST->{response} = $REST->{UA}->request( $REST->{request} );
       +    if(not $REST->{response}->is_success()) {
       +        print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n";
       +        sleep 2;
       +        return getApiData ( $uri );
       +    }    
       +    return decode_json($REST->{response}->content);
       +}
       +
       +### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl)
       +sub scrapeSubComments {
       +    my ( $payload, $parentID, $lvl ) = @_;
       +    my $output = "";
       +    for my $hit ($payload->{"hits"}) {
       +        foreach my $comment (@$hit) {
       +            if ($comment->{'parent_id'} == $parentID) {
       +                my $text     = encode("UTF-8", $comment->{'comment_text'});
       +                my $author   = encode("UTF-8", $comment->{'author'});
       +                my $objectID = $comment->{'objectID'};
       +                $output .= formatContent("$author:", $lvl);
       +                $output .= formatContent("$text", $lvl)."\n\n";
       +                $output .= scrapeSubComments( $payload, $objectID, ++$lvl );
       +            }
       +        }
       +    }
       +    return $output;
       +}
       +
       +### SUB: scrapeComments($objectID, $number)
       +sub scrapeComments {
       +    my ( $objectID, $number ) = @_;
       +    my $payload = getApiData("$api_taguri"."comment,story_$objectID&hitsPerPage=$number");
       +    my $content = scrapeSubComments($payload, $objectID, 0);
       +    saveFile($content, "story_$objectID.gph");
       +}
       +
       +### SUB: formatContent($content, $lvl)
       +sub formatContent {
       +    my ( $content, $lvl ) = @_;
       +    $content = decode_entities($content);
       +
       +    # remove trailing space before wrapping
       +    $content =~ s/ $/\n/g;
       +
       +    # calculate padding
       +    $Text::Wrap::columns=72-($lvl*2);
       +    my $pad="";
       +    while($lvl > 0) {
       +      $pad="  ".$pad;
       +      $lvl--;
       +    }
       +
       +    # Search for links
       +    my $LX = new HTML::LinkExtractor();
       +    $LX->strip(1);
       +    $LX->parse(\$content);
       +
       +    # Replace some HTML elements
       +    my $HR = HTML::Restrict->new();
       +    $content =~ s/<p>/\n\n/g;
       +    $content =~ s/<li>/\n\n\* /g;
       +    $content =~ s/<blockquote>/\n\n--- QUOTE ---\n/g;
       +    $content =~ s/<\/blockquote>/\n---- END ----\n\n/g;
       +    my $content_clean = $HR->process($content);
       +
       +    # nobody needs more that one newline.
       +    $content_clean =~ s/\n\n(\n)*/\n\n/g;
       +
       +    # Loop at links, match text, add [counter] and generate output.
       +    my $c = 0;
       +    my $links = "";
       +    foreach my $link ($LX->links) {
       +        foreach my $linkitem (@$link) {
       +
       +            # skip empty links (image links for example)
       +            if(!$linkitem->{_TEXT}) { next; }
       +
       +            $c++;
       +            $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g;
       +
       +            # FIXME FIXME FIXME
       +            # It's late and the below works.
       +            $content_clean =~ s/\n \[$c\] / \[$c\] /g;
       +            $content_clean =~ s/\n \[$c\] / \[$c\] /g;
       +
       +            $content_clean =~ s/\[$c\] \n/ \[$c\] /g;
       +            $content_clean =~ s/\[$c\] \n/ \[$c\] /g;
       +
       +            $content_clean =~ s/\n \[$c\] \n/ \[$c\] /g;
       +            $content_clean =~ s/\n \[$c\] \n/ \[$c\] /g;
       +
       +            $content_clean =~ s/  / /g;
       +            $content_clean =~ s/  / /g;
       +            $content_clean =~ s/  / /g;
       +            # FIXME FIXME FIXME
       +
       +            # shorten links
       +            my $short = $linkitem->{href};
       +            my $l = 63 - length($pad);
       +            if(length($short) > $l) { $short = substr($short,0,$l)."..."; }
       +
       +            # add link to output scalar
       +            $links .= sprintf("[h|${pad}[%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href});
       +        }   
       +    }
       +
       +
       +    # Wrap content 72 - padding
       +    $content_clean = wrap("","",$content_clean);
       +
       +    # shrink multiple newlines
       +    $content_clean =~ s/\n\n(\n)*/\n\n/g;
       +
       +    # Add padding to the left
       +    $content_clean =~ s/^/$pad/g;
       +    $content_clean =~ s/\n/\n$pad/g;
       +
       +    # print links if there were any.
       +    if($links) {
       +        $content_clean .= "\n\n$links";
       +    } else {
       +        $content_clean .= "\n";    
       +    }
       +
       +    $content_clean =~ s/\t/    /g;
       +    $content_clean =~ s/\nt/\ntt/g;
       +
       +    return $content_clean;
       +}
       +
       +### SUB: saveFile($content, $filename)
       +sub saveFile {
       +    my ( $content, $filename ) = @_;
       +    my $path = "$go_root$go_path";
       +
       +    # save temporary file
       +    open (FH, ">> $path/.$filename") || die "Cannot open file temporary file: $filename\n";
       +      print FH $content;
       +    close(FH);
       +
       +    # rename to temporary file to real file (atomic)
       +    rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n";
       +    print "Debug: saveFile(\$content, $filename);\n\n";
       +    return 0;
       +}
       +
       +
       +### MAIN PROGRAM
       +
       +my ($selected_story) = @ARGV;
       +my $json_fp = getApiData("$api_taguri"."front_page");
       +
       +my $content = "";
       +for my $hit ($json_fp->{"hits"}) {
       +    foreach my $story (@$hit) {
       +        my $title  = encode("UTF-8", $story->{'title'});
       +        my $url = "";
       +        if($story->{'url'}) {
       +            $url    = encode("UTF-8", $story->{'url'});
       +        } else {
       +            $url    = "/hn/story_$story->{'objectID'}.gph";
       +        }
       +        my $author = encode("UTF-8", $story->{'author'});
       +
       +        $title =~ s/\|/\\|/g;
       +        $story->{'created_at'} =~ /(....-..-..)T(..:..).*/;
       +        my $date = $1;
       +        my $time = $2;
       +
       +        $content .= "[h|  $title|URL:$url|server|port]\n";
       +        $content .= "  by $author ($story->{'points'} points) at $time ($date)\n";
       +        $content .= "[1|  read $story->{'num_comments'} comments|$go_path/story_$story->{'objectID'}.gph|server|port]\n";
       +        $content .= "\n";
       +        print "Debug: scrapeComments($story->{'objectID'}, $story->{'num_comments'});\n";
       +        scrapeComments($story->{'objectID'}, $story->{'num_comments'});
       +    }
       +}
       +# saving index last to avoid broken links while scraper is running.
       +saveFile($content, "index.gph");
       +
       +exit 0;