t* hacker news on gopher URI git clone git://git.codevoid.de/hn-gopher DIR Log DIR Files DIR Refs --- DIR commit 23a060ffd2c3ed13af0c3c49a8938c4802e3e5da URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de> Date: Sun, 29 Jul 2018 22:50:47 +0200 Initial Commit Diffstat: A hn-scraper.pl | 214 +++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+), 0 deletions(-) --- DIR diff --git a/hn-scraper.pl b/hn-scraper.pl t@@ -0,0 +1,214 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use LWP::UserAgent; +use JSON; +use HTML::LinkExtractor; +use HTML::Restrict; +use HTML::Entities; +use Encode; +use Text::Wrap; +$Text::Wrap::columns=72; +use Data::Dumper; + +### CONFIGURATION +my $protocol = "https"; +my $server = "hn.algolia.com"; +my $api_taguri = "/api/v1/search?tags="; +my $go_root = "/srv/codevoid-gopher"; +my $go_path = "/hn"; + + +### FUNCTIONS +### SUB: $json = getApiData("/api/..."); +sub getApiData { + my ( $uri ) = @_; + print "Debug: getApiData($protocol://$server$uri)\n"; + my $REST= ({HOST => "$server", + URL => "$protocol://$server$uri" }); + $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); + $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); + $REST->{resource} = $REST->{URL}; + $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); + $REST->{response} = $REST->{UA}->request( $REST->{request} ); + if(not $REST->{response}->is_success()) { + print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n"; + sleep 2; + return getApiData ( $uri ); + } + return decode_json($REST->{response}->content); +} + +### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) +sub scrapeSubComments { + my ( $payload, $parentID, $lvl ) = @_; + my $output = ""; + for my $hit ($payload->{"hits"}) { + foreach my $comment (@$hit) { + if ($comment->{'parent_id'} == $parentID) { + my $text = encode("UTF-8", $comment->{'comment_text'}); + my $author = encode("UTF-8", $comment->{'author'}); + my $objectID = $comment->{'objectID'}; + $output .= formatContent("$author:", $lvl); + $output .= formatContent("$text", $lvl)."\n\n"; + $output .= scrapeSubComments( $payload, $objectID, ++$lvl ); + } + } + } + return $output; +} + +### SUB: scrapeComments($objectID, $number) +sub scrapeComments { + my ( $objectID, $number ) = @_; + my $payload = getApiData("$api_taguri"."comment,story_$objectID&hitsPerPage=$number"); + my $content = scrapeSubComments($payload, $objectID, 0); + saveFile($content, "story_$objectID.gph"); +} + +### SUB: formatContent($content, $lvl) +sub formatContent { + my ( $content, $lvl ) = @_; + $content = decode_entities($content); + + # remove trailing space before wrapping + $content =~ s/ $/\n/g; + + # calculate padding + $Text::Wrap::columns=72-($lvl*2); + my $pad=""; + while($lvl > 0) { + $pad=" ".$pad; + $lvl--; + } + + # Search for links + my $LX = new HTML::LinkExtractor(); + $LX->strip(1); + $LX->parse(\$content); + + # Replace some HTML elements + my $HR = HTML::Restrict->new(); + $content =~ s/<p>/\n\n/g; + $content =~ s/<li>/\n\n\* /g; + $content =~ s/<blockquote>/\n\n--- QUOTE ---\n/g; + $content =~ s/<\/blockquote>/\n---- END ----\n\n/g; + my $content_clean = $HR->process($content); + + # nobody needs more that one newline. + $content_clean =~ s/\n\n(\n)*/\n\n/g; + + # Loop at links, match text, add [counter] and generate output. + my $c = 0; + my $links = ""; + foreach my $link ($LX->links) { + foreach my $linkitem (@$link) { + + # skip empty links (image links for example) + if(!$linkitem->{_TEXT}) { next; } + + $c++; + $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g; + + # FIXME FIXME FIXME + # It's late and the below works. + $content_clean =~ s/\n \[$c\] / \[$c\] /g; + $content_clean =~ s/\n \[$c\] / \[$c\] /g; + + $content_clean =~ s/\[$c\] \n/ \[$c\] /g; + $content_clean =~ s/\[$c\] \n/ \[$c\] /g; + + $content_clean =~ s/\n \[$c\] \n/ \[$c\] /g; + $content_clean =~ s/\n \[$c\] \n/ \[$c\] /g; + + $content_clean =~ s/ / /g; + $content_clean =~ s/ / /g; + $content_clean =~ s/ / /g; + # FIXME FIXME FIXME + + # shorten links + my $short = $linkitem->{href}; + my $l = 63 - length($pad); + if(length($short) > $l) { $short = substr($short,0,$l)."..."; } + + # add link to output scalar + $links .= sprintf("[h|${pad}[%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href}); + } + } + + + # Wrap content 72 - padding + $content_clean = wrap("","",$content_clean); + + # shrink multiple newlines + $content_clean =~ s/\n\n(\n)*/\n\n/g; + + # Add padding to the left + $content_clean =~ s/^/$pad/g; + $content_clean =~ s/\n/\n$pad/g; + + # print links if there were any. + if($links) { + $content_clean .= "\n\n$links"; + } else { + $content_clean .= "\n"; + } + + $content_clean =~ s/\t/ /g; + $content_clean =~ s/\nt/\ntt/g; + + return $content_clean; +} + +### SUB: saveFile($content, $filename) +sub saveFile { + my ( $content, $filename ) = @_; + my $path = "$go_root$go_path"; + + # save temporary file + open (FH, ">> $path/.$filename") || die "Cannot open file temporary file: $filename\n"; + print FH $content; + close(FH); + + # rename to temporary file to real file (atomic) + rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n"; + print "Debug: saveFile(\$content, $filename);\n\n"; + return 0; +} + + +### MAIN PROGRAM + +my ($selected_story) = @ARGV; +my $json_fp = getApiData("$api_taguri"."front_page"); + +my $content = ""; +for my $hit ($json_fp->{"hits"}) { + foreach my $story (@$hit) { + my $title = encode("UTF-8", $story->{'title'}); + my $url = ""; + if($story->{'url'}) { + $url = encode("UTF-8", $story->{'url'}); + } else { + $url = "/hn/story_$story->{'objectID'}.gph"; + } + my $author = encode("UTF-8", $story->{'author'}); + + $title =~ s/\|/\\|/g; + $story->{'created_at'} =~ /(....-..-..)T(..:..).*/; + my $date = $1; + my $time = $2; + + $content .= "[h| $title|URL:$url|server|port]\n"; + $content .= " by $author ($story->{'points'} points) at $time ($date)\n"; + $content .= "[1| read $story->{'num_comments'} comments|$go_path/story_$story->{'objectID'}.gph|server|port]\n"; + $content .= "\n"; + print "Debug: scrapeComments($story->{'objectID'}, $story->{'num_comments'});\n"; + scrapeComments($story->{'objectID'}, $story->{'num_comments'}); + } +} +# saving index last to avoid broken links while scraper is running. +saveFile($content, "index.gph"); + +exit 0;