it* hacker news on gopher Err codevoid.de 70 i Err codevoid.de 70 hgit clone git://git.codevoid.de/hn-gopher URL:git://git.codevoid.de/hn-gopher codevoid.de 70 1Log /git/hn-gopher/log.gph codevoid.de 70 1Files /git/hn-gopher/files.gph codevoid.de 70 1Refs /git/hn-gopher/refs.gph codevoid.de 70 i--- Err codevoid.de 70 1commit 23a060ffd2c3ed13af0c3c49a8938c4802e3e5da /git/hn-gopher/commit/23a060ffd2c3ed13af0c3c49a8938c4802e3e5da.gph codevoid.de 70 hAuthor: Stefan Hagen URL:mailto:sh+git[at]codevoid[dot]de codevoid.de 70 iDate: Sun, 29 Jul 2018 22:50:47 +0200 Err codevoid.de 70 i Err codevoid.de 70 iInitial Commit Err codevoid.de 70 i Err codevoid.de 70 iDiffstat: Err codevoid.de 70 i A hn-scraper.pl | 214 +++++++++++++++++++++++++++++++ Err codevoid.de 70 i Err codevoid.de 70 i1 file changed, 214 insertions(+), 0 deletions(-) Err codevoid.de 70 i--- Err codevoid.de 70 1diff --git a/hn-scraper.pl b/hn-scraper.pl /git/hn-gopher/file/hn-scraper.pl.gph codevoid.de 70 it@@ -0,0 +1,214 @@ Err codevoid.de 70 i+#!/usr/bin/env perl Err codevoid.de 70 i+ Err codevoid.de 70 i+use strict; Err codevoid.de 70 i+use warnings; Err codevoid.de 70 i+use LWP::UserAgent; Err codevoid.de 70 i+use JSON; Err codevoid.de 70 i+use HTML::LinkExtractor; Err codevoid.de 70 i+use HTML::Restrict; Err codevoid.de 70 i+use HTML::Entities; Err codevoid.de 70 i+use Encode; Err codevoid.de 70 i+use Text::Wrap; Err codevoid.de 70 i+$Text::Wrap::columns=72; Err codevoid.de 70 i+use Data::Dumper; Err codevoid.de 70 i+ Err codevoid.de 70 i+### CONFIGURATION Err codevoid.de 70 i+my $protocol = "https"; Err codevoid.de 70 i+my $server = "hn.algolia.com"; Err codevoid.de 70 i+my $api_taguri = "/api/v1/search?tags="; Err codevoid.de 70 i+my $go_root = "/srv/codevoid-gopher"; Err codevoid.de 70 i+my $go_path = "/hn"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ Err codevoid.de 70 i+### FUNCTIONS Err codevoid.de 70 i+### SUB: $json = getApiData("/api/..."); Err codevoid.de 70 i+sub getApiData { Err codevoid.de 70 i+ my ( $uri ) = @_; Err codevoid.de 70 i+ print "Debug: getApiData($protocol://$server$uri)\n"; Err codevoid.de 70 i+ my $REST= ({HOST => "$server", Err codevoid.de 70 i+ URL => "$protocol://$server$uri" }); Err codevoid.de 70 i+ $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); Err codevoid.de 70 i+ $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); Err codevoid.de 70 i+ $REST->{resource} = $REST->{URL}; Err codevoid.de 70 i+ $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); Err codevoid.de 70 i+ $REST->{response} = $REST->{UA}->request( $REST->{request} ); Err codevoid.de 70 i+ if(not $REST->{response}->is_success()) { Err codevoid.de 70 i+ print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n"; Err codevoid.de 70 i+ sleep 2; Err codevoid.de 70 i+ return getApiData ( $uri ); Err codevoid.de 70 i+ } Err codevoid.de 70 i+ return decode_json($REST->{response}->content); Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i+### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) Err codevoid.de 70 i+sub scrapeSubComments { Err codevoid.de 70 i+ my ( $payload, $parentID, $lvl ) = @_; Err codevoid.de 70 i+ my $output = ""; Err codevoid.de 70 i+ for my $hit ($payload->{"hits"}) { Err codevoid.de 70 i+ foreach my $comment (@$hit) { Err codevoid.de 70 i+ if ($comment->{'parent_id'} == $parentID) { Err codevoid.de 70 i+ my $text = encode("UTF-8", $comment->{'comment_text'}); Err codevoid.de 70 i+ my $author = encode("UTF-8", $comment->{'author'}); Err codevoid.de 70 i+ my $objectID = $comment->{'objectID'}; Err codevoid.de 70 i+ $output .= formatContent("$author:", $lvl); Err codevoid.de 70 i+ $output .= formatContent("$text", $lvl)."\n\n"; Err codevoid.de 70 i+ $output .= scrapeSubComments( $payload, $objectID, ++$lvl ); Err codevoid.de 70 i+ } Err codevoid.de 70 i+ } Err codevoid.de 70 i+ } Err codevoid.de 70 i+ return $output; Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i+### SUB: scrapeComments($objectID, $number) Err codevoid.de 70 i+sub scrapeComments { Err codevoid.de 70 i+ my ( $objectID, $number ) = @_; Err codevoid.de 70 i+ my $payload = getApiData("$api_taguri"."comment,story_$objectID&hitsPerPage=$number"); Err codevoid.de 70 i+ my $content = scrapeSubComments($payload, $objectID, 0); Err codevoid.de 70 i+ saveFile($content, "story_$objectID.gph"); Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i+### SUB: formatContent($content, $lvl) Err codevoid.de 70 i+sub formatContent { Err codevoid.de 70 i+ my ( $content, $lvl ) = @_; Err codevoid.de 70 i+ $content = decode_entities($content); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # remove trailing space before wrapping Err codevoid.de 70 i+ $content =~ s/ $/\n/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # calculate padding Err codevoid.de 70 i+ $Text::Wrap::columns=72-($lvl*2); Err codevoid.de 70 i+ my $pad=""; Err codevoid.de 70 i+ while($lvl > 0) { Err codevoid.de 70 i+ $pad=" ".$pad; Err codevoid.de 70 i+ $lvl--; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Search for links Err codevoid.de 70 i+ my $LX = new HTML::LinkExtractor(); Err codevoid.de 70 i+ $LX->strip(1); Err codevoid.de 70 i+ $LX->parse(\$content); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Replace some HTML elements Err codevoid.de 70 i+ my $HR = HTML::Restrict->new(); Err codevoid.de 70 i+ $content =~ s/

/\n\n/g; Err codevoid.de 70 i+ $content =~ s/

  • /\n\n\* /g; Err codevoid.de 70 i+ $content =~ s/
    /\n\n--- QUOTE ---\n/g; Err codevoid.de 70 i+ $content =~ s/<\/blockquote>/\n---- END ----\n\n/g; Err codevoid.de 70 i+ my $content_clean = $HR->process($content); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # nobody needs more that one newline. Err codevoid.de 70 i+ $content_clean =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Loop at links, match text, add [counter] and generate output. Err codevoid.de 70 i+ my $c = 0; Err codevoid.de 70 i+ my $links = ""; Err codevoid.de 70 i+ foreach my $link ($LX->links) { Err codevoid.de 70 i+ foreach my $linkitem (@$link) { Err codevoid.de 70 i+ Err codevoid.de 70 i+ # skip empty links (image links for example) Err codevoid.de 70 i+ if(!$linkitem->{_TEXT}) { next; } Err codevoid.de 70 i+ Err codevoid.de 70 i+ $c++; Err codevoid.de 70 i+ $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # FIXME FIXME FIXME Err codevoid.de 70 i+ # It's late and the below works. Err codevoid.de 70 i+ $content_clean =~ s/\n \[$c\] / \[$c\] /g; Err codevoid.de 70 i+ $content_clean =~ s/\n \[$c\] / \[$c\] /g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ $content_clean =~ s/\[$c\] \n/ \[$c\] /g; Err codevoid.de 70 i+ $content_clean =~ s/\[$c\] \n/ \[$c\] /g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ $content_clean =~ s/\n \[$c\] \n/ \[$c\] /g; Err codevoid.de 70 i+ $content_clean =~ s/\n \[$c\] \n/ \[$c\] /g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ $content_clean =~ s/ / /g; Err codevoid.de 70 i+ $content_clean =~ s/ / /g; Err codevoid.de 70 i+ $content_clean =~ s/ / /g; Err codevoid.de 70 i+ # FIXME FIXME FIXME Err codevoid.de 70 i+ Err codevoid.de 70 i+ # shorten links Err codevoid.de 70 i+ my $short = $linkitem->{href}; Err codevoid.de 70 i+ my $l = 63 - length($pad); Err codevoid.de 70 i+ if(length($short) > $l) { $short = substr($short,0,$l)."..."; } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # add link to output scalar Err codevoid.de 70 i+ $links .= sprintf("[h|${pad}[%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href}); Err codevoid.de 70 i+ } Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Wrap content 72 - padding Err codevoid.de 70 i+ $content_clean = wrap("","",$content_clean); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # shrink multiple newlines Err codevoid.de 70 i+ $content_clean =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Add padding to the left Err codevoid.de 70 i+ $content_clean =~ s/^/$pad/g; Err codevoid.de 70 i+ $content_clean =~ s/\n/\n$pad/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # print links if there were any. Err codevoid.de 70 i+ if($links) { Err codevoid.de 70 i+ $content_clean .= "\n\n$links"; Err codevoid.de 70 i+ } else { Err codevoid.de 70 i+ $content_clean .= "\n"; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ $content_clean =~ s/\t/ /g; Err codevoid.de 70 i+ $content_clean =~ s/\nt/\ntt/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ return $content_clean; Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i+### SUB: saveFile($content, $filename) Err codevoid.de 70 i+sub saveFile { Err codevoid.de 70 i+ my ( $content, $filename ) = @_; Err codevoid.de 70 i+ my $path = "$go_root$go_path"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # save temporary file Err codevoid.de 70 i+ open (FH, ">> $path/.$filename") || die "Cannot open file temporary file: $filename\n"; Err codevoid.de 70 i+ print FH $content; Err codevoid.de 70 i+ close(FH); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # rename to temporary file to real file (atomic) Err codevoid.de 70 i+ rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n"; Err codevoid.de 70 i+ print "Debug: saveFile(\$content, $filename);\n\n"; Err codevoid.de 70 i+ return 0; Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i+ Err codevoid.de 70 i+### MAIN PROGRAM Err codevoid.de 70 i+ Err codevoid.de 70 i+my ($selected_story) = @ARGV; Err codevoid.de 70 i+my $json_fp = getApiData("$api_taguri"."front_page"); Err codevoid.de 70 i+ Err codevoid.de 70 i+my $content = ""; Err codevoid.de 70 i+for my $hit ($json_fp->{"hits"}) { Err codevoid.de 70 i+ foreach my $story (@$hit) { Err codevoid.de 70 i+ my $title = encode("UTF-8", $story->{'title'}); Err codevoid.de 70 i+ my $url = ""; Err codevoid.de 70 i+ if($story->{'url'}) { Err codevoid.de 70 i+ $url = encode("UTF-8", $story->{'url'}); Err codevoid.de 70 i+ } else { Err codevoid.de 70 i+ $url = "/hn/story_$story->{'objectID'}.gph"; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ my $author = encode("UTF-8", $story->{'author'}); Err codevoid.de 70 i+ Err codevoid.de 70 i+ $title =~ s/\|/\\|/g; Err codevoid.de 70 i+ $story->{'created_at'} =~ /(....-..-..)T(..:..).*/; Err codevoid.de 70 i+ my $date = $1; Err codevoid.de 70 i+ my $time = $2; Err codevoid.de 70 i+ Err codevoid.de 70 i+ $content .= "[h| $title|URL:$url|server|port]\n"; Err codevoid.de 70 i+ $content .= " by $author ($story->{'points'} points) at $time ($date)\n"; Err codevoid.de 70 i+ $content .= "[1| read $story->{'num_comments'} comments|$go_path/story_$story->{'objectID'}.gph|server|port]\n"; Err codevoid.de 70 i+ $content .= "\n"; Err codevoid.de 70 i+ print "Debug: scrapeComments($story->{'objectID'}, $story->{'num_comments'});\n"; Err codevoid.de 70 i+ scrapeComments($story->{'objectID'}, $story->{'num_comments'}); Err codevoid.de 70 i+ } Err codevoid.de 70 i+} Err codevoid.de 70 i+# saving index last to avoid broken links while scraper is running. Err codevoid.de 70 i+saveFile($content, "index.gph"); Err codevoid.de 70 i+ Err codevoid.de 70 i+exit 0; Err codevoid.de 70 .