it* hacker news on gopher Err codevoid.de 70 i Err codevoid.de 70 hgit clone git://git.codevoid.de/hn-gopher URL:git://git.codevoid.de/hn-gopher codevoid.de 70 1Log /git/hn-gopher/log.gph codevoid.de 70 1Files /git/hn-gopher/files.gph codevoid.de 70 1Refs /git/hn-gopher/refs.gph codevoid.de 70 i--- Err codevoid.de 70 1commit 58874a778c1585b20a7dbd4696706ad13f248bee /git/hn-gopher/commit/58874a778c1585b20a7dbd4696706ad13f248bee.gph codevoid.de 70 1parent 837b822cd29a7435100daf6426e86126dcb02dca /git/hn-gopher/commit/837b822cd29a7435100daf6426e86126dcb02dca.gph codevoid.de 70 hAuthor: Stefan Hagen URL:mailto:sh+git[at]codevoid[dot]de codevoid.de 70 iDate: Tue, 31 Jul 2018 21:16:17 +0200 Err codevoid.de 70 i Err codevoid.de 70 iAdd code comments, article scraper, pagination Err codevoid.de 70 i Err codevoid.de 70 iDiffstat: Err codevoid.de 70 i M hn-scraper.pl | 363 +++++++++++++++++++++++++------ Err codevoid.de 70 i Err codevoid.de 70 i1 file changed, 292 insertions(+), 71 deletions(-) Err codevoid.de 70 i--- Err codevoid.de 70 1diff --git a/hn-scraper.pl b/hn-scraper.pl /git/hn-gopher/file/hn-scraper.pl.gph codevoid.de 70 it@@ -1,17 +1,29 @@ Err codevoid.de 70 i #!/usr/bin/env perl Err codevoid.de 70 i Err codevoid.de 70 i+# default Err codevoid.de 70 i use strict; Err codevoid.de 70 i use warnings; Err codevoid.de 70 i+ Err codevoid.de 70 i+# parallel processing Err codevoid.de 70 i use Parallel::ForkManager; Err codevoid.de 70 i+ Err codevoid.de 70 i+# date formatting Err codevoid.de 70 i use DateTime; Err codevoid.de 70 i use DateTime::Duration; Err codevoid.de 70 i use DateTime::Format::Duration; Err codevoid.de 70 i+ Err codevoid.de 70 i+# network Err codevoid.de 70 i use LWP::UserAgent; Err codevoid.de 70 i+ Err codevoid.de 70 i+# protocol transformation Err codevoid.de 70 i use JSON; Err codevoid.de 70 i+use Encode; Err codevoid.de 70 i+ Err codevoid.de 70 i+# text formatting Err codevoid.de 70 i+use HTML::FormatText::WithLinks; Err codevoid.de 70 i use HTML::LinkExtractor; Err codevoid.de 70 i use HTML::Restrict; Err codevoid.de 70 i use HTML::Entities; Err codevoid.de 70 i-use Encode; Err codevoid.de 70 i use Text::Wrap; Err codevoid.de 70 i $Text::Wrap::columns=72; Err codevoid.de 70 i Err codevoid.de 70 it@@ -21,8 +33,11 @@ my $server = "hn.algolia.com"; Err codevoid.de 70 i my $api_uri = "/api/v1"; Err codevoid.de 70 i my $go_root = "/srv/codevoid-gopher"; Err codevoid.de 70 i my $go_path = "/hn"; Err codevoid.de 70 i-my $index_count = 100; Err codevoid.de 70 i+my $index_count = 20; # item count per page Err codevoid.de 70 i+my $total_count = 400; # total item count (all pages) Err codevoid.de 70 i+my $dumper = 0; # 1 creates plain text versions Err codevoid.de 70 i Err codevoid.de 70 i+### CAN HAZ LOGO? SURE! Err codevoid.de 70 i my $logo =" _______ __ _______\n"; Err codevoid.de 70 i $logo .="| | |.---.-..----.| |--..-----..----. | | |.-----..--.--.--..-----.\n"; Err codevoid.de 70 i $logo .="| || _ || __|| < | -__|| _| | || -__|| | | ||__ --|\n"; Err codevoid.de 70 it@@ -31,61 +46,94 @@ my $logo =" _______ __ _______\n"; Err codevoid.de 70 i $logo .= "[h|Visit Hacker News on the Internet|URL:https://news.ycombinator.com|server|port]\n\n"; Err codevoid.de 70 i Err codevoid.de 70 i ### FUNCTIONS Err codevoid.de 70 i-### SUB: $json = getTopStories(); Err codevoid.de 70 i+ Err codevoid.de 70 i+# SUB: $json = getTopStories(); Err codevoid.de 70 i+# read all top stories supplied by the firebase API. This API will only return Err codevoid.de 70 i+# the IDs of stories that are currently on the front page. In order. Err codevoid.de 70 i sub getTopStories { Err codevoid.de 70 i # FIXME make this configurable, maybe. Err codevoid.de 70 i- #print "Debug: getTopStories($protocol://hacker-news.firebaseio.com/v0/topstories.json)\n"; Err codevoid.de 70 i+ # yes, this is dupicate code to getApiData() Err codevoid.de 70 i my $REST= ({HOST => "hacker-news.firebaseio.com", Err codevoid.de 70 i- URL => "$protocol://hacker-news.firebaseio.com/v0/topstories.json" }); Err codevoid.de 70 i+ URL => "https://hacker-news.firebaseio.com/v0/topstories.json" }); Err codevoid.de 70 i $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); Err codevoid.de 70 i $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); Err codevoid.de 70 i $REST->{resource} = $REST->{URL}; Err codevoid.de 70 i $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); Err codevoid.de 70 i $REST->{response} = $REST->{UA}->request( $REST->{request} ); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # we're not giving up Err codevoid.de 70 i if(not $REST->{response}->is_success()) { Err codevoid.de 70 i- my $delay = 5; Err codevoid.de 70 i- #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in $delay seconds...\n"; Err codevoid.de 70 i- sleep $delay; Err codevoid.de 70 i+ sleep 5; Err codevoid.de 70 i return getTopStories(); Err codevoid.de 70 i } Err codevoid.de 70 i+ Err codevoid.de 70 i return decode_json($REST->{response}->content); Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i Err codevoid.de 70 i-### SUB: $json = getApiData("/api/..."); Err codevoid.de 70 i+# SUB: $json = getApiData("/api/..."); Err codevoid.de 70 i+# this call returns stories and comments. The nice thing about this is, that it Err codevoid.de 70 i+# can provide all comments to a story in one call. Err codevoid.de 70 i+# OPTIMIZE: right now, the story and comments are fetched separately. This Err codevoid.de 70 i+# could be combined in one call. Err codevoid.de 70 i sub getApiData { Err codevoid.de 70 i my ( $uri ) = @_; Err codevoid.de 70 i- #print "Debug: getApiData($protocol://$server$uri)\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i my $REST= ({HOST => "$server", Err codevoid.de 70 i URL => "$protocol://$server$uri" }); Err codevoid.de 70 i+ Err codevoid.de 70 i $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); Err codevoid.de 70 i $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); Err codevoid.de 70 i $REST->{resource} = $REST->{URL}; Err codevoid.de 70 i $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); Err codevoid.de 70 i $REST->{response} = $REST->{UA}->request( $REST->{request} ); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # we're not giving up Err codevoid.de 70 i if(not $REST->{response}->is_success()) { Err codevoid.de 70 i- #print "Debug: Got \"", $REST->{response}->status_line, "\" trying again in 2 seconds...\n"; Err codevoid.de 70 i sleep 2; Err codevoid.de 70 i return getApiData ( $uri ); Err codevoid.de 70 i- } Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i return decode_json($REST->{response}->content); Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i Err codevoid.de 70 i-### SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) Err codevoid.de 70 i+# SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) Err codevoid.de 70 i+# recursive comment scraper Err codevoid.de 70 i+# this sub formats searches for a comment with the incoming parentID Err codevoid.de 70 i+# and adds it to $output. Then it calles itself again with the ID of Err codevoid.de 70 i+# the found comment and an increased indent level. Err codevoid.de 70 i+# Err codevoid.de 70 i+# Then searches for comments with the incoming ID as parent ID and Err codevoid.de 70 i+# adds the first hit to $output. Then it calls itself with the ID as Err codevoid.de 70 i+# parentID again... Err codevoid.de 70 i+# Err codevoid.de 70 i+# If no more comments are found with the supplied ID, it decreases Err codevoid.de 70 i+# the ident level and returns to the previous invocation. Err codevoid.de 70 i sub scrapeSubComments { Err codevoid.de 70 i my ( $payload, $parentID, $lvl ) = @_; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # search for comment Err codevoid.de 70 i my $output = ""; Err codevoid.de 70 i for my $hit ($payload->{"hits"}) { Err codevoid.de 70 i foreach my $comment (@$hit) { Err codevoid.de 70 i+ Err codevoid.de 70 i+ # comment is found, add to output Err codevoid.de 70 i if ($comment->{'parent_id'} == $parentID) { Err codevoid.de 70 i- my $text = encode("UTF-8", $comment->{'comment_text'}); Err codevoid.de 70 i- my $author = encode("UTF-8", $comment->{'author'}); Err codevoid.de 70 i- my $objectID = $comment->{'objectID'}; Err codevoid.de 70 i- my $ago = parseDate($comment->{'created_at'}); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # format data Err codevoid.de 70 i+ my $text = encode("UTF-8", $comment->{'comment_text'}); Err codevoid.de 70 i+ my $author = encode("UTF-8", $comment->{'author'}); Err codevoid.de 70 i+ my $ago = parseDate($comment->{'created_at'}); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # add to output Err codevoid.de 70 i $output .= formatContent("$author wrote $ago:", $lvl); Err codevoid.de 70 i $output .= formatContent("$text", $lvl)."\n"; Err codevoid.de 70 i- $output .= scrapeSubComments( $payload, $objectID, ++$lvl ); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # invoke itself with objectID and travers down the hierarchy Err codevoid.de 70 i+ $output .= scrapeSubComments( $payload, $comment->{'objectID'}, ++$lvl ); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # decrease indentation level Err codevoid.de 70 i $lvl--; Err codevoid.de 70 i } Err codevoid.de 70 i } Err codevoid.de 70 it@@ -93,7 +141,11 @@ sub scrapeSubComments { Err codevoid.de 70 i return $output; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i-### SUB: $datestr = parseDate($datestring) Err codevoid.de 70 i+# SUB: $datestr = parseDate($datestring) Err codevoid.de 70 i+# takes someting like 2018-04-23T23:45Z002 and converts it to a relative Err codevoid.de 70 i+# and humand readable notation like "4 days ago". Err codevoid.de 70 i+# OPTIMIZE: the Duration API can be used with parse pattern this should Err codevoid.de 70 i+# be used. It's probably simpler and faster. Err codevoid.de 70 i sub parseDate { Err codevoid.de 70 i my ( $datestring ) = @_; Err codevoid.de 70 i Err codevoid.de 70 it@@ -171,42 +223,147 @@ sub parseDate { Err codevoid.de 70 i return $dtstr; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i-### SUB: scrapeComments($objectID, $number, $title) Err codevoid.de 70 i+# SUB: scrapeComments($objectID, $number, $link) Err codevoid.de 70 i+# this sets up the comment page frame. The content is added by hierarchial Err codevoid.de 70 i+# scrapeSubComments() calls. Err codevoid.de 70 i sub scrapeComments { Err codevoid.de 70 i- my ( $objectID, $number, $title ) = @_; Err codevoid.de 70 i- my $content = "$logo\nCOMMENT PAGE FOR:\n \"$title\"\n\n"; Err codevoid.de 70 i+ my ( $objectID, $number, $link ) = @_; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # set header Err codevoid.de 70 i+ my $content = "$logo\nCOMMENT PAGE FOR:\n$link\n\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # the comment count. If this is zero, this call can be skipped. Err codevoid.de 70 i if($number) { Err codevoid.de 70 i+ # call API to receive all comments. The previews call already contains Err codevoid.de 70 i my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number"); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # invoke hiararchial scraper and hand over the payload Err codevoid.de 70 i+ # (only working in memory from here) Err codevoid.de 70 i $content .= scrapeSubComments($payload, $objectID, 0); Err codevoid.de 70 i } else { Err codevoid.de 70 i+ # previous call indicated 0 comments. Err codevoid.de 70 i $content .= "No comments available\n"; Err codevoid.de 70 i } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # all comments have been added to the page. Add footer and save file. Err codevoid.de 70 i $content .= "\n[1|<- back to front page|$go_path|server|port]"; Err codevoid.de 70 i saveFile($content, "comments_$objectID.gph"); Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i+# SUB: $url = isHtml($url) Err codevoid.de 70 i+# this sub checks a given URL by performing a HEAD request. In case the URL is Err codevoid.de 70 i+# of type text/html, it will return the URL. Otherwise 0. Err codevoid.de 70 i+sub isHtml { Err codevoid.de 70 i+ my ( $url ) = @_; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # perform HEAD request Err codevoid.de 70 i+ my $ua = LWP::UserAgent->new(keep_alive => 0, timeout => 30); Err codevoid.de 70 i+ $ua->agent("codevoid-hackernews-gopherproxy/0.1"); Err codevoid.de 70 i+ my $req = HTTP::Request->new(HEAD => $url); Err codevoid.de 70 i+ $req->header('Accept' => 'text/html'); Err codevoid.de 70 i+ my $resp = $ua->request($req); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # check content type Err codevoid.de 70 i+ if ($resp->is_success && ($resp->content_type =~ m/text\/html/)) { Err codevoid.de 70 i+ return $resp->request()->uri(); Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ return 0; Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i+# SUB: dumpArticle($url, $objectID) Err codevoid.de 70 i+# This sub downloads webpages and convert them into a plain text format than Err codevoid.de 70 i+# can be served on gopher. Once an article has been converted, it is not being Err codevoid.de 70 i+# downloaded again. Err codevoid.de 70 i+# OPTIMIZE: For some pages, this works great. Not for others. Some custom made Err codevoid.de 70 i+# preprocessing steps could be added to strip out navigation, footer, excessive Err codevoid.de 70 i+# ads and other non-relevant data. This could be done on a per domain basis. Err codevoid.de 70 i+# (this could be a separate program which could be reused in other projects) Err codevoid.de 70 i+sub dumpArticle { Err codevoid.de 70 i+ my ( $url, $objectID ) = @_; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # is it cached? return. Err codevoid.de 70 i+ if (-e "$go_root$go_path/article_$objectID.gph") { Err codevoid.de 70 i+ return 0; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # content type check Err codevoid.de 70 i+ $url = isHtml($url); Err codevoid.de 70 i+ if($url == 0) { Err codevoid.de 70 i+ print "Skipping (not html)\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # the supplied URL is not html, don't add it to the front page. Err codevoid.de 70 i+ return 1; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # we got html, let's download it Err codevoid.de 70 i+ my $ua = LWP::UserAgent->new; Err codevoid.de 70 i+ my $req = HTTP::Request->new(GET => $url); Err codevoid.de 70 i+ my $resp = $ua->request($req); Err codevoid.de 70 i+ Err codevoid.de 70 i+ if ($resp->is_success) { Err codevoid.de 70 i+ Err codevoid.de 70 i+ # OPTIMIZE: this would be the place to modify the HTML Err codevoid.de 70 i+ # in $resp->decoded_content Err codevoid.de 70 i+ Err codevoid.de 70 i+ # call successful - convert it to text Err codevoid.de 70 i+ my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); Err codevoid.de 70 i+ my $message = $f->parse($resp->decoded_content); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # wrap it to 72 characters (will destroy link lists) Err codevoid.de 70 i+ #$Text::Wrap::columns=72; Err codevoid.de 70 i+ #$message = wrap("","",$message); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # shrink multiple newlines Err codevoid.de 70 i+ $message =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i+ $message =~ s/\t/ /g; Err codevoid.de 70 i+ $message =~ s/\nt/\ntt/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # save to file Err codevoid.de 70 i+ saveFile($message, "article_$objectID.gph"); Err codevoid.de 70 i+ } else { Err codevoid.de 70 i+ # the call was unsuccessful. We're not trying again here. Err codevoid.de 70 i+ # The call be repeated on the next scraper run. Returning 1 here Err codevoid.de 70 i+ # leads to the link to this file will not be added on the front page. Err codevoid.de 70 i+ return 1; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # no complaints, add the link to this article. Err codevoid.de 70 i+ return 0; Err codevoid.de 70 i+} Err codevoid.de 70 i+ Err codevoid.de 70 i ### SUB: formatContent($content, $lvl) Err codevoid.de 70 i+# This is the comment page formatter. It takes text and an indentation Err codevoid.de 70 i+# level und put this nicely on a page, with a level bar on the left. Err codevoid.de 70 i sub formatContent { Err codevoid.de 70 i my ( $content, $lvl ) = @_; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # decode html notations Err codevoid.de 70 i $content = decode_entities($content); Err codevoid.de 70 i Err codevoid.de 70 i # remove trailing space before wrapping Err codevoid.de 70 i $content =~ s/ $/\n/g; Err codevoid.de 70 i Err codevoid.de 70 i+ # handle crazy indent levels that would leave no Err codevoid.de 70 i+ # room for text on the right side Err codevoid.de 70 i my $pad=""; Err codevoid.de 70 i if($lvl > 20) { Err codevoid.de 70 i $pad = "$lvl> "; Err codevoid.de 70 i $lvl = 19; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i- # calculate padding Err codevoid.de 70 i+ # Setup text wrapper to wrap at 72 - indent level Err codevoid.de 70 i+ # each level in/decreases two spaces Err codevoid.de 70 i $Text::Wrap::columns=72-($lvl*2); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Calculate spaces to add on the left side Err codevoid.de 70 i+ # based on the reply/indent level. Err codevoid.de 70 i while($lvl > 0) { Err codevoid.de 70 i $pad=" ".$pad; Err codevoid.de 70 i $lvl--; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i- # Search for links Err codevoid.de 70 i+ # Search for links in comments Err codevoid.de 70 i my $LX = new HTML::LinkExtractor(); Err codevoid.de 70 i $LX->strip(1); Err codevoid.de 70 i $LX->parse(\$content); Err codevoid.de 70 it@@ -215,8 +372,8 @@ sub formatContent { Err codevoid.de 70 i my $HR = HTML::Restrict->new(); Err codevoid.de 70 i $content =~ s/

/\n\n/g; Err codevoid.de 70 i $content =~ s/

  • /\n\n\* /g; Err codevoid.de 70 i- $content =~ s/
    /\n\n--- QUOTE ---\n/g; Err codevoid.de 70 i- $content =~ s/<\/blockquote>/\n---- END ----\n\n/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # strip remaining HTML tags Err codevoid.de 70 i my $content_clean = $HR->process($content); Err codevoid.de 70 i Err codevoid.de 70 i # nobody needs more that one newline. Err codevoid.de 70 it@@ -230,8 +387,11 @@ sub formatContent { Err codevoid.de 70 i Err codevoid.de 70 i # skip empty links (image links for example) Err codevoid.de 70 i if(!$linkitem->{_TEXT}) { next; } Err codevoid.de 70 i- Err codevoid.de 70 i+ Err codevoid.de 70 i+ # link found, increase counter Err codevoid.de 70 i $c++; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # replace link text with [$counter] Err codevoid.de 70 i $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g; Err codevoid.de 70 i Err codevoid.de 70 i # make sure there are no newlines/extra spaces around [0] Err codevoid.de 70 it@@ -245,7 +405,7 @@ sub formatContent { Err codevoid.de 70 i $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g; Err codevoid.de 70 i $content_clean =~ s/ \[\d\] $//g; Err codevoid.de 70 i Err codevoid.de 70 i- # shorten links Err codevoid.de 70 i+ # shorten links that are too long for the indent level Err codevoid.de 70 i my $short = $linkitem->{href}; Err codevoid.de 70 i my $l = 62 - length($pad); Err codevoid.de 70 i if(length($short) > $l) { $short = substr($short,0,$l)."..."; } Err codevoid.de 70 it@@ -255,7 +415,6 @@ sub formatContent { Err codevoid.de 70 i } Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i- Err codevoid.de 70 i # Wrap content 72 - padding Err codevoid.de 70 i $content_clean = wrap("","",$content_clean); Err codevoid.de 70 i Err codevoid.de 70 it@@ -266,13 +425,14 @@ sub formatContent { Err codevoid.de 70 i $content_clean =~ s/^/$pad║ /g; Err codevoid.de 70 i $content_clean =~ s/\n/\n$pad║ /g; Err codevoid.de 70 i Err codevoid.de 70 i- # print links if there were any. Err codevoid.de 70 i+ # print links if any... Err codevoid.de 70 i if($links) { Err codevoid.de 70 i $content_clean .= "\n$pad║ \n$links"; Err codevoid.de 70 i } else { Err codevoid.de 70 i $content_clean .= "\n"; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i+ # fix gopher issues (geomyidae design) Err codevoid.de 70 i $content_clean =~ s/\t/ /g; Err codevoid.de 70 i $content_clean =~ s/\nt/\ntt/g; Err codevoid.de 70 i Err codevoid.de 70 it@@ -311,7 +471,7 @@ my $count = 0; Err codevoid.de 70 i for my $id (@$json_top) { Err codevoid.de 70 i $query .="story_$id,"; Err codevoid.de 70 i $count++; Err codevoid.de 70 i- if($count > $index_count) { Err codevoid.de 70 i+ if($count >= $total_count) { Err codevoid.de 70 i last; Err codevoid.de 70 i } Err codevoid.de 70 i } Err codevoid.de 70 it@@ -319,81 +479,142 @@ for my $id (@$json_top) { Err codevoid.de 70 i # remove trailing comma and close query Err codevoid.de 70 i $query =~ s/,$/\)/g; Err codevoid.de 70 i Err codevoid.de 70 i+# fetch the top story IDs from firebase API Err codevoid.de 70 i+my $topStoryList = getApiData("$api_uri/$query"); Err codevoid.de 70 i+ Err codevoid.de 70 i # set up background tasks for parallel scraping Err codevoid.de 70 i-my $pm = new Parallel::ForkManager(10); Err codevoid.de 70 i+my $pm = new Parallel::ForkManager(50); Err codevoid.de 70 i Err codevoid.de 70 i-my $json_fp = getApiData("$api_uri/$query"); Err codevoid.de 70 i-for my $hit ($json_fp->{"hits"}) { Err codevoid.de 70 i+# scrape story header and comments Err codevoid.de 70 i+for my $hit ($topStoryList->{"hits"}) { Err codevoid.de 70 i foreach my $story (@$hit) { Err codevoid.de 70 i Err codevoid.de 70 i- # do everything from here in background Err codevoid.de 70 i+ # do everything from here on in background Err codevoid.de 70 i $pm->start and next; Err codevoid.de 70 i Err codevoid.de 70 i+ # convenient variables Err codevoid.de 70 i+ my $objectID = $story->{'objectID'}; Err codevoid.de 70 i+ my $author = encode("UTF-8", $story->{'author'}); Err codevoid.de 70 i+ my $title = encode("UTF-8", $story->{'title'}); Err codevoid.de 70 i+ my $url = encode("UTF-8", $story->{'url'}); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # comments (default to 0) Err codevoid.de 70 i+ my $number = 0; Err codevoid.de 70 i+ if($story->{'num_comments'}) { Err codevoid.de 70 i+ $number = $story->{'num_comments'}; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # parse date and convert to relative notation (5 min ago) Err codevoid.de 70 i+ my $ago = parseDate($story->{'created_at'}); Err codevoid.de 70 i+ Err codevoid.de 70 i # title is a link, escape "|" Err codevoid.de 70 i- my $title = encode("UTF-8", $story->{'title'}); Err codevoid.de 70 i $title =~ s/\|/\\|/g; Err codevoid.de 70 i Err codevoid.de 70 i # URL is either a HTML link line or a gopher dir Err codevoid.de 70 i- my $url = ""; Err codevoid.de 70 i- if($story->{'url'}) { Err codevoid.de 70 i- $url = encode("UTF-8", $story->{'url'}); Err codevoid.de 70 i- $content .= "[h| $title|URL:$url|server|port]\n"; Err codevoid.de 70 i+ my $link; Err codevoid.de 70 i+ if($url) { Err codevoid.de 70 i+ # link goes HTTP Err codevoid.de 70 i+ $link = "[h| $title|URL:$url|server|port]\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # is the article dumper active? Err codevoid.de 70 i+ if($dumper == 1) { Err codevoid.de 70 i+ if(dumpArticle($url, $objectID) eq 0) { Err codevoid.de 70 i+ $link .= "[1| plaintext version|$go_path/article_$objectID.gph|server|port]\n"; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i } else { Err codevoid.de 70 i- $url = "$go_path/comments_$story->{'objectID'}.gph"; Err codevoid.de 70 i- $content .= "[1| $title|$url|server|port]\n"; Err codevoid.de 70 i+ # link goes GOPHER (redefine URL to comments [Ask HN]) Err codevoid.de 70 i+ $url = "$go_path/comments_$story->{'objectID'}.gph"; Err codevoid.de 70 i+ $link = "[1| $title|$url|server|port]\n"; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i- # Err codevoid.de 70 i- my $author = encode("UTF-8", $story->{'author'}); Err codevoid.de 70 i- my $objectID = $story->{'objectID'}; Err codevoid.de 70 i- Err codevoid.de 70 i- # parse date Err codevoid.de 70 i- my $ago = parseDate($story->{'created_at'}); Err codevoid.de 70 i+ # add title link line Err codevoid.de 70 i+ $content .= $link; Err codevoid.de 70 i Err codevoid.de 70 i- my $number = 0; Err codevoid.de 70 i- if($story->{'num_comments'}) { Err codevoid.de 70 i- $number = $story->{'num_comments'}; Err codevoid.de 70 i- } Err codevoid.de 70 i- Err codevoid.de 70 i- # build content Err codevoid.de 70 i+ # add author line Err codevoid.de 70 i $content .= " by $author ($story->{'points'} points) $ago\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # add comment link line Err codevoid.de 70 i $content .= "[1| read $number comments|$go_path/comments_$objectID.gph|server|port]\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # aaaand one blank Err codevoid.de 70 i $content .= "\n"; Err codevoid.de 70 i Err codevoid.de 70 i- # Save (if not already done - assuming the story doesn't change) Err codevoid.de 70 i- # FIXME: the title could be changed by the staff Err codevoid.de 70 i- if (not -e "$go_root$go_path/story_$objectID.gph") { Err codevoid.de 70 i- saveFile($content, "story_$objectID.gph"); Err codevoid.de 70 i- } Err codevoid.de 70 i+ # Save story file Err codevoid.de 70 i+ saveFile($content, "story_$objectID.gph"); Err codevoid.de 70 i Err codevoid.de 70 i # Fire up the comment scraper Err codevoid.de 70 i- #print "Debug: scrapeComments($objectID, $number, $title);\n"; Err codevoid.de 70 i- scrapeComments($story->{'objectID'}, $number, $title); Err codevoid.de 70 i+ scrapeComments($story->{'objectID'}, $number, $link); Err codevoid.de 70 i Err codevoid.de 70 i- # background task stopps here Err codevoid.de 70 i- $pm->finish Err codevoid.de 70 i+ # background task stops here Err codevoid.de 70 i+ $pm->finish; Err codevoid.de 70 i } Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i # wait for all scraping be done and all cache files be present Err codevoid.de 70 i $pm->wait_all_children; Err codevoid.de 70 i Err codevoid.de 70 i-# construct index from cached files Err codevoid.de 70 i+# construct index Err codevoid.de 70 i $count = 0; Err codevoid.de 70 i+ Err codevoid.de 70 i+# setup pagination variables Err codevoid.de 70 i+my $page = 1; Err codevoid.de 70 i+my $nextpage; Err codevoid.de 70 i+my $prevpage; Err codevoid.de 70 i+my $filename; Err codevoid.de 70 i+ Err codevoid.de 70 i+# initialize output variable Err codevoid.de 70 i my $index_out = "$logo"; Err codevoid.de 70 i+ Err codevoid.de 70 i+# loop at all top stories (to keep the sequence) Err codevoid.de 70 i for my $id (@$json_top) { Err codevoid.de 70 i+ Err codevoid.de 70 i+ # append the story files Err codevoid.de 70 i if (-e "$go_root$go_path/story_$id.gph") { Err codevoid.de 70 i open(my $fh, '<', "$go_root$go_path/story_$id.gph"); Err codevoid.de 70 i- while (my $row = <$fh>) { Err codevoid.de 70 i- $index_out .= $row; Err codevoid.de 70 i- } Err codevoid.de 70 i+ while (my $row = <$fh>) { $index_out .= $row; } Err codevoid.de 70 i close($fh); Err codevoid.de 70 i } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # increase story counter Err codevoid.de 70 i $count++; Err codevoid.de 70 i- # OPTIMIZE: Add pagignation? (who goes to page 2 anyway...) Err codevoid.de 70 i- if($count > $index_count) { last; } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # Pagination Err codevoid.de 70 i+ if(($count % $index_count) eq 0) { Err codevoid.de 70 i+ Err codevoid.de 70 i+ # setup defaults Err codevoid.de 70 i+ $filename = "index-$page.gph"; Err codevoid.de 70 i+ $nextpage = $page + 1; Err codevoid.de 70 i+ $prevpage = $page - 1; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # special handling for first page (different name) Err codevoid.de 70 i+ if($page eq 1) { Err codevoid.de 70 i+ $filename = "index.gph"; Err codevoid.de 70 i+ $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]\n\n"; Err codevoid.de 70 i+ $index_out .= "[1|<< Back Home|/|server|port]"; Err codevoid.de 70 i+ } else { Err codevoid.de 70 i+ $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]"; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ # incease page counter Err codevoid.de 70 i+ $page++; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # done, save file, proceed with next page Err codevoid.de 70 i+ saveFile($index_out, $filename); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # initialize indexout for next run Err codevoid.de 70 i+ $index_out = "$logo"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ } else { Err codevoid.de 70 i+ Err codevoid.de 70 i+ # handle last page Err codevoid.de 70 i+ if ( $count >= $total_count ) { Err codevoid.de 70 i+ $index_out .= "[1| << Prev Page ($prevpage) |$go_path/index-$prevpage.gph|server|port]"; Err codevoid.de 70 i+ saveFile($index_out, $filename); Err codevoid.de 70 i+ last; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ } Err codevoid.de 70 i } Err codevoid.de 70 i-$index_out .= "\n[1|<- go back home|/|server|port]"; Err codevoid.de 70 i-saveFile($index_out, "index.gph"); Err codevoid.de 70 i Err codevoid.de 70 i exit 0; Err codevoid.de 70 .