it* hacker news on gopher Err codevoid.de 70 i Err codevoid.de 70 hgit clone git://git.codevoid.de/hn-gopher URL:git://git.codevoid.de/hn-gopher codevoid.de 70 1Log /git/hn-gopher/log.gph codevoid.de 70 1Files /git/hn-gopher/files.gph codevoid.de 70 1Refs /git/hn-gopher/refs.gph codevoid.de 70 i--- Err codevoid.de 70 1commit a0d7801aaae62f384ff7cbf5a628dfe12b211341 /git/hn-gopher/commit/a0d7801aaae62f384ff7cbf5a628dfe12b211341.gph codevoid.de 70 1parent 58874a778c1585b20a7dbd4696706ad13f248bee /git/hn-gopher/commit/58874a778c1585b20a7dbd4696706ad13f248bee.gph codevoid.de 70 hAuthor: Stefan Hagen URL:mailto:sh+git[at]codevoid[dot]de codevoid.de 70 iDate: Tue, 31 Jul 2018 23:13:12 +0200 Err codevoid.de 70 i Err codevoid.de 70 iAdd text variants using readability (ruby variant) Err codevoid.de 70 i Err codevoid.de 70 iDiffstat: Err codevoid.de 70 i M hn-scraper.pl | 98 +++++++++++++++++++------------ Err codevoid.de 70 i Err codevoid.de 70 i1 file changed, 62 insertions(+), 36 deletions(-) Err codevoid.de 70 i--- Err codevoid.de 70 1diff --git a/hn-scraper.pl b/hn-scraper.pl /git/hn-gopher/file/hn-scraper.pl.gph codevoid.de 70 it@@ -35,7 +35,7 @@ my $go_root = "/srv/codevoid-gopher"; Err codevoid.de 70 i my $go_path = "/hn"; Err codevoid.de 70 i my $index_count = 20; # item count per page Err codevoid.de 70 i my $total_count = 400; # total item count (all pages) Err codevoid.de 70 i-my $dumper = 0; # 1 creates plain text versions Err codevoid.de 70 i+my $dumper = 1; # 1 creates plain text versions Err codevoid.de 70 i Err codevoid.de 70 i ### CAN HAZ LOGO? SURE! Err codevoid.de 70 i my $logo =" _______ __ _______\n"; Err codevoid.de 70 it@@ -280,7 +280,7 @@ sub isHtml { Err codevoid.de 70 i # ads and other non-relevant data. This could be done on a per domain basis. Err codevoid.de 70 i # (this could be a separate program which could be reused in other projects) Err codevoid.de 70 i sub dumpArticle { Err codevoid.de 70 i- my ( $url, $objectID ) = @_; Err codevoid.de 70 i+ my ( $url, $objectID, $title ) = @_; Err codevoid.de 70 i Err codevoid.de 70 i # is it cached? return. Err codevoid.de 70 i if (-e "$go_root$go_path/article_$objectID.gph") { Err codevoid.de 70 it@@ -290,43 +290,70 @@ sub dumpArticle { Err codevoid.de 70 i # content type check Err codevoid.de 70 i $url = isHtml($url); Err codevoid.de 70 i if($url == 0) { Err codevoid.de 70 i- print "Skipping (not html)\n"; Err codevoid.de 70 i- Err codevoid.de 70 i # the supplied URL is not html, don't add it to the front page. Err codevoid.de 70 i return 1; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i- # we got html, let's download it Err codevoid.de 70 i- my $ua = LWP::UserAgent->new; Err codevoid.de 70 i- my $req = HTTP::Request->new(GET => $url); Err codevoid.de 70 i- my $resp = $ua->request($req); Err codevoid.de 70 i- Err codevoid.de 70 i- if ($resp->is_success) { Err codevoid.de 70 i- Err codevoid.de 70 i- # OPTIMIZE: this would be the place to modify the HTML Err codevoid.de 70 i- # in $resp->decoded_content Err codevoid.de 70 i- Err codevoid.de 70 i- # call successful - convert it to text Err codevoid.de 70 i- my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); Err codevoid.de 70 i- my $message = $f->parse($resp->decoded_content); Err codevoid.de 70 i+ print "Scraping: $url\n"; Err codevoid.de 70 i+ my $message = decode("UTF-8", "$title\n"); Err codevoid.de 70 i+ $message .= "-------------------------------------------------------------------------\n"; Err codevoid.de 70 i+ $message .= decode("UTF-8", `/usr/local/bin/readability -i "$url"`); Err codevoid.de 70 i+ if($? ne 0) { Err codevoid.de 70 i+ print "Scraping failed: $url\n"; Err codevoid.de 70 i+ return 1; Err codevoid.de 70 i+ } Err codevoid.de 70 i Err codevoid.de 70 i- # wrap it to 72 characters (will destroy link lists) Err codevoid.de 70 i- #$Text::Wrap::columns=72; Err codevoid.de 70 i- #$message = wrap("","",$message); Err codevoid.de 70 i+ # call successful - convert it to text Err codevoid.de 70 i+ my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); Err codevoid.de 70 i+ $message = $f->parse($message); Err codevoid.de 70 i Err codevoid.de 70 i- # shrink multiple newlines Err codevoid.de 70 i- $message =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i- $message =~ s/\t/ /g; Err codevoid.de 70 i- $message =~ s/\nt/\ntt/g; Err codevoid.de 70 i+ $message .= "\n\n\nSource:\n[h|$url|URL:$url|server|port]\n\n"; Err codevoid.de 70 i Err codevoid.de 70 i- # save to file Err codevoid.de 70 i- saveFile($message, "article_$objectID.gph"); Err codevoid.de 70 i- } else { Err codevoid.de 70 i- # the call was unsuccessful. We're not trying again here. Err codevoid.de 70 i- # The call be repeated on the next scraper run. Returning 1 here Err codevoid.de 70 i- # leads to the link to this file will not be added on the front page. Err codevoid.de 70 i- return 1; Err codevoid.de 70 i- } Err codevoid.de 70 i+ # shrink multiple newlines Err codevoid.de 70 i+ $message =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i+ $message =~ s/\t/ /g; Err codevoid.de 70 i+ $message =~ s/\nt/\ntt/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # save to file Err codevoid.de 70 i+ saveFile($message, "article_$objectID.gph"); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # *** *** Err codevoid.de 70 i+ Err codevoid.de 70 i+ ## we got html, let's download it Err codevoid.de 70 i+ #my $ua = LWP::UserAgent->new; Err codevoid.de 70 i+ #my $req = HTTP::Request->new(GET => $url); Err codevoid.de 70 i+ #my $resp = $ua->request($req); Err codevoid.de 70 i+ Err codevoid.de 70 i+ #if ($resp->is_success) { Err codevoid.de 70 i+ # Err codevoid.de 70 i+ # # OPTIMIZE: this would be the place to modify the HTML Err codevoid.de 70 i+ # # in $resp->decoded_content Err codevoid.de 70 i+ # print "Scraping: $url\n"; Err codevoid.de 70 i+ # my $message = "Source: $url\n\n"; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # # call successful - convert it to text Err codevoid.de 70 i+ # my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); Err codevoid.de 70 i+ # $message = $f->parse($resp->decoded_content); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # # wrap it to 72 characters (will destroy link lists) Err codevoid.de 70 i+ # #$Text::Wrap::columns=72; Err codevoid.de 70 i+ # #$message = wrap("","",$message); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # # shrink multiple newlines Err codevoid.de 70 i+ # $message =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i+ # $message =~ s/\t/ /g; Err codevoid.de 70 i+ # $message =~ s/\nt/\ntt/g; Err codevoid.de 70 i+ Err codevoid.de 70 i+ # # save to file Err codevoid.de 70 i+ # saveFile($message, "article_$objectID.gph"); Err codevoid.de 70 i+ #} else { Err codevoid.de 70 i+ # # the call was unsuccessful. We're not trying again here. Err codevoid.de 70 i+ # # The call be repeated on the next scraper run. Returning 1 here Err codevoid.de 70 i+ # # leads to the link to this file will not be added on the front page. Err codevoid.de 70 i+ # return 1; Err codevoid.de 70 i+ #} Err codevoid.de 70 i+ # Err codevoid.de 70 i+ # *** *** Err codevoid.de 70 i Err codevoid.de 70 i # no complaints, add the link to this article. Err codevoid.de 70 i return 0; Err codevoid.de 70 it@@ -449,9 +476,8 @@ sub saveFile { Err codevoid.de 70 i print FH $content; Err codevoid.de 70 i close(FH); Err codevoid.de 70 i Err codevoid.de 70 i- # rename to temporary file to real file (atomic) Err codevoid.de 70 i+ # rename temporary file to real file (atomic) Err codevoid.de 70 i rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n"; Err codevoid.de 70 i- #print "Debug: saveFile(\$content, $filename);\n\n"; Err codevoid.de 70 i return 0; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 it@@ -518,8 +544,8 @@ for my $hit ($topStoryList->{"hits"}) { Err codevoid.de 70 i Err codevoid.de 70 i # is the article dumper active? Err codevoid.de 70 i if($dumper == 1) { Err codevoid.de 70 i- if(dumpArticle($url, $objectID) eq 0) { Err codevoid.de 70 i- $link .= "[1| plaintext version|$go_path/article_$objectID.gph|server|port]\n"; Err codevoid.de 70 i+ if(dumpArticle($url, $objectID, $title) eq 0) { Err codevoid.de 70 i+ $link .= "[1| text version|$go_path/article_$objectID.gph|server|port]\n"; Err codevoid.de 70 i } Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 .