t* hacker news on gopher URI git clone git://git.codevoid.de/hn-gopher DIR Log DIR Files DIR Refs --- DIR commit a0d7801aaae62f384ff7cbf5a628dfe12b211341 DIR parent 58874a778c1585b20a7dbd4696706ad13f248bee URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de> Date: Tue, 31 Jul 2018 23:13:12 +0200 Add text variants using readability (ruby variant) Diffstat: M hn-scraper.pl | 98 +++++++++++++++++++------------ 1 file changed, 62 insertions(+), 36 deletions(-) --- DIR diff --git a/hn-scraper.pl b/hn-scraper.pl t@@ -35,7 +35,7 @@ my $go_root = "/srv/codevoid-gopher"; my $go_path = "/hn"; my $index_count = 20; # item count per page my $total_count = 400; # total item count (all pages) -my $dumper = 0; # 1 creates plain text versions +my $dumper = 1; # 1 creates plain text versions ### CAN HAZ LOGO? SURE! my $logo =" _______ __ _______\n"; t@@ -280,7 +280,7 @@ sub isHtml { # ads and other non-relevant data. This could be done on a per domain basis. # (this could be a separate program which could be reused in other projects) sub dumpArticle { - my ( $url, $objectID ) = @_; + my ( $url, $objectID, $title ) = @_; # is it cached? return. if (-e "$go_root$go_path/article_$objectID.gph") { t@@ -290,43 +290,70 @@ sub dumpArticle { # content type check $url = isHtml($url); if($url == 0) { - print "Skipping (not html)\n"; - # the supplied URL is not html, don't add it to the front page. return 1; } - # we got html, let's download it - my $ua = LWP::UserAgent->new; - my $req = HTTP::Request->new(GET => $url); - my $resp = $ua->request($req); - - if ($resp->is_success) { - - # OPTIMIZE: this would be the place to modify the HTML - # in $resp->decoded_content - - # call successful - convert it to text - my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); - my $message = $f->parse($resp->decoded_content); + print "Scraping: $url\n"; + my $message = decode("UTF-8", "$title\n"); + $message .= "-------------------------------------------------------------------------\n"; + $message .= decode("UTF-8", `/usr/local/bin/readability -i "$url"`); + if($? ne 0) { + print "Scraping failed: $url\n"; + return 1; + } - # wrap it to 72 characters (will destroy link lists) - #$Text::Wrap::columns=72; - #$message = wrap("","",$message); + # call successful - convert it to text + my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); + $message = $f->parse($message); - # shrink multiple newlines - $message =~ s/\n\n(\n)*/\n\n/g; - $message =~ s/\t/ /g; - $message =~ s/\nt/\ntt/g; + $message .= "\n\n\nSource:\n[h|$url|URL:$url|server|port]\n\n"; - # save to file - saveFile($message, "article_$objectID.gph"); - } else { - # the call was unsuccessful. We're not trying again here. - # The call be repeated on the next scraper run. Returning 1 here - # leads to the link to this file will not be added on the front page. - return 1; - } + # shrink multiple newlines + $message =~ s/\n\n(\n)*/\n\n/g; + $message =~ s/\t/ /g; + $message =~ s/\nt/\ntt/g; + + # save to file + saveFile($message, "article_$objectID.gph"); + + # *** <this part has been replaced with readibility> *** + + ## we got html, let's download it + #my $ua = LWP::UserAgent->new; + #my $req = HTTP::Request->new(GET => $url); + #my $resp = $ua->request($req); + + #if ($resp->is_success) { + # + # # OPTIMIZE: this would be the place to modify the HTML + # # in $resp->decoded_content + # print "Scraping: $url\n"; + # my $message = "Source: $url\n\n"; + + # # call successful - convert it to text + # my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); + # $message = $f->parse($resp->decoded_content); + + # # wrap it to 72 characters (will destroy link lists) + # #$Text::Wrap::columns=72; + # #$message = wrap("","",$message); + + # # shrink multiple newlines + # $message =~ s/\n\n(\n)*/\n\n/g; + # $message =~ s/\t/ /g; + # $message =~ s/\nt/\ntt/g; + + # # save to file + # saveFile($message, "article_$objectID.gph"); + #} else { + # # the call was unsuccessful. We're not trying again here. + # # The call be repeated on the next scraper run. Returning 1 here + # # leads to the link to this file will not be added on the front page. + # return 1; + #} + # + # *** </this part has been replaced with readibility> *** # no complaints, add the link to this article. return 0; t@@ -449,9 +476,8 @@ sub saveFile { print FH $content; close(FH); - # rename to temporary file to real file (atomic) + # rename temporary file to real file (atomic) rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n"; - #print "Debug: saveFile(\$content, $filename);\n\n"; return 0; } t@@ -518,8 +544,8 @@ for my $hit ($topStoryList->{"hits"}) { # is the article dumper active? if($dumper == 1) { - if(dumpArticle($url, $objectID) eq 0) { - $link .= "[1| plaintext version|$go_path/article_$objectID.gph|server|port]\n"; + if(dumpArticle($url, $objectID, $title) eq 0) { + $link .= "[1| text version|$go_path/article_$objectID.gph|server|port]\n"; } }