t* hacker news on gopher
       
   URI git clone git://git.codevoid.de/hn-gopher
   DIR Log
   DIR Files
   DIR Refs
       ---
   DIR commit a0d7801aaae62f384ff7cbf5a628dfe12b211341
   DIR parent 58874a778c1585b20a7dbd4696706ad13f248bee
   URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
       Date:   Tue, 31 Jul 2018 23:13:12 +0200
       
       Add text variants using readability (ruby variant)
       
       Diffstat:
         M hn-scraper.pl                       |      98 +++++++++++++++++++------------
       
       1 file changed, 62 insertions(+), 36 deletions(-)
       ---
   DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
       t@@ -35,7 +35,7 @@ my $go_root     = "/srv/codevoid-gopher";
        my $go_path     = "/hn";
        my $index_count = 20;   # item count per page
        my $total_count = 400;  # total item count (all pages)
       -my $dumper      = 0;    # 1 creates plain text versions
       +my $dumper      = 1;    # 1 creates plain text versions
        
        ### CAN HAZ LOGO? SURE!
        my $logo  =" _______               __                   _______\n";
       t@@ -280,7 +280,7 @@ sub isHtml {
        # ads and other non-relevant data. This could be done on a per domain basis.
        # (this could be a separate program which could be reused in other projects)
        sub dumpArticle {
       -    my ( $url, $objectID ) = @_;
       +    my ( $url, $objectID, $title ) = @_;
        
            # is it cached? return.
            if (-e "$go_root$go_path/article_$objectID.gph") {
       t@@ -290,43 +290,70 @@ sub dumpArticle {
            # content type check
            $url = isHtml($url);
            if($url == 0) {
       -        print "Skipping (not html)\n";
       -
                # the supplied URL is not html, don't add it to the front page.
                return 1;   
            }
        
       -    # we got html, let's download it
       -    my $ua  = LWP::UserAgent->new;
       -    my $req = HTTP::Request->new(GET => $url);
       -    my $resp = $ua->request($req);
       -
       -    if ($resp->is_success) {
       -        
       -        # OPTIMIZE: this would be the place to modify the HTML
       -        # in $resp->decoded_content
       -
       -        # call successful - convert it to text
       -        my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url");
       -        my $message = $f->parse($resp->decoded_content);
       +    print "Scraping: $url\n";
       +    my $message = decode("UTF-8", "$title\n");
       +    $message   .= "-------------------------------------------------------------------------\n";
       +    $message   .= decode("UTF-8", `/usr/local/bin/readability -i "$url"`);
       +    if($? ne 0) {
       +      print "Scraping failed: $url\n";
       +      return 1;    
       +    }
        
       -        # wrap it to 72 characters (will destroy link lists)
       -        #$Text::Wrap::columns=72;
       -        #$message = wrap("","",$message);
       +    # call successful - convert it to text
       +    my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url");
       +    $message = $f->parse($message);
        
       -        # shrink multiple newlines
       -        $message =~ s/\n\n(\n)*/\n\n/g;
       -        $message =~ s/\t/    /g;
       -        $message =~ s/\nt/\ntt/g;
       +    $message   .= "\n\n\nSource:\n[h|$url|URL:$url|server|port]\n\n";
        
       -        # save to file
       -        saveFile($message, "article_$objectID.gph");
       -    } else {
       -        # the call was unsuccessful. We're not trying again here.
       -        # The call be repeated on the next scraper run. Returning 1 here
       -        # leads to the link to this file will not be added on the front page.
       -        return 1;
       -    }
       +    # shrink multiple newlines
       +    $message =~ s/\n\n(\n)*/\n\n/g;
       +    $message =~ s/\t/    /g;
       +    $message =~ s/\nt/\ntt/g;
       +
       +    # save to file
       +    saveFile($message, "article_$objectID.gph");
       +
       +    # *** <this part has been replaced with readibility> ***
       +
       +    ## we got html, let's download it
       +    #my $ua  = LWP::UserAgent->new;
       +    #my $req = HTTP::Request->new(GET => $url);
       +    #my $resp = $ua->request($req);
       +
       +    #if ($resp->is_success) {
       +    #    
       +    #    # OPTIMIZE: this would be the place to modify the HTML
       +    #    # in $resp->decoded_content
       +    #    print "Scraping: $url\n";
       +    #    my $message = "Source: $url\n\n";
       +
       +    #    # call successful - convert it to text
       +    #    my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url");
       +    #    $message = $f->parse($resp->decoded_content);
       +
       +    #    # wrap it to 72 characters (will destroy link lists)
       +    #    #$Text::Wrap::columns=72;
       +    #    #$message = wrap("","",$message);
       +
       +    #    # shrink multiple newlines
       +    #    $message =~ s/\n\n(\n)*/\n\n/g;
       +    #    $message =~ s/\t/    /g;
       +    #    $message =~ s/\nt/\ntt/g;
       +
       +    #    # save to file
       +    #    saveFile($message, "article_$objectID.gph");
       +    #} else {
       +    #    # the call was unsuccessful. We're not trying again here.
       +    #    # The call be repeated on the next scraper run. Returning 1 here
       +    #    # leads to the link to this file will not be added on the front page.
       +    #    return 1;
       +    #}
       +    #
       +    # *** </this part has been replaced with readibility> ***
        
            # no complaints, add the link to this article.
            return 0;
       t@@ -449,9 +476,8 @@ sub saveFile {
              print FH $content;
            close(FH);
        
       -    # rename to temporary file to real file (atomic)
       +    # rename temporary file to real file (atomic)
            rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n";
       -    #print "Debug: saveFile(\$content, $filename);\n\n";
            return 0;
        }
        
       t@@ -518,8 +544,8 @@ for my $hit ($topStoryList->{"hits"}) {
        
                    # is the article dumper active?
                    if($dumper == 1) {
       -                if(dumpArticle($url, $objectID) eq 0) {
       -                    $link .= "[1|  plaintext version|$go_path/article_$objectID.gph|server|port]\n";
       +                if(dumpArticle($url, $objectID, $title) eq 0) {
       +                    $link .= "[1|  text version|$go_path/article_$objectID.gph|server|port]\n";
                        }
                    }