t* hacker news on gopher
       
   URI git clone git://git.codevoid.de/hn-gopher
   DIR Log
   DIR Files
   DIR Refs
       ---
   DIR commit 2ffdbea646f3d5028a12be096dc3e881a363a335
   DIR parent a0d7801aaae62f384ff7cbf5a628dfe12b211341
   URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de>
       Date:   Tue, 31 Jul 2018 23:58:48 +0200
       
       Add blacklist, plausibility check, error handling
       
       Diffstat:
         M hn-scraper.pl                       |      58 +++++++++++++++++++++++++------
       
       1 file changed, 48 insertions(+), 10 deletions(-)
       ---
   DIR diff --git a/hn-scraper.pl b/hn-scraper.pl
       t@@ -270,6 +270,30 @@ sub isHtml {
        
            return 0;
        }
       +# SUB: checkBlacklist($url)
       +sub checkBlacklist {
       +    my ( $url ) = @_;
       +    my @list = ( "youtube\.com",
       +                 "blog\.longnow",
       +                 "twitter\.com",
       +                 "phys\.org",
       +                 "vimeo\.com",
       +                 "github\.com",
       +                 "facebook\.com",
       +                 "laptopmag\.com",
       +                 "github\.com",
       +                 "apple\.com",
       +                 "mjg59\.dreamwidth\.org",
       +                 "scmp\.com"
       +             );
       +    foreach my $item (@list) {
       +        if( $url =~ m/.*${item}/ ) {
       +          print "Blacklisted: $url\n";
       +          return 1;    
       +        }
       +          
       +    }
       +}
        
        # SUB: dumpArticle($url, $objectID)
        # This sub downloads webpages and convert them into a plain text format than
       t@@ -282,6 +306,10 @@ sub isHtml {
        sub dumpArticle {
            my ( $url, $objectID, $title ) = @_;
        
       +    if(checkBlacklist( $url ) eq 1) {
       +      return 1;    
       +    };
       +
            # is it cached? return.
            if (-e "$go_root$go_path/article_$objectID.gph") {
                return 0;
       t@@ -294,10 +322,13 @@ sub dumpArticle {
                return 1;   
            }
        
       -    print "Scraping: $url\n";
       -    my $message = decode("UTF-8", "$title\n");
       -    $message   .= "-------------------------------------------------------------------------\n";
       -    $message   .= decode("UTF-8", `/usr/local/bin/readability -i "$url"`);
       +    my $msg  = decode("UTF-8", "$title\n");
       +       $msg .= decode("UTF-8", "-------------------------------------------------------------------------\n\n");
       +
       +    # let readability do the work...
       +    $msg .= decode("UTF-8", `/usr/local/bin/readability -i "$url" 2>/dev/null`);
       +
       +    # error handling
            if($? ne 0) {
              print "Scraping failed: $url\n";
              return 1;    
       t@@ -305,17 +336,24 @@ sub dumpArticle {
        
            # call successful - convert it to text
            my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url");
       -    $message = $f->parse($message);
       +    $msg = $f->parse($msg);
       +
       +    # plausibility check. too small?
       +    if(length($msg) < 500) {
       +        print "Text < 500: $url\n";
       +        return 1;
       +    }
        
       -    $message   .= "\n\n\nSource:\n[h|$url|URL:$url|server|port]\n\n";
       +    $msg.= "\n\n\nSource:\n[h|$url|URL:$url|server|port]";
        
            # shrink multiple newlines
       -    $message =~ s/\n\n(\n)*/\n\n/g;
       -    $message =~ s/\t/    /g;
       -    $message =~ s/\nt/\ntt/g;
       +    $msg =~ s/\n\n(\n)*/\n\n/g;
       +    $msg =~ s/\t/    /g;
       +    $msg =~ s/\nt/\ntt/g;
        
            # save to file
       -    saveFile($message, "article_$objectID.gph");
       +    $msg= encode("UTF-8", $msg);
       +    saveFile($msg, "article_$objectID.gph");
        
            # *** <this part has been replaced with readibility> ***