t* hacker news on gopher URI git clone git://git.codevoid.de/hn-gopher DIR Log DIR Files DIR Refs --- DIR commit 2ffdbea646f3d5028a12be096dc3e881a363a335 DIR parent a0d7801aaae62f384ff7cbf5a628dfe12b211341 URI Author: Stefan Hagen <sh+git[at]codevoid[dot]de> Date: Tue, 31 Jul 2018 23:58:48 +0200 Add blacklist, plausibility check, error handling Diffstat: M hn-scraper.pl | 58 +++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 10 deletions(-) --- DIR diff --git a/hn-scraper.pl b/hn-scraper.pl t@@ -270,6 +270,30 @@ sub isHtml { return 0; } +# SUB: checkBlacklist($url) +sub checkBlacklist { + my ( $url ) = @_; + my @list = ( "youtube\.com", + "blog\.longnow", + "twitter\.com", + "phys\.org", + "vimeo\.com", + "github\.com", + "facebook\.com", + "laptopmag\.com", + "github\.com", + "apple\.com", + "mjg59\.dreamwidth\.org", + "scmp\.com" + ); + foreach my $item (@list) { + if( $url =~ m/.*${item}/ ) { + print "Blacklisted: $url\n"; + return 1; + } + + } +} # SUB: dumpArticle($url, $objectID) # This sub downloads webpages and convert them into a plain text format than t@@ -282,6 +306,10 @@ sub isHtml { sub dumpArticle { my ( $url, $objectID, $title ) = @_; + if(checkBlacklist( $url ) eq 1) { + return 1; + }; + # is it cached? return. if (-e "$go_root$go_path/article_$objectID.gph") { return 0; t@@ -294,10 +322,13 @@ sub dumpArticle { return 1; } - print "Scraping: $url\n"; - my $message = decode("UTF-8", "$title\n"); - $message .= "-------------------------------------------------------------------------\n"; - $message .= decode("UTF-8", `/usr/local/bin/readability -i "$url"`); + my $msg = decode("UTF-8", "$title\n"); + $msg .= decode("UTF-8", "-------------------------------------------------------------------------\n\n"); + + # let readability do the work... + $msg .= decode("UTF-8", `/usr/local/bin/readability -i "$url" 2>/dev/null`); + + # error handling if($? ne 0) { print "Scraping failed: $url\n"; return 1; t@@ -305,17 +336,24 @@ sub dumpArticle { # call successful - convert it to text my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); - $message = $f->parse($message); + $msg = $f->parse($msg); + + # plausibility check. too small? + if(length($msg) < 500) { + print "Text < 500: $url\n"; + return 1; + } - $message .= "\n\n\nSource:\n[h|$url|URL:$url|server|port]\n\n"; + $msg.= "\n\n\nSource:\n[h|$url|URL:$url|server|port]"; # shrink multiple newlines - $message =~ s/\n\n(\n)*/\n\n/g; - $message =~ s/\t/ /g; - $message =~ s/\nt/\ntt/g; + $msg =~ s/\n\n(\n)*/\n\n/g; + $msg =~ s/\t/ /g; + $msg =~ s/\nt/\ntt/g; # save to file - saveFile($message, "article_$objectID.gph"); + $msg= encode("UTF-8", $msg); + saveFile($msg, "article_$objectID.gph"); # *** <this part has been replaced with readibility> ***