it* hacker news on gopher Err codevoid.de 70 i Err codevoid.de 70 hgit clone git://git.codevoid.de/hn-gopher URL:git://git.codevoid.de/hn-gopher codevoid.de 70 1Log /git/hn-gopher/log.gph codevoid.de 70 1Files /git/hn-gopher/files.gph codevoid.de 70 1Refs /git/hn-gopher/refs.gph codevoid.de 70 i--- Err codevoid.de 70 1commit 2ffdbea646f3d5028a12be096dc3e881a363a335 /git/hn-gopher/commit/2ffdbea646f3d5028a12be096dc3e881a363a335.gph codevoid.de 70 1parent a0d7801aaae62f384ff7cbf5a628dfe12b211341 /git/hn-gopher/commit/a0d7801aaae62f384ff7cbf5a628dfe12b211341.gph codevoid.de 70 hAuthor: Stefan Hagen URL:mailto:sh+git[at]codevoid[dot]de codevoid.de 70 iDate: Tue, 31 Jul 2018 23:58:48 +0200 Err codevoid.de 70 i Err codevoid.de 70 iAdd blacklist, plausibility check, error handling Err codevoid.de 70 i Err codevoid.de 70 iDiffstat: Err codevoid.de 70 i M hn-scraper.pl | 58 +++++++++++++++++++++++++------ Err codevoid.de 70 i Err codevoid.de 70 i1 file changed, 48 insertions(+), 10 deletions(-) Err codevoid.de 70 i--- Err codevoid.de 70 1diff --git a/hn-scraper.pl b/hn-scraper.pl /git/hn-gopher/file/hn-scraper.pl.gph codevoid.de 70 it@@ -270,6 +270,30 @@ sub isHtml { Err codevoid.de 70 i Err codevoid.de 70 i return 0; Err codevoid.de 70 i } Err codevoid.de 70 i+# SUB: checkBlacklist($url) Err codevoid.de 70 i+sub checkBlacklist { Err codevoid.de 70 i+ my ( $url ) = @_; Err codevoid.de 70 i+ my @list = ( "youtube\.com", Err codevoid.de 70 i+ "blog\.longnow", Err codevoid.de 70 i+ "twitter\.com", Err codevoid.de 70 i+ "phys\.org", Err codevoid.de 70 i+ "vimeo\.com", Err codevoid.de 70 i+ "github\.com", Err codevoid.de 70 i+ "facebook\.com", Err codevoid.de 70 i+ "laptopmag\.com", Err codevoid.de 70 i+ "github\.com", Err codevoid.de 70 i+ "apple\.com", Err codevoid.de 70 i+ "mjg59\.dreamwidth\.org", Err codevoid.de 70 i+ "scmp\.com" Err codevoid.de 70 i+ ); Err codevoid.de 70 i+ foreach my $item (@list) { Err codevoid.de 70 i+ if( $url =~ m/.*${item}/ ) { Err codevoid.de 70 i+ print "Blacklisted: $url\n"; Err codevoid.de 70 i+ return 1; Err codevoid.de 70 i+ } Err codevoid.de 70 i+ Err codevoid.de 70 i+ } Err codevoid.de 70 i+} Err codevoid.de 70 i Err codevoid.de 70 i # SUB: dumpArticle($url, $objectID) Err codevoid.de 70 i # This sub downloads webpages and convert them into a plain text format than Err codevoid.de 70 it@@ -282,6 +306,10 @@ sub isHtml { Err codevoid.de 70 i sub dumpArticle { Err codevoid.de 70 i my ( $url, $objectID, $title ) = @_; Err codevoid.de 70 i Err codevoid.de 70 i+ if(checkBlacklist( $url ) eq 1) { Err codevoid.de 70 i+ return 1; Err codevoid.de 70 i+ }; Err codevoid.de 70 i+ Err codevoid.de 70 i # is it cached? return. Err codevoid.de 70 i if (-e "$go_root$go_path/article_$objectID.gph") { Err codevoid.de 70 i return 0; Err codevoid.de 70 it@@ -294,10 +322,13 @@ sub dumpArticle { Err codevoid.de 70 i return 1; Err codevoid.de 70 i } Err codevoid.de 70 i Err codevoid.de 70 i- print "Scraping: $url\n"; Err codevoid.de 70 i- my $message = decode("UTF-8", "$title\n"); Err codevoid.de 70 i- $message .= "-------------------------------------------------------------------------\n"; Err codevoid.de 70 i- $message .= decode("UTF-8", `/usr/local/bin/readability -i "$url"`); Err codevoid.de 70 i+ my $msg = decode("UTF-8", "$title\n"); Err codevoid.de 70 i+ $msg .= decode("UTF-8", "-------------------------------------------------------------------------\n\n"); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # let readability do the work... Err codevoid.de 70 i+ $msg .= decode("UTF-8", `/usr/local/bin/readability -i "$url" 2>/dev/null`); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # error handling Err codevoid.de 70 i if($? ne 0) { Err codevoid.de 70 i print "Scraping failed: $url\n"; Err codevoid.de 70 i return 1; Err codevoid.de 70 it@@ -305,17 +336,24 @@ sub dumpArticle { Err codevoid.de 70 i Err codevoid.de 70 i # call successful - convert it to text Err codevoid.de 70 i my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); Err codevoid.de 70 i- $message = $f->parse($message); Err codevoid.de 70 i+ $msg = $f->parse($msg); Err codevoid.de 70 i+ Err codevoid.de 70 i+ # plausibility check. too small? Err codevoid.de 70 i+ if(length($msg) < 500) { Err codevoid.de 70 i+ print "Text < 500: $url\n"; Err codevoid.de 70 i+ return 1; Err codevoid.de 70 i+ } Err codevoid.de 70 i Err codevoid.de 70 i- $message .= "\n\n\nSource:\n[h|$url|URL:$url|server|port]\n\n"; Err codevoid.de 70 i+ $msg.= "\n\n\nSource:\n[h|$url|URL:$url|server|port]"; Err codevoid.de 70 i Err codevoid.de 70 i # shrink multiple newlines Err codevoid.de 70 i- $message =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i- $message =~ s/\t/ /g; Err codevoid.de 70 i- $message =~ s/\nt/\ntt/g; Err codevoid.de 70 i+ $msg =~ s/\n\n(\n)*/\n\n/g; Err codevoid.de 70 i+ $msg =~ s/\t/ /g; Err codevoid.de 70 i+ $msg =~ s/\nt/\ntt/g; Err codevoid.de 70 i Err codevoid.de 70 i # save to file Err codevoid.de 70 i- saveFile($message, "article_$objectID.gph"); Err codevoid.de 70 i+ $msg= encode("UTF-8", $msg); Err codevoid.de 70 i+ saveFile($msg, "article_$objectID.gph"); Err codevoid.de 70 i Err codevoid.de 70 i # *** *** Err codevoid.de 70 i Err codevoid.de 70 .