t* hacker news on gopher URI git clone git://git.codevoid.de/hn-gopher DIR Log DIR Files DIR Refs --- thn-scraper.pl (20744B) --- 1 #!/usr/bin/env perl 2 3 # default 4 use strict; 5 use warnings; 6 7 # parallel processing 8 use Parallel::ForkManager; 9 10 # date formatting 11 use DateTime; 12 use DateTime::Format::Duration; 13 14 # network 15 use LWP::UserAgent; 16 17 # protocol transformation 18 use JSON; 19 use Encode; 20 21 # text formatting 22 use HTML::FormatText::WithLinks; 23 use HTML::LinkExtractor; 24 use HTML::Restrict; 25 use HTML::Entities; 26 use Text::Wrap; 27 $Text::Wrap::columns=72; 28 29 ### CONFIGURATION 30 my $protocol = "https"; 31 my $server = "hn.algolia.com"; 32 my $api_uri = "/api/v1"; 33 my $go_root = "/srv/codevoid-gopher"; 34 my $go_path = "/hn"; 35 my $index_count = 20; # item count per page 36 my $total_count = 400; # total item count (all pages) 37 my $dumper = 1; # 1 creates plain text versions 38 39 ### CAN HAZ LOGO? SURE! 40 my $logo =" _______ __ _______\n"; 41 $logo .="| | |.---.-..----.| |--..-----..----. | | |.-----..--.--.--..-----.\n"; 42 $logo .="| || _ || __|| < | -__|| _| | || -__|| | | ||__ --|\n"; 43 $logo .="|___|___||___._||____||__|__||_____||__| |__|____||_____||________||_____|\n"; 44 $logo .=" on Gopher (inofficial)\n"; 45 $logo .= "[h|Visit Hacker News on the Web|URL:https://news.ycombinator.com|server|port]\n\n"; 46 47 ### FUNCTIONS 48 49 # SUB: $json = getTopStories(); 50 # read all top stories supplied by the firebase API. This API will only return 51 # the IDs of stories that are currently on the front page. In order. 52 sub getTopStories { 53 # FIXME make this configurable, maybe. 54 # yes, this is dupicate code to getApiData() 55 my $REST= ({HOST => "hacker-news.firebaseio.com", 56 URL => "https://hacker-news.firebaseio.com/v0/topstories.json" }); 57 $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); 58 $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); 59 $REST->{resource} = $REST->{URL}; 60 $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); 61 $REST->{response} = $REST->{UA}->request( $REST->{request} ); 62 63 # we're not giving up 64 if(not $REST->{response}->is_success()) { 65 sleep 5; 66 return getTopStories(); 67 } 68 69 return decode_json($REST->{response}->content); 70 } 71 72 73 # SUB: $json = getApiData("/api/..."); 74 # this call returns stories and comments. The nice thing about this is, that it 75 # can provide all comments to a story in one call. 76 # OPTIMIZE: right now, the story and comments are fetched separately. This 77 # could be combined in one call. 78 sub getApiData { 79 my ( $uri ) = @_; 80 81 my $REST= ({HOST => "$server", 82 URL => "$protocol://$server$uri" }); 83 84 $REST->{UA} = LWP::UserAgent->new(keep_alive => 0, timeout => 30); 85 $REST->{UA}->agent("codevoid-hackernews-gopherproxy/0.1"); 86 $REST->{resource} = $REST->{URL}; 87 $REST->{request} = HTTP::Request->new( GET => $REST->{resource} ); 88 $REST->{response} = $REST->{UA}->request( $REST->{request} ); 89 90 # we're not giving up 91 if(not $REST->{response}->is_success()) { 92 sleep 2; 93 return getApiData ( $uri ); 94 } 95 96 return decode_json($REST->{response}->content); 97 } 98 99 100 # SUB: $gph = scrapeSubComments($payload, $parentID, $lvl) 101 # recursive comment scraper 102 # this sub formats searches for a comment with the incoming parentID 103 # and adds it to $output. Then it calles itself again with the ID of 104 # the found comment and an increased indent level. 105 # 106 # Then searches for comments with the incoming ID as parent ID and 107 # adds the first hit to $output. Then it calls itself with the ID as 108 # parentID again... 109 # 110 # If no more comments are found with the supplied ID, it decreases 111 # the ident level and returns to the previous invocation. 112 sub scrapeSubComments { 113 my ( $payload, $parentID, $lvl ) = @_; 114 115 # search for comment 116 my $output = ""; 117 for my $hit ($payload->{"hits"}) { 118 foreach my $comment (@$hit) { 119 120 # comment is found, add to output 121 if ($comment->{'parent_id'} == $parentID) { 122 123 # format data 124 my $text = encode("UTF-8", $comment->{'comment_text'}); 125 my $author = encode("UTF-8", $comment->{'author'}); 126 my $ago = parseDate($comment->{'created_at'}); 127 128 # add to output 129 $output .= formatContent("$author wrote $ago:", $lvl); 130 $output .= formatContent("$text", $lvl)."\n"; 131 132 # invoke itself with objectID and travers down the hierarchy 133 $output .= scrapeSubComments( $payload, $comment->{'objectID'}, ++$lvl ); 134 135 # decrease indentation level 136 $lvl--; 137 } 138 } 139 } 140 return $output; 141 } 142 143 # SUB: $datestr = parseDate($datestring) 144 # takes someting like 2018-04-23T23:45Z002 and converts it to a relative 145 # and humand readable notation like "4 days ago". 146 # OPTIMIZE: the Duration API can be used with parse pattern this should 147 # be used. It's probably simpler and faster. 148 sub parseDate { 149 my ( $datestring ) = @_; 150 151 # set output (parse) pattern 152 my $p = DateTime::Format::Duration->new( 153 pattern => '%Y|%m|%e|%H|%M', 154 normalize => 1 155 ); 156 157 # FIXME: DateTime::Duration can do the parsing 158 # parse string and create datetime object 159 $datestring =~ /(....)-(..)-(..)T(..):(..).*/; 160 my $dt = DateTime->new( 161 year => $1, 162 month => $2, 163 day => $3, 164 hour => $4, 165 minute => $5, 166 second => 0, 167 nanosecond => 0, 168 time_zone => 'UTC' 169 ); 170 171 # calculate difference 172 my $dt_now = DateTime->now; 173 my $dt_diff = $dt_now - $dt; 174 175 # parse result 176 my $o = $p->format_duration($dt_diff); 177 178 # parse output (FIXME: this is *so* ugly) 179 my $dtstr = ""; 180 $o =~ /(\d+)\|(\d+)\|(\d+)\|(\d+)\|(\d+)/; 181 my $Y = int($1); 182 my $m = int($2); 183 my $d = int($3); 184 my $H = int($4); 185 my $M = int($5); 186 if($M) { 187 $dtstr = "$M min ago"; 188 } 189 if($H) { 190 if($H == 1) { 191 $dtstr = "$H hour $M min ago"; 192 } else { 193 $dtstr = "$H hours $M min ago"; 194 } 195 } 196 if($d) { 197 if($d == 1) { 198 $dtstr = "$d day ago"; 199 } else { 200 $dtstr = "$d days ago"; 201 } 202 } 203 if($m) { 204 if($m == 1) { 205 if($d == 1) { 206 $dtstr = "$m month $d day ago"; 207 } else { 208 $dtstr = "$m month $d days ago"; 209 } 210 } else { 211 if($d == 1) { 212 $dtstr = "$m months $d day ago"; 213 } else { 214 $dtstr = "$m months $d days ago"; 215 } 216 } 217 } 218 if($Y) { 219 $dtstr = "on $Y-$m-$d ($H:$M)"; 220 } 221 222 return $dtstr; 223 } 224 225 # SUB: scrapeComments($objectID, $number, $link) 226 # this sets up the comment page frame. The content is added by hierarchial 227 # scrapeSubComments() calls. 228 sub scrapeComments { 229 my ( $objectID, $number, $link ) = @_; 230 231 # set header 232 my $content = "$logo\nCOMMENT PAGE FOR:\n$link\n\n"; 233 234 # the comment count. If this is zero, this call can be skipped. 235 if($number) { 236 # call API to receive all comments. The previews call already contains 237 my $payload = getApiData("$api_uri/search?tags="."comment,story_$objectID&hitsPerPage=$number"); 238 239 # invoke hiararchial scraper and hand over the payload 240 # (only working in memory from here) 241 $content .= scrapeSubComments($payload, $objectID, 0); 242 } else { 243 # previous call indicated 0 comments. 244 $content .= "No comments available\n"; 245 } 246 247 # all comments have been added to the page. Add footer and save file. 248 $content .= "\n[1|<- back to front page|$go_path|server|port]"; 249 saveFile($content, "comments_$objectID.gph"); 250 } 251 252 # SUB: $url = isHtml($url) 253 # this sub checks a given URL by performing a HEAD request. In case the URL is 254 # of type text/html, it will return the URL. Otherwise 0. 255 sub isHtml { 256 my ( $url ) = @_; 257 258 # perform HEAD request 259 my $ua = LWP::UserAgent->new(keep_alive => 0, timeout => 30); 260 $ua->agent("codevoid-hackernews-gopherproxy/0.1"); 261 my $req = HTTP::Request->new(HEAD => $url); 262 $req->header('Accept' => 'text/html'); 263 my $resp = $ua->request($req); 264 265 # check content type 266 if ($resp->is_success && ($resp->content_type =~ m/text\/html/)) { 267 return $resp->request()->uri(); 268 } 269 270 return 0; 271 } 272 # SUB: checkBlacklist($url) 273 sub checkBlacklist { 274 my ( $url ) = @_; 275 my @list = ( "youtube\.com", 276 "blog\.longnow", 277 "twitter\.com", 278 "phys\.org", 279 "vimeo\.com", 280 "\\/github\.com", 281 "facebook\.com", 282 "laptopmag\.com", 283 "github\.com", 284 "bloomberg\.com", 285 "apple\.com", 286 "mjg59\.dreamwidth\.org", 287 "scmp\.com", 288 "slate\.com", 289 "nature\.com", 290 "forbes\.com", 291 "www\.nature\.com", 292 "tandfonline\.com", 293 "usebookman\.com", 294 "reddit\.com" 295 ); 296 foreach my $item (@list) { 297 if( $url =~ m/.*${item}/ ) { 298 print "Blacklisted: $url\n"; 299 return 1; 300 } 301 302 } 303 } 304 305 # SUB: dumpArticle($url, $objectID) 306 # This sub downloads webpages and convert them into a plain text format than 307 # can be served on gopher. Once an article has been converted, it is not being 308 # downloaded again. 309 # OPTIMIZE: For some pages, this works great. Not for others. Some custom made 310 # preprocessing steps could be added to strip out navigation, footer, excessive 311 # ads and other non-relevant data. This could be done on a per domain basis. 312 # (this could be a separate program which could be reused in other projects) 313 sub dumpArticle { 314 my ( $url, $objectID, $title ) = @_; 315 316 if(checkBlacklist( $url ) eq 1) { 317 return 1; 318 }; 319 320 # is it cached? return. 321 if (-e "$go_root$go_path/article_$objectID.gph") { 322 return 0; 323 } 324 325 # content type check 326 $url = isHtml($url); 327 if($url == 0) { 328 # the supplied URL is not html, don't add it to the front page. 329 return 1; 330 } 331 332 my $msg = decode("UTF-8", "$title\n"); 333 $msg .= decode("UTF-8", "-------------------------------------------------------------------------\n\n"); 334 335 # let readability do the work... 336 $msg .= decode("UTF-8", `/usr/local/bin/readability -i "$url" 2>/dev/null`); 337 338 # error handling 339 if($? ne 0) { 340 print "Scraping failed: $url\n"; 341 return 1; 342 } 343 344 # call successful - convert it to text 345 my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); 346 $msg = $f->parse($msg); 347 348 # plausibility check. too small? 349 if(length($msg) <= 500) { 350 print "Text <= 500: $url\n"; 351 return 1; 352 } 353 354 $msg.= "\n\n\nSource:\n[h|$url|URL:$url|server|port]"; 355 356 # shrink multiple newlines 357 $msg =~ s/\n\n(\n)*/\n\n/g; 358 $msg =~ s/\t/ /g; 359 $msg =~ s/\nt/\ntt/g; 360 361 # save to file 362 $msg= encode("UTF-8", $msg); 363 saveFile($msg, "article_$objectID.gph"); 364 365 # *** <this part has been replaced with readibility> *** 366 367 ## we got html, let's download it 368 #my $ua = LWP::UserAgent->new; 369 #my $req = HTTP::Request->new(GET => $url); 370 #my $resp = $ua->request($req); 371 372 #if ($resp->is_success) { 373 # 374 # # OPTIMIZE: this would be the place to modify the HTML 375 # # in $resp->decoded_content 376 # print "Scraping: $url\n"; 377 # my $message = "Source: $url\n\n"; 378 379 # # call successful - convert it to text 380 # my $f = HTML::FormatText::WithLinks->new(anchor_links => 0, unique_links => 1, base => "$url"); 381 # $message = $f->parse($resp->decoded_content); 382 383 # # wrap it to 72 characters (will destroy link lists) 384 # #$Text::Wrap::columns=72; 385 # #$message = wrap("","",$message); 386 387 # # shrink multiple newlines 388 # $message =~ s/\n\n(\n)*/\n\n/g; 389 # $message =~ s/\t/ /g; 390 # $message =~ s/\nt/\ntt/g; 391 392 # # save to file 393 # saveFile($message, "article_$objectID.gph"); 394 #} else { 395 # # the call was unsuccessful. We're not trying again here. 396 # # The call be repeated on the next scraper run. Returning 1 here 397 # # leads to the link to this file will not be added on the front page. 398 # return 1; 399 #} 400 # 401 # *** </this part has been replaced with readibility> *** 402 403 # no complaints, add the link to this article. 404 return 0; 405 } 406 407 ### SUB: formatContent($content, $lvl) 408 # This is the comment page formatter. It takes text and an indentation 409 # level und put this nicely on a page, with a level bar on the left. 410 sub formatContent { 411 my ( $content, $lvl ) = @_; 412 413 # decode html notations 414 $content = decode_entities($content); 415 416 # remove trailing space before wrapping 417 $content =~ s/ $/\n/g; 418 419 # handle crazy indent levels that would leave no 420 # room for text on the right side 421 my $pad=""; 422 if($lvl > 20) { 423 $pad = "$lvl> "; 424 $lvl = 19; 425 } 426 427 # Setup text wrapper to wrap at 72 - indent level 428 # each level in/decreases two spaces 429 $Text::Wrap::columns=72-($lvl*2); 430 431 # Calculate spaces to add on the left side 432 # based on the reply/indent level. 433 while($lvl > 0) { 434 $pad=" ".$pad; 435 $lvl--; 436 } 437 438 # Search for links in comments 439 my $LX = new HTML::LinkExtractor(); 440 $LX->strip(1); 441 $LX->parse(\$content); 442 443 # Replace some HTML elements 444 my $HR = HTML::Restrict->new(); 445 $content =~ s/<p>/\n\n/g; 446 $content =~ s/<li>/\n\n\* /g; 447 448 # strip remaining HTML tags 449 my $content_clean = $HR->process($content); 450 451 # nobody needs more that one newline. 452 $content_clean =~ s/\n\n(\n)*/\n\n/g; 453 454 # Loop at links, match text, add [counter] and generate output. 455 my $c = 0; 456 my $links = ""; 457 foreach my $link ($LX->links) { 458 foreach my $linkitem (@$link) { 459 460 # skip empty links (image links for example) 461 if(!$linkitem->{_TEXT}) { next; } 462 463 # link found, increase counter 464 $c++; 465 466 # replace link text with [$counter] 467 $content_clean =~ s/(\Q$linkitem->{_TEXT}\E)/ \[$c\] /g; 468 469 # make sure there are no newlines/extra spaces around [0] 470 $content_clean =~ s/[\s\n]+\[$c\][\s\n]+/ \[$c\] /g; 471 472 # fix the [1] [1] situation (FIXME: how to do this properly?) 473 $content_clean =~ s/\[1\][\.:\s\n]+\[1\]/\[1\]/g; 474 $content_clean =~ s/\[2\][\.:\s\n]+\[2\]/\[2\]/g; 475 $content_clean =~ s/\[3\][\.:\s\n]+\[3\]/\[3\]/g; 476 $content_clean =~ s/\[4\][\.:\s\n]+\[3\]/\[4\]/g; 477 $content_clean =~ s/\[5\][\.:\s\n]+\[3\]/\[5\]/g; 478 $content_clean =~ s/ \[\d\] $//g; 479 480 # shorten links that are too long for the indent level 481 my $short = $linkitem->{href}; 482 my $l = 62 - length($pad); 483 if(length($short) > $l) { $short = substr($short,0,$l)."..."; } 484 485 # add link to output scalar 486 $links .= sprintf("[h|${pad}║ [%i]: %s|URL:%s|codevoid.de|70]\n", $c, $short, $linkitem->{href}); 487 } 488 } 489 490 # Wrap content 72 - padding 491 $content_clean = wrap("","",$content_clean); 492 493 # shrink multiple newlines 494 $content_clean =~ s/\n\n(\n)*/\n\n/g; 495 496 # Add padding to the left 497 $content_clean =~ s/^/$pad║ /g; 498 $content_clean =~ s/\n/\n$pad║ /g; 499 500 # print links if any... 501 if($links) { 502 $content_clean .= "\n$pad║ \n$links"; 503 } else { 504 $content_clean .= "\n"; 505 } 506 507 # fix gopher issues (geomyidae design) 508 $content_clean =~ s/\t/ /g; 509 $content_clean =~ s/\nt/\ntt/g; 510 511 return $content_clean; 512 } 513 514 ### SUB: saveFile($content, $filename) 515 sub saveFile { 516 my ( $content, $filename ) = @_; 517 my $path = "$go_root$go_path"; 518 519 # save temporary file 520 open (FH, ">> $path/.$filename") || die "Cannot open file temporary file: $filename\n"; 521 print FH $content; 522 close(FH); 523 524 # rename temporary file to real file (atomic) 525 rename("$path/.$filename", "$path/$filename") || die "Cannot rename temporary file: $filename\n"; 526 return 0; 527 } 528 529 530 ### MAIN PROGRAM 531 my ($selected_story) = @ARGV; 532 my $content = ""; 533 534 # fetch top story IDs 535 my $json_top = getTopStories(); 536 537 # construct search query 538 my $query = "search?hitsPerPage=500&tags=story,("; 539 540 # add stories to search query 541 my $count = 0; 542 for my $id (@$json_top) { 543 $query .="story_$id,"; 544 $count++; 545 if($count >= $total_count) { 546 last; 547 } 548 } 549 550 # remove trailing comma and close query 551 $query =~ s/,$/\)/g; 552 553 # fetch the top story IDs from firebase API 554 my $topStoryList = getApiData("$api_uri/$query"); 555 556 # set up background tasks for parallel scraping 557 my $pm = new Parallel::ForkManager(50); 558 559 # scrape story header and comments 560 for my $hit ($topStoryList->{"hits"}) { 561 foreach my $story (@$hit) { 562 563 # do everything from here on in background 564 $pm->start and next; 565 566 # convenient variables 567 my $objectID = $story->{'objectID'}; 568 my $author = encode("UTF-8", $story->{'author'}); 569 my $title = encode("UTF-8", $story->{'title'}); 570 my $url = encode("UTF-8", $story->{'url'}); 571 572 # comments (default to 0) 573 my $number = 0; 574 if($story->{'num_comments'}) { 575 $number = $story->{'num_comments'}; 576 } 577 578 # parse date and convert to relative notation (5 min ago) 579 my $ago = parseDate($story->{'created_at'}); 580 581 # title is a link, escape "|" 582 $title =~ s/\|/\\|/g; 583 584 # URL is either a HTML link line or a gopher dir 585 my $link; 586 if($url) { 587 # link goes HTTP 588 $link = "[h| $title|URL:$url|server|port]\n"; 589 590 # is the article dumper active? 591 if($dumper == 1) { 592 if(dumpArticle($url, $objectID, $title) eq 0) { 593 $link .= "[1| text version|$go_path/article_$objectID.gph|server|port]\n"; 594 } 595 } 596 597 } else { 598 # link goes GOPHER (redefine URL to comments [Ask HN]) 599 $url = "$go_path/comments_$story->{'objectID'}.gph"; 600 $link = "[1| $title|$url|server|port]\n"; 601 } 602 603 # add title link line 604 $content .= $link; 605 606 # add author line 607 $content .= " by $author ($story->{'points'} points) $ago\n"; 608 609 # add comment link line 610 $content .= "[1| read $number comments|$go_path/comments_$objectID.gph|server|port]\n"; 611 612 # aaaand one blank 613 $content .= "\n"; 614 615 # Save story file 616 saveFile($content, "story_$objectID.gph"); 617 618 # Fire up the comment scraper 619 scrapeComments($story->{'objectID'}, $number, $link); 620 621 # background task stops here 622 $pm->finish; 623 } 624 } 625 626 # wait for all scraping be done and all cache files be present 627 $pm->wait_all_children; 628 629 # construct index 630 $count = 0; 631 632 # setup pagination variables 633 my $page = 1; 634 my $nextpage; 635 my $prevpage; 636 my $filename; 637 638 # initialize output variable 639 my $index_out = "$logo"; 640 641 # loop at all top stories (to keep the sequence) 642 for my $id (@$json_top) { 643 644 # append the story files 645 if (-e "$go_root$go_path/story_$id.gph") { 646 open(my $fh, '<', "$go_root$go_path/story_$id.gph"); 647 while (my $row = <$fh>) { $index_out .= $row; } 648 close($fh); 649 } 650 651 # increase story counter 652 $count++; 653 654 # Pagination 655 if(($count % $index_count) eq 0) { 656 657 # setup defaults 658 $filename = "index-$page.gph"; 659 $nextpage = $page + 1; 660 $prevpage = $page - 1; 661 662 # special handling for first page (different name) 663 if($page eq 1) { 664 $filename = "index.gph"; 665 $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]\n\n"; 666 $index_out .= "[1|<< Back Home|/|server|port]"; 667 } else { 668 $index_out .= "[1| Next Page ($nextpage) >>|$go_path/index-$nextpage.gph|server|port]"; 669 } 670 671 # incease page counter 672 $page++; 673 674 # done, save file, proceed with next page 675 saveFile($index_out, $filename); 676 677 # initialize indexout for next run 678 $index_out = "$logo"; 679 680 } else { 681 682 # handle last page 683 if ( $count >= $total_count ) { 684 $index_out .= "[1| << Prev Page ($prevpage) |$go_path/index-$prevpage.gph|server|port]"; 685 saveFile($index_out, $filename); 686 last; 687 } 688 } 689 } 690 691 exit 0;