1 #!/home/src/perl/repoperls/installed-perls/perl/pVNtS9N/perl-5.8.0@32642/bin/perl -d
9 cnntp-scrape - collect metadata of postings to nntp.perl.org
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
25 defaults to C<http://www.nntp.perl.org/group>.
29 defaults to C<perl.cpan.testers>.
33 defaults to something like ~/var/...
39 https://svn.develooper.com/projects/cnntp/ is the code base that
40 produces the pages. I found the link on
41 http://www.nntp.perl.org/about/
43 We want to download all metadata of all postings of the current month,
44 i.e. subject, from, id and write it to a permanent place. We are not
45 interested much in data from the month before, so we should parse out
46 which month we are in and use it as a label for the resulting DB file.
47 We are not interested in the page id because this changes every hour.
49 Maybe we do not want to parse the html but rather some RSS feed? I see
50 in the svn log that there were atom and rss activities but I find no
53 /group/$1/rss/posts.xml in apache/sites/cnntp.tmpl sounds interesting.
54 Indeed, http://www.nntp.perl.org/group/perl.cpan.testers/rss/posts.xml
55 gives me something RSSish starting with
56 http://www.nntp.perl.org/group/perl.cpan.testers/2009/01/msg2987597.html
58 http://www.nntp.perl.org/group/perl.cpan.testers/2009/01/msg2987558.html
59 But the month has started with
60 http://www.nntp.perl.org/group/perl.cpan.testers/2009/01/msg2967157.html
65 B<month>, name, article_min, article_max, current_paging. The month
66 changes only when we visit the default page,
67 http://www.nntp.perl.org/group/perl.cpan.testers/
69 current_paging which can change at any time: I just clicked on 116
70 which was the last but on the resulting page I see now 118 is the
71 last. Two consecutive clicks are not necessarily providing adjacent
72 datasets. current_paging has attributes: max (int), seen (hash)
74 articles can be thread leaders or not. If they are thread leaders we
75 must read them to get at the other articles in the thread. We have the
76 subject already but not the author. article_subject, article_from,
77 article_id. whether an article is a thread leader or not can change
78 over time, even while we are reading, we should not record the fact.
79 Oh well, we must, of course, record it as long as we have not read
82 =head2 first round algorithm fails
84 I thought I'd get all articles in a range and be finished when I
85 actually have them. But neither can I determine the two borders
86 reliably nor can I get intervals complete.
88 So the algorithm must change. Read 1..n. Stop when either 20 minutes
89 have passed or n is reached or end is reached. Then sleep. Good
98 =item how to deal with holes
100 How come that we have holes? For example 3081767 is missing today. It
101 was a posting on Jan 17th. All could be so correct and simple without
102 the holes. 17th january is today. 3081851 is current max. That's 84
103 below max. The current first page goes down to 3081762 but I do not
106 PASS local-lib-1.003001 i686-linux-thread-multi-64int-ld 2.6.16.38
108 The string "local-lib" is not on pages 1-10, so this is not a
109 threading issue. The posting is simply missing in the pages. The
110 missing postings are not a fault of this program it seems.
112 We just passed a full wallclock hour. And nntp.perl.org has expired
113 all pages. Reading page1 again, gives a max of 3081942, that is 92
114 more. So I should find x1767 on page one or two. But it isn't. Page 1
115 has visible 1830-1942, page2 1594-1668!!! No shift-reload corrects for
116 that. Apparently immediately after the hour there are new pages and
117 old pages mixed. Of course my reading algorithm cannot cope with that.
119 Potential solution: goto nntp to fill holes.
123 NNTP xhdr is far superior to the method deployed here. xpat would--in
124 theory--be even better but it is extremeley slow so that I believe
125 they first filter by pattern and then by range.
128 use Net::NNTP; use YAML::XS qw(Dump);
129 use List::Pairwise qw(mapp grepp);
130 my $nntp=Net::NNTP->new("nntp.perl.org");
131 my(undef,undef,$last) = $nntp->group("perl.cpan.testers");
132 my %fail = grepp { $b =~ /^FAIL / } %{$nntp->xhdr("Subject", [$last-200, $last])};
136 '4346542': FAIL Log-Accounting-SVN-0.01 i686-linux-thread-multi-ld 2.6.28-11-generic
137 '4346547': FAIL Parse-SVNDiff-0.03 i686-linux-thread-multi 2.6.28-11-generic
138 '4346552': FAIL Tie-Scalar-MarginOfError-0.03 i686-linux-thread-multi-ld 2.6.28-11-generic
139 '4346556': FAIL Pointer-0.11 i686-linux-thread-multi 2.6.28-11-generic
141 perl -le 0.08s user 0.02s system 0% cpu 10.059 total
143 =item why not a database?
145 With a database we could slowly fill the full 4 or 5 millions and
146 already use it for a web page. But then we should really work with
147 jifty or catalyst again. Or download the existing sqlite database and
148 then start adding to it?
150 Abandoning the current codebase because I'd have to delete nearly
151 everything here. Calling it cnntp-refresh.pl
158 use File
::Basename
qw(dirname);
159 use File
::Path
qw(mkpath);
164 use List
::Util
qw(max min);
165 use Pod
::Usage
qw(pod2usage);
166 use Set
::IntSpan
::Fast
::XS
();
167 use Set
::IntSpan
::Fast
();
177 $Opt{group
} ||= "perl.cpan.testers";
178 $Opt{baseuri
} ||= "http://www.nntp.perl.org/group";
180 my $workdir = $Opt{workdir
} ||= File
::Spec
->catdir(vardir
(\
%Opt),"workdir");
182 or die("Couldn't change to $workdir: $!");
185 my $page1 = read_default_page
(\
%Opt);
186 my $page_uris = $page1->{current_paging
};
187 my $month = $page1->{month
};
188 my $db = read_local_db_for_month
(\
%Opt,$month);
189 $db->{workdir
} = $workdir;
190 my $min_article = $db->{min_article
}; # must be before merging!
191 merge_page_into_db
($page1,$db);
193 $pages_seen{"page1.html"} = 1;
194 $db->{pages_seen
} = \
%pages_seen;
195 unless ($min_article) {
196 # first time here in this month
197 my($last_page_uri) = $page_uris->[-1];
198 my $page2 = read_a_page
($last_page_uri);
199 merge_page_into_db
($page2,$db);
200 my($tpagename) = $last_page_uri =~ m
|.*/([^/]+)$|;
201 $pages_seen{$tpagename} = 1;
204 PAGE
: for my $uri (@
$page_uris) {
205 last COLLECT
if have_seen_enough
($db); # the only way out
207 my($tpagename) = $uri =~ m
|.*/([^/]+)$|;
208 next PAGE
if $pages_seen{$tpagename};
210 my $page2 = read_a_page
($uri);
211 merge_page_into_db
($page2,$db);
212 write_current_month
(\
%Opt,$db);
213 my $freshest_page_uris = $page2->{current_paging
};
214 my($higest_page_id_outer) = $page_uris->[-1] =~ /page(\d+)\.html/;
215 my($higest_page_id_inner) = $freshest_page_uris->[-1] =~ /page(\d+)\.html/;
216 if ($higest_page_id_outer < $higest_page_id_inner) {
217 # they have renumbered
218 $page_uris = $freshest_page_uris;
222 $pages_seen{$tpagename} = 1;
224 warn " ### highly suspect! ###";
227 write_current_month
(\
%Opt,$db);
235 $ua = LWP
::UserAgent
->new
250 $xp = XML
::LibXML
->new;
252 $xp->clean_namespaces(1);
253 #my $catalog = __FILE__;
254 #$catalog =~ s|ParseReport.pm$|ParseReport/catalog|;
255 #$xp->load_catalog($catalog);
260 sub write_current_month
{
262 my $month = $db->{month
};
263 my $path = dbpath
($opt,$month);
264 mkpath dirname
$path;
265 my $set = Set
::IntSpan
::Fast
->new($db->{set_as_string
}||"");
266 for my $a (keys %{$db->{articles
}}) {
269 $db->{set_as_string
} = $set->as_string;
270 YAML
::Syck
::DumpFile
("$path.new", $db);
271 rename "$path.new", $path or die "Could not rename: $!";
274 sub have_seen_enough
{
276 return unless exists $db->{min_article
};
279 for (my $i = $db->{min_article
}; $i <= $db->{max_article
}; $i++) {
281 if (!exists $db->{articles
}{$i}) {
285 warn "missing[$miss]shouldhave[$shouldhave]\n";
286 return 1 if $miss < 0.1 * $shouldhave;
287 return 1 if $db->{pages_seen
}{"page24.html"};
297 sub fetch_and_parse
{
299 $as ||= mybasename
($uri);
301 sleep 6; # let them breathe
302 print STDERR
"****Fetching $uri as $as...";
303 my $resp = _ua
->mirror($uri,$as);
304 my $cheaders = "$as.header";
305 if ($resp->is_success) {
306 print STDERR
"DONE\n";
307 open my $fh, ">", $cheaders or die;
308 for ($resp->headers->as_string) {
312 } elsif (304 == $resp->code) {
313 print STDERR
"DONE (not modified)\n";
314 my $atime = my $mtime = time;
315 utime $atime, $mtime, $cheaders;
316 } elsif ($resp->code >= 500) {
320 "%s on downloading %s: %s; sleeping %d, then retrying",
331 "unexpected error on downloading %s: %s",
339 my $xml = _xp
->parse_html_file($as);
344 my $content = do {open my $fh, $file or die; local $/; <$fh> };
345 my $dumper = Dumpvalue
->new(unctrl
=> "unctrl");
346 if ($content =~ s/([\x00-\x09\x0b\x0c\x0e-\x1f])/ $dumper->stringify($1,1) /ge) {
347 my(@stat) = stat $file or die;
348 open my $fh, ">", $file or die;
351 utime $stat[9], $stat[9], $file;
355 sub read_default_page
{
357 my $uri = "$opt->{baseuri}/$opt->{group}/";
358 my $xml = fetch_and_parse
($uri,"index.html");
359 my $page = understand_a_page
($xml,$uri);
364 my $xml = fetch_and_parse
($uri);
365 my $page = understand_a_page
($xml,$uri);
368 sub understand_a_page
{
369 my($xml,$baseuri) = @_;
372 Carp
::confess
("understand_a_page called without or with an undefined \$xml argument");
374 # want month, pages, articles(subject,id,from,thread_leader)
375 my($p1) = $xml->findnodes("/html/body/p[a[1] = 'Newest']");
376 my @p1a = $p1->findnodes("a");
377 shift @p1a while $p1a[0] && $p1a[0]->getAttribute("href") !~ m
|/page\d
|;
378 pop @p1a while $p1a[-1] && $p1a[-1]->textContent =~ m
|Oldest
|;
380 URI
->new($_->getAttribute("href"))->abs($baseuri)
382 my($month) = $pages[0] =~ m
|/(\d+/\d
+)/page\d
|;
385 my($table1) = $xml->findnodes("/html/body/table[\@class = 'article_list']");
386 my @table1tr = $table1->findnodes("tr");
388 my($td1a) = $_->findnodes("td[1]/a");
389 my $href = $td1a->getAttribute("href");
390 my($id) = $href =~ /msg(\d+)\.html$/;
391 my($td2) = $_->findnodes("td[2]");
392 my($td3) = $_->findnodes("td[3]");
393 my $from = $td3->textContent;
399 subject
=> $td1a->textContent,
400 thread_leader
=> $td2->textContent =~ /^1\s/ ?
0 : 1,
404 for my $article (grep {$_->{thread_leader
}>0} @articles) {
405 my $thr_articles = read_thread_leader
($article,$baseuri);
406 push @articles, @
$thr_articles;
409 baseuri
=> $baseuri, # for debugging
411 current_paging
=> \
@pages,
412 articles
=> \
@articles,
416 sub read_thread_leader
{
417 my($article,$baseuri) = @_;
418 my $uri = URI
->new($article->{href
})->abs($baseuri);
419 my $xml = fetch_and_parse
($uri->as_string);
421 my($ul) = $xml->findnodes("/html/body/ul/ul");
423 ($ul) = $xml->findnodes("/html/body/ul");
426 my $localtime = localtime;
427 require YAML
::Syck
; print STDERR
"Line " . __LINE__
. ", File: " . __FILE__
. "\n" . YAML
::Syck
::Dump
({article
=> $article, baseuri
=> $baseuri, localtime => $localtime}); # XXX
428 # e.g. http://www.nntp.perl.org/group/perl.cpan.testers/2009/01/msg3036775.html
429 # which has some extreme html that cannot be parsed.
432 my(@ul_li) = $ul->findnodes("li");
435 my($a) = $_->findnodes("a") or next;;
436 my $href = $a->getAttribute("href");
437 my($id) = $href =~ /msg(\d+)\.html$/;
438 my $subject = $a->textContent;
439 $subject =~ s/^\s+//;
440 $subject =~ s/\s+\Z//;
441 my $from = $_->textContent;
442 $from =~ s/.*by\s+//s;
455 my($opt,$month) = @_;
456 sprintf "%s/%s.yml", vardir
($opt), $month;
463 "%s/var/cnntp-scrape/%s",
469 sub read_local_db_for_month
{
470 my($opt,$month) = @_;
471 my $path = dbpath
($opt,$month);
474 $y = YAML
::Syck
::LoadFile
($path);
481 sub merge_page_into_db
{
484 die "Alert: mixing two months not allowed" if $page->{month
} ne $db->{month
};
486 $db->{month
} = $page->{month
};
488 my $max_page_article_id = max
map {$_->{id
}} @
{$page->{articles
}};
489 if (!$db->{max_article
} or $max_page_article_id > $db->{max_article
}) {
490 $db->{max_article
} = $max_page_article_id;
492 my $min_page_article_id = min
map {$_->{id
}} @
{$page->{articles
}};
493 if ($min_page_article_id < 2900000) {
495 warn "broken page/baseuri[$page->{baseuri}] min_page_article_id[$min_page_article_id]";
497 if (!$db->{min_article
} or $min_page_article_id < $db->{min_article
}) {
498 $db->{min_article
} = $min_page_article_id;
501 for my $article (@
{$page->{articles
}}) {
502 my $id = $article->{id
};
503 $db->{articles
}{$id} ||=
505 from
=> $article->{from
},
506 subject
=> $article->{subject
},