new issue
[andk-cpan-tools.git] / bin / distlslt.pl
blob07508d0ce7cac7881afb8111516edb1de7de5ade
1 use Sys::Hostname;
2 die "not yet ported to ds8143" if hostname eq "ds8143";
4 use strict;
5 use warnings;
7 =head1 NAME
9 distlslt -
11 =head1 SYNOPSIS
13 =head1 OPTIONS
15 =over 2
17 =cut
19 my $optpod = <<'=back';
21 =item --fulldate!
23 Display date on each line instead of age in days. Currently disables
24 google chart.
26 =item --help!
28 Describe usage.
30 =item --n=n
32 Number of tiles to cut from the population. In other words: if n=100
33 we calculate percentiles, if n=4 we calculate quartiles. Defaults to
34 40.
36 =item --distribution_based!
38 If this option is set then the population is not modules but
39 distributions on CPAN. See below for a discussion.
41 =item --with-distro
43 Accompany each reported line for a module with the distro it is in.
45 =item --root=s
47 Directory where the CPAN mirror is on disk.
49 =item --withcpan!
51 Look up if we have the module installed for every reported line.
53 =back
55 =head1 DESCRIPTION
57 This is the script I used to prepare the posting I<How fresh is the
58 CPAN> at http://use.perl.org/~LaPerla/journal/36320 .
60 It iterates over all modules indexed in
61 $CPAN/modules/02packages.details.txt (where $CPAN denotes the path to
62 a CPAN mirror) and measures their age as the age of the distribution
63 they are contained in. For that the script requires a full CPAN mirror
64 below the directory given in --root parameter to measure the
65 modification time of each distro.
67 The output consists of a table of 1/N-tiles (N as given in --n
68 parameter) and a measurement of quartiles based on months since the
69 release.
71 If Google::Chart is installed we produce a link to a google chart.
72 It's recommended to call it with '-n=12' because higher numbers are
73 not that well painted by google.
75 If the option --distribution_based is given then the population is
76 distros instead of modules. Interestingly it gives a completely
77 different picture. The median of the age of a distro is much higher
78 than the median of the age of a module (2009: 26.8 vs. 17.8). This
79 indicates that younger distros tend to be complexer and contain more
80 modules. The single-module-distro is out nowadays.
82 When I first calculated these statistics 2008-05 I chose modules as
83 the natural basis. Only later (2009-09) it occurred to me that I could
84 add the --distribution_based switch to see if there is a difference.
85 So what is better suited to represent the freshness of the CPAN--a
86 module based statistics or distribution based one?
88 I don't know. There are distros that contain many modules and are
89 still perceived as a single unit. Others are perceived as collection
90 of modules.
92 =head1 HISTORY
94 Calculated 1/40-tiles, of all modules Apr 10, 2008. This was the
95 output then:
97 Found 53394 modules
98 1 Thu Apr 10 20:15:53 2008 Test::A8N::File
99 2 Sun Apr 6 15:21:12 2008 WebService::ISBNDB::Iterator
100 3 Sat Mar 29 17:29:52 2008 VS::Chart::Color
101 4 Sun Mar 23 00:35:16 2008 Module::ScanDeps::DataFeed
102 5 Tue Mar 11 18:32:08 2008 Curses::UI::Popupmenu
103 6 Sat Mar 1 00:15:57 2008 Paranoid
104 7 Mon Feb 18 16:58:22 2008 Config::IniHash
105 8 Mon Feb 4 15:46:18 2008 Geo::Proj::Japan
106 9 Tue Jan 22 08:31:13 2008 Spreadsheet::Engine::Function::WEEKDAY
107 10 Thu Jan 3 17:55:37 2008 SystemC::Vregs::Language
108 11 Sat Dec 15 20:10:17 2007 Ogre::SceneManager
109 12 Thu Nov 22 19:12:58 2007 Alzabo::Create
110 13 Sat Oct 27 21:35:43 2007 KinoSearch::Search::HitQueue
111 14 Thu Sep 27 22:42:46 2007 Business::PayPal::API::CaptureRequest
112 15 Sun Aug 26 01:19:30 2007 Wx::DemoModules::wxStaticText
113 16 Sat Jul 28 20:52:11 2007 CommandParser::Vcs
114 17 Fri Jun 8 16:31:15 2007 HTML::FromMail::Page
115 18 Thu Apr 19 03:44:27 2007 Chemistry::OpenBabel
116 19 Sat Mar 17 23:17:11 2007 Java::JCR::Version::VersionHistory
117 20 Sun Jan 21 01:42:01 2007 Authen::Passphrase::Clear
118 21 Mon Nov 27 20:40:52 2006 CQL::PrefixNode
119 22 Fri Sep 22 05:50:59 2006 Template::Magic::Zone
120 23 Tue Jul 4 23:41:45 2006 Time::TCB
121 24 Fri May 12 14:49:12 2006 Test::Unit::GTestRunner
122 25 Tue Feb 28 00:42:21 2006 DBIx::Class::Loader::Generic
123 26 Thu Dec 1 17:55:02 2005 IO::Handle::Rewind
124 27 Sun Sep 4 02:39:20 2005 Games::Sudoku::OO::Set
125 28 Wed Jun 1 23:31:26 2005 Parse::EBNF::Token
126 29 Mon Feb 28 05:23:57 2005 Class::StrongSingleton
127 30 Tue Nov 30 11:40:10 2004 CGI::Wiki::Formatter::UseMod
128 31 Thu Aug 12 22:14:32 2004 DBomb::Meta::OneToMany
129 32 Tue Apr 20 20:57:27 2004 Apache::AuthenNIS
130 33 Tue Dec 23 10:47:30 2003 Bio::Tools::Run::PiseApplication::descseq
131 34 Tue Oct 7 14:00:51 2003 Apache::Profiler
132 35 Mon Jun 30 15:52:49 2003 WWW::BookBot::Test
133 36 Sun Jan 19 16:04:55 2003 Introspector::MetaInheritance
134 37 Fri Sep 20 14:35:14 2002 Anarres::Mud::Driver::Efun::MudOS
135 38 Sun Feb 10 06:55:51 2002 Math::MVPoly::Monomial
136 39 Tue Apr 17 13:41:26 2001 CGI::Test::Form::Widget::Menu::List
137 40 Mon Dec 20 00:05:25 1999 Wizard::LDAP::User
139 Quartile 1 was about newyear 2008, quartile 2 in January 2007,
140 quartile 3 in November 2004.
142 2009-09-05 redid it with -n=12 for an re-post of the article.
144 Found 68753 modules
145 1 0 Parse::HTTP::UserAgent::Base::Parsers
146 2 29 Wx::DocView
147 3 77 Coat::Persistent::Constraint
148 4 168 PDF::API2::Basic::PDF::Page
149 5 261 DJabberd::XMLElement
150 6 375 eBay::API::XML::DataType::LocalMarketAutoAcceptEnabledDefinitionType
151 7 533 Bit::MorseSignals::Receiver
152 8 743 Data::ICal::TimeZone::Object::Europe::Helsinki
153 9 1044 Email::MIME::CreateHTML::Resolver::LWP
154 10 1465 Config::Setting::IniParser
155 11 1947 SVG::SVG2zinc::Backend::Print
156 12 2451 POE::Component::Server::PreforkTCP
157 http://chart.apis.google.com/chart?chs=300x120&chd=s:97642zvqiYMAA,93ytojeZUPJEA&cht=lxy&chls=3&chxt=x,r&chxl=0:|5.6m|17.8m|48.8m|1:|0|25|50|75|100&chxp=0,93,78,40&chm=c,FF0000,0,3,10|c,FF0000,0,6,10|c,FF0000,0,9,10&chf=c,ls,90,999999,0.25,AAAAAA,0.25,CCCCCC,0.25,EEEEEE,0.25
160 Also 2009-09-05 I introduced the --distribution_based switch which
161 lead to this result (this was an hour later, so not exactly
162 corresponding lines):
164 Found 68754 modules
165 1 0 4 GPHAT/Data-Verifier-0.07.tar.gz
166 2 59 9201 BBURCH/Net-Int-Stats-1.03.tar.gz
167 3 142 15967 BINGOS/POE-Filter-KennySpeak-1.00.tar.gz
168 4 269 23441 ANDYA/Set-IntSpan-Fast-1.15.tar.gz
169 5 422 30323 ADAMK/HTML-Location-1.03.tar.gz
170 6 573 35486 YUKINOBU/Math-Business-Stochastic-0.03.tar.gz
171 7 803 41192 BRIANL/Statistics-Lite-3.2.tar.gz
172 8 1095 46536 HDP/Proc-Daemontools-Service-0.02.tar.gz
173 9 1404 50813 WONKO/CGI-Application-MailPage-1.6.tar.gz
174 10 1752 55193 LGODDARD/Image-Maps-Plot-FromLatLong-0.12.tar.gz
175 11 2171 60024 MADWOLF/OpenCA-DB-2.0.5.tar.gz
176 12 2736 64733 JRED/JaM-1.0.10.tar.gz
177 http://chart.apis.google.com/chart?chs=300x120&chd=s:9752zwqjdVMAA,93ytojeZUPJEA&cht=lxy&chls=3&chxt=x,r&chxl=0:|9.0m|26.8m|58.4m|1:|0|25|50|75|100&chxp=0,90,70,35&chm=c,FF0000,0,3,10|c,FF0000,0,6,10|c,FF0000,0,9,10&chf=c,ls,90,999999,0.25,AAAAAA,0.25,CCCCCC,0.25,EEEEEE,0.25
179 The posting for these results is at ...
181 =cut
184 use DateTime;
185 use Getopt::Long;
186 use Time::Progress;
187 use Pod::Usage qw(pod2usage);
189 our %Opt;
190 my @opt = $optpod =~ /^=item --(\S+)/mg;
191 GetOptions(\%Opt,
192 @opt,
193 ) or pod2usage(2);
195 if ($Opt{help}) {
196 pod2usage(0);
198 if ($Opt{withcpan}) {
199 require CPAN;
201 $Opt{n} ||= 40;
202 die "N[$Opt{n}] must be divideable by 4 to get the graphics correct" if $Opt{n} % 4;
203 $Opt{root} ||= "/home/ftp/pub/PAUSE";
204 open my $fh, "zcat $Opt{root}/modules/02packages.details.txt.gz|" or die;
205 my(%module_age,%distro_age,%distro_containing_module,%modules_contained_in_distro);
206 my $state = "header";
207 my $current_line = 0;
208 my $tp;
209 if (-t *STDOUT) {
210 $tp = Time::Progress->new();
212 my $lines;
213 $| = 1;
214 while (<$fh>) {
215 if ($state eq "header") {
216 if (/^\s*$/){
217 $state = "body";
218 next;
219 } elsif (/^Line-Count:\s*(\d+)/) {
220 $lines = $1;
221 if ($tp) {
222 $tp->attr( min => 1, max => $lines );
223 } else {
224 print "Found $lines modules";
227 } elsif ($state eq "body") {
228 chomp;
229 $current_line++;
230 my($m,$v,$d) = split " ", $_;
231 unless (-e "$Opt{root}/authors/id/$d"){
232 warn "could not find '$Opt{root}/authors/id/$d' for '$m'";
233 next;
235 my $module_age = $distro_age{$d} ||= -M _;
236 $module_age{$m} = $module_age;
237 $distro_containing_module{$m} = $d;
238 my $a = $modules_contained_in_distro{$d} ||= [];
239 push @$a, $m;
240 if ($lines==$current_line || !($current_line % 100)) {
241 my $formatted_current_line = sprintf "%8d", $current_line;
242 print $tp->report("\r$formatted_current_line %p over: %l s, left %e s; ETA: %f", $current_line ) if $tp;
244 } else {
245 die "illegal state $state";
248 print "\n";
249 my @items;
250 if ($Opt{distribution_based}) {
251 @items = sort { $distro_age{$a} <=> $distro_age{$b} } keys %distro_age;
252 } else {
253 @items = sort { $module_age{$a} <=> $module_age{$b} } keys %module_age;
256 my $painted = 0;
257 if ($Opt{withcpan}) {
258 CPAN::Index->reload;
260 my $now = DateTime->now;
261 my $value_sets = [];
262 my @t_index;
263 my $modules_covered = 0;
264 for my $i (0..$#items) {
265 if ($Opt{distribution_based}) {
266 my $modules_contained = @{$modules_contained_in_distro{$items[$i]}};
267 $modules_covered += $modules_contained;
269 while (($painted/$Opt{n}) < ($i/@items)) {
270 my $mtime;
271 if ($Opt{distribution_based}) {
272 $mtime = $^T-86400*$distro_age{$items[$i]};
273 } else {
274 my $module_age = $module_age{$items[$i]};
275 $mtime = $^T-86400*$module_age;
277 my $dt = DateTime->from_epoch(epoch => $mtime);
278 my $lt = $dt->ymd;
279 my $age_days = int(($now->epoch - $dt->epoch)/86400);
280 my $have = "";
281 my $have_format = "%s";
282 my $display_date = $age_days;
283 my $date_format = "%4d";
284 if ($Opt{withcpan} &&! $Opt{distribution_based}) {
285 $have_format = " %-5s";
286 my $mod = CPAN::Shell->expand("Module",$items[$i]);
287 $have = CPAN::Shell->expand("Module",$items[$i])->inst_version if $mod;
288 $have = "" unless defined $have;
290 if ($Opt{fulldate}) {
291 $date_format = "%-10s";
292 $display_date = $lt;
294 $painted++;
295 my $fullquart = int($Opt{n}/4+.01);
296 if ($painted>1 && ((($painted-1) % $fullquart) == 0)) {
297 push @t_index, $painted-1;
299 if ($Opt{distribution_based}) {
300 printf "%2d $date_format$have_format %5d %-20s\n", $painted, $display_date, $have, $modules_covered, substr($items[$i],5);
301 } else {
302 my $distro = "";
303 if ($Opt{"with-distro"}) {
304 $distro = sprintf " %s", substr($distro_containing_module{$items[$i]},5);
306 printf "%2d $date_format$have_format %-20s%s\n", $painted, $display_date, $have, $items[$i], $distro;
308 push @{$value_sets->[0]}, $Opt{fulldate} ? $display_date : -$display_date;
309 push @{$value_sets->[1]}, 1-(($painted-1)/$Opt{n});
312 XAXIS: push @{$value_sets->[0]}, $value_sets->[0][-1]; # must use the 0 for proper scaling
313 YAXIS: push @{$value_sets->[1]}, 0;
314 my @txlabel;
315 unless ($Opt{fulldate}) {
316 my $display_months = 1;
317 if ($display_months) {
318 @txlabel = map { sprintf "%.1fm", -$_/30 } @{$value_sets->[0]}[@t_index];
319 } else {
320 @txlabel = map { sprintf "%dd", -$_ } @{$value_sets->[0]}[@t_index];
322 warn sprintf "DEBUG: t_index[%s]txlabel[%s]modules[%d]distros[%d]", join(",",@t_index), join(",",@txlabel), scalar keys %distro_containing_module, scalar keys %distro_age;
323 my $HAVE_GOOGLE_CHARTS;
324 my @txpos;
325 unless ($Opt{fulldate}) {
326 $HAVE_GOOGLE_CHARTS = eval { require Google::Chart; 1; };
327 use List::Util qw(min max);
328 for my $vs (@$value_sets) {
329 my $min = min @$vs;
330 my $max = max @$vs;
331 my $range = $max - $min;
332 $vs = [ map { int(100 * ($_ - $min)/$range) } @$vs ];
334 @txpos = @{$value_sets->[0]}[@t_index];
336 if ($HAVE_GOOGLE_CHARTS) {
337 my $chart = Google::Chart->new(
338 type_name => 'type_line_xy',
339 set_size => [ 300, 120 ],
340 data_spec => {
341 encoding => 'data_simple_encoding',
342 max_value => 100,
343 value_sets => $value_sets,
346 print $chart->get_url, "&chls=3&chxt=x,r&chxl=0:|$txlabel[0]|$txlabel[1]|$txlabel[2]|1:|0|25|50|75|100&chxp=0,$txpos[0],$txpos[1],$txpos[2]&chm=c,FF0000,0,$t_index[0],10|c,FF0000,0,$t_index[1],10|c,FF0000,0,$t_index[2],10&chf=c,ls,90,999999,0.25,AAAAAA,0.25,CCCCCC,0.25,EEEEEE,0.25\n";