wiki.pl: Port some fixes from upstream
[Orgmuse.git] / age-vs-popularity
blobb36e69e5a20da1d1b65c26cef9db76b9f3bd8bf7
1 #! /usr/bin/perl
2 # Copyright (C) 2006 Alex Schroeder <alex@emacswiki.org>
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with this program; if not, write to the
16 # Free Software Foundation, Inc.
17 # 59 Temple Place, Suite 330
18 # Boston, MA 02111-1307 USA
20 use Time::ParseDate;
21 use Term::ProgressBar;
22 use Encode;
23 use Unicode::Normalize;
25 my $PageDir = 'page';
26 my $LogFile = 'access.log';
27 my $ReportFile = 'age-vs-popularity.csv';
28 my $Now = time;
29 my $Verbose = 1;
31 # $UrlFilter must match the requested URL, and $1 must be the pagename
32 my $FreeLinkPattern = "([-,.()' _0-9A-Za-z\x80-\xff]+)";
33 my $UrlFilter = "^/(?:cw|en|de|fr)[/?]$FreeLinkPattern\$";
34 warn "URL filter: $UrlFilter\n";
36 # namespaces
37 # my $InterSitePattern = '[A-Z\x80-\xff]+[A-Za-z\x80-\xff]+';
39 sub UrlDecode {
40 my $str = shift;
41 $str =~ s/%([0-9a-f][0-9a-f])/chr(hex($1))/ge;
42 return $str;
45 sub ParseLogLine {
46 my $line = shift;
47 my %result;
48 $line =~ m/"(\S+)\s+(\S+)\s+HTTP\/[10.]+"\s+(\d+)/ or die "Cannot parse:\n$_";
49 my $type = $1;
50 my $url = UrlDecode($2);
51 my $code = $3;
52 return unless $type eq 'GET';
53 return unless $code == 200; # Forget 304 Not Modified
54 return $1 if $url =~ m!$UrlFilter!;
55 # namespaces
56 # return $url if $url =~ m!^/odd/$InterSitePattern/$FreeLinkPattern$!;
57 return;
60 sub ParseData {
61 my $data = shift;
62 my %result;
63 while ($data =~ /(\S+?): (.*?)(?=\n[^ \t]|\Z)/sg) {
64 my ($key, $value) = ($1, $2);
65 $value =~ s/\n\t/\n/g;
66 $result{$key} = $value;
68 return %result;
71 my %Age = ();
72 my %Hits = ();
74 sub ParseLog {
75 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
76 $atime,$mtime,$ctime,$blksize,$blocks)
77 = stat($LogFile);
78 my $progress = Term::ProgressBar->new({name => 'Log',
79 count => $size,
80 ETA => linear, });
81 $progress->max_update_rate(1);
82 my $next_update = 0;
83 my $count = 0;
84 open(F, $LogFile) or die "Cannot read $Logfile: $!";
85 while ($_ = <F>) {
86 $count += length;
87 my $page = ParseLogLine($_);
88 next unless $page;
89 $Hits{$page}++;
90 $next_update = $progress->update($count) if $count++ >= $next_update;
92 close(F);
93 $progress->update($size) if $size >= $next_update;
96 sub ParsePages {
97 # include dotfiles!
98 my @files = glob("$PageDir/*/*.pg $PageDir/*/.*.pg");
99 my $progress = Term::ProgressBar->new({name => 'Pages',
100 count => $#files,
101 ETA => linear, });
102 $progress->max_update_rate(1);
103 my $next_update = 0;
104 my $count = 0;
105 foreach my $file (@files) {
106 next unless $file =~ m|/.*/(.+)\.pg$|;
107 my $page = encode_utf8(NFC(decode_utf8($1))); # normalize on HFS+ filesystems
108 local $/ = undef; # Read complete files
109 open(F, $file) or die "Cannot read $page file: $!";
110 my $data = <F>;
111 close(F);
112 my %result = ParseData($data);
113 my $days = ($Now - $result{ts}) / (24 * 60 * 60);
114 $Age{$page} = $days;
115 $next_update = $progress->update($count) if $count++ >= $next_update;
117 $progress->update($#files) if $#files >= $next_update;
120 sub WriteReport {
121 open(F, "> $ReportFile") or die "Cannot write $ReportFile: $!";
122 print F "Days,Hits,Name\n";
123 for my $page (keys %Age) {
124 print F "$Age{$page},$Hits{$page},$page\n";
126 close(F);
129 ParseLog();
130 ParsePages();
131 WriteReport();