#! /usr/bin/perl # Copyright (C) 2006 Alex Schroeder # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the # Free Software Foundation, Inc. # 59 Temple Place, Suite 330 # Boston, MA 02111-1307 USA use Time::ParseDate; use Term::ProgressBar; use Encode; use Unicode::Normalize; my $PageDir = 'page'; my $LogFile = 'access.log'; my $ReportFile = 'age-vs-popularity.csv'; my $Now = time; my $Verbose = 1; # $UrlFilter must match the requested URL, and $1 must be the pagename my $FreeLinkPattern = "([-,.()' _0-9A-Za-z\x80-\xff]+)"; my $UrlFilter = "^/(?:cw|en|de|fr)[/?]$FreeLinkPattern\$"; warn "URL filter: $UrlFilter\n"; # namespaces # my $InterSitePattern = '[A-Z\x80-\xff]+[A-Za-z\x80-\xff]+'; sub UrlDecode { my $str = shift; $str =~ s/%([0-9a-f][0-9a-f])/chr(hex($1))/ge; return $str; } sub ParseLogLine { my $line = shift; my %result; $line =~ m/"(\S+)\s+(\S+)\s+HTTP\/[10.]+"\s+(\d+)/ or die "Cannot parse:\n$_"; my $type = $1; my $url = UrlDecode($2); my $code = $3; return unless $type eq 'GET'; return unless $code == 200; # Forget 304 Not Modified return $1 if $url =~ m!$UrlFilter!; # namespaces # return $url if $url =~ m!^/odd/$InterSitePattern/$FreeLinkPattern$!; return; } sub ParseData { my $data = shift; my %result; while ($data =~ /(\S+?): (.*?)(?=\n[^ \t]|\Z)/sg) { my ($key, $value) = ($1, $2); $value =~ s/\n\t/\n/g; $result{$key} = $value; } return %result; } my %Age = (); my %Hits = (); sub ParseLog { my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($LogFile); my $progress = Term::ProgressBar->new({name => 'Log', count => $size, ETA => linear, }); $progress->max_update_rate(1); my $next_update = 0; my $count = 0; open(F, $LogFile) or die "Cannot read $Logfile: $!"; while ($_ = ) { $count += length; my $page = ParseLogLine($_); next unless $page; $Hits{$page}++; $next_update = $progress->update($count) if $count++ >= $next_update; } close(F); $progress->update($size) if $size >= $next_update; } sub ParsePages { # include dotfiles! my @files = glob("$PageDir/*/*.pg $PageDir/*/.*.pg"); my $progress = Term::ProgressBar->new({name => 'Pages', count => $#files, ETA => linear, }); $progress->max_update_rate(1); my $next_update = 0; my $count = 0; foreach my $file (@files) { next unless $file =~ m|/.*/(.+)\.pg$|; my $page = encode_utf8(NFC(decode_utf8($1))); # normalize on HFS+ filesystems local $/ = undef; # Read complete files open(F, $file) or die "Cannot read $page file: $!"; my $data = ; close(F); my %result = ParseData($data); my $days = ($Now - $result{ts}) / (24 * 60 * 60); $Age{$page} = $days; $next_update = $progress->update($count) if $count++ >= $next_update; } $progress->update($#files) if $#files >= $next_update; } sub WriteReport { open(F, "> $ReportFile") or die "Cannot write $ReportFile: $!"; print F "Days,Hits,Name\n"; for my $page (keys %Age) { print F "$Age{$page},$Hits{$page},$page\n"; } close(F); } ParseLog(); ParsePages(); WriteReport();