#!/usr/bin/perl -w # Print http results by publication year for the # last trial (2005-02-27) # This includes duplicate urls from other articles. # Example: # year total articles total urls available unavailable # 1995 5 250 125 125 # 1996 23 459 349 135 # 1997 43 529 400 150 # ... use strict; my $urls_fn = "urls_by_year.txt"; my $last_result = "http_responses/2005-02-27result.txt"; my $num_articles_fn = "num_of_articles_per_year.txt"; my %seen; my %http_codes; # Get all urls for this publication year # Hash of arrays open(FILE, $urls_fn) || die("Can't open $urls_fn: $!\n"); while (my $line = ) { chomp($line); my ($year, $url) = split(/\t/, $line); push(@{$seen{$year}}, $url); } close FILE; my %years; open(FILE, $num_articles_fn) || die("Can't open $urls_fn: $!\n"); while (my $line = ) { chomp($line); my ($year, $num_articles) = split(/\t/, $line); $years{$year} = $num_articles; } close FILE; print "year\ttotal articles\turls per article\ttotal urls\tavailable\tunavailable\n"; foreach my $year (sort keys %seen) { print "$year\t"; my $total_urls = 0; my $total_avail = 0; my $total_unavail = 0; my @urls = @{ $seen{$year} }; foreach my $url (@urls) { #print "\t$_\n"; # See if url was found last time my $result = found_url($url); if ($result == -1) { print "Problem: can't find [$url]\n"; } elsif ($result == 1) { $total_avail++; } else { $total_unavail++; } $total_urls++; } my $urls_per_article = $total_urls / $years{$year}; printf "%d\t%0.1f\t%d\t%d\t%d\n", $years{$year}, $urls_per_article, $total_urls, $total_avail, $total_unavail; } # returns 1 if url was available last time, 0 if not, -1 # if not found sub found_url { my $url_find = shift; my $file; open(FILE, $last_result) || die("Can't open $last_result: $!\n"); while (my $line = ) { chomp($line); if ($line !~ m|^http://www.dlib.org| && $line !~ m|^http://dx.doi.org/10.1045/|) { my ($url, $len, $code) = split(/ /, $line); if ($url eq $url_find) { $code = 200 if ($code == 304 || $code == 301 || $code == 302); $code = "200-0" if ($code == 200 && $len == 0); if ($code eq '200') { close FILE; return 1; } else { close FILE; return 0; } } } } close FILE; # never found url return -1; }