#!/usr/bin/perl -w # Print http results by publication year for the # last trial (2005-02-27) # Example: # year total available unavailable # 1995 250 125 125 # 1996 459 349 135 # 1997 529 400 150 # ... use strict; my $urls_fn = "urls_by_year.txt"; my $last_result = "http_responses/2005-02-27result.txt"; my %seen; my %http_codes; # Get all urls for this publication year # Hash of arrays open(FILE, $urls_fn) || die("Can't open $urls_fn: $!\n"); while (my $line = ) { chomp($line); my ($year, $url) = split(/\t/, $line); push(@{$seen{$year}}, $url); } close FILE; print "year total available unavailable\n"; foreach my $year (sort keys %seen) { print "$year\t"; my $total_urls = 0; my $total_avail = 0; my $total_unavail = 0; my @urls = @{ $seen{$year} }; foreach my $url (@urls) { #print "\t$_\n"; # See if url was found last time my $result = found_url($url); if ($result == -1) { print "Problem: can't find [$url]\n"; } elsif ($result == 1) { $total_avail++; } else { $total_unavail++; } $total_urls++; } print "$total_urls\t$total_avail\t$total_unavail\n"; } exit; # returns 1 if url was available last time, 0 if not, -1 # if not found sub found_url { my $url_find = shift; my $file; open(FILE, $last_result) || die("Can't open $last_result: $!\n"); while (my $line = ) { chomp($line); if ($line !~ m|^http://www.dlib.org| && $line !~ m|^http://dx.doi.org/10.1045/|) { my ($url, $len, $code) = split(/ /, $line); if ($url eq $url_find) { $code = 200 if ($code == 304 || $code == 301 || $code == 302); $code = "200-0" if ($code == 200 && $len == 0); if ($code eq '200') { close FILE; return 1; } else { close FILE; return 0; } } } } close FILE; # never found url return -1; } print "year\ttotal urls\tavailable\tunavailable\n"; foreach my $year (sort keys %seen) { print "$year\t"; my $total_urls = 0; my $total_avail = 0; my $total_unavail = 0; # We may want to change this #my $available = $seen{$date}->{'200'}; #print "$available\t"; foreach my $code (sort keys %http_codes) { #foreach my $code (sort keys %{ $seen{$date} }) { my $count = $seen{$year}->{$code}; $count = 0 if (!defined $count); if ($code eq '200') { $total_avail += $count; } else { $total_unavail += $count; } $total_urls += $count; } print "$total_urls\t$total_avail\t$total_unavail\n"; }