#!/usr/bin/perl -w # Count availability based on file extension use strict; my $fn1 = "available_urls_on_2005-02-27.txt"; my $fn2 = "unavailable_urls_on_2005-02-27.txt"; my %all_types; my %types; open(URLS, $fn1) || die("Can't open $fn1: $!"); my @lines = ; close URLS; count_types(); my %avail_types = %types; open(URLS, $fn2) || die("Can't open $fn2: $!"); @lines = ; close URLS; %types = (); count_types(); my %unavail_types = %types; # Print table # file ext html/htm / others pdf etc. # gone urls 75 240 411 # found urls 805 556 884 # total urls 880 796 1295 # Get rid of unpopular file extensions foreach my $type (sort keys %all_types) { $avail_types{$type} = 0 if (!defined $avail_types{$type}); $unavail_types{$type} = 0 if (!defined $unavail_types{$type}); my $total = $avail_types{$type} + $unavail_types{$type}; if ($total < 30) { $all_types{$type} = -1; $all_types{other} = 1; $avail_types{other} += $avail_types{$type}; $unavail_types{other} += $unavail_types{$type}; } } my @types; foreach (sort keys %all_types) { if ($all_types{$_} != -1) { push(@types, $_); } } print "\t"; foreach my $type (@types) { print "$type\t"; } print "\n"; print "Accessible\t"; foreach my $type (@types) { $avail_types{$type} = 0 if (!defined $avail_types{$type}); print $avail_types{$type} . "\t"; } print "\nInaccessible\t"; foreach my $type (@types) { $unavail_types{$type} = 0 if (!defined $unavail_types{$type}); print $unavail_types{$type} . "\t"; } print "\nTotal URLs\t"; foreach my $type (@types) { my $total = $avail_types{$type} + $unavail_types{$type}; print "$total\t"; } print "\n"; ##################################################################### sub count_types { foreach my $line (@lines) { chomp($line); my $url = $line; my $ext = "other"; # see if url has a query string if ($url =~ m|\?.+|) { $ext = 'NONE'; } else { if ($url =~ m|\.(\w+)$|) { $ext = lc $1; $ext = 'html' if ($ext eq 'htm'); # put these two together } elsif ($url =~ m|/$|) { $ext = 'SLASH'; } else { $ext = 'NONE'; } } print "$ext\t$url\n"; $types{$ext}++; $all_types{$ext}++; } }