#!/usr/bin/perl -w # Count availability by domain name use strict; my $fn1 = "available_urls_on_2005-02-27.txt"; my $fn2 = "unavailable_urls_on_2005-02-27.txt"; my %all_domains_top_level; my %dn; open(URLS, $fn1) || die("Can't open $fn1: $!"); my @lines = ; close URLS; count_dn(); my %avail_dn = %dn; open(URLS, $fn2) || die("Can't open $fn2: $!"); @lines = ; close URLS; %dn = (); count_dn(); my %unavail_dn = %dn; #goto skip; # Place all the lesser-used domain names into "other" bin foreach my $top (keys %all_domains_top_level) { if ($all_domains_top_level{$top} < 30) { #print "FOUND: $top=" . $all_domains_top_level{$top} . "\n"; $all_domains_top_level{"other"} += $all_domains_top_level{$top}; $all_domains_top_level{$top} = -1; $avail_dn{"other"} += $avail_dn{$top} if (defined $avail_dn{$top}); $unavail_dn{"other"} += $unavail_dn{$top} if (defined $unavail_dn{$top}); } } skip: # Print table # path depth 0 1 2 # gone urls 75 240 411 # found urls 805 556 884 # total urls 880 796 1295 print "top level\tmissing urls\tfound urls\ttotal urls\n"; my $total_missing = 0; my $total_found = 0; my $total_urls = 0; foreach my $top (sort keys %all_domains_top_level) { next if $all_domains_top_level{$top} == -1; $avail_dn{$top} = 0 if (!defined $avail_dn{$top}); $unavail_dn{$top} = 0 if (!defined $unavail_dn{$top}); my $total = $avail_dn{$top} + $unavail_dn{$top}; print "$top\t" . $unavail_dn{$top} . "\t" . $avail_dn{$top} . "\t$total\n"; $total_missing += $unavail_dn{$top}; $total_found += $avail_dn{$top}; $total_urls += $total; } print "TOTAL\t$total_missing\t$total_found\t$total_urls\n"; ##################################################################### sub count_dn { foreach my $line (@lines) { chomp($line); my $new_line = $line; # grab top level domain name for this url $new_line =~ s|^\w+://||; # remove http:// my ($domain) = split(/\//, $new_line); my $top = "other"; # if ip address is used, use "other" domain name if ($domain !~ /^\d+\.\d+\.\d+\.\d+/) { # pull off the end. Some entries contained a bad period at the end ($top) = $domain =~ /\.([^.]+)\.?$/; print "NOT DEFINED: $line\n" if (!defined $top); } # remove optional port number $top =~ s/:\d+//; if ($top eq '101') { print "line=$line\ndomain=$domain\n"; } $dn{$top}++; $all_domains_top_level{$top}++; #print "$top\t$line\n"; } }