#!/usr/bin/perl -w # Count path depth use strict; my $fn1 = "available_urls_on_2005-02-27.txt"; my $fn2 = "unavailable_urls_on_2005-02-27.txt"; my %path_depth; open(URLS, $fn1) || die("Can't open $fn1: $!"); my @lines = ; close URLS; count_path_depth(); my %avail_path_depth = %path_depth; open(URLS, $fn2) || die("Can't open $fn2: $!"); @lines = ; close URLS; %path_depth = (); count_path_depth(); my %unavail_path_depth = %path_depth; # Print table # path depth 0 1 2 # gone urls 75 240 411 # found urls 805 556 884 # total urls 880 796 1295 print "path depth\t"; my @path_depth = (0..8); foreach (@path_depth) { print "$_\t"; } print "total\n"; print "gone urls\t"; my $total = 0; foreach my $d (@path_depth) { $unavail_path_depth{$d} = 0 if (!defined $unavail_path_depth{$d}); print $unavail_path_depth{$d} . "\t"; $total += $unavail_path_depth{$d}; } print "$total\nfound urls\t"; $total = 0; foreach my $d (@path_depth) { $avail_path_depth{$d} = 0 if (!defined $avail_path_depth{$d}); print $avail_path_depth{$d} . "\t"; $total += $avail_path_depth{$d}; } print "$total\ntotal urls\t"; $total = 0; foreach my $d (@path_depth) { my $sum = $avail_path_depth{$d} + $unavail_path_depth{$d}; print "$sum\t"; $total += $sum; } print "$total\n"; ##################################################################### sub count_path_depth { foreach my $line (@lines) { chomp($line); my $new_line = $line; $new_line =~ s|^.+//||; # remove http:// # See if there's a query string. If there is, # remove all / chars so it will count as just one # path count if ($new_line =~ /\?(.+)/) { my $query_string = $1; $query_string =~ s/\///g; #print "QS = $query_string\n"; $new_line =~ s/\?.+/\/$query_string/; #print "$new_line\n"; } my @items = split(/\//, $new_line); my $count = @items; $count--; # assume http://foo.org/ is path depth 0 #$count-- if ($new_line =~ m|/$|); # Don't count urls ending with / $path_depth{$count}++; #print "$count\t[$new_line]\n"; } }