#!/usr/bin/perl -w # Count urls per article, including all dlib urls # without the footer urls. # Remove urls that are used more than once in each article use strict; my $fn = "urls_by_article_complete.txt"; my @articles; my %articles; open(URLS, $fn) || die("Can't open $fn: $!\n"); my $line = ; while ($line) { chomp($line); #print "line=$line\n"; if ($line !~ /^\t/) { my $article = $line; #print "$article\n"; my ($year) = $article =~ /dlib\/\w+(\d\d)\//; if ($year > 4) { $year = "19$year"; } else { $year = "20$year"; } my $link_count = 0; $line = ; while ($line && $line =~ /^\t(.+)$/) { my $url = $1; chomp($url); #push(@{ $articles{$year} }, $url); $link_count++; $line = ; } print "$year\t$link_count\t$article\n"; } } close URLS;