#!/usr/bin/perl -w # print out urls based on publication year use strict; my $fn = "urls_by_article.txt"; # articles are not indented. All urls indented under article are the article's urls my @articles; my %articles; open(URLS, $fn) || die("Can't open $fn: $!\n"); my $line = ; while ($line) { chomp($line); #print "line=$line\n"; if ($line !~ /^\t/) { my $article = $line; #print "$article\n"; $line = ; while ($line && $line =~ /^\t(.+)$/) { my $url = $1; chomp($url); # ignore dlib urls next if ($url =~ m|^http://www.dlib.org| || $url =~ m|http://dx.doi.org/10.1045/|); #print "\t$url\n"; my ($year) = $article =~ /dlib\/\w+(\d\d)\//; if ($year > 4) { $year = "19$year"; } else { $year = "20$year"; } push(@{ $articles{$year} }, $url); $line = ; } } } close URLS; # Print urls per publication year foreach my $year (sort keys %articles) { foreach my $url (@{ $articles{$year} }) { print "$year\t$url\n"; } }