#!/usr/bin/perl -w # Create a list of unique urls from the set of all urls # and place publication year at begninning # Sort by pub year # Example: # 1995 http://www.abc.com # 1995 http://www.bbb.edu use strict; open(URLS, "url_id_list.txt") || die($!); my @lines = ; close URLS; # Grab all article headings. # 1.1 article url # 1.2 article url my @articles; foreach (@lines) { chomp($_); if (/^\d+\.\d+ (.+)$/) { push(@articles, $_); } } my $article_count = @articles; my %seen; foreach my $article (@articles) { #print "$article\n"; if ($article =~ /^(\d+)\.(\d+) (.+)$/) { my $sec1 = $1; my $sec2 = $2; my ($year) = $article =~ /dlib\/\w+(\d\d)\//; if ($year > 4) { $year = "19$year"; } else { $year = "20$year"; } # Now find all urls for this article foreach (@lines) { if (/^$sec1\.$sec2\.\d+ (.+)$/) { my $url = $1; if (!defined $seen{$url}) { $seen{$url} = $year; } else { # Since we've seen this url before, # make sure it has earliest pub year if ($seen{$url} > $year) { $seen{$url} = $year; } } } } } } foreach my $url (sort keys %seen) { print $seen{$url} . "\t$url\n"; }