#!/usr/bin/perl -w # print unique urls per article use strict; open(URLS, "url_id_list.txt") || die($!); my @lines = ; close URLS; # Grab all article headings. # 1.1 article url # 1.2 article url my @articles; foreach (@lines) { chomp($_); if (/^\d+\.\d+ (.+)$/) { push(@articles, $_); } } my $article_count = @articles; foreach my $article (@articles) { if ($article =~ /^(\d+)\.(\d+) (.+)$/) { my $sec1 = $1; my $sec2 = $2; # print article url print "$3\n"; my %seen = (); # Now find all urls for this article foreach (@lines) { if (/^$sec1\.$sec2\.\d+ (.+)$/) { my $url = $1; if ($url !~ m|^http://www.dlib.org| && $url !~ m|^http://dx.doi.org/10.1045/|) { if (!defined $seen{$url}) { $seen{$url} = 1; print "\t$url\n"; } } } } } }