#!/usr/bin/perl -w # print unique urls per article, including all dlib urls # Remove dlib urls that aren't really reference urls # These appear at the foot of each article: # # http://www.dlib.org/author-index.html # http://www.dlib.org/dlib/july03/07contents.html *** # http://www.dlib.org/Hypernews/get/dlib_responses.html # http://www.dlib.org/back.html # http://www.dlib.org/title-index.html # http://www.dlib.org/Architext/AT-dlib2query.html # http://www.dlib.org/dlib.html # http://www.dlib.org/access.html use strict; open(URLS, "url_id_list_complete.txt") || die($!); my @lines = ; close URLS; # Grab all article headings. # 1.1 article url # 1.2 article url my @articles; foreach (@lines) { chomp($_); if (/^\d+\.\d+ (.+)$/) { push(@articles, $_); } } my $article_count = @articles; foreach my $article (@articles) { if ($article =~ /^(\d+)\.(\d+) (.+)$/) { my $sec1 = $1; my $sec2 = $2; # print article url print "$3\n"; my %seen = (); # Now find all urls for this article foreach (@lines) { if (/^$sec1\.$sec2\.\d+ (.+)$/) { my $url = $1; if (!defined $seen{$url}) { next if ($url =~ m|^http://www.dlib.org/author-index.html|); next if ($url =~ m|^http://www.dlib.org/dlib/.+/\d\dcontents.html|); next if ($url =~ m|^http://www.dlib.org/Hypernews/get/dlib_responses.html|); next if ($url =~ m|^http://www.dlib.org/back.html|); next if ($url =~ m|^http://www.dlib.org/title-index.html|); next if ($url =~ m|^http://www.dlib.org/Architext/AT-dlib2query.html|); next if ($url =~ m|^http://www.dlib.org/dlib.html|); next if ($url =~ m|^http://www.dlib.org/access.html|); # This is likely just from the bottom of the page so only count # it as a ref if we see it twice if ($url eq "http://www.doi.org/") { $seen{$url} = 1; next; } $seen{$url} = 1; print "\t$url\n"; } elsif ($url eq "http://www.doi.org/" && $seen{$url} == 1) { # If seen twice, it must be a reference $seen{$url} = 2; print "\t$url\n"; } } } } }