#!/usr/bin/perl -w # Count the number of dc identifiers in each repository and # group into several characteristics. use strict; my $last_result = "http_responses/2005-02-27result.txt"; my $url_file = "urls_by_article_complete.txt"; my $purl_ids = "all_purl_ids.txt"; my $doi_ids = "all_doi_ids.txt"; my $handle_ids = "all_handle_ids.txt"; open(PURLS, ">$purl_ids") || die($!); open(DOIS, ">$doi_ids") || die($!); open(HANDLES, ">$handle_ids") || die($!); print "PURLs\tHandles\tDOIs\n"; my $purl_count = 0; my $handle_count = 0; my $doi_count = 0; open(FILE, $url_file) || die("Can't open $url_file: $!"); while (my $line = ) { chomp($line); # Only look at non-article urls if ($line =~ /^\t/) { my ($id) = $line =~ /^\t(.+)/; if ($id =~ /^http:\/\/purl\..+\/.+/ || $id =~ /^http:\/\/bibpurl\./) { $purl_count++; # See if url was found at last check print PURLS "$id\t" . found_url($id) . "\n"; } if ($id =~ /^http:\/\/hdl.handle.net\//) { $handle_count++; print HANDLES "$id\t". found_url($id) . "\n"; } # DOIs have a prefix and suffix. Example: 10.1000.10/123456 # Prefix must start with 10, prefix can have any number of ##.##, # suffix can have any number and type of chars if ($id =~ /\/10(\.[0-9]+)+\/.+/) { $doi_count++; print DOIS "$id\t" . found_url($id) . "\n";; } } } close FILE; print "$purl_count\t$handle_count\t$doi_count\n"; close HANDLES; close PURLS; close DOIS; sub found_url { my $url_find = shift; my $file; open(F, $last_result) || die("Can't open $last_result: $!\n"); while (my $line = ) { chomp($line); my ($url, $len, $code) = split(/ /, $line); if ($url eq $url_find) { $code = 200 if ($code == 304 || $code == 301 || $code == 302); $code = "200-0" if ($code == 200 && $len == 0); if ($code eq '200') { close F; return 1; } else { close F; return 0; } } } close F; # never found url return -1; }