#!/usr/bin/perl -w use strict; use warnings; use Getopt::Std; use File::Basename; use WWW::Google::PageRank; # Author: Dean Wilson # Created on: 2004/09/08 # License: GPL # Version: 0.2 # Homepage: http://www.unixdaemon.net/miniprojects.html#getpagerank # this short application takes a list of urls from the file passed in on # the command line and then gets the page rank for each one. my %opts; # s = summary (headers) u or h = show help d = delimiter getopts('uhsd:', \%opts); # use this to lookup the pagerank. only one handle is needed my $pr = WWW::Google::PageRank->new; my $urlfile = shift; # sanity checks &usage if ($opts{'u'} || $opts{'h'}); die "Please supply a file containing URLs\n" unless $urlfile; die "No file found at '$urlfile'!\n" unless -e $urlfile; # get the urls from the config file and clean them up. my $urls = get_urls($urlfile); # this is the simple header, toggled by a flag. # off by default due to the batch nature of the program if ($opts{'s'}) { print "Google PageRank\n\n"; print "URL, PageRank\n"; print "----------------\n"; } foreach my $url (sort keys %$urls) { # if you want to customise this propagates the hash # $urls->{$url} = $pr->get($url); if ($opts{'d'}) { print "${url}$opts{'d'}" . $pr->get($url), "\n"; } else { print "${url} " . $pr->get($url), "\n"; } } ######################################### # subs, funcs and other util methods. ######################################### sub get_urls { # extract the urls from the file. # assume one per line, do some clean up. my $urlfile = shift; my %urls; open(URLS, "<$urlfile") || die "Failed to open '$urlfile': $!"; while() { my $url; chomp; next if /^#/; next if /^\s*$/; s/\s*(\S*)?\s*/$1/; # remove all leading and trailing spaces $url = $1; # google doesn't play nice without this. unless($url =~ '^http://') { $url = 'http://' . $url; } $urls{$url} = 0; } close URLS; return \%urls; } ########################################## sub usage { # simple help/usage display my $app = basename($0); print<