#!/usr/bin/perl -w # Dedicated to Matt Westerman # A good man and a good friend, may he rest in peace. # Send questions, comments, suggestions, bugs, or fixes to # teTsu at plasmacow dot com # This program provided as is with no warrenty # suggested or implied. It is published under the same # Artistic License as Perl itself. If you have a perl # interpreter running, you should have a copy, otherwise # head to http://www.perl.com/language/misc/Artistic.html # to get one. # Changelog from .5 to .5.25 # fixed the problem with HEAD HTTP headers, instead we nopw do a # GET with a max size of 1 byte :) # TODO's # -- Be a little more specific as to why a link is bad such as the error # code, and perhaps the corresponding error message for the http code # anything else one can think of, let me know! # teTsu at plasmacow dot com (replace word with symbol of course ;~) # standard Perl moduiles usually distributed with a current # version of Perl, and/or well known distos of Linux, otherwise you may # want to become familiar with CPAN. use Getopt::Std; # Command line argument processing use URI; # URI manipulation use LWP::UserAgent; # HTTP client use HTTP::Status; # readable HTTP Status messages use HTTP::Headers; # Class for constructing HTTP headers use HTML::LinkExtor; # extract links from HTML use strict; my $VERSION = '0.5.25'; my $basename = $0; $basename =~ s/.*[\/\\](.+)$/$1/; # strip the path info if present # argument processing here my %args; getopts('hVvt:f:r:u:', \%args); usage() if($args{'h'}); usage(22, 'You must specify a host to be checked!') unless($args{'r'}); if($args{'V'}) { print "$basename version $VERSION\n"; exit 0; } my $email = $args{'f'} || 'root@localhost'; my $timeout = $args{'t'} || 15; my $verbose = $args{'v'} || 0; my $agent = $args{'u'} || $basename . '/' . $VERSION; my $root = new Link($args{'r'}); my $rooturl = $args{'r'}; my @pending = ( $root ); # waiting queue my (@good, @nogood); # lists of good and bad links # Our user agent my $ua = new LWP::UserAgent( timeout => $timeout, keep_alive => 1, protocols_allowed => ['http', 'https'], agent => $agent, from => $email ); print "$basename version $VERSION scannning ", $root->uri, "\n\n"; while($pending[0]) { $ua->max_size(1); my $res = $ua->get($pending[0]->uri); if($res->is_success && $pending[0]->uri =~ m/$rooturl/io && $res->content_type eq 'text/html') { # reset the max_size to unlimited $ua->max_size(undef); print "GET ", $pending[0]->uri, "\n" if($verbose > 0); my $p = new HTML::LinkExtor(\&set_links); $res=$ua->request(new HTTP::Request(GET=>$pending[0]->uri), sub { $p->parse($_[0]) }, 4096 ); $pending[0]->code($res->code); if($res->is_success) { push @good, shift(@pending); } else { push @nogood, shift(@pending); } } elsif($res->is_success) { # link isn't html or is on another server print "TEST ", $pending[0]->uri, " Successful\n" if($verbose > 0); $pending[0]->code($res->code); push @good, shift(@pending); } else { # link was no good print "TEST ", $pending[0]->uri, " UNsuccessful\n" if($verbose > 0); $pending[0]->code($res->code); push @nogood, shift(@pending); } } if($verbose > 0) { print "\nGood links checked:\n"; foreach (@good) { print $_->uri, "\n"; } } print "Bad links:\n"; foreach (@nogood) { print $_->uri, "\trefered by:\n"; foreach my $ref ($_->referer) { print "\t", $$ref, "\n"; } } # End of Main # Callback function used to extract links found on the page, see if they have # already been checked, and if not, queue them to be scanned in the pending # array sub set_links { my($tag, %links) = @_; foreach (values %links) { next if m/mailto/i; # skip any mailto links next if /^javascript/i; # quick hack to avoid javascript links s/(.*)#\w+$/$1/; # remove any anchors in URL my $link = new Link($_, $pending[0]->uri); my $slot = exists_in($link, \@pending, \@good, \@nogood); if($slot) { $$slot->referer($pending[0]->uri); } else { push @pending, $link; } } } # if the Link exists in one of the arrays supplies, return a reference to it, # otherwise return false sub exists_in { my $chk = shift; while(my $aref = shift) { foreach my $link (@$aref) { return \$link if($chk->uri eq $link->uri) } } return undef; } # I wonder what this does??? sub usage { print "$_[1]\n" if(defined($_[1])); # custom message print << "USAGE"; usage: $basename -r [root URL] r : (required) Starting host URL, FQDN only! Optional arguments h : This usage statement f : The person's email responsible for this user agent default: root\@localhost t : Timeout, time (in seconds) to wait for slow or dead hosts u : User agent's name, defualt: $basename/$VERSION v : Increase verbosity V : Print version number and exit example: $basename -r http://www.example.com/ -t 10 -f me\@example.com USAGE exit (defined($_[0])) ? $_[0] : 0; # did we specify an exit code? } # Link Class, takes care of certain redundent link management routines, # inherits the URI class via the simple HASA relationship. package Link; sub _new_uri # private funtion for setting a URI via { # two different URI constuctors methods my ($uri, $referer) = @_; $uri = ($referer) ? new_abs URI($uri, $referer) : new URI($uri); $uri->scheme('http') unless($uri->scheme); $uri->path('/') unless($uri->path); return $uri; } # contructor, MUST supply a uri! sub new { my($type, $uri, $referer) = @_; my $self = {}; die "No URI specified, exiting" unless($uri); $self->{URI} = _new_uri($uri, $referer); if($referer) { push(@{$self->{REFERERS}}, $referer); } else { $self->{REFERERS} = [ ]; # empty anonymouse array } bless $self, $type; } # HASA inheritance, i.e Link HASA URI sub uri { my($self, $uri, $referer) = @_; # if we would like to reset the uri $self->{URI} = _new_uri($uri, $referer) if($uri); return $self->{URI}; } # return the list of referers and/or add one to the list, # these are references to existing `Link' object, no instantiation sub referer { my($self, $referer) = @_; my $check = 0; if($referer) { push(@{$self->{REFERERS}}, $referer); } return ($self->{REFERERS}) ? @{$self->{REFERERS}} : undef; } # report and/or set the protocol response code to our request sub code { my($self, $newcode) = @_; $self->{CODE} = $newcode if($newcode); return ($self->{CODE}) ? $self->{CODE} : undef; } 0;