This code has not been tested in the last 2 years…
Here is the code:
#!/usr/bin/perl
use Time::HiRes;
use LWP::UserAgent;
use HTML::LinkExtor;
my $start_begin = [ Time::HiRes::gettimeofday( ) ];
my $base_url = $ARGV[0];
my $virtual_host = $ARGV[1];
my $a = LWP::UserAgent->new;
$a->timeout(2);
$a->default_header( "Host" => $virtual_host );
my $start = [ Time::HiRes::gettimeofday( ) ];
my $content = $a->get($base_url);
printf("GET: [%s] | %3s | %.4fs\n",substr($base_url,0,60),$content->code, Time::HiRes::tv_interval($start) );
if ( $content->code == 200 ) {
$parser = HTML::LinkExtor->new();
$parser->parse($content->content);
@links = $parser->links;
foreach $linkarray (@links) {
my @element = @$linkarray;
my $elt_type = shift @element;
#if ( $elt_type =~ /a|img/ ) {
if ( $elt_type =~ /a/i ) {
while (@element) {
my ($attr_name , $attr_value) = splice(@element, 0, 2);
#$seen{$attr_value}++ if ( $attr_value =~ /^$base_url/ );
$seen{$attr_value}++ if ( $attr_value =~ /$virtual_host/ );
#print "[$elt_type] $attr_name $attr_value\n";
}
} # else { print "Excluded [$elt_type] $attr_name $attr_value\n"; }
}
$i=0;
for(sort keys %seen) {
$start = [ Time::HiRes::gettimeofday( ) ];
my $content = $a->get($_);
printf("%02d | [%s] | %3s | %.4fs\n",++$i,substr($_,0,60),$content->code, Time::HiRes::tv_interval($start));
}
}
printf("Total time = %.4fs\n",Time::HiRes::tv_interval($start_begin));