#!/usr/local/bin/perl -w

# XLinks.pl
# Complete documentation at http://www.sfu.ca/~ajdelore/XLinks/
# Anthony DeLorenzo <ajdelore@cpan.org>

use strict;

my $VERSION = "1.2";

#### USER CONFIGURATION OPTIONS ####

# Don't forget to change the path to perl on the first line.

# The base directory of the site, without trailing slash
my $base = "/home/ajdelore/pub_html";

# The name of the file to start at, in the base directory.
my $startfile = "index.html";

# The name of your default index file for directories
my $indexfile = "index.html";

# Valid file extensions for XHTML documents -- others will not be 
# crawled or parsed.
my @extensions = ('html');

# Should the program print output to screen(0) or file (1);
# Include a name and path for a logfile, if set to 1. 
# Do not comment out if not using.
my $output_to_file = 0;
my $logfile = "";
 
# Should program output be: 
#      0: minimal (print failed links only) 
#      1: verbose (print all links checked)
my $verbose = 1;

#### END CONFIGURATION ####

use URI::file;
use XML::XPath;
use XML::XPath::XMLParser;
use LWP::UserAgent;

if ($output_to_file) {
  open (LOGFILE,">$logfile") or die "Couldn't open logfile";
  select LOGFILE;
}

chdir ($base) or die "Couldn't access directory $base";
$startfile = URI::file->new_abs($startfile,$base);

my (@files, @files_checked, %uris_ok, %uris_failed);
my $uris_checked = 0;

push @files, ($startfile);
push @files_checked, ($startfile);

my $ua = LWP::UserAgent->new;

PARSE: while ( scalar @files > 0 ) {
  my $file_uri = URI->new(pop @files);
  my @path_segments = $file_uri->path_segments; 
  my $filename = pop @path_segments;
  my $ext = (split /\./,$filename)[1];
  next PARSE unless ( grep { $_ eq $ext } @extensions );
  chdir (join('/', @path_segments));
  my $base_uri = URI::file->cwd;
  print "Trying to parse $file_uri\n";
  my $parser = XML::XPath->new(filename => $filename);

  my $nofollow = 0;
  my $path = ("/html/head/meta[\@name='XLinks']");
  foreach my $meta ( ($parser->find($path))->get_nodelist) {
    my $content = lc $meta->getAttribute('content');
    if ($content eq 'nocheck') {
      print "  Found meta 'nocheck' directive. Ignoring file.\n\n"; 
      next PARSE; 
    }
    elsif ($content eq 'nofollow') {
      print "  Found meta 'nofollow' directive.\n";
      $nofollow = 1;
    }
  }

  foreach my $path ('//a','//link','//img') {
    CHECK: foreach my $node ( ($parser->find($path))->get_nodelist) {
      next CHECK if $node->getAttribute('check') eq 'no'; 
      unless ( $path eq '//img' ) {
        my $href = $node->getAttribute('href');
        next if $href =~ /\#[\w\d]+$/;
        my $uri = URI->new_abs($href, $base_uri);
        next if $uri->scheme eq 'mailto';
        if ( $uri->scheme eq 'file' ) {
          if ( $uri =~ /\/$/ ) { $uri .= $indexfile }
          if (check_uri($uri)) {
            $nofollow = 1 if $node->getAttribute('check') eq 'nofollow';
            next CHECK if $nofollow;
            foreach (@files_checked) { next CHECK if URI::eq($_, $uri) }
            push @files, ($uri);
            push @files_checked, ($uri);
          }
        }
        else { check_uri($uri) }
      }
      else {
        my $src = $node->findvalue('@src');
        my $uri = URI->new_abs($src, $base_uri);
        check_uri($uri);
      }
    }
  }
  print "\n";
}

print "\nSUMMARY\n";
print "Pages Checked: ", scalar(@files_checked), "\n";
print "Links Checked: ", $uris_checked, "\n";
print "Unique Links Checked: ", 
      scalar(keys(%uris_ok)) + scalar(keys(%uris_failed)), 
      "\n";
print "Unique Links Pass/Fail: ",
      scalar(keys(%uris_ok)),
      " / ",
      scalar(keys(%uris_failed)),
      "\n\n";

sub check_uri {
  $uris_checked++;
  my $uri = shift;
  if ( defined $uris_ok{$uri} ) {
    print "  Valid ($uris_ok{$uri}) $uri\n" if $verbose;
    return 1;
  }

  elsif ( defined $uris_failed{$uri} ) {
    print "  Failed ($uris_failed{$uri}) $uri\n" if $verbose;
    return 0;
  }
    
  else {
    my $req = HTTP::Request-> new ('HEAD',$uri);
    my $res = $ua->request($req);

    if ( $res->is_success ) {
      $uris_ok{$uri} = $res->code;
      print "  Valid ($uris_ok{$uri}) $uri\n" if $verbose;
      return 1;
    }

    else {
      $uris_failed{$uri} = $res->code;
      print "  Failed ($uris_failed{$uri}) $uri\n" if $verbose;
      return 0;
    }
  }
}


