#!/usr/local/bin/perl -w
#
# robots.pl -- parse /robots.txt file
#
# $Id: norobots.pl,v 1.1 1994/06/20 11:09:47 mak Exp mak $
#
# Martijn Koster <m.koster@nexor.co.uk>

# bug/features: is lenient: excess leading/trailing space is removed, 
# the colon can have preceding space, the user-agent lines need not
# be the first in the record, and all records that match the user-agent
# lines are used, with the disallow lines added up.

# This package can be used in a library, by simply defining
# $norobots'lib=1 before 'require'ing it (need perl5, where caller returns
# false when standalone. Of course you can simply kill the
# "if (!defined $lib)" scope altogether.
#
# If you run it on it's own it run's a test on hardcoded robots files,
# with the useragent set to argv[1], or 'Fish' by default.
#
# It's designed to be extended into a test suite, but that hasn't happened
# yet. Feel free to use any bits of this code.
#
# Thanks to murphy@scotty.dccs.upenn.edu (Linda A Murphy) for bugfixes.
#
# I'd write this completely differently for Perl5...

package norobots;

if (!defined $lib) {

    %norobots = ( # test it out on these robots.txt's:
		 'web.nexor.co.uk', './robots.txt',
		 );
    %tests = (
	      'web.nexor.co.uk', '/~mak/ /nexor /borland/index.html rfcindex?mail',
	      'freeforall.edu', '/~guppie/ /welcome.html',
	      );

    $debug=1;			# let's see what's happening

    $iam = 'fish';		# default
    $iam = shift(@ARGV);	# User-agent given on command line
    print "I am '$iam'\n" if ($debug);

    for $host (keys %norobots) {
	&parse($host, $norobots{$host}, $iam); # read the files
    }
    for $host (keys %tests) {	# do the tests
	print "$host:\n";
	for $url (split(' ', $tests{$host})) {
	    $rule = &denied('web.nexor.co.uk', $url);
	    if ($rule) {
		print "  $url denied ($rule)\n";
	    } else {
		print "  $url ok\n";
	    }
	}
    }
} # end standalone

# library:

# Parse a /robots.txt file
# into a global %disallowed
sub parse {
    local($host, $file, $useragent) = @_;
    @disallowlines =();		# rules for this record

    undef($matched);
    undef($field);
    undef($value);
    undef(@forrobots);
    undef(@default);

    open(F, $file) || die "Cannot open $file: $!\n";
    while(<F>) {
	next if /^\s*\#/;	# ignore full comment line

	s/\#.*$//;		# nuke comments
	s/^\s+//;		# nuke leading space
	s/\s+$//;		# nuke trailing space
	if ($_ ne '') {		# got a line
	    if (!/:/) {
		warn "Invalid line '$_'\n";
		next;
	    }
	    ($field, $value) = split(/\s*:\s*/, $_, 2);

	    # match  case-insensitively
	    if ($field =~ /User-agent/i) {
		push(@forrobots, $value);
	    }
	    elsif ($field =~ /Disallow/i) {
		push(@disallowlines, $value);
	    }
	    else {
		warn "Invalid field '$field'\n";
	    }
	}
	else {			# empty, maybe end of record
	    &record;

	    undef(@forrobots);
	    undef(@disallowlines);
	}
    }
    close(<F>);
    # end of file, maybe of record
    &record;

    if ($matched) {
	$disallowed{$host} = join("\n", @disallowed);
    } else {
	print "I'm a '*'\n" if ($debug);
	$disallowed{$host} = join("\n", @default);
    }
    print "\nDisallowed for $host:\n  ", 
    join("\n  ", split("\n", $disallowed{$host})), 
    "\n\n" if ($debug);
}

# this isn't a proper function, it's part of &parse
sub record {
    # so let's see if this record was meant for us.   
    return if (!defined @forrobots);
    for(@forrobots) {
	if (/$useragent/i) { # yes
	    $matched++;
	    print "I'm a $_\n" if ($debug);
	    push(@disallowed, @disallowlines);
	}
	elsif ($_ eq '*') {
	    push(@default, @disallowlines);
	}
    }
}

# check if this url is disallowed by the host
# returns rule on which the url fails, or undef
sub denied {
    ($host, $url) = @_;
    for $rule (split("\n", $disallowed{$host})) {
#	print "checking $url against $rule\n" if ($debug);
	if ($url =~ /^$rule/) {
	    return $rule;		# match, disallowed
	}
    }
    return 0;			# fine
}

1;
