#!/usr/bin/perl # ************************************************************************** # # robots.prl written by Chris Evans (C.Evans@sghms.ac.uk) # # program generates a robot.txt file containing the necessary text to warn # # robots who respect the protocol requiring them to read this file, off the # # files that you specify. Specifications can be as global filename masks, # # as exclusions from particular directories or as exclusions from # # directories and all the subdirectories below them # # # # The program takes as input in the variables declared immediately below # # $filemask -- file mask (appropriate to the OS) for the find command # # $outputdir -- the directory into which to put these files # # $menu -- filename for the menu to all other reports # # $report -- filename for the directory order output # # $time_rep -- filename for the modification time sorted output # # $other_rep -- filename for the files other than *.htm? # # $startdir -- directory from which to start the recursive search # # $www_offset -- any additional directory reference that accounts for # # a possible difference between the root used by unix for the # # search versus that shown to the world by the HTTP server in use # # This last variable may seem odd but on our local machine the root # # shown to the outside world by the server is at: # # /usr1/www/pages/ # # but my pages (from which I want the search for *.htm files to # # start is at: # # /usr1/www/pages/mhs/psychotherapy/ # # giving the value for $www_offset of /usr1/www/pages # # # # file written by Chris Evans (C.Evans@sghms.ac.uk) 20.iii.96 # # copyright Chris Evans _BUT_ feel free to distribute this subject to the # # following requirements: # # 1) you do not make a profit on the distribution # # 2) you retain this header information in full # # I would also very much appreciate feedback (I know the programming is # # awful, I've not written much Perl and I'm a psychodynamic psychotherapyist # # with no intention of giving up my day job so no flames if # # you can resist the temptation) and a copy of enhanced versions if you do # # hack this into something better # # # # Chris Evans (C.Evans@sghms.ac.uk) Section of Psychotherapy, # # St. George's Hospital Medical School, Cranmer Terrace, # # London, SW17 0RE, Britain # # ************************************************************************** # $robotfil = "robot.txt"; # this MUST be called this according to the protocol $backname = "robot.bak"; $hostname = "http://www.sghms.ac.uk/"; # hostname (will be inserted in file) $startdir = "/mhs/psychotherapy"; # whence all searches start $www_offset = "/usr1/www/pages"; # offset between the Unix host root # and the server root $finalfil = $www_offset."/".$robotfil; # exclusions by file names, uses Unix wildcards @target = ("PUSAGE.HTM", "P19*.HTM", "ERRORS.HTM", "psylog.log", "mylog.log", "errlog.log", "P19*.TXT", "TOTAL.TXT", "TOTAL.HTM", "*.bak", ".glimpse*"); # now the directories you are going to exclude in their entireties @dirs = ("./dir", "./wwwstats", "./todo", "./keyw"); # and those where you're also excluding any of their subdirectories # the relative form of the reference is important @rdirs = ("./u", "./todo", "./excite"); # *************** O.K. that's the end of the user input ******************* # # rename the $robotfil if it exists if (-w($robotfil)) { rename($robotfil, $backname) || die "Couldn't rename $robotfil as $backname\n"; } open (ROBOT, ">robot.txt") || die "Couldn't open new robot.txt\n"; # *** the first lines are fixed by protocol print ROBOT "# robots.txt for $hostname\n\n"; print ROBOT "User-agent: *\n"; # at present I am doing this for ALL robots # now run through all subdirectories looking for the files by names foreach $target (@target) { print "excluding files of form: $target\n"; open(FIND, "find . -name \"$target\" -print |") || die "Couldn't run find: $!\n"; while ($filename = ) { chop($filename); # reformat the filename to the http server form $href = $filename; $href =~ s|.|$startdir|o; print ROBOT "Disallow: "; print ROBOT "$href\n"; $out{$href} = 1; } } # now run through the directories for exclusion # (but where their subdirectories are left accessible # to robots) foreach $dir (@dirs) { print "excluding files in directory: $dir\n"; open(DIR, "ls -R $dir |") || die "Couldn't run ls: $!\n"; while ($filename = ) { if (($filename =~ /\S/) && !($filename =~ /:/)) { # reformat the filename to the http server form chop($filename); $href = $filename; $ndir = $dir; $ndir =~ s/\.//; print "$filename $ndir\n"; $href = $startdir.$ndir."/".$filename; if (!$out{$href}) { $out{$href} = 1; print ROBOT "Disallow: "; print ROBOT "$href\n"; } } } } # now run through directories to exclude with all their subdirectories foreach $rdir (@rdirs) { print "excluding files in directory: $rdir (and all subdirectories off this)\n"; open(FIND, "find $rdir -name \"*.*\" -print |") || die "Couldn't run find: $!\n"; while ($filename = ) { chop($filename); # reformat the filename to the http server form $href = $filename; $href =~ s|.|$startdir|o; if (!$out{$href}) { $out{$href} = 1; print ROBOT "Disallow: "; print ROBOT "$href\n"; } } } close ROBOT; system("cp $robotfil $finalfil");