#!/usr/bin/perl # ************************************************************************** # # words.prl written by Chris Evans (C.Evans@sghms.ac.uk) # # program collects all the words in the parts of HTML files # # below a cerain directory # # # # The program takes as input in the variables declared immediately below # # $filemask -- file mask (appropriate to the OS) for the find command # # $outputdir -- the directory into which to put these files # # $menu -- filename for the menu to all other reports # # $report -- filename for the directory order output # # $time_rep -- filename for the modification time sorted output # # $other_rep -- filename for the files other than *.htm? # # $startdir -- directory from which to start the recursive search # # $www_offset -- any additional directory reference that accounts for # # a possible difference between the root used by unix for the # # search versus that shown to the world by the HTTP server in use # # This last variable may seem odd but on our local machine the root # # shown to the outside world by the server is at: # # /usr1/www/pages/ # # but my pages (from which I want the search for *.htm files to # # start is at: # # /usr1/www/pages/mhs/psychotherapy/ # # giving the value for $www_offset of /usr1/www/pages # # # # file written by Chris Evans (C.Evans@sghms.ac.uk) 20.iii.96 # # copyright Chris Evans _BUT_ feel free to distribute this subject to the # # following requirements: # # 1) you do not make a profit on the distribution # # 2) you retain this header information in full # # I would also very much appreciate feedback (I know the programming is # # awful, it's the first piece of PERL I've written and I'm a psychodynamic # # psychotherapist with no intention of giving up my day job so no flames if # # you can resist the temptation) and a copy of enhanced versions if you do # # hack this into something better # # # # Chris Evans (C.Evans@sghms.ac.uk) Section of Psychotherapy, # # St. George's Hospital Medical School, Cranmer Terrace, # # London, SW17 0RE, Britain # # ************************************************************************** # $outputdir = "/usr1/www/pages/mhs/psychotherapy/dir/"; $menu = "menu.htm"; $report = "htm_dir.htm"; $time_rep = "time_dir.dat"; $startdir = "/mhs/psychotherapy"; $www_offset = "/usr1/www/pages"; @filemask = ("*.htm"); # sort out the location that the server will show for files $http_rep_dir = $outputdir; $http_rep_dir =~ s|$www_offset||o; # o switch as the value of $www_offset is fixed $head = < Index of text words in Psychotherapy Section pages: $letter

Use [find] button on your browser to find word of interest to you

END $btm = <
File created using Perl script: words.prl by Chris Evans C.Evans@sghms.ac.uk END # **** that's the end of user input # open listing of words to ignore open(NOT,"words.lst") || die "Cannot open words.lst"; while () { ($out, $word, $n) = split; if ($out eq "b") { $not{$word} = 1; } } # program will exclude itself from all files I've marked in robot.txt open(ROBOT,"/usr1/www/pages/mhs/psychotherapy/robot.txt") || die "Cannot open robot.txt\n"; print "Reading files to ignore from robot.txt\n"; while () { if (/Disallow: (.*)/) { $file = $1; $file =~ s/$startdir/./; $norobfile{$file} = 1; } } close(ROBOT); open(PROB,">wordprob.lst") || die "Problem creating file wordprob.lst\n"; print "Finding files...\n"; $filenum = 0; # run through all subdirectories looking for files foreach $mask (@filemask) { print "Processing files of type: $mask\n\n"; open(FIND, "find . -name \"$mask\" -print |") || die "Couldn't run find: $!\n"; while ($filename = ) { chop($filename); if ($norobfile{$filename}) { print "skipping file = $filename\n"; # don't process the file if it's excluded next; } print "\nProcessing file $filename...\n"; $filenum++; $href = $filename; $href =~ s|.|$startdir|o; $href{$filenum} = $href; $linenum = 0; open (IN, "$filename") || die "Couldn't open $filename\n"; $body = 0; while () { $linenum++; $body = 1 if (//i); $body = 0 if (/<\/BODY>/i); if ($body) { # if line ends with a hyphen, concatenate with next if (s/-\s*\n$//) { $start = $_; $_ = ; $linenum++; $_ = $start.$_; } s/<.*>//g; # strip closed HTML tags if (/; $linenum++; $_ = $start.$_; # if line ends with a hyphen, concatenate with next if (s/-\s*\n$//) { $start = $_; $_ = ; $linenum++; $_ = $start.$_; } s/<.*>//g; # strip closed HTML tags if (/ 1) ) { # if result has non-numeric, non-space contents .. $word_n{$word}++; # count it $filenum{$word} .= $filenum." "; } } } } } close (IN); } } open(OUT,">words.dat") || die "couldn't open words.dat\n"; foreach $word (sort byfreq keys(%word_n)) { if ($not{$word}) { $tag = "b "; } else { $tag = "n "; } printf OUT "%2s %30s %-s\n",$tag, $word, $word_n{$word}; } print "\n\n\n\nNOW creating the index itself\n"; $letter = "a"; print "$letter\n"; $htm = "htm"; $first = 1; open(LET,">dir/$letter.$htm") || die "Can't open dir/$letter.$htm\n"; print LET "$head"; foreach $word (sort bylwrcase keys %filenum) { $w_let = substr($word,0,1); $w_let =~ tr/A-Z/a-z/; if (($w_let =~ /[a-z]/) && (!$not{$word})) { # only handle if word starts with a letter and is not a skip word if ($w_let ne $letter) { while ($w_let ne $letter) { $letter++; print "$letter\n"; $first = 1; } print LET $btm; close(LET); open(LET,">dir/$letter.$htm") || die "Can't open dir/$letter.$htm\n"; print LET "$head"; } if ($first) { print LET "
    \n
  • $word\n
      \n"; $first = 0; } else { print LET "
    \n
  • $word\n
      \n"; } undef @files; undef %n_refs; @files = split(/ /,$filenum{$word}); foreach $file_n (@files) { $n_refs{$file_n}++; } foreach $filenum (keys %n_refs) { print LET "
    • $href{$filenum}"; if ($n_refs{$filenum} == 1) { print LET " (occurs once)\n"; } else { print LET " (occurs $n_refs{$filenum} times)\n"; } } } } #open(NOT,">dir/dropword.dat") || die "couldn't open dir/dropword.dat in write mode\n"; foreach $word (keys %notword) { $notword{$word} = $word_n{$word}; print "$word $word_n{$word}\n"; } # --- subroutines --- sub byfreq { # sorts by frequency, most frequent first $word_n{$b} <=> $word_n{$a}; } sub bylwrcase { # compare by lowercase of the strings local($stringa) = $a; local($stringb) = $b; $stringa =~ tr/A-Z/a-z/; $stringb =~ tr/A-Z/a-z/; $stringa cmp $stringb; }