#!/usr/bin/perl
# ************************************************************************** #
# words.prl written by Chris Evans (C.Evans@sghms.ac.uk) #
# program collects all the words in the
parts of HTML files #
# below a cerain directory #
# #
# The program takes as input in the variables declared immediately below #
# $filemask -- file mask (appropriate to the OS) for the find command #
# $outputdir -- the directory into which to put these files #
# $menu -- filename for the menu to all other reports #
# $report -- filename for the directory order output #
# $time_rep -- filename for the modification time sorted output #
# $other_rep -- filename for the files other than *.htm? #
# $startdir -- directory from which to start the recursive search #
# $www_offset -- any additional directory reference that accounts for #
# a possible difference between the root used by unix for the #
# search versus that shown to the world by the HTTP server in use #
# This last variable may seem odd but on our local machine the root #
# shown to the outside world by the server is at: #
# /usr1/www/pages/ #
# but my pages (from which I want the search for *.htm files to #
# start is at: #
# /usr1/www/pages/mhs/psychotherapy/ #
# giving the value for $www_offset of /usr1/www/pages #
# #
# file written by Chris Evans (C.Evans@sghms.ac.uk) 20.iii.96 #
# copyright Chris Evans _BUT_ feel free to distribute this subject to the #
# following requirements: #
# 1) you do not make a profit on the distribution #
# 2) you retain this header information in full #
# I would also very much appreciate feedback (I know the programming is #
# awful, it's the first piece of PERL I've written and I'm a psychodynamic #
# psychotherapist with no intention of giving up my day job so no flames if #
# you can resist the temptation) and a copy of enhanced versions if you do #
# hack this into something better #
# #
# Chris Evans (C.Evans@sghms.ac.uk) Section of Psychotherapy, #
# St. George's Hospital Medical School, Cranmer Terrace, #
# London, SW17 0RE, Britain #
# ************************************************************************** #
$outputdir = "/usr1/www/pages/mhs/psychotherapy/dir/";
$menu = "menu.htm";
$report = "htm_dir.htm";
$time_rep = "time_dir.dat";
$startdir = "/mhs/psychotherapy";
$www_offset = "/usr1/www/pages";
@filemask = ("*.htm");
# sort out the location that the server will show for files
$http_rep_dir = $outputdir;
$http_rep_dir =~ s|$www_offset||o; # o switch as the value of $www_offset is fixed
$head = <
Index of text words in Psychotherapy Section pages: $letter
Use [find] button on your browser to find word of interest to you
END
$btm = <
File created using Perl script:
words.prl by Chris Evans
C.Evans@sghms.ac.uk
END
# **** that's the end of user input
# open listing of words to ignore
open(NOT,"words.lst") || die "Cannot open words.lst";
while () {
($out, $word, $n) = split;
if ($out eq "b") {
$not{$word} = 1;
}
}
# program will exclude itself from all files I've marked in robot.txt
open(ROBOT,"/usr1/www/pages/mhs/psychotherapy/robot.txt") || die "Cannot open robot.txt\n";
print "Reading files to ignore from robot.txt\n";
while () {
if (/Disallow: (.*)/) {
$file = $1;
$file =~ s/$startdir/./;
$norobfile{$file} = 1;
}
}
close(ROBOT);
open(PROB,">wordprob.lst") || die "Problem creating file wordprob.lst\n";
print "Finding files...\n";
$filenum = 0;
# run through all subdirectories looking for files
foreach $mask (@filemask) {
print "Processing files of type: $mask\n\n";
open(FIND, "find . -name \"$mask\" -print |") || die "Couldn't run find: $!\n";
while ($filename = )
{
chop($filename);
if ($norobfile{$filename}) {
print "skipping file = $filename\n";
# don't process the file if it's excluded
next;
}
print "\nProcessing file $filename...\n";
$filenum++;
$href = $filename;
$href =~ s|.|$startdir|o;
$href{$filenum} = $href;
$linenum = 0;
open (IN, "$filename") || die "Couldn't open $filename\n";
$body = 0;
while () {
$linenum++;
$body = 1 if (//i);
$body = 0 if (/<\/BODY>/i);
if ($body) {
# if line ends with a hyphen, concatenate with next
if (s/-\s*\n$//) {
$start = $_;
$_ = ;
$linenum++;
$_ = $start.$_;
}
s/<.*>//g; # strip closed HTML tags
if (/) {
# still an HTML tag in there
$start = chop($_);
$_ = ;
$linenum++;
$_ = $start.$_;
# if line ends with a hyphen, concatenate with next
if (s/-\s*\n$//) {
$start = $_;
$_ = ;
$linenum++;
$_ = $start.$_;
}
s/<.*>//g; # strip closed HTML tags
if (/) {
# still an unbalanced <
print PROB "\n\n!!!Unbalanced < at $linenum in $filename !!!\n\n";
}
}
s/_+/_/g; # compress multiple underlines
s/-+/-/g; # compress multiple hyphens
s/&(\w+);//g; # remove HTML special characters
s/\d{1,2}\.[ivx]+\.\d{1,4}//g; # removes dates
s/[,;?!:'"\*\]\[\(\)\}\{]/ /g; # replace puncuation
# other than "." with spaces
s/\s+/ /g; # compress multiple spaces
if ($_) {
@tmp = split(/ +/, $_);
foreach $word (@tmp) {
$word =~ s/^_//; # get rid of initial underlines
$word =~ s/^-//; # get rid of initial hyphens
$endchar = substr($word,-1);
chop($word) if ($endchar =~ /[_\.]/);
if (
!($word =~ /[\.\d]/) &&
($word =~ /\S/) &&
(length($word) > 1)
) {
# if result has non-numeric, non-space contents ..
$word_n{$word}++; # count it
$filenum{$word} .= $filenum." ";
}
}
}
}
}
close (IN);
}
}
open(OUT,">words.dat") || die "couldn't open words.dat\n";
foreach $word (sort byfreq keys(%word_n)) {
if ($not{$word}) {
$tag = "b ";
} else {
$tag = "n ";
}
printf OUT "%2s %30s %-s\n",$tag, $word, $word_n{$word};
}
print "\n\n\n\nNOW creating the index itself\n";
$letter = "a";
print "$letter\n";
$htm = "htm";
$first = 1;
open(LET,">dir/$letter.$htm") || die "Can't open dir/$letter.$htm\n";
print LET "$head";
foreach $word (sort bylwrcase keys %filenum) {
$w_let = substr($word,0,1);
$w_let =~ tr/A-Z/a-z/;
if (($w_let =~ /[a-z]/) && (!$not{$word})) {
# only handle if word starts with a letter and is not a skip word
if ($w_let ne $letter) {
while ($w_let ne $letter) {
$letter++;
print "$letter\n";
$first = 1;
}
print LET $btm;
close(LET);
open(LET,">dir/$letter.$htm") || die "Can't open dir/$letter.$htm\n";
print LET "$head";
}
if ($first) {
print LET "\n- $word\n
\n";
$first = 0;
} else {
print LET "
\n - $word\n
\n";
}
undef @files;
undef %n_refs;
@files = split(/ /,$filenum{$word});
foreach $file_n (@files) {
$n_refs{$file_n}++;
}
foreach $filenum (keys %n_refs) {
print LET "- $href{$filenum}";
if ($n_refs{$filenum} == 1) {
print LET " (occurs once)\n";
} else {
print LET " (occurs $n_refs{$filenum} times)\n";
}
}
}
}
#open(NOT,">dir/dropword.dat") || die "couldn't open dir/dropword.dat in write mode\n";
foreach $word (keys %notword) {
$notword{$word} = $word_n{$word};
print "$word $word_n{$word}\n";
}
# --- subroutines ---
sub byfreq {
# sorts by frequency, most frequent first
$word_n{$b} <=> $word_n{$a};
}
sub bylwrcase {
# compare by lowercase of the strings
local($stringa) = $a;
local($stringb) = $b;
$stringa =~ tr/A-Z/a-z/;
$stringb =~ tr/A-Z/a-z/;
$stringa cmp $stringb;
}