#! /usr/local/bin/perl # site-index.pl --- constructs IAFA templates for a Web site running # NCSA httpd; puts them in DocumentRoot/site.idx # # version 0.2 # # Copyright 1994 Robert S. Thau. Unlimited distribution permitted so # long as this notice is preserved. # Configuration --- organization info. $site_desc = 'Web server, St. George's Hospital Medical School'; $site_keys = 'Psychotherapy Section pages'; # Directory containing /conf and /logs subdirs; same as -d opt to httpd. # This is NOT your DocumentRoot --- we get that out of srm.conf, just like # the server does. $server_dir = '/usr1/www'; # Talkiness level ... 0 for no output except for errors, # 1 for some progress reports, 2 for more than you could conceivably want... $verbose = 2; # Distributions we recognize ... %distributions = ( 'global', 'site.idx', 'local', 'local.idx', ); $default_distribution = 'local'; # Flags controlling the indexing process. The '$require_meta' # and '$index_users' vars are binary flags --- set them to 1 to # enable the feature (e.g., to require information), 0 to # disable (e.g., to index any .html file with a ). $require_meta = 0; # only index files with <META>information $index_users = 1; # index /~user/... areas $follow_dir_symlinks = 0; # Follow symlinks to directories? $follow_file_symlinks = 0; # Follow symlinks to files? # Directory for output (if defined) --- default is to use DocumentRoot. # You must uncomment the assignment to set the variable! # # $output_dir = '/com/doc/web-support/test'; # We don't index any directory which has a file matching this regexp. # (Note that it is a regexp, not a wildcarded filename). Only has an # effect if uncommented, of course... # $stop_rx = '\.dont_index'; # stop when we see a file matching this rx ################################################################ # Code. # # You should not have to modify anything below. $* = 1; &handle_ncsa_toplev; ################################################################ # # Parsing the server config files. *All* of the NCSA dependency # is here; the rest of the code ought to transfer pretty directly # to indexing, say, a CERN-httpd or Plexus setup. sub handle_ncsa_toplev { open (HTTPD_CONF, "$server_dir/conf/httpd.conf") || die "No httpd.conf???"; while (<HTTPD_CONF>) { $server_name = $1 if (/^\s*ServerName\s+(\S+)/i); } close (HTTPD_CONF); open (SRM, "$server_dir/conf/srm.conf") || die "No srm.conf???"; while (<SRM>) { $doc_root = $1 if (/^\s*DocumentRoot\s+(\S+)/i); $dir_index = $1 if (/^\s*DirectoryIndex\s+(\S+)/i); $user_dir = $1 if (/^\s*UserDir\s+(\S+)/i); push (@aliases, $2 . "\000" . $1) if (/^\s*Alias\s+(\S+)\s+(\S+)/) } close (SRM); die ("Can't find Web root") if (!defined ($doc_root)); if (!defined ($server_name)) { chop($server_name = `hostname`); chop($server_name = "$server_name." . `domainname`) if (index ($hostname, '.') == -1); } if (!defined ($output_dir)) { $output_dir = $doc_root } &open_site_idx; &blab (1, "Indexing from DocumentRoot\n"); &index_directory ($doc_root, ''); &blab (1, "Indexing from Aliases\n"); foreach (@aliases) { &index_directory (split (/\000/)) }; if (defined ($user_dir) && $index_users) { &blab (1, "Indexing from UserDirs\n"); while(($name, $pwd, $uid, $gid, $quota, $comment, $gcos, $home_dir, $shell) = getpwent()) { &index_directory ("$home_dir/$user_dir", "/~$name") if (-d "$home_dir/$user_dir") } } &close_site_idx (); } ################################################################ # # This function walks a directory hierarchy, calling itself recursively # to deal with subdirectories, and calling &index_file to deal with # (potentially) relevant files. There are two arguments, the physical # path to the directory and the corresponding partial URI. # # N.B. that &index_file (and this function by extension) expect the # output index files to already have been opened (by &open_site_idx). sub index_directory { local ($phys_path, $uri_path) = @_; local (@subdirs, $file, $phys_name, $uri_name); opendir (phys_dir, $phys_path) || return; local (@files) = readdir (phys_dir); closedir (phys_dir); do { &blab (1, "Stopped by $file at $phys_path\n"); return } if (defined ($stop_rx) && grep (/$stop_rx/o ? (($file = $_), 1) : 0, @files)); &blab (2, "Indexing $phys_path\n"); foreach $file (@files) { $phys_name = "$phys_path/$file"; $uri_name = "$uri_path/$file"; if ($file =~ /\.html$/) { &index_file ($phys_name, $uri_name) } elsif ($file !~ /\.+/ && (-d $phys_name || ($follow_dir_symlinks && -l $phys_name))) { push (@subdirs, $file) } } foreach $file (@subdirs) { &index_directory ( "$phys_path/$file", "$uri_path/$file" ) } } ################################################################ # # This function constructs the index entry for a file (if any), and # writes it out to the appropriate index. There are two arguments; # the physical path to the file (in the local filesystem), and the # corresponding partial URI. # # This function finds the file handle corresponding to the document's # distribution (if any) by looking in the %handle aarray, which is set # up by &open_site_idx. That function also opens the files; it must # therefore be invoked before any call to &index_directory or &index_file. sub index_file { local ($phys_path, $uri_path) = @_; return if ((! $follow_file_symlinks) && (-l $phys_path)); open (THE_DOC, $phys_path) || return; local($_) = join ('', <THE_DOC>); close (THE_DOC); tr/\n/\000/; s/<\/title>/\001/i; local($title) = $1 if /<title>([^\001]*)\001/i; $title =~ s/[\s\000]*$//; $title =~ s/<([^">]|"[^"]*")*>//g; $title =~ tr/\000/\n/; local($meta, $name, $val, $desc, $keywords, $type, $dist); while (/<meta ([^">]|"[^"]*")*>/gi) { $meta = $&; $name = ($meta =~ /name="([^"]+)"/ ? "\L$1" : ''); $val = ($meta =~ /content="([^"]+)"/ ? $1 : ''); next if (! $name || ! $val); $val =~ tr/\000/\n/; $desc = $val if $name =~ /description/i; $keywords = $val if $name =~ /keywords/i; $type = $val if $name =~ /(resource|iafa)-type/i; $dist = $val if $name =~ /distribution/i; } return if (!defined ($title) || ($require_meta && !defined($desc) && !defined($type) && !defined($keywords))); if (!defined ($dist)) { $dist = $default_distribution } if (!defined ($handle{$dist})) { print "Bogus distribution $dist in $phys_path\n"; return } $desc =~ s/^/ /g; $desc =~ s/^\s*//; $title =~ s/^/ /g; $title =~ s/^\s*//; $keywords =~ s/^/ /g; $keywords =~ s/^\s*//; if ($type =~ /service/i) { &print_service_entry ($handle{$dist}) } else { &print_file_entry ($handle{$dist}) } } ################################################################ # # These next few functions are basically there so I can have all # the information on the actual format of the index files in one # place. Note that they rely heavily on variables set by their # callers... could I persuade you to consider them elaborations # of the functions that invoke them, a la Knuth's WEAVE & TANGLE? sub print_file_entry { local ($handle) = @_; (print $handle (<<EOF)) || die "Can't write $handle!!!"; Template-Type: DOCUMENT Title: $title URI: $uri_path Description: $desc Keywords: $keywords EOF } sub print_service_entry { local ($handle) = @_; (print $handle (<<EOF)) || die "Can't write $handle!!!"; Template-Type: SERVICE Name: $title URI: $uri_path Description: $desc Keywords: $keywords EOF } sub print_site_entry { local ($handle) = @_; (print $handle (<<EOF)) || die "Can't write $handle!!!"; Template-Type: SITEINFO Host-Name: $server_name URI: / Description: $site_desc Keywords: $site_keys EOF } ################################################################ # # These functions manage the index files themselves, and the internal # file handles. As a side effect, &open_site_idx creates two aarrays # which are used by &close_site_idx and &index_file. These are: # # %handle --- maps distributions onto the file handles # %fhand --- maps filenames onto the file handles # # Note also that these functions are a little careful about making # sure that a good index is left around someplace. Instead of writing # directly to 'blah.idx', the script writes to 'blah.idx.new'; if it's # interrupted for any reason (or can't write due to, e.g., a full disk), # the old indexes are unaltered. When the job is finished, we rename # 'blah.idx.new' to 'blah.idx', but only after renaming the old one to # 'blah.idx.bak', allowing easy manual recovery from lossage which the # script itself could not automatically detect. sub open_site_idx { local ($dist, $file); while (($dist, $file) = each (%distributions)) { $file = "$output_dir/$file"; if (defined ($fhand {$file})) { $handle {$dist} = $fhand {$file} } else { $handle = "xxx.$file.xxx"; open ($handle, ">$file.new") || die ("Can't open $file.new!!!"); $handle{$dist} = $handle; $fhand{$file} = $handle; &print_site_entry ($handle); } } } sub close_site_idx { while (($file, $handle) = each (%fhand)) { close ($handle); rename ("$file", "$file.bak"); rename ("$file.new", "$file"); } } ################################################################ # # General utility --- print a message, but only if we're being # sufficiently verbose. sub blab { local ($vkey) = shift (@_); print @_ if $vkey <= $verbose }