#! /usr/local/bin/perl
# site-index.pl --- constructs IAFA templates for a Web site running
# NCSA httpd; puts them in DocumentRoot/site.idx
#
# version 0.2
#
# Copyright 1994 Robert S. Thau. Unlimited distribution permitted so
# long as this notice is preserved.
# Configuration --- organization info.
$site_desc = 'Web server at St. George\'s Hospital Medical School, University of London, UK';
$site_keys = 'Psychotherapy Section';
# Directory containing /conf and /logs subdirs; same as -d opt to httpd.
# This is NOT your DocumentRoot --- we get that out of srm.conf, just like
# the server does.
$server_dir = '/usr1/www';
# Talkiness level ... 0 for no output except for errors,
# 1 for some progress reports, 2 for more than you could conceivably want...
$verbose = 1;
# Distributions we recognize ...
%distributions = (
'global', 'site.idx',
# 'local', 'local.idx',
# 'MIT', 'local.idx'
);
$default_distribution = 'global';
# Flags controlling the indexing process. The '$require_meta'
# and '$index_users' vars are binary flags --- set them to 1 to
# enable the feature (e.g., to require information), 0 to
# disable (e.g., to index any .html file with a
).
$require_meta = 0; # only index files with information
$index_users = 1; # index /~user/... areas
$follow_dir_symlinks = 1; # Follow symlinks to directories?
$follow_file_symlinks = 1; # Follow symlinks to files?
# Directory for output (if defined) --- default is to use DocumentRoot.
# You must uncomment the assignment to set the variable!
#
$output_dir = '/usr1/www/pages/mhs/psychotherapy';
# We don't index any directory which has a file matching this regexp.
# (Note that it is a regexp, not a wildcarded filename). Only has an
# effect if uncommented, of course...
#
# $stop_rx = '\.htaccess'; # stop when we see a file matching this rx
################################################################
# Code.
#
# You should not have to modify anything below.
$* = 1;
&handle_ncsa_toplev;
################################################################
#
# Parsing the server config files. *All* of the NCSA dependency
# is here; the rest of the code ought to transfer pretty directly
# to indexing, say, a CERN-httpd or Plexus setup.
sub handle_ncsa_toplev
{
open (HTTPD_CONF, "/etc/httpd.conf") || die "No httpd.conf???";
while ()
{
$server_name = $1 if (/^\s*ServerName\s+(\S+)/i);
}
close (HTTPD_CONF);
open (SRM, "$server_dir/conf/srm.conf-dist") || die "No srm.conf???";
while ()
{
$doc_root = $1 if (/^\s*DocumentRoot\s+(\S+)/i);
$doc_root = "/usr1/www/pages";
$dir_index = $1 if (/^\s*DirectoryIndex\s+(\S+)/i);
$user_dir = $1 if (/^\s*UserDir\s+(\S+)/i);
push (@aliases, $2 . "\000" . $1) if (/^\s*Alias\s+(\S+)\s+(\S+)/)
}
close (SRM);
die ("Can't find Web root") if (!defined ($doc_root));
if (!defined ($server_name))
{
chop($server_name = `hostname`);
chop($server_name = "$server_name." . `domainname`)
if (index ($hostname, '.') == -1);
}
if (!defined ($output_dir)) { $output_dir = $doc_root }
&open_site_idx;
&blab (1, "Indexing from DocumentRoot\n");
&index_directory ($doc_root, '');
&blab (1, "Indexing from Aliases\n");
foreach (@aliases) { &index_directory (split (/\000/)) };
if (defined ($user_dir) && $index_users)
{
&blab (1, "Indexing from UserDirs\n");
while(($name, $pwd, $uid, $gid, $quota, $comment, $gcos, $home_dir, $shell)
= getpwent())
{
&index_directory ("$home_dir/$user_dir", "/~$name")
if (-d "$home_dir/$user_dir")
}
}
&close_site_idx ();
}
################################################################
#
# This function walks a directory hierarchy, calling itself recursively
# to deal with subdirectories, and calling &index_file to deal with
# (potentially) relevant files. There are two arguments, the physical
# path to the directory and the corresponding partial URI.
#
# N.B. that &index_file (and this function by extension) expect the
# output index files to already have been opened (by &open_site_idx).
sub index_directory
{
local ($phys_path, $uri_path) = @_;
local (@subdirs, $file, $phys_name, $uri_name);
opendir (phys_dir, $phys_path) || return;
local (@files) = readdir (phys_dir); closedir (phys_dir);
do { &blab (1, "Stopped by $file at $phys_path\n"); return }
if (defined ($stop_rx) &&
grep (/$stop_rx/o ? (($file = $_), 1) : 0,
@files));
&blab (2, "Indexing $phys_path\n");
foreach $file (@files)
{
$phys_name = "$phys_path/$file"; $uri_name = "$uri_path/$file";
if ($file =~ /\.htm$/)
{ &index_file ($phys_name, $uri_name) }
elsif ($file !~ /\.+/
&& (-d $phys_name || ($follow_dir_symlinks && -l $phys_name)))
{ push (@subdirs, $file) }
}
foreach $file (@subdirs)
{ &index_directory ( "$phys_path/$file", "$uri_path/$file" ) }
}
################################################################
#
# This function constructs the index entry for a file (if any), and
# writes it out to the appropriate index. There are two arguments;
# the physical path to the file (in the local filesystem), and the
# corresponding partial URI.
#
# This function finds the file handle corresponding to the document's
# distribution (if any) by looking in the %handle aarray, which is set
# up by &open_site_idx. That function also opens the files; it must
# therefore be invoked before any call to &index_directory or &index_file.
sub index_file
{
local ($phys_path, $uri_path) = @_;
return if ((! $follow_file_symlinks) && (-l $phys_path));
open (THE_DOC, $phys_path) || return;
local($_) = join ('', );
close (THE_DOC);
tr/\n/\000/;
s/<\/title>/\001/i;
local($title) = $1 if /([^\001]*)\001/i;
$title =~ s/[\s\000]*$//;
$title =~ s/<([^">]|"[^"]*")*>//g;
$title =~ tr/\000/\n/;
local($meta, $name, $val, $desc, $keywords, $type, $dist);
while (/]|"[^"]*")*>/gi)
{
$meta = $&;
$name = ($meta =~ /name="([^"]+)"/ ? "\L$1" : '');
$val = ($meta =~ /content="([^"]+)"/ ? $1 : '');
next if (! $name || ! $val);
$val =~ tr/\000/\n/;
$desc = $val if $name =~ /description/i;
$keywords = $val if $name =~ /keywords/i;
$type = $val if $name =~ /(resource|iafa)-type/i;
$dist = $val if $name =~ /distribution/i;
}
return if (!defined ($title)
|| ($require_meta && !defined($desc) && !defined($type)
&& !defined($keywords)));
if (!defined ($dist)) { $dist = $default_distribution }
if (!defined ($handle{$dist}))
{ print "Bogus distribution $dist in $phys_path\n"; return }
$desc =~ s/^/ /g; $desc =~ s/^\s*//;
$title =~ s/^/ /g; $title =~ s/^\s*//;
$keywords =~ s/^/ /g; $keywords =~ s/^\s*//;
if ($type =~ /service/i) { &print_service_entry ($handle{$dist}) }
else { &print_file_entry ($handle{$dist}) }
}
################################################################
#
# These next few functions are basically there so I can have all
# the information on the actual format of the index files in one
# place. Note that they rely heavily on variables set by their
# callers... could I persuade you to consider them elaborations
# of the functions that invoke them, a la Knuth's WEAVE & TANGLE?
sub print_file_entry
{
local ($handle) = @_;
(print $handle (<$file.new") || die ("Can't open $file.new!!!");
$handle{$dist} = $handle;
$fhand{$file} = $handle;
&print_site_entry ($handle);
}
}
}
sub close_site_idx
{
while (($file, $handle) = each (%fhand))
{
close ($handle);
rename ("$file", "$file.bak");
rename ("$file.new", "$file");
}
}
################################################################
#
# General utility --- print a message, but only if we're being
# sufficiently verbose.
sub blab { local ($vkey) = shift (@_); print @_ if $vkey <= $verbose }