#! /usr/local/bin/perl

# site-index.pl --- constructs IAFA templates for a Web site running
#                   NCSA httpd; puts them in DocumentRoot/site.idx
#
# version 0.2
#
# Copyright 1994 Robert S. Thau.  Unlimited distribution permitted so
# long as this notice is preserved.

# Configuration --- organization info.

$site_desc = 'Web server at the BYU Computer Science Department';
$site_keys = 'CS, Computer Science, BYU';

# Directory containing /conf and /logs subdirs; same as -d opt to httpd.
# This is NOT your DocumentRoot --- we get that out of srm.conf, just like
# the server does.

$server_dir = '/usr/users/www/www/httpd_1.1';  

# Talkiness level ... 0 for no output except for errors,
# 1 for some progress reports, 2 for more than you could conceivably want...

$verbose = 2;

# Distributions we recognize ...

%distributions = (
                   'global', 'site.idx',
                   'local', 'local.idx',
                 );

$default_distribution = 'local';                
        
# Flags controlling the indexing process.  The '$require_meta'
# and '$index_users' vars are binary flags --- set them to 1 to
# enable the feature (e.g., to require <META>information), 0 to
# disable (e.g., to index any .html file with a <TITLE>).

$require_meta = 0;             # only index files with <META>information  
$index_users = 1;              # index /~user/... areas

$follow_dir_symlinks = 0;      # Follow symlinks to directories?
$follow_file_symlinks = 0;     # Follow symlinks to files?

# Directory for output (if defined) --- default is to use DocumentRoot.
# You must uncomment the assignment to set the variable!
#
# $output_dir = '/com/doc/web-support/test';

# We don't index any directory which has a file matching this regexp.
# (Note that it is a regexp, not a wildcarded filename).  Only has an
# effect if uncommented, of course...
#       
$stop_rx = '\.dont_index';       # stop when we see a file matching this rx

################################################################
# Code.
#
# You should not have to modify anything below.

$* = 1;
&handle_ncsa_toplev;

################################################################
#
# Parsing the server config files.  *All* of the NCSA dependency
# is here; the rest of the code ought to transfer pretty directly
# to indexing, say, a CERN-httpd or Plexus setup.

sub handle_ncsa_toplev
{
  open (HTTPD_CONF, "$server_dir/conf/httpd.conf") || die "No httpd.conf???";

  while (<HTTPD_CONF>)
  {
    $server_name = $1 if (/^\s*ServerName\s+(\S+)/i);
  }

  close (HTTPD_CONF);

  open (SRM, "$server_dir/conf/srm.conf") || die "No srm.conf???";

  while (<SRM>)
  {
    $doc_root = $1 if (/^\s*DocumentRoot\s+(\S+)/i);
    $dir_index = $1 if (/^\s*DirectoryIndex\s+(\S+)/i);
    $user_dir = $1 if (/^\s*UserDir\s+(\S+)/i);
    push (@aliases, $2 . "\000" . $1) if (/^\s*Alias\s+(\S+)\s+(\S+)/)
  }

  close (SRM);

  die ("Can't find Web root") if (!defined ($doc_root));

  if (!defined ($server_name))
  { 
    chop($server_name = `hostname`);
    chop($server_name = "$server_name." . `domainname`)
        if (index ($hostname, '.') == -1);
  }

  if (!defined ($output_dir)) { $output_dir = $doc_root }

  &open_site_idx;  

  &blab (1, "Indexing from DocumentRoot\n");
  &index_directory ($doc_root, '');

  &blab (1, "Indexing from Aliases\n");
  foreach (@aliases) { &index_directory (split (/\000/)) };

  if (defined ($user_dir) && $index_users)
  {
    &blab (1, "Indexing from UserDirs\n");

    while(($name, $pwd, $uid, $gid, $quota, $comment, $gcos, $home_dir, $shell)
          = getpwent())
    {
      &index_directory ("$home_dir/$user_dir", "/~$name")
        if (-d "$home_dir/$user_dir")
    }
  }

  &close_site_idx ();
}

################################################################
#
# This function walks a directory hierarchy, calling itself recursively
# to deal with subdirectories, and calling &index_file to deal with
# (potentially) relevant files.  There are two arguments, the physical
# path to the directory and the corresponding partial URI.
#
# N.B. that &index_file (and this function by extension) expect the
# output index files to already have been opened (by &open_site_idx).

sub index_directory
{
  local ($phys_path, $uri_path) = @_;
  local (@subdirs, $file, $phys_name, $uri_name);

  opendir (phys_dir, $phys_path) || return;
  local (@files) = readdir (phys_dir);  closedir (phys_dir);

  do { &blab (1, "Stopped by $file at $phys_path\n"); return }
     if (defined ($stop_rx) &&
         grep (/$stop_rx/o ? (($file = $_), 1) : 0,
               @files));

  &blab (2, "Indexing $phys_path\n");

  foreach $file (@files)
  {
    $phys_name = "$phys_path/$file";     $uri_name = "$uri_path/$file";

    if ($file =~ /\.html$/)
      { &index_file ($phys_name, $uri_name) }
    elsif ($file !~ /\.+/
           && (-d $phys_name || ($follow_dir_symlinks && -l $phys_name)))
      { push (@subdirs, $file) }
  }

  foreach $file (@subdirs)
    { &index_directory ( "$phys_path/$file", "$uri_path/$file" ) }
}

################################################################
#
# This function constructs the index entry for a file (if any), and
# writes it out to the appropriate index.  There are two arguments;
# the physical path to the file (in the local filesystem), and the
# corresponding partial URI.
#
# This function finds the file handle corresponding to the document's
# distribution (if any) by looking in the %handle aarray, which is set
# up by &open_site_idx.  That function also opens the files; it must
# therefore be invoked before any call to &index_directory or &index_file.

sub index_file
{
  local ($phys_path, $uri_path) = @_;

  return if ((! $follow_file_symlinks) && (-l $phys_path));

  open (THE_DOC, $phys_path) || return;
  local($_) = join ('', <THE_DOC>);
  close (THE_DOC);

  tr/\n/\000/;

  s/<\/title>/\001/i;
  local($title) = $1 if /<title>([^\001]*)\001/i;

  $title =~ s/[\s\000]*$//;
  $title =~ s/<([^">]|"[^"]*")*>//g;
  $title =~ tr/\000/\n/;

  local($meta, $name, $val, $desc, $keywords, $type, $dist);

  while (/<meta ([^">]|"[^"]*")*>/gi)
  {
    $meta = $&;

    $name = ($meta =~ /name="([^"]+)"/ ? "\L$1" : '');
    $val = ($meta =~ /value="([^"]+)"/ ? $1 : '');

    next if (! $name || ! $val);

    $val =~ tr/\000/\n/;

    $desc = $val if $name =~ /description/i;
    $keywords = $val if $name =~ /keywords/i;
    $type = $val if $name =~ /(resource|iafa)-type/i;
    $dist = $val if $name =~ /distribution/i;
  }

  return if (!defined ($title)
             || ($require_meta && !defined($desc) && !defined($type)
                               && !defined($keywords)));

  if (!defined ($dist)) { $dist = $default_distribution }

  if (!defined ($handle{$dist}))
    { print "Bogus distribution $dist in $phys_path\n"; return }

  $desc =~ s/^/      /g;         $desc =~ s/^\s*//;
  $title =~ s/^/      /g;        $title =~ s/^\s*//;
  $keywords =~ s/^/      /g;     $keywords =~ s/^\s*//;

  if ($type =~ /service/i) { &print_service_entry ($handle{$dist}) }
  else { &print_file_entry ($handle{$dist}) }
}

################################################################
#
# These next few functions are basically there so I can have all
# the information on the actual format of the index files in one
# place.  Note that they rely heavily on variables set by their
# callers... could I persuade you to consider them elaborations
# of the functions that invoke them, a la Knuth's WEAVE & TANGLE?

sub print_file_entry
{
  local ($handle) = @_;

  (print $handle (<<EOF)) || die "Can't write $handle!!!";
Template-Type: DOCUMENT
Title: $title
URI: $uri_path
Description: $desc
Keywords: $keywords

EOF
}

sub print_service_entry
{
  local ($handle) = @_;

  (print $handle (<<EOF)) || die "Can't write $handle!!!";
Template-Type: SERVICE
Name: $title
URI: $uri_path
Description: $desc
Keywords: $keywords

EOF
}

sub print_site_entry
{
  local ($handle) = @_;

  (print $handle (<<EOF)) || die "Can't write $handle!!!";
Template-Type: SITEINFO
Host-Name: $server_name
URI: /
Description: $site_desc
Keywords: $site_keys

EOF
}

################################################################
#
# These functions manage the index files themselves, and the internal
# file handles.  As a side effect, &open_site_idx creates two aarrays
# which are used by &close_site_idx and &index_file.  These are:
#
# %handle --- maps distributions onto the file handles
# %fhand --- maps filenames onto the file handles
#
# Note also that these functions are a little careful about making
# sure that a good index is left around someplace.  Instead of writing
# directly to 'blah.idx', the script writes to 'blah.idx.new'; if it's
# interrupted for any reason (or can't write due to, e.g., a full disk),
# the old indexes are unaltered.  When the job is finished, we rename
# 'blah.idx.new' to 'blah.idx', but only after renaming the old one to
# 'blah.idx.bak', allowing easy manual recovery from lossage which the
# script itself could not automatically detect.

sub open_site_idx
{
  local ($dist, $file);

  while (($dist, $file) = each (%distributions))
  {
    $file = "$output_dir/$file";

    if (defined ($fhand {$file}))
      { $handle {$dist} = $fhand {$file} }
    else {
      $handle = "xxx.$file.xxx";
      open ($handle, ">$file.new") || die ("Can't open $file.new!!!");

      $handle{$dist} = $handle;
      $fhand{$file} = $handle;

      &print_site_entry ($handle);
    }
  }
}

sub close_site_idx
{
  while (($file, $handle) = each (%fhand))
  {
    close ($handle);
    rename ("$file", "$file.bak");
    rename ("$file.new", "$file");
  }
}

################################################################
#
# General utility --- print a message, but only if we're being
# sufficiently verbose.

sub blab { local ($vkey) = shift (@_); print @_ if $vkey <= $verbose }

