#!/usr/bin/perl # program to convert a large NETSCAPE bookmark.htm file to a directory of smaller files # copyright, Chris Evans 30.i.96 # program takes a Netscape bookmarks file, # strips
tags and blank lines from it # saves it under a new name # opens and runs through this creating a file of HREF pointers # to other files it creates (usually in another directory) # each of which has a name reflecting the header in the bookmark file # has the title of the header in the HREF pointer to that file # and contains all the HREFs actually stored in the Netscape # bookmarks file under that header. # the file of pointers has a nested list structure reflecting # the nested headers structure in the bookmark file # because of the clumsy way in which I've done this # the pointers file is opened after saving and the redundant # pairs between headers on the same level which have # no subsidiary headers are stripped out before saving the file # again. I'm sure there's a better way around this! # file names and directories are all set here $verbose = 1; #to get feedback on what it's doing! $dir1 = "./"; # location of the Netscape bookmark file $bookmark = "bookmark.htm"; # name of that file $bookmk = "bookmk.htm"; # name of file after stripping
and space $dir2 = "./bkmks/"; # directory for the files created for each # header in the bookmark file as seen by OS $dir3 = "/mhs/psychotherapy/bkmks/"; # as seen via the WWW server $ptrs1 = "ptrs1.htm"; # file of pointers to new files in $dir2 # before stripping pairs $ptrs = "ptrs.htm"; # and after stripping # I put this next bit at the bottom of all files in dir2 # you will want to change it to reflect your setup $btm = <
File created using Perl script: ptrs.prl written by Chris Evans C.Evans\@sghms.ac.uk EOF $btm1 = < File created using Perl script: ptrs.prl written by Chris Evans C.Evans\@sghms.ac.uk EOF # and this bit goes at the top of each file in dir2 $intro = <This file is created automatically from one category of bookmarks within a copy of my NETSCAPE bookmark file using a Perl script, ptrs.prl written by myself, Chris Evans C.Evans\@sghms.ac.uk
EOF # and this bit goes at the top of the pointer file $intro1 = <This file is created automatically from a copy of my NETSCAPE bookmark file using a Perl script, ptrs.prl written by myself, Chris Evans C.Evans\@sghms.ac.uk
EOF # set up utility variables $n = 0; $n_hrefs = 0; $n_http = 0; $n_file = 0; $n_goph = 0; $n_ftp = 0; $n_locl = 0; $n_file = 0; $n_site = 0; $htm = ".htm"; $head1 = "\n\n"; $head2 = "\n\n\n"; @opens = (); # ======= now start doing something for real! ====================== # open the bookmark file and strip the redundant stuff out of it open (BOOKMARK, "$dir1$bookmark") || die "Can't open $dir1$bookmark\n"; open (BOOKMK, ">$dir1$bookmk") || die "Can't create $dir1$bookmk\n"; while () { $_ =~ s/
//g; # drop
tags # deal with entirely numeric addresses if ($_ =~ /
(.*)<\/A>/) { $n_hrefs++; print "Dealing with HREF number $n_hrefs\n" if ($verbose); $call = $1; $call =~ tr/A-Z/a-z/; $n_http++ if ($call eq "http"); $n_goph++ if ($call eq "gopher"); $n_ftp++ if ($call eq "ftp"); $site = $2; $site =~ tr/A-Z/a-z/; $n_site++ if (!$n_site{$site}); $n_site{$site}++; $file = $site.$3; $n_file++ if (!$n_file{$file}); $n_file{$file}++; $call = $call.$file; $n_call++ if (!$n_call{$call}); $n_call{$call}++; ($asec,$amin,$ahr,$amday,$amon,$ayear,$awday,$ayday,$aisdat) = localtime($4); $aday = (Sun,Mon,Tue,Wed,Thu,Fri,Sat,Sun)[$awday]; $amon = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$amon]; $added = $aday." ".$amday.".".$amon.".".$ayear." ".$ahr.":".$amin; ($usec,$umin,$uhr,$umday,$umon,$uyear,$uwday,$uyday,$uisdat) = localtime($5); $uday = (Sun,Mon,Tue,Wed,Thu,Fri,Sat,Sun)[$uwday]; $umon = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$umon]; $upped = $uday." ".$umday.".".$umon.".".$uyear." ".$uhr.":".$umin; print BOOKMK "\n

\n"; print BOOKMK; # print the original HREF print BOOKMK " (numeric address: "; print BOOKMK "$1://$2/$3 is the one stored) "; } elsif ($_ =~ /

(.*)<\/A>/) { $n_hrefs++; print "Dealing with HREF number $n_hrefs\n" if ($verbose); $call = $1; $call =~ tr/A-Z/a-z/; $n_http++ if ($call eq "http"); $n_goph++ if ($call eq "gopher"); $n_ftp++ if ($call eq "ftp"); $site = $2; $site =~ tr/A-Z/a-z/; $n_site++ if (!$n_site{$site}); $n_site{$site}++; $file = $site.$3; $n_file++ if (!$n_file{$file}); $n_file{$file}++; $call = $call.$file; $n_call++ if (!$n_call{$call}); $n_call{$call}++; ($asec,$amin,$ahr,$amday,$amon,$ayear,$awday,$ayday,$aisdat) = localtime($4); $aday = (Sun,Mon,Tue,Wed,Thu,Fri,Sat,Sun)[$awday]; $amon = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$amon]; $added = $aday." ".$amday.".".$amon.".".$ayear." ".$ahr.":".$amin; ($usec,$umin,$uhr,$umday,$umon,$uyear,$uwday,$uyday,$uisdat) = localtime($5); $uday = (Sun,Mon,Tue,Wed,Thu,Fri,Sat,Sun)[$uwday]; $umon = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$umon]; $upped = $uday." ".$umday.".".$umon.".".$uyear." ".$uhr.":".$umin; if (!$numeric{$site}) { print "Looking up site number $n_site\n" if ($verbose); ($name,$aliases,$addrtype,$length,@addrs) = gethostbyname($2); ($add_a,$add_b,$add_c,$add_d) = unpack('C4',$addrs[0]); $address = $add_a.".".$add_b.".".$add_c.".".$add_d; $numeric{$site} = $address; } if ($numeric{$site} ne "...") { print BOOKMK "\n

\n"; print BOOKMK; # print the original HREF print BOOKMK " numeric address:"; # and add the numeric print BOOKMK "\n$1://$numeric{$site}/$3\n"; } else { print BOOKMK "\n

\n"; print BOOKMK; # print the original HREF print BOOKMK "(no numeric address found today, sorry!) "; } } elsif ($_ =~ /

>>>>> $1"; } else { print BOOKMK if /\S/; # print if there's some non-space character left } } close (BOOKMARK) || die "Can't close bookmark.htm\n"; close (BOOKMK) || die "Can't close bookmk.htm\n"; open (HREFS, ">bkmkstat.htm") || die "Can't open bkmkstat.htm\n"; print "\n\nWriting summary statistics to file\n" if ($verbose); print HREFS $head1; print HREFS "Info. about my BOOKMARK file"; print HREFS $head2; $dups = $n_hrefs - $n_call; print HREFS "

Information on my pointers elsewhere from my bookmark.htm file

\n"; print HREFS "My bookmark.htm file contains $n_hrefs pointers in total,"; if ($dups == 0) { print HREFS "None of these are duplicate pointers, i.e. all $n_call are unique pointers.\n"; } elsif ($dups == 1) { print HREFS "One is duplicated, leaving $n_call unique pointers.
\n"; } else { print HREFS "$dups are duplicates leaving $n_call unique pointers.
\n"; } print HREFS "Of these $n_locl are to local files and no use to anyone but me\n"; print HREFS "(sorry, still useful to me to store shortcuts to these somewhere!)\n"; print HREFS "A further $n_ftp are FTP calls, $n_goph are gopher service calls\n"; print HREFS "leaving $n_http as typical WWW http calls.\n

"; print HREFS "For the amusement of anyone who is interested, I have mounted\n"; print HREFS "the list of sites with how many references\n"; print HREFS "there are to each, when I've time I'll do a domain breakdown on this\n

\n"; print HREFS "More usefully, in a pinch is the\n"; print HREFS "listing of numeric addresses for\n"; print HREFS "each site which might come in useful to you if the DNS (Domain Name Service)\n"; print HREFS "goes wrong (again).\n<\BODY>\n<\HTML>\n"; close (HREFS) || die "Can't close hrefs.htm\n"; open (SITE,">sites.txt") || die "Can't open sites.txt\n"; print "\nWriting site call counts\n" if ($verbose); foreach $site (sort bysitefreq keys %n_site) { printf SITE "%50s %12s\n", $site,$n_site{$site}; } close (SITE) || die "Can't close sites.txt\n"; open (NUM,">numeric.txt") || die "Can't open numeric.txt\n"; print "\nWriting numeric addresses to file\n" if ($verbose); foreach $site (sort keys %numeric) { printf NUM "%50s %-16s\n", $site, $numeric{$site}; } close (NUM) || die "Can't close sites.txt\n"; # this next bit is clumsy but I don't know any better! open (BOOKMK, "$dir1$bookmk") || die "Can't open $dir1$bookmk\n"; # now create the pointers file open (PTRS1, ">$dir1$ptrs1") || die "Can't open $dir1$ptrs1\n"; print PTRS1 "\n\nPointers file: digest of $bookmark\n"; print PTRS1 "\n\n\n"; print PTRS1 "

Pointers: digest of $bookmark

\n"; print PTRS1 $intro1; $n++; $curr = "remain01"; open ($curr, ">$dir2$curr$htm") || die "Can't create $dir2$curr$htm"; unshift(@opens,$curr); $files{"remain"} = 1; $title = "The remainder"; print $curr "$head1\n"; print $curr "$title\n"; print $curr "$head2\n"; $body = 1; while ($inp = ) { # read from the input file if ($inp =~ /<\/DL>/) { # is this end of entries for a header? if ($curr ne "remain01") { # do nothing if you're at the end, otherwise print PTRS1 "<\/UL>\n"; # close that list in PTRS1 file } print $curr $btm; # put the finishing stuff in that current output file close ($curr) || die "Can't close $curr\n"; shift(@opens); # shift that filename out of the array of open files $curr = $opens[0]; # and reset the current output file to the next in that array } elsif ($inp =~ /]*([^<]*)/) { # that odd looking thing finds the contents of H3 headers # so this is a new header $title = $1; # take the title $title =~ s/>//; # strip the > you had in it! $file = $title; # going to use the title to make a filename # this next few lines knocks this into shape $file =~ tr/&/n/; # replace & with n $file =~ s/\//_/; # replace / with _ $file =~ s/\.//g; # drop any full stops $file =~ s/\s+/_/g; # squash any white space to one _ $file = substr($file,0,6); # pad to length 6 with _ if (length($file) < 6) { $file = $file."_______"; $file = substr($file,0,6); } # O.K. you've got a usable 6 character basic name $files{$file}++; # increment counter for that name if (length($files{$file}) > 2) { die "Too many headers whose titles form the filename stem $file\n"; } elsif (length($files{$file}) eq 1) { $filen = $file."0".$files{$file}; } else { $filen = $file.$files{$file}; # pad name to length 8 with the index number of that name } $n++; # counts the files created (not really used) unshift(@opens,$filen); # put file name into array of open files # print a pointer to it in the current file print $curr "\n

Subsidiary file of references:$title<\/A><\/H4>\n"; $curr = $filen; # now make it the current file to write to open ($filen, ">$dir2$filen$htm") || die "Can't create $dir2$filen$htm\n"; $body = 0; print PTRS1 "