#!/l/local/bin/perl
# $Log: makeWordWheelFiles.pl,v $
# Revision 1.4 2000/06/29 17:51:26 pfarber
# fix mdash ('--') butted up against words, map to
# [space] and map sgml to space instead of '' which
# created bogus 'words' like boo148bar
#
# Revision 1.3 2000/06/23 16:01:28 pfarber
# Cleanup
#
# Revision 1.2 2000/06/02 19:09:42 pfarber
# Checkpoint at completion of WW coding
#
# Revision 1.1 2000/04/24 16:22:16 pfarber
# Initial commit
#
use strict;
# **********************************************************************
# take a config file and generate wordwheel files.
# The config file will contain global variables that point this engine
# to various places for the collection's dd file, where to put
# the generated wordwheel file, etc.
# The config file will also have definitions of "realm" names
# and what regions in the sgml file correspond to these.
#
# Write out the form expected by WordWheel2.pm, which
# is an SGML file for pat50 indexing. Please see WordWheel2.pm (in the
# usual cgi-lib area (/l1/lib).
# **********************************************************************
# **********************************************************************
BEGIN
{
unshift (@INC, "/l1/lib");
}
use strict;
use vars (
# ****************************************
# globals set in config file
# ****************************************
'$gOutFileDir', '$gOutFileSuffix',
'$gDataDict', '%gCollNameToRegion',
'@gRealms', '%gRealmRegions', '%gRealmRegionFormats',
'$gSeparateDirPerColl', '$gLocale',
);
use Pat2;
use Set2;
use SSP2;
## -------------------- Main --------------------
# ****************************************
# other globals used herein
# ****************************************
my $gConfigFile = shift(@ARGV);
my $gWordCount;
my %gAllWords;
my $debugflag = 1 if $ENV{'MWWDEBUG'};
require $gConfigFile;
require 5.004;
use POSIX qw(locale_h);
use locale;
setlocale(LC_CTYPE, $gLocale);
setlocale(LC_COLLATE, $gLocale);
my $gPat = &PATInit ($gDataDict);
my $coll;
foreach $coll ( keys %gCollNameToRegion )
{
&GatherWordsFromFiles( $coll );
&OutputWordWheelFile( $coll );
print '- ' x 20;
print "\n";
}
exit;
##############################
########## SUBROUTINES ##########
##################################################
sub GatherWordsFromFiles
{
my $coll = shift;
my $realm;
foreach $realm ( @gRealms)
{
$gWordCount = 0;
&CreateRealmResults( $realm, $coll );
}
}
##################################################
sub CheckInput
{
if ( $#ARGV != 0 )
{
print STDOUT "Need name of config file to use as only command line parameter.\n";
exit;
}
}
##################################################
sub CleanResults
{
my $r = shift;
my @List;
$$r =~ s,^,,;
$$r =~ s,$,,;
$$r =~ s,.*?,,g;
$$r =~ s,.*?,,g;
$$r =~ s,.*?,,g;
$$r =~ s,.*?,,g;
$$r =~ s,^,,;
$$r =~ s,$,,;
@List = split ( /<\/Raw>/ , $$r) ;
# Remove tags replacing them by space to avoid creating 'words'
# from sgml like e.g. 'all249when' from :
# ...and that all249When the Queen
my $i;
for ( $i = 0; $i <= $#List; $i++ )
{
$List[$i] =~ s,<[^>]*?>, ,g;
}
return ( @List );
}
##################################################
sub CreateRealmResults
{
my ( $realm, $coll ) = @_;
my $format = $gRealmRegionFormats{$realm};
my $region = $gRealmRegions{$realm};
print STDOUT "Retrieving region $region within $coll from $gDataDict. This could take a while ...\n\n";
my $collRegion = $gCollNameToRegion{$coll};
my $search = qq{pr.region.$format ($region within $collRegion);};
my $resultString = $gPat->lowendQuery($search);
my @results = &CleanResults(\$resultString);
my $line;
my $lineCount = 0;
foreach $line ( @results )
{
# Split on whitespace. Later we need to fix up cruft
# e.g. "mothers,fathers". Also map "<", ">", and
# "&" etc. to " " since they are basically punctuation but
# cannot appear as "<", ">", "&" in SGML.
$line =~ s,\&\;, ,g;
$line =~ s,\<\;, ,g;
$line =~ s,\>\;, ,g;
$line =~ s,\&emsp\;, ,g;
$line =~ s,\&ensp\;, ,g;
$line =~ s,\&prime\;, ,g;
$line =~ s,\&plus\;, ,g;
$line =~ s,\&ndash\;, ,g;
$line =~ s,\&mdash\;, ,g;
$line =~ s,\&nldr\;, ,g;
$line =~ s,\&mldr\;, ,g;
$line =~ s,\&hellip\;, ,g;
$line =~ s,\&tilde\;, ,g;
$line =~ s,\&lcub\;, ,g;
$line =~ s,\&rcub\;, ,g;
$line =~ s,\&verbar\;, ,g;
$line =~ s,\ \;, ,g;
$line =~ s,\&\;, ,g;
$line =~ s,\<\;, ,g;
$line =~ s,\>\;, ,g;
# Special case: (from Yeats): "acquirements--self-possession,
# adaptability, how to dress well, how even to play tennis
# decently--you would ..." Replace '--' with ' ' but allow
# "his 'idiot' enemies: A--- who against ..."
$line =~ s,([^-])--([^-]),$1 $2,g;
my $part;
foreach $part ( split( /\s+/, $line ) )
{
&ProcessWord( $part, $realm );
}
$lineCount++;
if ( ($lineCount % 1000 ) == 0 )
{
print STDOUT "Finished line $lineCount\n....";
}
# debugging: last if ( $lineCount > 200 );
}
}
# ----------------------------------------------------------------------
# NAME : StoreWord
# PURPOSE :
# CALLED BY :
# CALLS :
# INPUT : "word" obtained by splitting on whitespace and punctuation
# excluding "-" and "'"
# GLOBALS :
# SIDE-EFFECTS :
# NOTES : Split again on "-" and "'"
# ----------------------------------------------------------------------
sub StoreWord
{
my ( $realm, $word ) = @_;
return if ( ! $word );
$gWordCount++;
# Words containing "-" and "'" appear here. This includes leading
# chars e.g. "-five" and "'a". Step on these chars so the word
# will sort within a-z.
$word =~ s,^[\'\-],,g;
# Identify the character type by the first character
$word =~ m,^(.),;
my $character = $1;
my $chartype;
# Sort results into the bins. Sort "&entref;foo" into the MISC bin for
# now until we define a sort order for character entity references.
if ( POSIX::isalpha( $character ) )
{
# Strict alpha or words beginning with character entity refs.
# If alpha word comes in with non-alpha word chars, store
# it. Later we'll split and store it's constituents too.
$chartype = 'ALPHA';
}
elsif ( POSIX::isdigit( $character ) )
{
$chartype = 'NUM';
# If numeric $word came in with non-alpha word chars ( 1775-1776 e.g.)
# skip it. ProcessWord() will split it on "-" later and we'll
# process 1775 and 1776 separately
return if ( $word =~ m,[\'\-], );
# Get rid of any non-numeric characters mixed in. Non-alphanumeric
# chars have already been eliminated by this point
$word = &Flatten8bitChars( $word );
$word =~ s,[a-z],,g;
}
else
{
$chartype = 'MISC';
}
# Keep count of all words
$gAllWords{$realm}{$chartype}{$word}++;
if ( $gWordCount % 20000 == 0 )
{
print STDOUT "$gWordCount words processed\n";
}
}
# ----------------------------------------------------------------------
# NAME : SplitWordCandidate
# PURPOSE :
# CALLED BY :
# CALLS :
# INPUT : "word" obtained by splitting on whitespace
# GLOBALS :
# SIDE-EFFECTS :
# NOTES : Split on leading, trailing and embedded punctuation
# ----------------------------------------------------------------------
sub SplitWordCandidate
{
my $candidate = shift;
$candidate = lc( $candidate );
# 7 bit non-alphanumeric but part of "words"
# ----> \'\-
# 7 bit non-alphanumeric chars (except "'", "-" )
# ----> \&\!\"\#\$\%\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s
# *All* printable 7 bit non-alphanumeric chars (including "'", "-" )
# ----> \&\!\"\#\$\%\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\'\-
# NOTE: Should agree with ww2-idx
# Remove *All* non-alphanumeric at beginning
while ( $candidate =~ s/^[\!\"\#\$\%\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\'\-]+// )
{;}
# Remove *All* non-alphanumeric at end
while ( $candidate =~ s/[\!\"\#\$\%\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s\'\-]+$// )
{;}
# Split on *some* embedded non-alphanumeric. Not '-' because of
# e.g. forty-two and not "'" because we treat "d'une" as a
# word but catch "mary,john" or hello(goodbye) e.g.
my @parts = split( /[\&\!\"\#\$\%\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\\\]\^\`\{\|\}\~\s]+/, $candidate);
return @parts;
}
# ----------------------------------------------------------------------
# NAME : StoreTrueWords
# PURPOSE : Store "true words" and re-split on non-alpha word chars
# and store again
# CALLED BY :
# CALLS :
# INPUT : list from SplitWordCandidate
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub StoreTrueWords
{
my $realm = shift;
my $trueWordsRef = shift;
foreach my $trueWord ( @$trueWordsRef )
{
&StoreWord( $realm, $trueWord );
# At this point we may be looking at words like "d'une" and
# "forty-five" which have been saved as words. Split them
# one more time and save their constituents as "words" too.
if ( $trueWord =~ m,[\'\-]+, )
{
my @subtrueWords = split( /[\'\-]+/, $trueWord );
foreach my $subtrueWord ( @subtrueWords )
{
&StoreWord( $realm, $subtrueWord );
}
}
}
}
# ----------------------------------------------------------------------
# NAME : ProcessWord
# PURPOSE :
# CALLED BY :
# CALLS :
# INPUT : "word" obtained by splitting on whitespace
# GLOBALS :
# SIDE-EFFECTS :
# NOTES : Split on whitespace so we get
# [punctuation|wordchar]+
# remove leading or trailing punctuation so we get
# wordchar+[wordchar|punctuation]*wordchar+
# remove embedded punctuation except for '-' for dates
# and numbers (forty-two) and "'" for contractions
# when the lemma is formed we remove "'" and "-"
# and flatten 8bit chars. When we take input from
# the user we also remove "-" and "'" so we can search
# solely on the lemma (but we display the word).
# NOTE: Lemma formation and user input processing have
# to agree on what is a word
# ----------------------------------------------------------------------
sub ProcessWord
{
my ( $word, $realm ) = @_;
my $entityCt;
my @entityArr;
my $wordHasEntity = 0;
print "ProcessWord: $word\n" if $debugflag;
# If the input "word" contains a character entity, parse the
# word and substitute "sp0t" e.g. to leave the entity intact
$entityCt = 0;
while ( $word =~ m/(\&[a-zA-Z][a-zA-Z0-9\-]{0,7}\;)/g )
{
$wordHasEntity = 1;
push( @entityArr, $1 );
my $mark = 'sp' . $entityCt . 't';
$word =~ s,$1,$mark,;
$entityCt++;
}
if ( $wordHasEntity )
{
# Now can split this "word" without destroying entity
# boundaries '&' and ';'
my @parts = &SplitWordCandidate( $word );
# Put the entities back in
foreach my $p ( @parts )
{
$entityCt = 0;
foreach my $e ( @entityArr )
{
my $mark = 'sp' . $entityCt . 't';
$p =~ s,$mark,$e,;
$entityCt++;
}
}
&StoreTrueWords( $realm, \@parts );
}
else
{
my @parts = SplitWordCandidate( $word );
&StoreTrueWords( $realm, \@parts );
}
}
##################################################
sub Flatten8bitChars
{
my $word = shift;
$word =~ tr/ΛηεΜιζθνκλμρξοΝ
―τςσ §ΏΰΨ/aaaaaaceeeeiiiinoooooouuuuysaaaaaaceeeeiiiinoooooouuuuyy/;
return $word;
}
# ----------------------------------------------------------------------
# NAME : OutputWordWheelFile
# PURPOSE : Write the words marked up w/SGML which adheres to the wordwheel.dtd
# to the output file
# CALLED BY :
# CALLS :
# INPUT :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub OutputWordWheelFile
{
my $coll = shift;
my $dir;
if ($gSeparateDirPerColl)
{
$dir = $gOutFileDir . $coll . q{/};
}
else
{
$dir = $gOutFileDir . q{/};
}
if ( ! -e $dir )
{
`/bin/mkdir -m 2775 -p $dir`;
}
my $outFile = $dir . $coll . $gOutFileSuffix;
open(OUTFILE, ">$outFile") || die(qq{Cannot open file $outFile for writing: $!});;
print STDOUT "\nStarting to output to $outFile\n\n";
print OUTFILE qq{\n};
print OUTFILE qq{$coll\n};
my $chartype;
my $realm;
foreach $realm ( sort ( @gRealms ) )
{
print STDOUT "Working on realm $realm...\n";
my $sequence = 0; ## ----- for numbers in I tag -----
print OUTFILE qq{\n$realm\n};
foreach $chartype ( sort ( keys ( %{$gAllWords{$realm}} ) ) )
{
print STDOUT "Working on character type $chartype...\n";
print OUTFILE qq{<$chartype>\n};
foreach my $word ( sort
{
if ( $a =~ m,^[0-9], ) { $a <=> $b }
else { $a cmp $b }
}
( keys( %{ $gAllWords{$realm}{$chartype} } ) ) )
{
# flatten 8 bit chars and save word in the lemma. Used for
# consistent lowest common denominator word sorting across
# collections that come from different languages
if ( lc( $gLocale ) ne 'c' )
{
my $lemma = Flatten8bitChars( $word );
if ( $lemma )
{
print OUTFILE ( qq{$word \n} );
$sequence++;
}
}
else
{
if ( $word )
{
print OUTFILE ( qq{$word \n} );
$sequence++;
}
}
}
print OUTFILE qq{$chartype>\n};
}
print OUTFILE qq{\n};
}
print OUTFILE qq{\n};
close(OUTFILE);
}
##################################################
__END__;