#!/l/local/bin/perl # © 2004, The Regents of The University of Michigan, All Rights Reserved # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject # to the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 5.8.3; # For best Unicode handling use strict; use Encode; # # Title: OAI data Translation engine # Description: Acquires data from the UIUC OAI Harvester, changes the OAI xml to bib class xml # Copyright: Copyright (c) 2004 # Company: The University of Michigan # author: Phillip Farber based on Java version by Michael Burek # version: 1.0 # # ------------------------------------------------------------ # R o o t s # ------------------------------------------------------------ my $DLXSROOT = $ENV{'DLXSROOT'}; my $OAI_BIN_ROOT = "$DLXSROOT/bin/o/oaister"; my $OAI_ERR_FILES_ROOT = "$OAI_BIN_ROOT/errors"; my $OAI_BIN_TRANSFORM_ROOT = "$OAI_BIN_ROOT/oaitransform"; my $OAI_PREP_ROOT = "$DLXSROOT/prep"; my $OAI_DATA_ROOT = "$OAI_PREP_ROOT/h/harvester"; my $OAI_BIB_FILES_ROOT = "$OAI_PREP_ROOT/o/oaister"; # ------------------------------------------------------------ # F i l e s # ------------------------------------------------------------ # Perl friendly list of short and longnames to be used by OaiBibApp.pm my $REPOSITORY_NAMES = "$OAI_BIN_TRANSFORM_ROOT/repositoryNames.pl"; my $REPOSITORY_LOOKUP_TABLE = "$OAI_BIN_TRANSFORM_ROOT/repository_table.txt"; my $NORMALIZATION_TABLE = "$OAI_BIN_TRANSFORM_ROOT/normal_types.txt"; # XSL stylesheet my $OAI2BIB_XSL_FILE = "$OAI_BIN_TRANSFORM_ROOT/oai-bibclass3.xsl"; # Error reporting my $REJECTED_REC_FILE = "$OAI_ERR_FILES_ROOT/rejected_records.txt"; my $NORM_FAIL_FILE = "$OAI_ERR_FILES_ROOT/normalization_errors.txt"; # ------------- # System Limits # ------------- # Max number of records to concatenate into a single bib file my $maxXmlFiles = 2000; # Max number of elements before rejecting the record my $maxNumOfTitleElements = 100; # Max number of <creator> + <contributor> (author) elements before rejecting the record my $maxNumOfAuthorElements = 100; # Max number of <subject> elements before rejecting the record my $maxNumOfSubjectElements = 100; # ------------------------------------------------------------ # # M A I N P R O G R A M # # ------------------------------------------------------------ &::ASSERT( $DLXSROOT, qq{You must set the DLXSROOT environment variable} ); # Instantiate an OAI Transform object (ot) my $ot = OAITransform->new(); # OAI_DATA_ROOT is the path to the root of the UIUC harvested files my @dirs; if ( $ARGV[0] ) { if ( $ARGV[0] eq '-g' ) { @dirs = &::OpenOaiDataRoot(); $ot->SetGeneratingPerl(); } elsif ( $ARGV[0] eq '-h' ) { print( qq{Usage: OAITransform [-h | -g | dirname ]\n\n}. qq{OAITransform uses the following files:\n\n} . qq{\t"oai-bibclass3.xsl" to transform simple DC tags \n\tinto DLXS BibClass tags\n\n} . qq{\t"normal_types.txt" to normalize the simple DC TYPE\n} . qq{\ttag into 4 normalized types (text, image, audio, video) \n} . qq{\twhich are put in a DLXS BibClass NORM tag\n\n} . qq{\t"repository_table.txt" which contains all the repository IDs\n} . qq{\tand full names of repositories (use repository_table.sample.txt\n} . qq{\tto get started)\n\n} . qq{OAITransform generates "repositoryNames.pl" which is a list of \n} . qq{repository IDs and full names of repositories, built from the \n} . qq{directory list of repository names living at "$DLXSROOT/prep/h/harvester"\n} . qq{and "repository_table.txt"\n\n} ); exit 0; } else { if ( -d "$OAI_DATA_ROOT/$ARGV[0]" ) { @dirs = $ARGV[0]; } else { print qq{$ARGV[0] is not a repository directory under $OAI_DATA_ROOT\n}; exit 1; } } } else { @dirs = &::OpenOaiDataRoot(); } if ( $ot->GeneratingPerl() ) { foreach my $dir ( @dirs ) { $ot->AddArchiveName( $dir, $ot->GetRepositoryName( $dir ) ); } $ot->GeneratePerlRepositoryNameHashFile(); } else { print qq{\nStarting OAI transform program...\n}; foreach my $dir ( @dirs ) { print( qq{\nrepository identifier: $dir\norganization: } . $ot->GetRepositoryName( $dir ) . qq{\n} ); $ot->AddArchiveName( $dir, $ot->GetRepositoryName( $dir ) ); $ot->SetCurrentArchive( $dir ); $ot->SetFileNo( 0 ); $ot->InitRawXmlFile( "$OAI_BIB_FILES_ROOT/" . $ot->GetCurrentArchive() . "_" . $ot->GetFileNo() . "_raw.xml" ); $ot->DescendAndProcessFiles( "$OAI_DATA_ROOT/$dir" ); $ot->FinishUpWithOutputFile(); $ot->TranslateToBib( "$OAI_BIB_FILES_ROOT/" . $ot->GetCurrentArchive() . "_bib.xml", $ot->GetCurrentArchive() ); $ot->ProcessOneRepositoryResults( $dir ); } if ( scalar( @dirs ) > 1 ) { $ot->PrintSessionResults(); } } exit 0; # ------------------------------------------------------------ # # e n d M A I N P R O G R A M # # ------------------------------------------------------------ # Utilities sub ASSERT { my ( $condition, $msg ) = @_; die $msg if ( ! $condition ); } sub OpenOaiDataRoot { &::ASSERT( opendir( OAI_DATA_ROOT, $OAI_DATA_ROOT ), qq{Unable to opendir $OAI_DATA_ROOT: $!} ); my @dirs = grep{ ( ! /^\./ ) && -d "$OAI_DATA_ROOT/$_" } readdir( OAI_DATA_ROOT ); closedir( OAI_DATA_ROOT ); return @dirs; } package OAITransform; # Private Class data my %repositoryNames; my %normalizationValues; my $normalizationRE; my @sessionInfo; # ---------------------------------------------------------------------- # NAME : new # PURPOSE : Instantiate and OAITransform object (ot) # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub new { my $class = shift; my $self = {}; bless $self, $class; $self->LoadRepositoryLookTable(); $self->LoadNormalizationTable(); $self->{'transformer'} = Transformer->new(); $self->{'archivenames'} = []; $self->{'countofxmlfiles'} = 0; $self->{'fileno'} = 0; $self->{'xmlwithurls'} = 0; $self->{'xmlwithouturls'} = 0; $self->{'totalsessionrecords'} = 0; $self->{'totalrepositoryrecords'} = 0; $self->{'xmltoobigsorejected'} = 0; $self->{'deletedrecords'} = 0; $self->{'normfailcounttot'} = 0; $self->{'reposnormfailcount'} = 0; $self->{'numfailedrawfiles'} = 0; $self->{'normalization_failures'} = undef; &::ASSERT( open( NORM_FAIL_FILE, ">$NORM_FAIL_FILE" ), qq{Unable to open file "$NORM_FAIL_FILE" for append: $!} ); &::ASSERT( open( REJECTED_REC_FILE, ">$REJECTED_REC_FILE" ), qq{Unable to open file "$REJECTED_REC_FILE" for append: $!} ); return $self; } sub DESTROY { my $self = shift; close( NORM_FAIL_FILE ); unlink $NORM_FAIL_FILE if ( $self->GetSessionNormFailCt() == 0 ); close( REJECTED_REC_FILE ); unlink $REJECTED_REC_FILE if ( $self->GetRejectedRecordCt() == 0 ); } # ---------------------------------------------------------------------- # NAME : AddArchiveName # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub AddArchiveName { my $self = shift; my ( $shortName, $longName ) = @_; my %nameHash; $nameHash{'long'} = $longName; $nameHash{'short'} = $shortName; push( @{ $self->{'archivenames'} }, \%nameHash ); } sub GetArchiveNamesCt { my $self = shift; return scalar( @{ $self->{'archivenames'} } ); } sub GetArchiveShortNameAt { my $self = shift; my $i = shift; &::ASSERT( $self->{'archivenames'}[$i], "Archive name at $i not defined" ); return $self->{'archivenames'}[$i]{'short'}; } sub GetArchiveLongNameAt { my $self = shift; my $i = shift; &::ASSERT( $self->{'archivenames'}[$i], "Archive name at $i not defined" ); return $self->{'archivenames'}[$i]{'long'}; } sub SAN_Helper { return lc( $ { $a }{'long'} ) cmp lc( $ { $b }{'long'} ); } sub SortArchiveNames { my $self = shift; # sort using longname as key my @a = sort SAN_Helper @{ $self->{'archivenames'} }; $self->{'archivenames'} = \@a; } sub GeneratePerlRepositoryNameHashFile { my $self = shift; print( "Generating $REPOSITORY_NAMES from ... " ); &::ASSERT( open( REPOSITORY_NAMES, ">$REPOSITORY_NAMES" ), qq{Unable to open file "$REPOSITORY_NAMES" for write: $!} ); $self->SortArchiveNames(); my $archiveNamesCounter = $self->GetArchiveNamesCt(); print( REPOSITORY_NAMES qq{\@shortnames = \(\n} ); for ( my $i = 0; $i < $archiveNamesCounter; $i++ ) { my $shortName = $self->GetArchiveShortNameAt( $i ); print( REPOSITORY_NAMES qq{"$shortName",\n} ); } print( REPOSITORY_NAMES qq{\);\n\n} ); print( REPOSITORY_NAMES qq{\%longnames = \(\n} ); for ( my $i = 0; $i < $archiveNamesCounter; $i++ ) { my $shortName = $self->GetArchiveShortNameAt( $i ); my $longName = $self->GetArchiveLongNameAt( $i ); print( REPOSITORY_NAMES qq{"$shortName" => "$longName",\n} ); print( qq{ $shortName } ); } print( REPOSITORY_NAMES qq{\);\n\n} ); close( REPOSITORY_NAMES ); print( " ... done.\n" ); } sub InitRawXmlFile { my $self = shift; my $newFile = shift; print( qq{\t\tnew XML file: $newFile\n} ); &::ASSERT( open( RAWXMLFILESTREAM, ">$newFile" ), qq{Unable to open new raw XML file "$newFile" for write: $!} ); my $msg = q{Problem with file write: }; &::ASSERT( ( print( RAWXMLFILESTREAM qq{<?xml version="1.0" encoding="UTF-8"?>\n\n} ) && print( RAWXMLFILESTREAM qq{<Group name=\"} . $self->GetCurrentArchive() . qq{\">\n} ) ), qq{$msg$!} ); } sub FinishUpWithOutputFile { my $self = shift; my $msg = q{Problem with file write: }; &::ASSERT( print( RAWXMLFILESTREAM qq{</Group>\n} ), qq{$msg$!} ); close( RAWXMLFILESTREAM ); } sub ConvertFileFrom { my ( $encoding, $name ) = @_; print qq{===> Attempting conversion from ISO-8859-1 to UTF-8 on $name\n}; &::ASSERT( open( INPUT_XML_FILE, $name ), qq{ConvertFileFrom: Unable to open file "$name" for reading: $!} ); local $/; my $xml = <INPUT_XML_FILE>; &::ASSERT( $xml, qq{ConvertFileFrom: Input XML file $name empty or I/O error reading: $!} ); close( INPUT_XML_FILE ); Encode::from_to( $xml, $encoding, "utf8" ); &::ASSERT( open( INPUT_XML_FILE, ">$name" ), qq{ConvertFileFrom: Unable to open file "$name" for write: $!} ); print( INPUT_XML_FILE $xml ); close( INPUT_XML_FILE ); } sub TranslateToBib { my $self = shift; my ( $institutionBibXmlFileName, $collectionName ) = @_; &::ASSERT( open( OUTXMLFILESTREAM, ">$institutionBibXmlFileName" ), qq{Unable to open new raw XML file "$institutionBibXmlFileName" for write: $!} ); my $msg = q{Problem with file write: }; &::ASSERT( ( print( OUTXMLFILESTREAM qq{<?xml version="1.0" encoding="UTF-8"?>\n} ) && print( OUTXMLFILESTREAM qq{<BIBDB><GROUP NAME="$collectionName">} ) ), qq{$msg$!} ); for ( my $x = 0; $x <= $self->GetFileNo(); $x++ ) { my $name = qq{$OAI_BIB_FILES_ROOT/} . $self->GetCurrentArchive() . q{_} . $x . q{_raw.xml}; print( qq{\ttransform: $name => $institutionBibXmlFileName ...\n} ); # Transform OAI XML to BibClass DTD if ( ! $self->MyTransformer()->OaiToBib( $name, $OAI2BIB_XSL_FILE, \*OUTXMLFILESTREAM ) ) { # Try a conversion OAITransform::ConvertFileFrom( "iso-8859-1", $name ); if ( ! $self->MyTransformer()->OaiToBib( $name, $OAI2BIB_XSL_FILE, \*OUTXMLFILESTREAM ) ) { $self->IncrNumFailedRawFiles(); print qq{===> Parse of $name after conversion ** FAILED **\n}; } else { print qq{===> Parse of $name after conversion ** SUCCESSFUL **\n}; } } } print( qq{done with translation for archive: } . $self->GetCurrentArchive() . qq{\n} ); &::ASSERT( print( OUTXMLFILESTREAM qq{</GROUP></BIBDB>} ), qq{$msg$!} ); close( OUTXMLFILESTREAM ); } # # # ---------------------------------------------------------------------- # NAME : DescendAndProcessFiles # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub DescendAndProcessFiles { my $self = shift; my $dir = shift; print( qq{\tprocessing files in $dir\n} ); &::ASSERT( ( -d $dir ), qq{Argument to DescendAndProcessFiles not a directory: $dir} ); # List files in this dir &::ASSERT( opendir( DIR, $dir ), qq{Unable to opendir $dir: $!} ); my @filesOrDirs = grep{ ( ! /^\./ ) } readdir( DIR ); closedir( DIR ); foreach my $f ( @filesOrDirs ) { my $fOd = "$dir/$f"; if ( -d $fOd ) { # Recurse $self->DescendAndProcessFiles( $fOd ); } else { if ( $fOd =~ m,\.xml$, ) { my $xmlFileCount = $self->GetCountOfXMLFiles(); my $currentXmlRef = $self->ReadXmlFile( $fOd ); if ( $currentXmlRef ) { $self->ParseXML( $currentXmlRef, $fOd ); $self->SetCountOfXMLFiles( ++$xmlFileCount ); } if ( $xmlFileCount >= $maxXmlFiles ) { $self->FinishUpWithOutputFile(); my $currFileNo = $self->GetFileNo(); $self->SetFileNo( ++$currFileNo ); $self->InitRawXmlFile( "$OAI_BIB_FILES_ROOT/" . $self->GetCurrentArchive() . "_" . $self->GetFileNo() . "_raw.xml" ); $self->SetCountOfXMLFiles( 0 ); } } elsif ( $fOd =~ m,\.del$, ) { $self->IncrDeletedRecordsCt(); } else { print qq{UNKNOWN extension. Skipping file: $fOd\n}; } } } } sub ReadXmlFile { my $self = shift; my $file = shift; &::ASSERT( open( XMLFILE, $file ), qq{Unable to open $file: $file ($!)} ); local $/; my $buf = <XMLFILE>; close( XMLFILE ); return \$buf; } sub GetRepositoryName { my $self = shift; my $key = shift; &::ASSERT( $repositoryNames{$key}, qq{No repository name for repository id: $key} ); return $repositoryNames{$key}; } sub LoadRepositoryLookTable { my $self = shift; &::ASSERT( open( REPOSITORY_LOOKUP_TABLE, $REPOSITORY_LOOKUP_TABLE ), qq{No repository identifier file named } . $REPOSITORY_LOOKUP_TABLE . qq{ was found in the current working directory: $!} ); local $/; my $buf = <REPOSITORY_LOOKUP_TABLE>; %repositoryNames = split( "=|\n", $buf ); close( REPOSITORY_LOOKUP_TABLE ); } sub LoadNormalizationTable { my $self = shift; &::ASSERT( open( NORMALIZATION_TABLE, $NORMALIZATION_TABLE ), qq{No normalization file named } . $NORMALIZATION_TABLE . qq{ was found in the current working directory: $!} ); local $/; my $buf = <NORMALIZATION_TABLE>; %OAITransform::normalizationValues = split( "\t+|\n", $buf ); $OAITransform::normalizationRE = join( '|', keys( %OAITransform::normalizationValues ) ); $OAITransform::normalizationRE = qr/$OAITransform::normalizationRE/i; close( NORMALIZATION_TABLE ); } # ---------------------------------------------------------------------- # NAME : ReportNormalizationFailures # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub ReportNormalizationFailures { my $self = shift; my $dir = shift; return if ( $self->GetReposNormFailCt() == 0 ); print( NORM_FAIL_FILE qq{--------------------------------------------------------------\n} . qq{Normalization Type Error Report for archive: $dir } . qq{\n--------------------------------------------------------------\n} ); my $nfHashRef = $self->{'normalization_failures'}; foreach my $nfType ( keys %{ $nfHashRef } ) { print( NORM_FAIL_FILE qq{\t\ttype error: "$nfType", ($$nfHashRef{$nfType}) occurrences\n} ); } } # ---------------------------------------------------------------------- # NAME : ReportRejectedRecord # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub ReportRejectedRecord { my $self = shift; my $xmlFilename = shift; print( REJECTED_REC_FILE qq{Rejected record file = $xmlFilename\n} ); } # ---------------------------------------------------------------------- # NAME : ParseXML # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub NormHelper { my $self = shift; my $type = shift; my @matches; if ( @matches = $type =~ m,$OAITransform::normalizationRE,g ) { return $OAITransform::normalizationValues{lc( $matches[0] )}; } # Failed. No normalization $self->{'normalization_failures'}{$type}++; $self->IncrSessionNormFailCt(); return "NORMALIZATIONFAILURE"; } sub CleanXML { my $self = shift; my $bibRecordRef = shift; # NOTE: Careful. Order matters here # XML element name my $pat = qr/[a-zA-Z_:][a-zA-Z0-9.:_-]*/; # get rid of the dc:, oai_dc: namespaces $$bibRecordRef =~ s,(</?)(dc:|oai_dc:),$1,go; # turn oaidc element into the dc element $$bibRecordRef =~ s,(</?)oaidc,$1dc,go; # elements without content not wanted $$bibRecordRef =~ s,<($pat)[^>]*?>\s*<$1>,,gs; # remove singletons $$bibRecordRef =~ s,<$pat[^>]*/>,,gs; # remove attributes from what remains $$bibRecordRef =~ s,<($pat)[^>]*?>,<$1>,gs; # turn url into identifier $$bibRecordRef =~ s,(</?)url,$1identifier,go; } sub ParseXML { my $self = shift; my ( $bibRecordRef, $xmlFileName ) = @_; # Count me in the session and this repository $self->IncrSessionRecordCt(); $self->IncrRepositoryRecordCt(); # Initial cleanup. Mainly getting rid of namespaces $self->CleanXML( $bibRecordRef ); my ( $metadataElement ) = ( $$bibRecordRef =~ m,(<metadata[^>]*>.*?</metadata>),os ); my ( $headerElement ) = ( $$bibRecordRef =~ m,(<header[^>]*>.*?</header>),os ); # Check for too many elements in the record (Perl hack to count matches) my $titleCount = () = $metadataElement =~ m,<title[^>]*>.*?,gos; my $authorCount = () = $metadataElement =~ m,<(creator|contributor)[^>]*>.*?,gos; my $subjectCount = () = $metadataElement =~ m,]*>.*?,gos; if ( $titleCount > $maxNumOfTitleElements || $subjectCount > $maxNumOfSubjectElements || $authorCount > $maxNumOfAuthorElements ) { print( qq{record rejected: title count = $titleCount, author count = $authorCount, subject count = $subjectCount\n} ); $self->IncrRejectedRecordCt(); $self->ReportRejectedRecord( $xmlFileName ); return; } # Check the record has an child containing a URL my $hasUrl = ( $metadataElement =~ m,]*>(https?|ftp)://.*?,os ); if ( ! $hasUrl ) { $self->IncrXmlWithoutUrlsCt(); return; } # Remove other s that do not contain URL so the XSL # will not wrap them with tags. Instead change them into # pseudo-DC to be transformed to bibclass by the stylesheet. my @identifiers = ( $metadataElement =~ m,(]*>.*?),gos ); foreach my $id ( @identifiers ) { if ( $id !~ m,(https?|ftp)://,os ) { my ( $content ) = $id =~ m,]*>(.*?),os; $metadataElement =~ s,\Q$id\E,$content,s; } } # Count me as a record containing a URL possibly with # children also not containng URLs $self->IncrXmlWithUrlsCt(); # ----------------------------------------- # The record has a URL and was not rejected # Condition the record: Currently we are only interested in the # and
siblings ( children) so we # synthesize a record containing only these elements. In the # process we also drop the XML declaration PI and xmlns attribute # from $$bibRecordRef = qq{\n$headerElement\n$metadataElement\n}; # Insert non-dc-compliant institution element for UI purposes in BibClass my $inst = q{} . $self->GetCurrentArchiveLong() . q{}; $$bibRecordRef =~ s,,\n$inst,s; # Normalize the case-insensitive element content and wrap # it with . Record failures. $$bibRecordRef =~ s|]*>(.*?)|"$1\n" . $self->NormHelper( $1 ) . ""|ges; # Remove normalization failures, if any $$bibRecordRef =~ s,\nNORMALIZATIONFAILURE,,gs; # Emit the record print( RAWXMLFILESTREAM "$$bibRecordRef\n" ); } sub MyTransformer { my $self = shift; return $self->{'transformer'}; } sub SetGeneratingPerl { my $self = shift; $self->{'generateperl'} = 1; } sub GeneratingPerl { my $self = shift; return $self->{'generateperl'}; } sub SetCurrentArchive { my $self = shift; my $arg = shift; $self->{'currentarchive'} = $arg; $self->{'currentarchivelongname'} = $self->GetRepositoryName( $arg ); } sub GetCurrentArchive { my $self = shift; &::ASSERT( $self->{'currentarchive'}, qq{Current archive not set} ); return $self->{'currentarchive'}; } sub GetCurrentArchiveLong { my $self = shift; &::ASSERT( $self->{'currentarchivelongname'}, qq{Current archive long name not set} ); return $self->{'currentarchivelongname'}; } sub SetCountOfXMLFiles { my $self = shift; my $arg = shift; $self->{'countofxmlfiles'} = $arg; } sub GetCountOfXMLFiles { my $self = shift; return $self->{'countofxmlfiles'}; } sub SetFileNo { my $self = shift; my $fn = shift; $self->{'fileno'} = $fn; return $fn; } sub GetFileNo { my $self = shift; return $self->{'fileno'}; } sub ResetXmlWithUrlsCt { my $self = shift; $self->{'xmlwithurls'} = 0; } sub IncrXmlWithUrlsCt { my $self = shift; $self->{'xmlwithurls'}++; } sub GetXmlWithUrlsCt { my $self = shift; return $self->{'xmlwithurls'}; } sub ResetXmlWithoutUrlsCt { my $self = shift; $self->{'xmlwithouturls'} = 0; } sub IncrXmlWithoutUrlsCt { my $self = shift; $self->{'xmlwithouturls'}++; } sub GetXmlWithoutUrlsCt { my $self = shift; return $self->{'xmlwithouturls'}; } sub IncrSessionRecordCt { my $self = shift; $self->{'totalsessionrecords'}++; } sub GetSessionRecordCt { my $self = shift; return $self->{'totalsessionrecords'}; } sub IncrRepositoryRecordCt { my $self = shift; $self->{'totalrepositoryrecords'}++; } sub ResetRepositoryRecordCt { my $self = shift; $self->{'totalrepositoryrecords'} = 0; } sub GetRepositoryRecordCt { my $self = shift; return $self->{'totalrepositoryrecords'}; } sub IncrRejectedRecordCt { my $self = shift; $self->{'xmltoobigsorejected'}++; } sub GetRejectedRecordCt { my $self = shift; return $self->{'xmltoobigsorejected'}; } sub IncrSessionNormFailCt { my $self = shift; $self->{'normfailcounttot'}++; $self->{'reposnormfailcount'}++; } sub GetSessionNormFailCt { my $self = shift; return $self->{'normfailcounttot'}; } sub IncrReposNormFailCt { my $self = shift; $self->{'reposnormfailcount'}++; } sub GetReposNormFailCt { my $self = shift; return $self->{'reposnormfailcount'}; } sub ResetReposNormFailCt { my $self = shift; $self->{'reposnormfailcount'} = 0; $self->{'normalization_failures'} = undef; } sub IncrNumFailedRawFiles { my $self = shift; $self->{'numfailedrawfiles'}++; } sub ResetNumFailedRawFiles { my $self = shift; $self->{'numfailedrawfiles'} = 0; } sub GetNumFailedRawFiles { my $self = shift; return $self->{'numfailedrawfiles'}; } sub ResetDeletedRecordsCt { my $self = shift; $self->{'deletedrecords'} = 0; } sub IncrDeletedRecordsCt { my $self = shift; $self->{'deletedrecords'}++; } sub GetDeletedRecordsCt { my $self = shift; return $self->{'deletedrecords'}; } sub ProcessOneRepositoryResults { my $self = shift; my $dir = shift; my $withURL = $self->GetXmlWithUrlsCt(); my $withoutURL = $self->GetXmlWithoutUrlsCt(); my $rejected = $self->GetRejectedRecordCt(); my $total = $self->GetRepositoryRecordCt(); my $successRate = ( $withURL / $total ) * 100; $successRate =~ s,(.*?\.\d\d?)\d*,$1,; print( qq{\nRepository Report: $dir\n} . qq{\trecords with URLs = $withURL\n} . qq{\trecords without URLs = $withoutURL\n} . qq{\trepository records = $total\n} . qq{\tsuccess rate = $successRate%\n} . qq{\t------------------------\n} . qq{\trecords rejected = $rejected\n} . qq{\tdeleted records (.del) = } . $self->GetDeletedRecordsCt() . qq{\n} . qq{\tnormalization errors = } . $self->GetReposNormFailCt() . qq{\n} . qq{\traw parse failures = } . $self->GetNumFailedRawFiles() . qq{\n} ); my $ai = ArchiveInfo->new( $dir, $self->GetXmlWithUrlsCt(), $self->GetXmlWithoutUrlsCt() ); push( @OAITransform::sessionInfo, $ai ); $self->ReportNormalizationFailures( $dir ); $self->ResetReposNormFailCt(); $self->ResetNumFailedRawFiles(); $self->ResetRepositoryRecordCt(); $self->ResetXmlWithUrlsCt(); $self->ResetXmlWithoutUrlsCt(); $self->ResetDeletedRecordsCt(); } sub PrintSessionResults { my $self = shift; print( qq{\nSession Report\n} ); foreach my $ai ( @OAITransform::sessionInfo ) { my ( $name, $noUrlCt, $withUrlCt ) = $ai->GetArchiveInfoData(); print( qq{\trepository: $name\n\t\telements with URLs = $withUrlCt\n\t\telements without URLs = $noUrlCt\n} ); } my $total = $self->GetSessionRecordCt(); print( qq{\n\ntotal session records = $total\n\n} ); print( qq{Program end.\n} ); } # ------------------------------------------------------------ # # XSLT transformation of raw OAI XML to BibClass DTD XML # # ------------------------------------------------------------ package Transformer; use XML::LibXML; use XML::LibXSLT; sub new { my $class = shift; my $self = {}; bless $self, $class; $self->{'xmlparser'} = new XML::LibXML; $self->{'xslttransformer'} = new XML::LibXSLT; $self->{'compiledstylesheet'} = undef; return $self; } # ---------------------------------------------------------------------- # NAME : # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub OaiToBib { my $self = shift; my ( $inXmlFileName, $xslFileName, $outXmlFileHandle ) = @_; &::ASSERT( open( INPUT_XML_FILE, $inXmlFileName ), qq{OaiToBib: Unable to open file "$inXmlFileName" for reading: $!} ); local $/; my $xml = ; &::ASSERT( $xml, qq{OaiToBib: Input XML file $inXmlFileName empty or I/O error reading: $!} ); close( INPUT_XML_FILE ); my $xformedXmlRef; eval { $xformedXmlRef = $self->XsltTransformData( $inXmlFileName, \$xml, $xslFileName ); }; if ( $@ ) { print( qq{\ttransform ** FAILED **\n\tReason: $@\n} ); return 0; } else { print $outXmlFileHandle $$xformedXmlRef; return 1; } } # ---------------------------------------------------------------------- # NAME : GetStyleSheet # PURPOSE : # CALLS : # INPUT : # RETURNS : # GLOBALS : # SIDE-EFFECTS : # NOTES : # ---------------------------------------------------------------------- sub GetStyleSheet { my $self = shift; my $xslFileName = shift; my $xmlParser = $self->{'xmlparser'}; return $self->{'compiledstylesheet'} if $self->{'compiledstylesheet'}; my $parsedXslDocument; eval { $parsedXslDocument = $xmlParser->parse_file( $xslFileName ); }; &::ASSERT( ! $@, qq{Error parsing XSL file: $xslFileName. Parser details: $@} ); my $stylesheet; my $xsltTransformer = $self->{'xslttransformer'}; eval { $stylesheet = $xsltTransformer->parse_stylesheet( $parsedXslDocument ); }; if ( $@ ) { &::ASSERT( 0, qq{Error processing XML tree for stylesheet: $xslFileName. Details: $@} ); } else { $self->{'compiledstylesheet'} = $stylesheet; } return $stylesheet; } # ---------------------------------------------------------------------- # NAME : XsltTransformData # PURPOSE : Does a XSLT transform of data # CALLS : XSLT and XML libraries # INPUT : # RETURNS : # NOTES : # ---------------------------------------------------------------------- sub XsltTransformData { my $self = shift; my ( $inXmlFileName, $xmlRef, $xslFileName ) = @_; my $xmlParser = $self->{'xmlparser'}; my $parsedXmlPage; eval { $parsedXmlPage = $xmlParser->parse_string( $$xmlRef ); }; &::ASSERT( ! $@, qq{Error parsing input XML file: $inXmlFileName. Parser details: $@} ); my $stylesheet = $self->GetStyleSheet( $xslFileName ); my $results; eval { $results = $stylesheet->transform( $parsedXmlPage ); }; &::ASSERT( ! $@, qq{Error transforming XML tree from $inXmlFileName using stylesheet $xslFileName. Details: $@} ); my $output; eval { $output = $stylesheet->output_string( $results ); }; &::ASSERT( ! $@, qq{Error on output request to XSLT object built from $xslFileName. Details: $@} ); return \$output; } # ---------------------------------------------------------------------- # Private (ai) class that holds status information about the program run # ---------------------------------------------------------------------- package ArchiveInfo; sub new { my $class = shift; my ( $name, $withUrlCt, $noUrlCt ) = @_; my $self = {}; bless $self, $class; $self->{'name'} = $name; $self->{'nourlct'} = $noUrlCt; $self->{'withurlct'} = $withUrlCt; return $self; } sub GetArchiveInfoData { my $self = shift; return ( $self->{'name'}, $self->{'nourlct'}, $self->{'withurlct'} ); }