#!/l/local/bin/perl
# © 2004, The Regents of The University of Michigan, All Rights Reserved
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject
# to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
require 5.8.3; # For best Unicode handling
use strict;
use Encode;
#
# Title: OAI data Translation engine
# Description: Acquires data from the UIUC OAI Harvester, changes the OAI xml to bib class xml
# Copyright: Copyright (c) 2004
# Company: The University of Michigan
# author: Phillip Farber based on Java version by Michael Burek
# version: 1.0
#
# ------------------------------------------------------------
# R o o t s
# ------------------------------------------------------------
my $DLXSROOT = $ENV{'DLXSROOT'};
my $OAI_BIN_ROOT = "$DLXSROOT/bin/o/oaister";
my $OAI_ERR_FILES_ROOT = "$OAI_BIN_ROOT/errors";
my $OAI_BIN_TRANSFORM_ROOT = "$OAI_BIN_ROOT/oaitransform";
my $OAI_PREP_ROOT = "$DLXSROOT/prep";
my $OAI_DATA_ROOT = "$OAI_PREP_ROOT/h/harvester";
my $OAI_BIB_FILES_ROOT = "$OAI_PREP_ROOT/o/oaister";
# ------------------------------------------------------------
# F i l e s
# ------------------------------------------------------------
# Perl friendly list of short and longnames to be used by OaiBibApp.pm
my $REPOSITORY_NAMES = "$OAI_BIN_TRANSFORM_ROOT/repositoryNames.pl";
my $REPOSITORY_LOOKUP_TABLE = "$OAI_BIN_TRANSFORM_ROOT/repository_table.txt";
my $NORMALIZATION_TABLE = "$OAI_BIN_TRANSFORM_ROOT/normal_types.txt";
# XSL stylesheet
my $OAI2BIB_XSL_FILE = "$OAI_BIN_TRANSFORM_ROOT/oai-bibclass3.xsl";
# Error reporting
my $REJECTED_REC_FILE = "$OAI_ERR_FILES_ROOT/rejected_records.txt";
my $NORM_FAIL_FILE = "$OAI_ERR_FILES_ROOT/normalization_errors.txt";
# -------------
# System Limits
# -------------
# Max number of records to concatenate into a single bib file
my $maxXmlFiles = 2000;
# Max number of
elements before rejecting the record
my $maxNumOfTitleElements = 100;
# Max number of + (author) elements before rejecting the record
my $maxNumOfAuthorElements = 100;
# Max number of elements before rejecting the record
my $maxNumOfSubjectElements = 100;
# ------------------------------------------------------------
#
# M A I N P R O G R A M
#
# ------------------------------------------------------------
&::ASSERT( $DLXSROOT, qq{You must set the DLXSROOT environment variable} );
# Instantiate an OAI Transform object (ot)
my $ot = OAITransform->new();
# OAI_DATA_ROOT is the path to the root of the UIUC harvested files
my @dirs;
if ( $ARGV[0] )
{
if ( $ARGV[0] eq '-g' )
{
@dirs = &::OpenOaiDataRoot();
$ot->SetGeneratingPerl();
}
elsif ( $ARGV[0] eq '-h' )
{
print( qq{Usage: OAITransform [-h | -g | dirname ]\n\n}.
qq{OAITransform uses the following files:\n\n} .
qq{\t"oai-bibclass3.xsl" to transform simple DC tags \n\tinto DLXS BibClass tags\n\n} .
qq{\t"normal_types.txt" to normalize the simple DC TYPE\n} .
qq{\ttag into 4 normalized types (text, image, audio, video) \n} .
qq{\twhich are put in a DLXS BibClass NORM tag\n\n} .
qq{\t"repository_table.txt" which contains all the repository IDs\n} .
qq{\tand full names of repositories (use repository_table.sample.txt\n} .
qq{\tto get started)\n\n} .
qq{OAITransform generates "repositoryNames.pl" which is a list of \n} .
qq{repository IDs and full names of repositories, built from the \n} .
qq{directory list of repository names living at "$DLXSROOT/prep/h/harvester"\n} .
qq{and "repository_table.txt"\n\n} );
exit 0;
}
else
{
if ( -d "$OAI_DATA_ROOT/$ARGV[0]" )
{
@dirs = $ARGV[0];
}
else
{
print qq{$ARGV[0] is not a repository directory under $OAI_DATA_ROOT\n};
exit 1;
}
}
}
else
{
@dirs = &::OpenOaiDataRoot();
}
if ( $ot->GeneratingPerl() )
{
foreach my $dir ( @dirs )
{
$ot->AddArchiveName( $dir, $ot->GetRepositoryName( $dir ) );
}
$ot->GeneratePerlRepositoryNameHashFile();
}
else
{
print qq{\nStarting OAI transform program...\n};
foreach my $dir ( @dirs )
{
print( qq{\nrepository identifier: $dir\norganization: } . $ot->GetRepositoryName( $dir ) . qq{\n} );
$ot->AddArchiveName( $dir, $ot->GetRepositoryName( $dir ) );
$ot->SetCurrentArchive( $dir );
$ot->SetFileNo( 0 );
$ot->InitRawXmlFile( "$OAI_BIB_FILES_ROOT/" . $ot->GetCurrentArchive() . "_" . $ot->GetFileNo() . "_raw.xml" );
$ot->DescendAndProcessFiles( "$OAI_DATA_ROOT/$dir" );
$ot->FinishUpWithOutputFile();
$ot->TranslateToBib( "$OAI_BIB_FILES_ROOT/" . $ot->GetCurrentArchive() . "_bib.xml", $ot->GetCurrentArchive() );
$ot->ProcessOneRepositoryResults( $dir );
}
if ( scalar( @dirs ) > 1 )
{
$ot->PrintSessionResults();
}
}
exit 0;
# ------------------------------------------------------------
#
# e n d M A I N P R O G R A M
#
# ------------------------------------------------------------
# Utilities
sub ASSERT
{
my ( $condition, $msg ) = @_;
die $msg if ( ! $condition );
}
sub OpenOaiDataRoot
{
&::ASSERT( opendir( OAI_DATA_ROOT, $OAI_DATA_ROOT ),
qq{Unable to opendir $OAI_DATA_ROOT: $!} );
my @dirs = grep{ ( ! /^\./ ) && -d "$OAI_DATA_ROOT/$_" } readdir( OAI_DATA_ROOT );
closedir( OAI_DATA_ROOT );
return @dirs;
}
package OAITransform;
# Private Class data
my %repositoryNames;
my %normalizationValues;
my $normalizationRE;
my @sessionInfo;
# ----------------------------------------------------------------------
# NAME : new
# PURPOSE : Instantiate and OAITransform object (ot)
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub new
{
my $class = shift;
my $self = {};
bless $self, $class;
$self->LoadRepositoryLookTable();
$self->LoadNormalizationTable();
$self->{'transformer'} = Transformer->new();
$self->{'archivenames'} = [];
$self->{'countofxmlfiles'} = 0;
$self->{'fileno'} = 0;
$self->{'xmlwithurls'} = 0;
$self->{'xmlwithouturls'} = 0;
$self->{'totalsessionrecords'} = 0;
$self->{'totalrepositoryrecords'} = 0;
$self->{'xmltoobigsorejected'} = 0;
$self->{'deletedrecords'} = 0;
$self->{'normfailcounttot'} = 0;
$self->{'reposnormfailcount'} = 0;
$self->{'numfailedrawfiles'} = 0;
$self->{'normalization_failures'} = undef;
&::ASSERT( open( NORM_FAIL_FILE, ">$NORM_FAIL_FILE" ),
qq{Unable to open file "$NORM_FAIL_FILE" for append: $!} );
&::ASSERT( open( REJECTED_REC_FILE, ">$REJECTED_REC_FILE" ),
qq{Unable to open file "$REJECTED_REC_FILE" for append: $!} );
return $self;
}
sub DESTROY
{
my $self = shift;
close( NORM_FAIL_FILE );
unlink $NORM_FAIL_FILE if ( $self->GetSessionNormFailCt() == 0 );
close( REJECTED_REC_FILE );
unlink $REJECTED_REC_FILE if ( $self->GetRejectedRecordCt() == 0 );
}
# ----------------------------------------------------------------------
# NAME : AddArchiveName
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub AddArchiveName
{
my $self = shift;
my ( $shortName, $longName ) = @_;
my %nameHash;
$nameHash{'long'} = $longName;
$nameHash{'short'} = $shortName;
push( @{ $self->{'archivenames'} }, \%nameHash );
}
sub GetArchiveNamesCt
{
my $self = shift;
return scalar( @{ $self->{'archivenames'} } );
}
sub GetArchiveShortNameAt
{
my $self = shift;
my $i = shift;
&::ASSERT( $self->{'archivenames'}[$i], "Archive name at $i not defined" );
return $self->{'archivenames'}[$i]{'short'};
}
sub GetArchiveLongNameAt
{
my $self = shift;
my $i = shift;
&::ASSERT( $self->{'archivenames'}[$i], "Archive name at $i not defined" );
return $self->{'archivenames'}[$i]{'long'};
}
sub SAN_Helper
{
return lc( $ { $a }{'long'} ) cmp lc( $ { $b }{'long'} );
}
sub SortArchiveNames
{
my $self = shift;
# sort using longname as key
my @a = sort SAN_Helper @{ $self->{'archivenames'} };
$self->{'archivenames'} = \@a;
}
sub GeneratePerlRepositoryNameHashFile
{
my $self = shift;
print( "Generating $REPOSITORY_NAMES from ... " );
&::ASSERT( open( REPOSITORY_NAMES, ">$REPOSITORY_NAMES" ),
qq{Unable to open file "$REPOSITORY_NAMES" for write: $!} );
$self->SortArchiveNames();
my $archiveNamesCounter = $self->GetArchiveNamesCt();
print( REPOSITORY_NAMES qq{\@shortnames = \(\n} );
for ( my $i = 0; $i < $archiveNamesCounter; $i++ )
{
my $shortName = $self->GetArchiveShortNameAt( $i );
print( REPOSITORY_NAMES qq{"$shortName",\n} );
}
print( REPOSITORY_NAMES qq{\);\n\n} );
print( REPOSITORY_NAMES qq{\%longnames = \(\n} );
for ( my $i = 0; $i < $archiveNamesCounter; $i++ )
{
my $shortName = $self->GetArchiveShortNameAt( $i );
my $longName = $self->GetArchiveLongNameAt( $i );
print( REPOSITORY_NAMES qq{"$shortName" => "$longName",\n} );
print( qq{ $shortName } );
}
print( REPOSITORY_NAMES qq{\);\n\n} );
close( REPOSITORY_NAMES );
print( " ... done.\n" );
}
sub InitRawXmlFile
{
my $self = shift;
my $newFile = shift;
print( qq{\t\tnew XML file: $newFile\n} );
&::ASSERT( open( RAWXMLFILESTREAM, ">$newFile" ),
qq{Unable to open new raw XML file "$newFile" for write: $!} );
my $msg = q{Problem with file write: };
&::ASSERT(
( print( RAWXMLFILESTREAM
qq{\n\n} ) &&
print( RAWXMLFILESTREAM
qq{GetCurrentArchive() . qq{\">\n} ) ),
qq{$msg$!} );
}
sub FinishUpWithOutputFile
{
my $self = shift;
my $msg = q{Problem with file write: };
&::ASSERT( print( RAWXMLFILESTREAM qq{\n} ), qq{$msg$!} );
close( RAWXMLFILESTREAM );
}
sub ConvertFileFrom
{
my ( $encoding, $name ) = @_;
print qq{===> Attempting conversion from ISO-8859-1 to UTF-8 on $name\n};
&::ASSERT( open( INPUT_XML_FILE, $name ),
qq{ConvertFileFrom: Unable to open file "$name" for reading: $!} );
local $/;
my $xml = ;
&::ASSERT( $xml, qq{ConvertFileFrom: Input XML file $name empty or I/O error reading: $!} );
close( INPUT_XML_FILE );
Encode::from_to( $xml, $encoding, "utf8" );
&::ASSERT( open( INPUT_XML_FILE, ">$name" ),
qq{ConvertFileFrom: Unable to open file "$name" for write: $!} );
print( INPUT_XML_FILE $xml );
close( INPUT_XML_FILE );
}
sub TranslateToBib
{
my $self = shift;
my ( $institutionBibXmlFileName, $collectionName ) = @_;
&::ASSERT( open( OUTXMLFILESTREAM, ">$institutionBibXmlFileName" ),
qq{Unable to open new raw XML file "$institutionBibXmlFileName" for write: $!} );
my $msg = q{Problem with file write: };
&::ASSERT(
( print( OUTXMLFILESTREAM qq{\n} ) &&
print( OUTXMLFILESTREAM qq{} ) ),
qq{$msg$!} );
for ( my $x = 0; $x <= $self->GetFileNo(); $x++ )
{
my $name = qq{$OAI_BIB_FILES_ROOT/} . $self->GetCurrentArchive() . q{_} . $x . q{_raw.xml};
print( qq{\ttransform: $name => $institutionBibXmlFileName ...\n} );
# Transform OAI XML to BibClass DTD
if ( ! $self->MyTransformer()->OaiToBib( $name, $OAI2BIB_XSL_FILE, \*OUTXMLFILESTREAM ) )
{
# Try a conversion
OAITransform::ConvertFileFrom( "iso-8859-1", $name );
if ( ! $self->MyTransformer()->OaiToBib( $name, $OAI2BIB_XSL_FILE, \*OUTXMLFILESTREAM ) )
{
$self->IncrNumFailedRawFiles();
print qq{===> Parse of $name after conversion ** FAILED **\n};
}
else
{
print qq{===> Parse of $name after conversion ** SUCCESSFUL **\n};
}
}
}
print( qq{done with translation for archive: } . $self->GetCurrentArchive() . qq{\n} );
&::ASSERT(
print( OUTXMLFILESTREAM qq{} ), qq{$msg$!} );
close( OUTXMLFILESTREAM );
}
#
#
# ----------------------------------------------------------------------
# NAME : DescendAndProcessFiles
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub DescendAndProcessFiles
{
my $self = shift;
my $dir = shift;
print( qq{\tprocessing files in $dir\n} );
&::ASSERT( ( -d $dir ), qq{Argument to DescendAndProcessFiles not a directory: $dir} );
# List files in this dir
&::ASSERT( opendir( DIR, $dir ), qq{Unable to opendir $dir: $!} );
my @filesOrDirs = grep{ ( ! /^\./ ) } readdir( DIR );
closedir( DIR );
foreach my $f ( @filesOrDirs )
{
my $fOd = "$dir/$f";
if ( -d $fOd )
{
# Recurse
$self->DescendAndProcessFiles( $fOd );
}
else
{
if ( $fOd =~ m,\.xml$, )
{
my $xmlFileCount = $self->GetCountOfXMLFiles();
my $currentXmlRef = $self->ReadXmlFile( $fOd );
if ( $currentXmlRef )
{
$self->ParseXML( $currentXmlRef, $fOd );
$self->SetCountOfXMLFiles( ++$xmlFileCount );
}
if ( $xmlFileCount >= $maxXmlFiles )
{
$self->FinishUpWithOutputFile();
my $currFileNo = $self->GetFileNo();
$self->SetFileNo( ++$currFileNo );
$self->InitRawXmlFile( "$OAI_BIB_FILES_ROOT/" . $self->GetCurrentArchive() . "_" . $self->GetFileNo() . "_raw.xml" );
$self->SetCountOfXMLFiles( 0 );
}
}
elsif ( $fOd =~ m,\.del$, )
{
$self->IncrDeletedRecordsCt();
}
else
{
print qq{UNKNOWN extension. Skipping file: $fOd\n};
}
}
}
}
sub ReadXmlFile
{
my $self = shift;
my $file = shift;
&::ASSERT( open( XMLFILE, $file ), qq{Unable to open $file: $file ($!)} );
local $/;
my $buf = ;
close( XMLFILE );
return \$buf;
}
sub GetRepositoryName
{
my $self = shift;
my $key = shift;
&::ASSERT( $repositoryNames{$key}, qq{No repository name for repository id: $key} );
return $repositoryNames{$key};
}
sub LoadRepositoryLookTable
{
my $self = shift;
&::ASSERT( open( REPOSITORY_LOOKUP_TABLE, $REPOSITORY_LOOKUP_TABLE ),
qq{No repository identifier file named }
. $REPOSITORY_LOOKUP_TABLE
. qq{ was found in the current working directory: $!} );
local $/;
my $buf = ;
%repositoryNames = split( "=|\n", $buf );
close( REPOSITORY_LOOKUP_TABLE );
}
sub LoadNormalizationTable
{
my $self = shift;
&::ASSERT( open( NORMALIZATION_TABLE, $NORMALIZATION_TABLE ),
qq{No normalization file named }
. $NORMALIZATION_TABLE
. qq{ was found in the current working directory: $!} );
local $/;
my $buf = ;
%OAITransform::normalizationValues = split( "\t+|\n", $buf );
$OAITransform::normalizationRE = join( '|', keys( %OAITransform::normalizationValues ) );
$OAITransform::normalizationRE = qr/$OAITransform::normalizationRE/i;
close( NORMALIZATION_TABLE );
}
# ----------------------------------------------------------------------
# NAME : ReportNormalizationFailures
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub ReportNormalizationFailures
{
my $self = shift;
my $dir = shift;
return if ( $self->GetReposNormFailCt() == 0 );
print( NORM_FAIL_FILE
qq{--------------------------------------------------------------\n}
. qq{Normalization Type Error Report for archive: $dir }
. qq{\n--------------------------------------------------------------\n} );
my $nfHashRef = $self->{'normalization_failures'};
foreach my $nfType ( keys %{ $nfHashRef } )
{
print( NORM_FAIL_FILE qq{\t\ttype error: "$nfType", ($$nfHashRef{$nfType}) occurrences\n} );
}
}
# ----------------------------------------------------------------------
# NAME : ReportRejectedRecord
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub ReportRejectedRecord
{
my $self = shift;
my $xmlFilename = shift;
print( REJECTED_REC_FILE qq{Rejected record file = $xmlFilename\n} );
}
# ----------------------------------------------------------------------
# NAME : ParseXML
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub NormHelper
{
my $self = shift;
my $type = shift;
my @matches;
if ( @matches = $type =~ m,$OAITransform::normalizationRE,g )
{
return $OAITransform::normalizationValues{lc( $matches[0] )};
}
# Failed. No normalization
$self->{'normalization_failures'}{$type}++;
$self->IncrSessionNormFailCt();
return "NORMALIZATIONFAILURE";
}
sub CleanXML
{
my $self = shift;
my $bibRecordRef = shift;
# NOTE: Careful. Order matters here
# XML element name
my $pat = qr/[a-zA-Z_:][a-zA-Z0-9.:_-]*/;
# get rid of the dc:, oai_dc: namespaces
$$bibRecordRef =~ s,(?)(dc:|oai_dc:),$1,go;
# turn oaidc element into the dc element
$$bibRecordRef =~ s,(?)oaidc,$1dc,go;
# elements without content not wanted
$$bibRecordRef =~ s,<($pat)[^>]*?>\s*<$1>,,gs;
# remove singletons
$$bibRecordRef =~ s,<$pat[^>]*/>,,gs;
# remove attributes from what remains
$$bibRecordRef =~ s,<($pat)[^>]*?>,<$1>,gs;
# turn url into identifier
$$bibRecordRef =~ s,(?)url,$1identifier,go;
}
sub ParseXML
{
my $self = shift;
my ( $bibRecordRef, $xmlFileName ) = @_;
# Count me in the session and this repository
$self->IncrSessionRecordCt();
$self->IncrRepositoryRecordCt();
# Initial cleanup. Mainly getting rid of namespaces
$self->CleanXML( $bibRecordRef );
my ( $metadataElement ) = ( $$bibRecordRef =~ m,(]*>.*?),os );
my ( $headerElement ) = ( $$bibRecordRef =~ m,(),os );
# Check for too many elements in the record (Perl hack to count matches)
my $titleCount = () = $metadataElement =~ m,]*>.*?,gos;
my $authorCount = () = $metadataElement =~ m,<(creator|contributor)[^>]*>.*?\1>,gos;
my $subjectCount = () = $metadataElement =~ m,]*>.*?,gos;
if ( $titleCount > $maxNumOfTitleElements ||
$subjectCount > $maxNumOfSubjectElements ||
$authorCount > $maxNumOfAuthorElements )
{
print( qq{record rejected: title count = $titleCount, author count = $authorCount, subject count = $subjectCount\n} );
$self->IncrRejectedRecordCt();
$self->ReportRejectedRecord( $xmlFileName );
return;
}
# Check the record has an child containing a URL
my $hasUrl = ( $metadataElement =~ m,]*>(https?|ftp)://.*?,os );
if ( ! $hasUrl )
{
$self->IncrXmlWithoutUrlsCt();
return;
}
# Remove other s that do not contain URL so the XSL
# will not wrap them with tags. Instead change them into
# pseudo-DC to be transformed to bibclass by the stylesheet.
my @identifiers = ( $metadataElement =~ m,(]*>.*?),gos );
foreach my $id ( @identifiers )
{
if ( $id !~ m,(https?|ftp)://,os )
{
my ( $content ) = $id =~ m,]*>(.*?),os;
$metadataElement =~ s,\Q$id\E,$content,s;
}
}
# Count me as a record containing a URL possibly with
# children also not containng URLs
$self->IncrXmlWithUrlsCt();
# -----------------------------------------
# The record has a URL and was not rejected
# Condition the record: Currently we are only interested in the
# and siblings ( children) so we
# synthesize a record containing only these elements. In the
# process we also drop the XML declaration PI and xmlns attribute
# from
$$bibRecordRef = qq{\n$headerElement\n$metadataElement\n};
# Insert non-dc-compliant institution element for UI purposes in BibClass
my $inst = q{} . $self->GetCurrentArchiveLong() . q{};
$$bibRecordRef =~ s,,\n$inst,s;
# Normalize the case-insensitive element content and wrap
# it with . Record failures.
$$bibRecordRef =~ s|]*>(.*?)|"$1\n" . $self->NormHelper( $1 ) . ""|ges;
# Remove normalization failures, if any
$$bibRecordRef =~ s,\nNORMALIZATIONFAILURE,,gs;
# Emit the record
print( RAWXMLFILESTREAM "$$bibRecordRef\n" );
}
sub MyTransformer
{
my $self = shift;
return $self->{'transformer'};
}
sub SetGeneratingPerl
{
my $self = shift;
$self->{'generateperl'} = 1;
}
sub GeneratingPerl
{
my $self = shift;
return $self->{'generateperl'};
}
sub SetCurrentArchive
{
my $self = shift;
my $arg = shift;
$self->{'currentarchive'} = $arg;
$self->{'currentarchivelongname'} = $self->GetRepositoryName( $arg );
}
sub GetCurrentArchive
{
my $self = shift;
&::ASSERT( $self->{'currentarchive'}, qq{Current archive not set} );
return $self->{'currentarchive'};
}
sub GetCurrentArchiveLong
{
my $self = shift;
&::ASSERT( $self->{'currentarchivelongname'}, qq{Current archive long name not set} );
return $self->{'currentarchivelongname'};
}
sub SetCountOfXMLFiles
{
my $self = shift;
my $arg = shift;
$self->{'countofxmlfiles'} = $arg;
}
sub GetCountOfXMLFiles
{
my $self = shift;
return $self->{'countofxmlfiles'};
}
sub SetFileNo
{
my $self = shift;
my $fn = shift;
$self->{'fileno'} = $fn;
return $fn;
}
sub GetFileNo
{
my $self = shift;
return $self->{'fileno'};
}
sub ResetXmlWithUrlsCt
{
my $self = shift;
$self->{'xmlwithurls'} = 0;
}
sub IncrXmlWithUrlsCt
{
my $self = shift;
$self->{'xmlwithurls'}++;
}
sub GetXmlWithUrlsCt
{
my $self = shift;
return $self->{'xmlwithurls'};
}
sub ResetXmlWithoutUrlsCt
{
my $self = shift;
$self->{'xmlwithouturls'} = 0;
}
sub IncrXmlWithoutUrlsCt
{
my $self = shift;
$self->{'xmlwithouturls'}++;
}
sub GetXmlWithoutUrlsCt
{
my $self = shift;
return $self->{'xmlwithouturls'};
}
sub IncrSessionRecordCt
{
my $self = shift;
$self->{'totalsessionrecords'}++;
}
sub GetSessionRecordCt
{
my $self = shift;
return $self->{'totalsessionrecords'};
}
sub IncrRepositoryRecordCt
{
my $self = shift;
$self->{'totalrepositoryrecords'}++;
}
sub ResetRepositoryRecordCt
{
my $self = shift;
$self->{'totalrepositoryrecords'} = 0;
}
sub GetRepositoryRecordCt
{
my $self = shift;
return $self->{'totalrepositoryrecords'};
}
sub IncrRejectedRecordCt
{
my $self = shift;
$self->{'xmltoobigsorejected'}++;
}
sub GetRejectedRecordCt
{
my $self = shift;
return $self->{'xmltoobigsorejected'};
}
sub IncrSessionNormFailCt
{
my $self = shift;
$self->{'normfailcounttot'}++;
$self->{'reposnormfailcount'}++;
}
sub GetSessionNormFailCt
{
my $self = shift;
return $self->{'normfailcounttot'};
}
sub IncrReposNormFailCt
{
my $self = shift;
$self->{'reposnormfailcount'}++;
}
sub GetReposNormFailCt
{
my $self = shift;
return $self->{'reposnormfailcount'};
}
sub ResetReposNormFailCt
{
my $self = shift;
$self->{'reposnormfailcount'} = 0;
$self->{'normalization_failures'} = undef;
}
sub IncrNumFailedRawFiles
{
my $self = shift;
$self->{'numfailedrawfiles'}++;
}
sub ResetNumFailedRawFiles
{
my $self = shift;
$self->{'numfailedrawfiles'} = 0;
}
sub GetNumFailedRawFiles
{
my $self = shift;
return $self->{'numfailedrawfiles'};
}
sub ResetDeletedRecordsCt
{
my $self = shift;
$self->{'deletedrecords'} = 0;
}
sub IncrDeletedRecordsCt
{
my $self = shift;
$self->{'deletedrecords'}++;
}
sub GetDeletedRecordsCt
{
my $self = shift;
return $self->{'deletedrecords'};
}
sub ProcessOneRepositoryResults
{
my $self = shift;
my $dir = shift;
my $withURL = $self->GetXmlWithUrlsCt();
my $withoutURL = $self->GetXmlWithoutUrlsCt();
my $rejected = $self->GetRejectedRecordCt();
my $total = $self->GetRepositoryRecordCt();
my $successRate = ( $withURL / $total ) * 100;
$successRate =~ s,(.*?\.\d\d?)\d*,$1,;
print( qq{\nRepository Report: $dir\n}
. qq{\trecords with URLs = $withURL\n}
. qq{\trecords without URLs = $withoutURL\n}
. qq{\trepository records = $total\n}
. qq{\tsuccess rate = $successRate%\n}
. qq{\t------------------------\n}
. qq{\trecords rejected = $rejected\n}
. qq{\tdeleted records (.del) = } . $self->GetDeletedRecordsCt() . qq{\n}
. qq{\tnormalization errors = } . $self->GetReposNormFailCt() . qq{\n}
. qq{\traw parse failures = } . $self->GetNumFailedRawFiles() . qq{\n}
);
my $ai = ArchiveInfo->new( $dir, $self->GetXmlWithUrlsCt(), $self->GetXmlWithoutUrlsCt() );
push( @OAITransform::sessionInfo, $ai );
$self->ReportNormalizationFailures( $dir );
$self->ResetReposNormFailCt();
$self->ResetNumFailedRawFiles();
$self->ResetRepositoryRecordCt();
$self->ResetXmlWithUrlsCt();
$self->ResetXmlWithoutUrlsCt();
$self->ResetDeletedRecordsCt();
}
sub PrintSessionResults
{
my $self = shift;
print( qq{\nSession Report\n} );
foreach my $ai ( @OAITransform::sessionInfo )
{
my ( $name, $noUrlCt, $withUrlCt ) = $ai->GetArchiveInfoData();
print( qq{\trepository: $name\n\t\telements with URLs = $withUrlCt\n\t\telements without URLs = $noUrlCt\n} );
}
my $total = $self->GetSessionRecordCt();
print( qq{\n\ntotal session records = $total\n\n} );
print( qq{Program end.\n} );
}
# ------------------------------------------------------------
#
# XSLT transformation of raw OAI XML to BibClass DTD XML
#
# ------------------------------------------------------------
package Transformer;
use XML::LibXML;
use XML::LibXSLT;
sub new
{
my $class = shift;
my $self = {};
bless $self, $class;
$self->{'xmlparser'} = new XML::LibXML;
$self->{'xslttransformer'} = new XML::LibXSLT;
$self->{'compiledstylesheet'} = undef;
return $self;
}
# ----------------------------------------------------------------------
# NAME :
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub OaiToBib
{
my $self = shift;
my ( $inXmlFileName, $xslFileName, $outXmlFileHandle ) = @_;
&::ASSERT( open( INPUT_XML_FILE, $inXmlFileName ),
qq{OaiToBib: Unable to open file "$inXmlFileName" for reading: $!} );
local $/;
my $xml = ;
&::ASSERT( $xml, qq{OaiToBib: Input XML file $inXmlFileName empty or I/O error reading: $!} );
close( INPUT_XML_FILE );
my $xformedXmlRef;
eval
{
$xformedXmlRef = $self->XsltTransformData( $inXmlFileName, \$xml, $xslFileName );
};
if ( $@ )
{
print( qq{\ttransform ** FAILED **\n\tReason: $@\n} );
return 0;
}
else
{
print $outXmlFileHandle $$xformedXmlRef;
return 1;
}
}
# ----------------------------------------------------------------------
# NAME : GetStyleSheet
# PURPOSE :
# CALLS :
# INPUT :
# RETURNS :
# GLOBALS :
# SIDE-EFFECTS :
# NOTES :
# ----------------------------------------------------------------------
sub GetStyleSheet
{
my $self = shift;
my $xslFileName = shift;
my $xmlParser = $self->{'xmlparser'};
return $self->{'compiledstylesheet'} if $self->{'compiledstylesheet'};
my $parsedXslDocument;
eval
{ $parsedXslDocument = $xmlParser->parse_file( $xslFileName ); };
&::ASSERT( ! $@, qq{Error parsing XSL file: $xslFileName. Parser details: $@} );
my $stylesheet;
my $xsltTransformer = $self->{'xslttransformer'};
eval
{ $stylesheet = $xsltTransformer->parse_stylesheet( $parsedXslDocument ); };
if ( $@ )
{
&::ASSERT( 0, qq{Error processing XML tree for stylesheet: $xslFileName. Details: $@} );
}
else
{
$self->{'compiledstylesheet'} = $stylesheet;
}
return $stylesheet;
}
# ----------------------------------------------------------------------
# NAME : XsltTransformData
# PURPOSE : Does a XSLT transform of data
# CALLS : XSLT and XML libraries
# INPUT :
# RETURNS :
# NOTES :
# ----------------------------------------------------------------------
sub XsltTransformData
{
my $self = shift;
my ( $inXmlFileName, $xmlRef, $xslFileName ) = @_;
my $xmlParser = $self->{'xmlparser'};
my $parsedXmlPage;
eval
{ $parsedXmlPage = $xmlParser->parse_string( $$xmlRef ); };
&::ASSERT( ! $@, qq{Error parsing input XML file: $inXmlFileName. Parser details: $@} );
my $stylesheet = $self->GetStyleSheet( $xslFileName );
my $results;
eval
{ $results = $stylesheet->transform( $parsedXmlPage ); };
&::ASSERT( ! $@, qq{Error transforming XML tree from $inXmlFileName using stylesheet $xslFileName. Details: $@} );
my $output;
eval
{ $output = $stylesheet->output_string( $results ); };
&::ASSERT( ! $@, qq{Error on output request to XSLT object built from $xslFileName. Details: $@} );
return \$output;
}
# ----------------------------------------------------------------------
# Private (ai) class that holds status information about the program run
# ----------------------------------------------------------------------
package ArchiveInfo;
sub new
{
my $class = shift;
my ( $name, $withUrlCt, $noUrlCt ) = @_;
my $self = {};
bless $self, $class;
$self->{'name'} = $name;
$self->{'nourlct'} = $noUrlCt;
$self->{'withurlct'} = $withUrlCt;
return $self;
}
sub GetArchiveInfoData
{
my $self = shift;
return ( $self->{'name'}, $self->{'nourlct'}, $self->{'withurlct'} );
}