###########################################################################
#
# MetadataXMLPlugin.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 2006 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# MetadataXMLPlugin process metadata.xml files in a collection
# Here's an example of a metadata file that uses three FileSet structures
# (ignore the # characters):
#
#
#
#
# nugget.*
#
# Nugget Point, The Catlins
# Nugget Point
#
#
#
# nugget-point-1.jpg
#
# Nugget Point Lighthouse, The Catlins
# Lighthouse
#
#
#
# kaka-point-dir
#
# Kaka Point, The Catlins
#
#
#
# Metadata elements are read and applied to files in the order they appear
# in the file.
#
# The FileName element describes the subfiles in the directory that the
# metadata applies to as a perl regular expression (a FileSet group may
# contain multiple FileName elements). So, nugget.*
# indicates that the metadata records in the following Description block
# apply to every subfile that starts with "nugget". For these files, a
# Title metadata element is set, overriding any old value that the Title
# might have had.
#
# Occasionally, we want to have multiple metadata values applied to a
# document; in this case we use the "mode=accumulate" attribute of the
# particular Metadata element. In the second metadata element of the first
# FileSet above, the "Place" metadata is accumulating, and may therefore be
# given several values. If we wanted to override these values and use a
# single metadata element again, we could set the mode attribute to
# "override" instead. Remember: every element is assumed to be in override
# mode unless you specify otherwise, so if you want to accumulate metadata
# for some field, every occurance must have "mode=accumulate" specified.
#
# The second FileSet element above applies to a specific file, called
# nugget-point-1.jpg. This element overrides the Title metadata set in the
# first FileSet, and adds a "Subject" metadata field.
#
# The third and final FileSet sets metadata for a subdirectory rather than
# a file. The metadata specified (a Title) will be passed into the
# subdirectory and applied to every file that occurs in the subdirectory
# (and to every subsubdirectory and its contents, and so on) unless the
# metadata is explictly overridden later in the import.
package MetadataXMLPlugin;
use strict;
no strict 'refs';
use BasePlugin;
use util;
use metadatautil;
sub BEGIN {
@MetadataXMLPlugin::ISA = ('BasePlugin');
unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
}
use XMLParser;
my $arguments = [
{ 'name' => "process_exp",
'desc' => "{BasePlugin.process_exp}",
'type' => "regexp",
'reqd' => "no",
'deft' => &get_default_process_exp() }
];
my $options = { 'name' => "MetadataXMLPlugin",
'desc' => "{MetadataXMLPlugin.desc}",
'abstract' => "no",
'inherits' => "yes",
'args' => $arguments };
my ($self);
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
push(@{$hashArgOptLists->{"OptList"}},$options);
$self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
if ($self->{'info_only'}) {
# don't worry about any options or initialisations etc
return bless $self, $class;
}
# create XML::Parser object for parsing metadata.xml files
my $parser;
if ($]<5.008) {
# Perl 5.6
$parser = new XML::Parser('Style' => 'Stream',
'Handlers' => {'Char' => \&Char,
'Doctype' => \&Doctype
});
}
else {
# Perl 5.8
$parser = new XML::Parser('Style' => 'Stream',
'ProtocolEncoding' => 'ISO-8859-1',
'Handlers' => {'Char' => \&Char,
'Doctype' => \&Doctype
});
}
$self->{'parser'} = $parser;
$self->{'in_filename'} = 0;
return bless $self, $class;
}
sub get_default_process_exp
{
return q^metadata\.xml$^;
}
sub metadata_read
{
my $self = shift (@_);
my ($pluginfo, $base_dir, $file, $block_hash,
$extrametakeys, $extrametadata,$extrametafile,
$processor, $maxdocs, $gli) = @_;
my $filename = &util::filename_cat($base_dir, $file);
if ($filename !~ /metadata\.xml$/ || !-f $filename) {
return undef;
}
$self->{'metadata-file'} = $file;
$self->{'metadata-filename'} = $filename;
my $outhandle = $self->{'outhandle'};
print STDERR "\n\n" if ($gli);
print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
# add the file to the block list so that it won't be processed in read, as we will do all we can with it here
$block_hash->{'file_blocks'}->{$filename} = 1;
$self->{'metadataref'} = $extrametadata;
$self->{'metafileref'} = $extrametafile;
$self->{'metakeysref'} = $extrametakeys;
eval {
$self->{'parser'}->parsefile($filename);
};
if ($@) {
my $plugin_name = ref ($self);
print $outhandle "$plugin_name failed to process $file ($@)\n";
return -1; #error
}
return 1;
}
sub Doctype {
my ($expat, $name, $sysid, $pubid, $internal) = @_;
# allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
# to be processed as well as the "DirectoryMetadata" files which should now
# be created by import.pl
die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
}
sub StartTag {
my ($expat, $element) = @_;
if ($element eq "FileSet") {
$self->{'saved_targets'} = [];
$self->{'saved_metadata'} = {};
}
elsif ($element eq "FileName") {
$self->{'in_filename'} = 1;
}
elsif ($element eq "Metadata") {
$self->{'metadata_name'} = $_{'name'};
$self->{'metadata_value'} = "";
if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
$self->{'metadata_accumulate'} = 1;
} else {
$self->{'metadata_accumulate'} = 0;
}
}
}
sub EndTag {
my ($expat, $element) = @_;
if ($element eq "FileSet") {
foreach my $target (@{$self->{'saved_targets'}}) {
my $file_metadata = $self->{'metadataref'}->{$target};
my $saved_metadata = $self->{'saved_metadata'};
if (!defined $file_metadata) {
$self->{'metadataref'}->{$target} = $saved_metadata;
# not had target before
push (@{$self->{'metakeysref'}}, $target);
}
else {
&metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
}
# now record which metadata.xml file it came from
my $file = $self->{'metadata-file'};
my $filename = $self->{'metadata-filename'};
if (!defined $self->{'metafileref'}->{$target}) {
$self->{'metafileref'}->{$target} = {};
}
$self->{'metafileref'}->{$target}->{$file} = $filename
}
}
elsif ($element eq "FileName") {
$self->{'in_filename'} = 0;
}
elsif ($element eq "Metadata") {
&metadatautil::store_saved_metadata($self,$self->{'metadata_name'}, $self->{'metadata_value'}, $self->{'metadata_accumulate'});
$self->{'metadata_name'} = "";
}
}
sub Text {
if ($self->{'in_filename'}) {
# $_ == FileName content
push (@{$self->{'saved_targets'}}, $_);
}
elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
# $_ == Metadata content
$self->{'metadata_value'} = $_;
}
}
# This Char function overrides the one in XML::Parser::Stream to overcome a
# problem where $expat->{Text} is treated as the return value, slowing
# things down significantly in some cases.
sub Char {
if ($]<5.008) {
use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
}
$_[0]->{'Text'} .= $_[1];
return undef;
}
#sub combine_metadata_structures
#{
# my $self = shift(@_);
#
# my ($mdref1, $mdref2) = @_;
# &metadatautil::combine_metadata_structures($mdref1, $mdref2);
#}
1;