###########################################################################
#
# basebuilder.pm -- base class for collection builders
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package basebuilder;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
use classify;
use cfgread;
use colcfg;
use dbutil;
use plugin;
use util;
BEGIN {
# set autoflush on for STDERR and STDOUT so that mgpp
# doesn't get out of sync with plugins
STDOUT->autoflush(1);
STDERR->autoflush(1);
}
END {
STDOUT->autoflush(0);
STDERR->autoflush(0);
}
our $maxdocsize = 12000;
# used to signify "gs2"(default) or "gs3"
my $gs_mode = "gs2";
sub new {
my ($class, $collection, $source_dir, $build_dir, $verbosity,
$maxdocs, $debug, $keepold, $incremental,
$remove_empty_classifications,
$outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
$outhandle = *STDERR unless defined $outhandle;
$no_text = 0 unless defined $no_text;
$failhandle = *STDERR unless defined $failhandle;
# create a builder object
my $self = bless {'collection'=>$collection,
'source_dir'=>$source_dir,
'build_dir'=>$build_dir,
'verbosity'=>$verbosity,
'maxdocs'=>$maxdocs,
'debug'=>$debug,
'keepold'=>$keepold,
'incremental'=>$incremental,
'remove_empty_classifications'=>$remove_empty_classifications,
'outhandle'=>$outhandle,
'no_text'=>$no_text,
'failhandle'=>$failhandle,
'notbuilt'=>{}, # indexes not built
'gli'=>$gli,
'disable_OAI'=>$disable_OAI
}, $class;
$self->{'gli'} = 0 unless defined $self->{'gli'};
# disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml
$self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
# Read in the collection configuration file.
my ($colcfgname);
($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
if ($gs_mode eq "gs2") {
$self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
} elsif ($gs_mode eq "gs3") {
$self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
#this $self->{'collect_cfg_preserve'} is used for gs3 only and to be passed to &colcfg::write_build_cfg_xml in sub make_auxilary_files later in this basebuilder.pm, we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
$self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg_xml ($colcfgname);
}
# get the database type for this collection from the collect.cfg file (may be undefined)
$self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
# get the list of plugins for this collection
my $plugins = [];
if (defined $self->{'collect_cfg'}->{'plugin'}) {
$plugins = $self->{'collect_cfg'}->{'plugin'};
}
# load all the plugins
#build up the extra global options for the plugins
my @global_opts = ();
if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
push @global_opts, "-separate_cjk";
}
$self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
if (scalar(@{$self->{'pluginfo'}}) == 0) {
print $outhandle "No plugins were loaded.\n";
die "\n";
}
# get the list of classifiers for this collection
my $classifiers = [];
if (defined $self->{'collect_cfg'}->{'classify'}) {
$classifiers = $self->{'collect_cfg'}->{'classify'};
}
# load all the classifiers
$self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
# load up any dontdb fields
$self->{'dontdb'} = {};
if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
$self->{'dontdb'}->{$dg} = 1;
}
}
$self->{'maxnumeric'} = 4;
return $self;
}
# stuff has been moved here from new, so we can use subclass methods
sub init {
my $self = shift(@_);
$self->generate_index_list();
my $indexes = $self->{'collect_cfg'}->{'indexes'};
if (defined $indexes) {
# sort out subcollection indexes
if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
$self->{'collect_cfg'}->{'indexes'} = [];
foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
foreach my $index (@$indexes) {
push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
}
}
}
# sort out language subindexes
if (defined $self->{'collect_cfg'}->{'languages'}) {
$indexes = $self->{'collect_cfg'}->{'indexes'};
$self->{'collect_cfg'}->{'indexes'} = [];
foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
foreach my $index (@$indexes) {
if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
}
else { # add in an empty subcollection field
push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
}
}
}
}
}
if (defined($self->{'collect_cfg'}->{'indexes'})) {
# make sure that the same index isn't specified more than once
my %tmphash = ();
my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
$self->{'collect_cfg'}->{'indexes'} = [];
foreach my $i (@tmparray) {
if (!defined ($tmphash{$i})) {
push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
$tmphash{$i} = 1;
}
}
} else {
$self->{'collect_cfg'}->{'indexes'} = [];
}
# load up the document processor for building
# if a buildproc class has been created for this collection, use it
# otherwise, use the mg buildproc
my ($buildprocdir, $buildproctype);
my $collection = $self->{'collection'};
if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
$buildproctype = "custombuildproc";
} elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
$buildproctype = "custombuildproc";
} elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
$buildproctype = "${collection}buildproc";
} else {
$buildprocdir = "$ENV{'GSDLHOME'}/perllib";
$buildproctype = $self->default_buildproc();
}
require "$buildprocdir/$buildproctype.pm";
eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
"\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
die "$@" if $@;
$self->generate_index_options();
if (!$self->{'debug'} && !$self->{'keepold'}) {
# remove any old builds
&util::rm_r($self->{'build_dir'});
&util::mk_all_dir($self->{'build_dir'});
# make the text directory
my $textdir = "$self->{'build_dir'}/text";
&util::mk_all_dir($textdir);
}
if ($self->{'incremental'}) {
# some classes may need to do some additional initialisation
$self->init_for_incremental_build();
}
}
# implement this in subclass if want to do additional initialisation for an
# incremental build
sub init_for_incremental_build {
my $self = shift (@_);
}
sub deinit {
my $self = shift (@_);
&plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
}
sub generate_index_options {
my $self = shift (@_);
my $separate_cjk = 0;
if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
if ($option =~ /separate_cjk/) {
$separate_cjk = 1;
}
}
}
# set this for building
$self->{'buildproc'}->set_separate_cjk($separate_cjk);
# record it for build.cfg
$self->{'separate_cjk'} = $separate_cjk;
}
sub set_sections_index_document_metadata {
my $self = shift (@_);
my ($index) = @_;
$self->{'buildproc'}->set_sections_index_document_metadata($index);
}
sub set_maxnumeric {
my $self = shift (@_);
my ($maxnumeric) = @_;
$self->{'maxnumeric'} = $maxnumeric;
}
sub set_strip_html {
my $self = shift (@_);
my ($strip) = @_;
$self->{'strip_html'} = $strip;
$self->{'buildproc'}->set_strip_html($strip);
}
sub compress_text {
my $self = shift (@_);
my ($textindex) = @_;
print STDERR "compress_text() should be implemented in subclass!!";
return;
}
sub build_indexes {
my $self = shift (@_);
my ($indexname) = @_;
my $outhandle = $self->{'outhandle'};
my $indexes = [];
if (defined $indexname && $indexname =~ /\w/) {
push @$indexes, $indexname;
} else {
$indexes = $self->{'collect_cfg'}->{'indexes'};
}
# create the mapping between the index descriptions
# and their directory names (includes subcolls and langs)
$self->{'index_mapping'} = $self->create_index_mapping ($indexes);
# build each of the indexes
foreach my $index (@$indexes) {
if ($self->want_built($index)) {
print $outhandle "\n*** building index $index in subdirectory " .
"$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
$self->build_index($index);
} else {
print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
}
}
$self->build_indexes_extra();
}
# implement this in subclass if want to do extra stuff at the end of building
# all the indexes
sub build_indexes_extra {
my $self = shift(@_);
}
sub build_index {
my $self = shift (@_);
my ($index) = @_;
print STDERR "build_index should be implemented in subclass\n";
return;
}
sub make_infodatabase {
my $self = shift (@_);
my $outhandle = $self->{'outhandle'};
print STDERR "BuildDir: $self->{'build_dir'}\n";
my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
&util::mk_all_dir ($textdir);
&util::mk_all_dir ($assocdir);
# Get info database file path
my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $textdir);
print $outhandle "\n*** creating the info database and processing associated files\n"
if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
# init all the classifiers
&classify::init_classifiers ($self->{'classifiers'});
my $reconstructed_docs = undef;
if ($self->{'keepold'}) {
# reconstruct doc_obj metadata from database for all docs
$reconstructed_docs = &classify::reconstruct_doc_objs_metadata($self->{'infodbtype'}, $infodb_file_path);
}
# set up the document processor
my ($infodb_handle);
if ($self->{'debug'}) {
$infodb_handle = *STDOUT;
}
else {
$infodb_handle = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $infodb_file_path);
if (!defined($infodb_handle))
{
print STDERR "\n\n" if $self->{'gli'};
die "builder::make_infodatabase - couldn't open infodb write handle\n";
}
}
$self->{'buildproc'}->set_infodbtype ($self->{'infodbtype'});
$self->{'buildproc'}->set_output_handle ($infodb_handle);
$self->{'buildproc'}->set_mode ('infodb');
$self->{'buildproc'}->set_assocdir ($assocdir);
$self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
$self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
$self->{'buildproc'}->set_indexing_text (0);
$self->{'buildproc'}->set_store_text(1);
$self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
# make_infodatabase needs full reset even for incremental build
# as incremental works by reconstructing all docs from the database and
# then adding in the new ones
$self->{'buildproc'}->zero_reset();
$self->{'buildproc'}->{'mdprefix_fields'} = {};
if ($self->{'keepold'}) {
# create flat classify structure, ready for new docs to be added
foreach my $doc_obj ( @$reconstructed_docs ) {
print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
$self->{'buildproc'}->process($doc_obj,undef);
}
}
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
# this has changed to only output collection meta if its
# not in the config file
$self->output_collection_meta($infodb_handle);
# output classification information
&classify::output_classify_info ($self->{'classifiers'}, $self->{'infodbtype'}, $infodb_handle,
$self->{'remove_empty_classifications'},
$self->{'gli'});
# Output classifier reverse lookup, used in incremental deletion
####&classify::print_reverse_lookup($infodb_handle);
# output doclist
my @doc_list = $self->{'buildproc'}->get_doc_list();
my $browselist_infodb = { 'hastxt' => [ "0" ],
'childtype' => [ "VList" ],
'numleafdocs' => [ scalar(@doc_list) ],
'thistype' => [ "Invisible" ],
'contains' => [ join(";", @doc_list) ] };
&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "browselist", $browselist_infodb);
&dbutil::close_infodb_write_handle($self->{'infodbtype'}, $infodb_handle) if !$self->{'debug'};
print STDERR "\n" if $self->{'gli'};
}
sub make_auxiliary_files {
my $self = shift (@_);
my ($index);
my $build_cfg = {};
# subclasses may have already defined stuff in here
if (defined $self->{'build_cfg'}) {
$build_cfg = $self->{'build_cfg'};
}
my $outhandle = $self->{'outhandle'};
print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
# get the text directory
&util::mk_all_dir ($self->{'build_dir'});
# store the build date
$build_cfg->{'builddate'} = time;
$build_cfg->{'buildtype'} = $self->{'buildtype'};
$build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
$build_cfg->{'stemindexes'} = $self->{'stemindexes'};
if ($self->{'separate_cjk'}) {
$build_cfg->{'separate_cjk'} = "true";
}
# store the number of documents and number of bytes
$build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
$build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
$build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
# store the mapping between the index names and the directory names
# the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
my @indexmap = ();
foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
if (not defined ($self->{'notbuilt'}->{$index})) {
push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
}
}
$build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
my @subcollectionmap = ();
foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
push (@subcollectionmap, "$subcollection\-\>" .
$self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
}
$build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
my @languagemap = ();
foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
push (@languagemap, "$language\-\>" .
$self->{'index_mapping'}->{'languagemap'}->{$language});
}
$build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
my @notbuilt = ();
foreach my $nb (keys %{$self->{'notbuilt'}}) {
push (@notbuilt, $nb);
}
$build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
$build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
$build_cfg->{'infodbtype'} = $self->{'infodbtype'};
$self->build_cfg_extra($build_cfg);
if ($gs_mode eq "gs2") {
&colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
}
if ($gs_mode eq "gs3") {
&colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
}
print STDERR "\n" if $self->{'gli'};
}
# implement this in subclass if want to add extra stuff to build.cfg
sub build_cfg_extra {
my $self = shift(@_);
my ($build_cfg) = @_;
}
sub collect_specific {
my $self = shift (@_);
}
sub want_built {
my $self = shift (@_);
my ($index) = @_;
if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
if ($index =~ /^$checkstr$/) {
$self->{'notbuilt'}->{$index} = 1;
return 0;
}
}
}
return 1;
}
sub create_index_mapping {
my $self = shift (@_);
my ($indexes) = @_;
print STDERR "create_index_mapping should be implemented in subclass\n";
my %mapping = ();
return \%mapping;
}
# returns a processed version of a field.
# if the field has only one component the processed
# version will contain the first character and next consonant
# of that componant - otherwise it will contain the first
# character of the first two components
# only uses letdig (\w) characters now
sub process_field {
my $self = shift (@_);
my ($field) = @_;
return "" unless (defined ($field) && $field =~ /\S/);
my ($a, $b);
my @components = split /,/, $field;
if (scalar @components >= 2) {
# pick the first letdig from the first two field names
($a) = $components[0] =~ /^[^\w]*(\w)/;
($b) = $components[1] =~ /^[^\w]*(\w)/;
} else {
# pick the first two letdig chars
($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
}
# there may not have been any letdigs...
$a = 'a' unless defined $a;
$b = '0' unless defined $b;
return "$a$b";
}
sub get_next_version {
my $self = shift (@_);
my ($nameref) = @_;
my $num=0;
if ($$nameref =~ /(\d\d)$/) {
$num = $1; $num ++;
$$nameref =~ s/\d\d$/$num/;
} elsif ($$nameref =~ /(\d)$/) {
$num = $1;
if ($num == 9) {$$nameref =~ s/\d$/10/;}
else {$num ++; $$nameref =~ s/\d$/$num/;}
} else {
$$nameref =~ s/.$/0/;
}
}
sub get_collection_meta_sets
{
my $self = shift(@_);
my $collection_infodb = shift(@_);
my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
foreach my $prefix (keys %$mdprefix_fields)
{
push(@{$collection_infodb->{"metadataset"}}, $prefix);
foreach my $field (keys %{$mdprefix_fields->{$prefix}})
{
push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
my $val = $mdprefix_fields->{$prefix}->{$field};
push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
}
}
}
# default is to output the metadata sets (prefixes) used in collection
sub output_collection_meta
{
my $self = shift(@_);
my $infodb_handle = shift(@_);
my %collection_infodb = ();
$self->get_collection_meta_sets(\%collection_infodb);
&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
}
# sometimes we need to read in an existing build.cfg - for example,
# if doing each stage of building separately, or when doing incremental
# building
sub read_build_cfg {
my $self = shift(@_);
my $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg");
if (!-e $buildconfigfile) {
# try the index dir - but do we know where it is?? try here
$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg");
if (!-e $buildconfigfile) {
#we cant find a config file - just ignore the field list
return undef;
}
}
my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
return $buildcfg;
}
sub print_stats {
my $self = shift (@_);
my $outhandle = $self->{'outhandle'};
my $indexing_text = $self->{'buildproc'}->get_indexing_text();
my $index = $self->{'buildproc'}->get_index();
my $num_bytes = $self->{'buildproc'}->get_num_bytes();
my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
if ($indexing_text) {
print $outhandle "Stats (Creating index $index)\n";
} else {
print $outhandle "Stats (Compressing text from $index)\n";
}
print $outhandle "Total bytes in collection: $num_bytes\n";
print $outhandle "Total bytes in $index: $num_processed_bytes\n";
if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
if ($self->{'keepold'}) {
if ($num_processed_bytes == 0) {
if ($indexing_text) {
print $outhandle "No additional text was added to $index\n";
} elsif (!$self->{'no_text'}) {
print $outhandle "No additional text was compressed\n";
}
}
}
else {
print $outhandle "***************\n";
if ($indexing_text) {
print $outhandle "WARNING: There is very little or no text to process for $index\n";
} elsif (!$self->{'no_text'}) {
print $outhandle "WARNING: There is very little or no text to compress\n";
}
print $outhandle " Was this your intention?\n";
print $outhandle "***************\n";
}
}
}
1;