###########################################################################
#
# lucenebuildproc.pm -- perl wrapper for building index with Lucene
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package lucenebuildproc;
# This document processor outputs a document
# for lucene to process
# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
use mgppbuildproc;
use ghtml;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
use IncrementalBuildUtils;
sub BEGIN {
@lucenebuildproc::ISA = ('mgppbuildproc');
}
sub new {
my $class = shift @_;
my $self = new mgppbuildproc (@_);
$self->{'numincdocs'} = 0;
return bless $self, $class;
}
sub is_incremental_capable
{
my $self = shift (@_);
# Unlike MG and MGPP, Lucene supports incremental building
return 1;
}
sub textedit {
my $self = shift (@_);
my ($doc_obj,$file,$edit_mode) = @_;
my $lucenehandle = $self->{'output_handle'};
my $outhandle = $self->{'outhandle'};
# only output this document if it is one to be indexed
return if ($doc_obj->get_doc_type() ne "indexed_doc");
# skip this document if in "compress-text" mode and asked to delete it
return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
my $indexed_doc = $self->is_subcollection_doc($doc_obj);
# this is another document
if (($edit_mode eq "add") || ($edit_mode eq "update")) {
$self->{'num_docs'} += 1;
}
else {
$self->{'num_docs'} -= 1;
}
# get the parameters for the output
# split on : just in case there is subcoll and lang stuff
my ($fields) = split (/:/, $self->{'index'});
my $doc_tag_name = $mgppbuildproc::level_map{'document'};
my $levels = $self->{'levels'};
my $ldoc_level = $levels->{'document'};
my $lsec_level = $levels->{'section'};
# gs2_id should be depricated #####
my $gs2_id = "";
if ($ldoc_level)
{
if ($self->{'db_level'} eq 'document')
{
$gs2_id = $self->{'num_docs'};
}
else
{
# default is section level
$gs2_id = $self->{'num_sections'} + 1;
}
}
my $gs2_docOID = $doc_obj->get_OID();
my $documenttag = undef;
my $documentendtag = undef;
$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
$documentendtag = "\n$doc_tag_name>\n";
my $sec_tag_name = "";
if ($lsec_level)
{
$sec_tag_name = $mgppbuildproc::level_map{'section'};
}
my $doc_section = 0; # just for this document
my $text = "";
$text .= $documenttag;
# get the text for this document
my $section = $doc_obj->get_top_section();
while (defined $section)
{
# update a few statistics
$doc_section++;
$self->{'num_sections'}++;
my $sec_gs2_id = $self->{'num_sections'};
my $sec_gs2_docOID = $gs2_docOID;
$sec_gs2_docOID .= ".$section" if ($section ne "");
# if we are doing subcollections, then some docs shouldn't be indexed.
# but we need to put the section tag placeholders in there so the
# sections match up with database
my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
if ($sec_tag_name ne "") {
$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
$text .= "\n$sec_tag_name>\n"
}
$section = $doc_obj->get_next_section($section);
next;
}
if ($sec_tag_name ne "")
{
$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
}
if (($edit_mode eq "add") || ($edit_mode eq "update")) {
$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
}
else {
# delete
$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
}
# has the user added a 'metadata' index?
my $all_metadata_specified = 0;
# which fields have already been indexed? (same as fields, but in a map)
my $specified_fields = {};
# do we have an allfields index??
my $allfields_index = 0;
# collect up all the text for it in here
my $allfields_text = "";
foreach my $field (split (/;/, $fields)) {
if ($field eq "allfields") {
$allfields_index = 1;
} elsif ($field eq "metadata") {
$all_metadata_specified = 1;
}
}
foreach my $field (split (/;/, $fields)) {
# only deal with this field if it doesn't start with top or
# this is the first section
my $real_field = $field;
next if (($real_field =~ s/^top//) && ($doc_section != 1));
# process these two later
next if ($real_field eq "allfields" || $real_field eq "metadata");
#individual metadata and or text specified - could be a comma separated list
$specified_fields->{$real_field} = 1;
my $shortname="";
my $new_field = 0; # have we found a new field name?
if (defined $self->{'indexfieldmap'}->{$real_field}) {
$shortname = $self->{'indexfieldmap'}->{$real_field};
}
else {
$shortname = $self->create_shortname($real_field);
$new_field = 1;
}
my @metadata_list = (); # put any metadata values in here
my $section_text = ""; # put the text in here
foreach my $submeta (split /,/, $real_field) {
if ($submeta eq "text") {
# no point in indexing text more than once
if ($section_text eq "") {
$section_text = $doc_obj->get_text($section);
if ($self->{'indexing_text'}) {
# we always strip html
$section_text = $self->preprocess_text($section_text, 1, "");
}
else {
# leave html stuff in, but escape the tags
&ghtml::htmlsafe($section_text);
}
}
}
else {
# its a metadata element
my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
}
}
push (@metadata_list, @section_metadata);
}
} # for each field in this one index
# now we add the text and/or metadata into new_text
if ($section_text ne "" || scalar(@metadata_list)) {
my $new_text = "";
if ($section_text ne "") {
$new_text .= "$section_text ";
}
foreach my $item (@metadata_list) {
&ghtml::htmlsafe($item);
$new_text .= "$item ";
}
if ($allfields_index) {
$allfields_text .= $new_text;
}
if ($self->{'indexing_text'}) {
# add the tag
$new_text = "<$shortname index=\"1\">$new_text$shortname>";
}
# filter the text
$new_text = $self->filter_text ($field, $new_text);
if (($edit_mode eq "add") || ($edit_mode eq "update")) {
$self->{'num_processed_bytes'} += length ($new_text);
$text .= "$new_text";
}
else {
# delete
$self->{'num_processed_bytes'} -= length ($new_text);
}
if ($self->{'indexing_text'} && $new_field) {
# we need to add to the list in indexfields
$self->{'indexfieldmap'}->{$real_field} = $shortname;
$self->{'indexfieldmap'}->{$shortname} = 1;
}
}
} # foreach field
if ($all_metadata_specified) {
my $new_text = "";
my $shortname = "";
my $metadata = $doc_obj->get_all_metadata ($section);
foreach my $pair (@$metadata) {
my ($mfield, $mvalue) = (@$pair);
# no value
next unless defined $mvalue && $mvalue ne "";
# we have already indexed this
next if defined ($specified_fields->{$mfield});
# check fields here, maybe others dont want - change to use dontindex!!
next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
next if ($mfield =~ /^gsdl/);
&ghtml::htmlsafe($mvalue);
if (defined $self->{'indexfieldmap'}->{$mfield}) {
$shortname = $self->{'indexfieldmap'}->{$mfield};
}
else {
$shortname = $self->create_shortname($mfield);
$self->{'indexfieldmap'}->{$mfield} = $shortname;
$self->{'indexfieldmap'}->{$shortname} = 1;
}
$new_text .= "<$shortname index=\"1\">$mvalue$shortname>\n";
if ($allfields_index) {
$allfields_text .= "$mvalue ";
}
if (!defined $self->{'indexfields'}->{$mfield}) {
$self->{'indexfields'}->{$mfield} = 1;
}
}
# filter the text
$new_text = $self->filter_text ("metadata", $new_text);
if (($edit_mode eq "add") || ($edit_mode eq "update")) {
$self->{'num_processed_bytes'} += length ($new_text);
$text .= "$new_text";
}
else {
# delete
$self->{'num_processed_bytes'} -= length ($new_text);
}
}
if ($allfields_index) {
# add the index name mapping
$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
$self->{'indexfieldmap'}->{"ZZ"} = 1;
my $new_text = "
EDWARD.." into "farmingedward" # (example from demo collection b20cre) # Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com) sub preprocess_text { my $self = shift (@_); my ($text, $strip_html, $para) = @_; # at this stage, we do not do paragraph tags unless have strip_html - # it will result in a huge mess of non-xml return unless $strip_html; my $new_text = $text; # if we have
tags, we can have < > inside them, need to delete # the <> before stripping tags $new_text =~ s/(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse; if ($para eq "") { # just remove all tags $new_text =~ s/<[^>]*>/ /gs; } else { # strip all tags excepttags which get turned into $para $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse; } # It's important that we remove name entities because otherwise the text passed to Lucene for indexing # may not be valid XML (eg. if HTML-only entities like are used) $new_text =~ s/&\w{1,10};//g; # Remove stray '&' characters, except in nnnn; or hhhh; entities (which are valid XML) $new_text =~ s/&([^\#])/ $1/g; return $new_text; } 1;