###########################################################################
#
# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# This plugin is to process an HTML file from a MediaWiki website which downloaded by
# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
# login, discussion, history, etc. Only the navigation and search section could be preserved.
# Searchbox will be modified to search the Greenstone collection instead of the website.
# It also can automatically add the table of contents on the website's Main_Page to the
# collection's Home page.
package MediaWikiPlugin;
use HTMLPlugin;
use unicode;
use strict; # every perl program should have this!
no strict 'refs'; # make an exception so we can use variables as filehandles
sub BEGIN {
@MediaWikiPlugin::ISA = ('HTMLPlugin');
}
my $arguments =
[
# show the table of contents on collection's home page
{ 'name' => "show_toc",
'desc' => "{MediaWikiPlugin.show_toc}",
'type' => "flag",
'reqd' => "no"},
# set to delete the table of contents section on each MediaWiki page
{ 'name' => "delete_toc",
'desc' => "{MediaWikiPlugin.delete_toc}",
'type' => "flag",
'reqd' => "no"},
# regexp to match the table of contents
{ 'name' => "toc_exp",
'desc' => "{MediaWikiPlugin.toc_exp}",
'type' => "regexp",
'reqd' => "no",
'deft' => "
]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?
\\n" },
# set to delete the navigation section
{ 'name' => "delete_nav",
'desc' => "{MediaWikiPlugin.delete_nav}",
'type' => "flag",
'reqd' => "no",
'deft' => ""},
# regexp to match the navigation section
{ 'name' => "nav_div_exp",
'desc' => "{MediaWikiPlugin.nav_div_exp}",
'type' => "regexp",
'reqd' => "no",
'deft' => "]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
# set to delete the searchbox section
{ 'name' => "delete_searchbox",
'desc' => "{MediaWikiPlugin.delete_searchbox}",
'type' => "flag",
'reqd' => "no",
'deft' => ""},
# regexp to match the searchbox section
{ 'name' => "searchbox_div_exp",
'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
'type' => "regexp",
'reqd' => "no",
'deft' => "
]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
# regexp to match title suffix
# can't use the title_sub option in HTMLPlugin instead
# because title_sub always matches from the begining
{ 'name' => "remove_title_suffix_exp",
'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
'type' => "regexp",
'reqd' => "no",
'deft' => ""}
];
my $options = { 'name' => "MediaWikiPlugin",
'desc' => "{MediaWikiPlugin.desc}",
'abstract' => "no",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
push(@{$hashArgOptLists->{"OptList"}},$options);
my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
return bless $self, $class;
}
sub process {
my $self = shift (@_);
my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
my $outhandle = $self->{'outhandle'};
my @head_and_body = split(/(.+)<\/title>/i;
my $doctitle = $1 if defined $1;
if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
my @doc_properties = split(/
/i,$head);
my $doc_heading = shift(@doc_properties);
my $rest_doc_properties = join(" ", @doc_properties);
my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
my $extracted_metadata = shift (@extracted_metadata);
$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
}
# set the title here if we haven't found it yet
if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
if (defined $doctitle && $doctitle =~ /\S/) {
# remove suffix in title if required
my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
$doctitle =~ s/$remove_suffix_exp//i;
}
$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
} else {
$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
}
}
# we are only interested in the column-contents div
# remove header section, it may contain header images or additional search boxes
my $header_exp = "