########################################################################### # # HTMLPlugin.pm -- basic html plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # # Note that this plugin handles frames only in a very simple way # i.e. each frame is treated as a separate document. This means # search results will contain links to individual frames rather # than linking to the top level frameset. # There may also be some problems caused by the _parent target # (it's removed by this plugin) # package HTMLPlugin; use ReadTextFile; use HBPlugin; use ghtml; use unicode; use util; use XMLParser; use Image::Size; use File::Copy; sub BEGIN { @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin'); } use strict; # every perl program should have this! no strict 'refs'; # make an exception so we can use variables as filehandles my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasePlugin.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp() }, { 'name' => "block_exp", 'desc' => "{BasePlugin.block_exp}", 'type' => 'regexp', 'deft' => &get_default_block_exp() }, { 'name' => "nolinks", 'desc' => "{HTMLPlugin.nolinks}", 'type' => "flag" }, { 'name' => "keep_head", 'desc' => "{HTMLPlugin.keep_head}", 'type' => "flag" }, { 'name' => "no_metadata", 'desc' => "{HTMLPlugin.no_metadata}", 'type' => "flag" }, { 'name' => "metadata_fields", 'desc' => "{HTMLPlugin.metadata_fields}", 'type' => "string", 'deft' => "Title" }, { 'name' => "hunt_creator_metadata", 'desc' => "{HTMLPlugin.hunt_creator_metadata}", 'type' => "flag" }, { 'name' => "file_is_url", 'desc' => "{HTMLPlugin.file_is_url}", 'type' => "flag" }, { 'name' => "assoc_files", 'desc' => "{HTMLPlugin.assoc_files}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "rename_assoc_files", 'desc' => "{HTMLPlugin.rename_assoc_files}", 'type' => "flag" }, { 'name' => "title_sub", 'desc' => "{HTMLPlugin.title_sub}", 'type' => "string", 'deft' => "" }, { 'name' => "description_tags", 'desc' => "{HTMLPlugin.description_tags}", 'type' => "flag" }, # retain this for backward compatibility (w3mir option was replaced by # file_is_url) { 'name' => "w3mir", # 'desc' => "{HTMLPlugin.w3mir}", 'type' => "flag", 'hiddengli' => "yes"}, { 'name' => "no_strip_metadata_html", 'desc' => "{HTMLPlugin.no_strip_metadata_html}", 'type' => "string", 'deft' => "", 'reqd' => "no"}, { 'name' => "sectionalise_using_h_tags", 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}", 'type' => "flag" }, { 'name' => "use_realistic_book", 'desc' => "{HTMLPlugin.tidy_html}", 'type' => "flag"}, { 'name' => "old_style_HDL", 'desc' => "{HTMLPlugin.old_style_HDL}", 'type' => "flag"} ]; my $options = { 'name' => "HTMLPlugin", 'desc' => "{HTMLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub HB_read_html_file { my $self = shift (@_); my ($htmlfile, $text) = @_; # load in the file if (!open (FILE, $htmlfile)) { print STDERR "ERROR - could not open $htmlfile\n"; return; } my $foundbody = 0; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; # just in case there was no
tag if (!$foundbody) { $foundbody = 1; open (FILE, $htmlfile) || return; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; } # text is in utf8 } # converts the text to utf8, as ghtml does that for é etc. sub HB_gettext { my $self = shift (@_); my ($foundbody, $text, $handle) = @_; my $line = ""; while (defined ($line = <$handle>)) { # look for body tag if (!$$foundbody) { if ($line =~ s/^.*]*>//i) { $$foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ m/]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print STDERR "HBPlug::HB_gettext - warning removed font $font\n" if ($font !~ m/^arial$/i); } $$text .= $line; } if ($self->{'input_encoding'} eq "iso_8859_1") { # convert to utf-8 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); } # convert any alphanumeric character entities to their utf-8 # equivalent for indexing purposes #&ghtml::convertcharentities ($$text); $$text =~ s/\s+/ /g; # remove \n's } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ m/<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/]*>\s*
|||
]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*
]*?src=\"?([^\">]+)\"?[^>]*>/
]*>)?((|||\s)*)<<TOC(\d+)>>\s*(.*?)
]*>)?((|||\s)*)<<TOC\d+>>)/$2/i) {
$sectiontext = $1;
} else {
$sectiontext = $html;
$html = "";
}
# remove tags and extra spaces from the title
$title =~ s/<\/?[^>]+>//g;
$title =~ s/^\s+|\s+$//g;
# close any sections below the current level and
# create a new section (special case for the firstsection)
print PROD "\n";
print PROD "\n\n{$_}"} } keys %$attr;
print PROD ">";
}
# is it a font tag
else
{
if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
{
# remove font tag
print PROD "";
}
else
{
# print without changes
print PROD $token->as_is;
}
}
}
close (PROD) || die("Error Closing File: $tmp_filename $!");
# run html-tidy on the tmp file to make it a proper XML file
my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
# write result back to the tmp file
open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
print PROD $tidyfile;
close (PROD) || die("Error Closing File: $tmp_filename $!");
# return the output filename
return $tmp_filename;
}
sub read_into_doc_obj
{
my $self = shift (@_);
my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
# get the input file
my $input_filename = $file;
my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
$suffix = lc($suffix);
if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
{
# because the document has to be sectionalized set the description tags
$self->{'description_tags'} = 1;
# set the file to be tidied
$input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
# get the tidied file
#my $tidy_filename = $self->tmp_tidy_file($input_filename);
my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
# derive tmp filename from input filename
my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
# set the new input file and base_dir to be from the tidied file
$file = "$tailname$suffix";
$base_dir = $dirname;
}
# call the parent read_into_doc_obj
my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
return ($process_status,$doc_obj);
}
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
push(@{$hashArgOptLists->{"OptList"}},$options);
my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
if ($self->{'w3mir'}) {
$self->{'file_is_url'} = 1;
}
$self->{'aux_files'} = {};
$self->{'dir_num'} = 0;
$self->{'file_num'} = 0;
return bless $self, $class;
}
# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
# if have eg @ @sig;
$tmptext =~ s/<[^>]*>/ /g;
$tmptext =~ s/ / /g;
$tmptext =~ s/^\s+//;
$tmptext =~ s/\s+$//;
$tmptext =~ s/\s+/ /gs;
$tmptext = &unicode::substr ($tmptext, 0, $size);
$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
}
}
sub extract_metadata {
my $self = shift (@_);
my ($textref, $metadata, $doc_obj, $section) = @_;
my $outhandle = $self->{'outhandle'};
# if we don't want metadata, we may as well not be here ...
return if (!defined $self->{'metadata_fields'});
# metadata fields to extract/save. 'key' is the (lowercase) name of the
# html meta, 'value' is the metadata name for greenstone to use
my %find_fields = ();
my %creator_fields = (); # short-cut for lookups
foreach my $field (split /,/, $self->{'metadata_fields'}) {
$field =~ s/^\s+//; # remove leading whitespace
$field =~ s/\s+$//; # remove trailing whitespace
# support tag
and
tags into "H1" metadata.
foreach my $field (keys %find_fields) {
if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
my $tag = $1;
if ($$textref =~ m@<$tag[^>]*>(.*?)$tag[^>]*>@g) {
my $content = $1;
$content =~ s/ / /g;
$content =~ s/<[^>]*>/ /g;
$content =~ s/^\s+//;
$content =~ s/\s+$//;
$content =~ s/\s+/ /gs;
if ($content) {
$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
$tag =~ s/^tag//i;
$doc_obj->add_utf8_metadata ($section, $tag, $content);
print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
if ($self->{'verbosity'} > 2);
}
}
}
}
# evaluate any "../" to next directory up
# evaluate any "./" as here
sub eval_dir_dots {
my $self = shift (@_);
my ($filename) = @_;
my $dirsep_os = &util::get_os_dirsep();
my @dirsep = split(/$dirsep_os/,$filename);
my @eval_dirs = ();
foreach my $d (@dirsep) {
if ($d eq "..") {
pop(@eval_dirs);
} elsif ($d eq ".") {
# do nothing!
} else {
push(@eval_dirs,$d);
}
}
# Need to fiddle with number of elements in @eval_dirs if the
# first one is the empty string. This is because of a
# modification to util::filename_cat that supresses the addition
# of a leading '/' character (or \ if windows) (intended to help
# filename cat with relative paths) if the first entry in the
# array is the empty string. Making the array start with *two*
# empty strings is a way to defeat this "smart" option.
#
if (scalar(@eval_dirs) > 0) {
if ($eval_dirs[0] eq ""){
unshift(@eval_dirs,"");
}
}
my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
return $evaluated_filename;
}
sub replace_usemap_links {
my $self = shift (@_);
my ($front, $link, $back) = @_;
# remove quotes from link at start and end if necessary
if ($link=~/^[\"\']/) {
$link=~s/^[\"\']//;
$link=~s/[\"\']$//;
$front.='"';
$back="\"$back";
}
$link =~ s/^\.\///;
return $front . $link . $back;
}
sub inc_filecount {
my $self = shift (@_);
if ($self->{'file_num'} == 1000) {
$self->{'dir_num'} ++;
$self->{'file_num'} = 0;
} else {
$self->{'file_num'} ++;
}
}
# Extend read_file so that strings like é are
# converted to UTF8 internally.
#
# We don't convert < or > or & or " in case
# they interfere with the GML files
sub read_file {
my $self = shift(@_);
my ($filename, $encoding, $language, $textref) = @_;
$self->SUPER::read_file($filename, $encoding, $language, $textref);
# Convert entities to their UTF8 equivalents
$$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
$$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
}
1;