###########################################################################
#
# HTMLImagePlugin.pm -- Context-based image indexing plugin for HTML documents
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 2001 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# DESCRIPTION:
#
# Extracts images and associated text and metadata from
# web pages as individual documents for indexing. Thumbnails
# are created from each image for browsing purposes.
#
# Options are available for configuring the aggressiveness of the
# associated text extraction mechanisms. A higher level of
# aggressiveness will extract more text and consequently
# may mean lower accuracy (precision); however, it may also
# retrieve more of the relevant images from the collection (recall).
# Lower levels of aggressiveness maybe result in slightly faster
# collection builds at the import stage.
#
# HTMLImagePlugin is a subclass of HTMLPlug (i.e. it will index pages also
# if required). It can be used in place of HTMLPlugin to index both
# pages and their images.
#
# REQUIREMENTS:
#
# The ImageMagick image manipulation is used to create
# thumbnails and extract some image metadata. (Available
# from http://www.imagemagick.org/)
#
# Unix:
# Many Linux distributions contain ImageMagick.
#
# Windows:
# ImageMagick can be downloaded from the website above.
# Make sure the system path includes the ImageMagick binaries
# before using HTMLImagePlugin.
#
# NOTE: NT/2000/XP contain a filesystem utility 'convert.exe'
# with the same name as the image conversion utility. The
# ImageMagick FAQ recommends renaming the filesystem
# utility (e.g. to 'fsconvert.exe') to avoid this clash.
#
# USAGE:
#
# An image document consists of metadata elements:
#
# OriginalFilename, FilePath, Filename, FileExt, FileSize,
# Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL
# ImageText, PageTitle
#
# Most of these are only useful in format strings (e.g. ThumbURL,
# Filename, URL, PageURL, CachePageURL).
#
# ImageText, as the name suggests contains the indexable text.
# (unless using the -document_text plugin option)
#
# Since image documents are made up of metadata elements
# alone, format strings are needed to display them properly.
# NOTE: The receptionist will only display results (e.g. thumbnails)
# in 4 columns if the format string begins with "
".
#
# The example below takes the user to the image within the
# source HTML document rather than using a format string
# on DocumentText to display the image document itself.
#
# Example collect.cfg:
#
# ...
#
# indexes document:ImageText document:text
# defaultindex document:ImageText
#
# collectionmeta .document:ImageText "images"
# collectionmeta .document:text "documents"
#
# ...
#
# plugin HTMLImagePlugin -index_pages -aggressiveness 6
#
# ...
#
# format SearchVList '{If}{[Title],[link][icon] [Title][[/link],
# } | '
#
# ...
#
package HTMLImagePlugin;
use HTMLPlugin;
use ghtml;
use unicode;
use util;
use strict; # 'subs';
no strict 'refs'; # allow filehandles to be variables and viceversa
sub BEGIN {
@HTMLImagePlugin::ISA = qw( HTMLPlugin );
}
my $aggressiveness_list =
[ { 'name' => "1",
'desc' => "{HTMLImagePlugin.aggressiveness.1}" },
{ 'name' => "2",
'desc' => "{HTMLImagePlugin.aggressiveness.2}" },
{ 'name' => "3",
'desc' => "{HTMLImagePlugin.aggressiveness.3}" },
{ 'name' => "4",
'desc' => "{HTMLImagePlugin.aggressiveness.4}" },
{ 'name' => "5",
'desc' => "{HTMLImagePlugin.aggressiveness.5}" },
{ 'name' => "6",
'desc' => "{HTMLImagePlugin.aggressiveness.6}" },
{ 'name' => "7",
'desc' => "{HTMLImagePlugin.aggressiveness.7}" },
{ 'name' => "8",
'desc' => "{HTMLImagePlugin.aggressiveness.8}" },
{ 'name' => "9",
'desc' => "{HTMLImagePlugin.aggressiveness.9}" } ];
my $arguments =
[ { 'name' => "aggressiveness",
'desc' => "{HTMLImagePlugin.aggressiveness}",
'type' => "int",
'list' => $aggressiveness_list,
'deft' => "3",
'reqd' => "no" },
{ 'name' => "index_pages",
'desc' => "{HTMLImagePlugin.index_pages}",
'type' => "flag",
'reqd' => "no" },
{ 'name' => "no_cache_images",
'desc' => "{HTMLImagePlugin.no_cache_images}",
'type' => "flag",
'reqd' => "no" },
{ 'name' => "min_size",
'desc' => "{HTMLImagePlugin.min_size}",
'type' => "int",
'deft' => "2000",
'reqd' => "no" },
{ 'name' => "min_width",
'desc' => "{HTMLImagePlugin.min_width}",
'type' => "int",
'deft' => "50",
'reqd' => "no" },
{ 'name' => "min_height",
'desc' => "{HTMLImagePlugin.min_height}",
'type' => "int",
'deft' => "50",
'reqd' => "no" },
{ 'name' => "thumb_size",
'desc' => "{HTMLImagePlugin.thumb_size}",
'type' => "int",
'deft' => "100",
'reqd' => "no" },
{ 'name' => "convert_params",
'desc' => "{HTMLImagePlugin.convert_params}",
'type' => "string",
'deft' => "",
'reqd' => "no" },
{ 'name' => "min_near_text",
'desc' => "{HTMLImagePlugin.min_near_text}",
'type' => "int",
'deft' => "10",
'reqd' => "no" },
{ 'name' => "max_near_text",
'desc' => "{HTMLImagePlugin.max_near_text}",
'type' => "int",
'deft' => "400",
'reqd' => "no" },
{ 'name' => "smallpage_threshold",
'desc' => "{HTMLImagePlugin.smallpage_threshold}",
'type' => "int",
'deft' => "2048",
'reqd' => "no" },
{ 'name' => "textrefs_threshold",
'desc' => "{HTMLImagePlugin.textrefs_threshold}",
'type' => "int",
'deft' => "2",
'reqd' => "no" },
{ 'name' => "caption_length",
'desc' => "{HTMLImagePlugin.caption_length}",
'type' => "int",
'deft' => "80",
'reqd' => "no" },
{ 'name' => "neartext_length",
'desc' => "{HTMLImagePlugin.neartext_length}",
'type' => "int",
'deft' => "300",
'reqd' => "no" },
{ 'name' => "document_text",
'desc' => "{HTMLImagePlugin.document_text}",
'type' => "flag",
'reqd' => "no" } ];
my $options = { 'name' => "HTMLImagePlugin",
'desc' => "{HTMLImagePlugin.desc}",
'abstract' => "no",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
push(@{$hashArgOptLists->{"OptList"}},$options);
my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
# init class variables
$self->{'textref'} = undef; # init by read_file fn
$self->{'htdoc_obj'} = undef; # init by process fn
$self->{'htpath'} = undef; # init by process fn
$self->{'hturl'} = undef; # init by process fn
$self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub
$self->{'smallpage'} = 0; # set by process fn
$self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small
$self->{'initialised'} = undef; # flag (see set_extraction_options())
return bless $self, $class;
}
# if indexing pages, let HTMLPlugin do it's stuff
# image extraction done through read()
sub process {
my $self = shift(@_);
my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
$self->{'imglist'} = ();
if ( $self->{'index_pages'} ) {
my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
if ( ! $ok ) { return $ok }
$self->{'htdoc_obj'} = $doc_obj;
}
# else use URL for referencing
#if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; }
$self->{'htpath'} = $base_dir if (-d $base_dir);
if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; }
$self->{'htpath'} =~ s/\\/\//g; # replace \ with /
$self->{'hturl'} = "http://$file";
$self->{'hturl'} =~ s/\\/\//g; # for windows
($self->{'filename'}) = $file =~ /.*[\/\\](.*)/;
($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i;
if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) {
$self->{'smallpage'} = 1;
} else { $self->{'smallpage'} = 0; }
if ( defined($self->{'initialised'}) ) { return 1; }
else {
$self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i);
return $self->{'initialised'};
}
}
# get complex configuration options from configuration files
# -- $GSDLCOLLECTION/etc/HTMLImagePlugin.cfg (tag sets for aggr 2+)
# -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+)
# If there's no HTMLImagePlugin.cfg file we'll use the following default values
my $defaultcfg = '
Caption
font
tt
small
b
i
u
em
td
li
a
p
tr
center
div
caption
br
ul
ol
table
hr
Neartext
tr|hr|table|h\d|img|body
td|tr|hr|table|h\d|img|body
p|br|td|tr|hr|table|h\d|img|body
font|p|i|b|em|img
';
sub set_extraction_options() {
my ($self, $collpath) = @_;
my ($filepath);
print {$self->{'outhandle'}} "HTMLImagePlugin: Initialising\n"
if $self->{'verbosity'} > 1;
# etc/HTMLImagePlugin.cfg (XML)
# tag sets for captions and neartext
if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
$self->{'delims'} = [];
$self->{'cdelims'} = [];
my ($cfg, @tagsets, $tagset, $type, @delims);
$filepath = "$collpath/etc/HTMLImagePlugin.cfg";
if ( open CFG, "<$filepath" ) {
while () { $cfg .= $_ }
close CFG;
} else {
$cfg = $defaultcfg;
}
(@tagsets) =
$cfg =~ /(.*?)<\/delimitertagset>/igs;
foreach $tagset ( @tagsets ) {
($type) = $tagset =~ /(.*?)<\/setname>/i;
if ( lc($type) eq "caption" ) {
(@{$self->{'cdelims'}}) = $tagset =~ /(.*?)<\/taggroup>/igs;
}
elsif ( lc($type) eq "neartext" ) {
(@{$self->{'delims'}}) = $tagset =~ /(.*?)<\/taggroup>/igs;
}
}
# output a warning if there seem to be no delimiters
if ( scalar(@{$self->{'cdelims'}} == 0)) {
print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no caption delimiters found in $filepath\n";
}
if ( scalar(@{$self->{'delims'}} == 0)) {
print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no neartext delimiters found in $filepath\n";
}
}
# get stop words for textual reference extraction
# TODO: warnings scroll off. Would be best to output them again at end of import
if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
$self->{'stopwords'} = ();
$filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
if ( open STOPWORDS, "<$filepath" ) {
while ( ) {
chomp;
$self->{'stopwords'}{$_} = 1;
}
close STOPWORDS;
} else {
print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: couldn't open stopwords file at $filepath ($!)\n";
}
}
if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) {
$self->{'max_near_text'} = $self->{'neartext_length'} * 1.33;
print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
}
if ( $self->{'caption_length'} > $self->{'max_near_text'} ) {
$self->{'max_near_text'} = $self->{'caption_length'} * 1.33;
print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n";
}
return 1;
}
# return number of files processed, undef if can't recognise, -1 if
# cant process
# Note that $base_dir might be "" and that $file might
# include directories
sub read {
my ($self, $pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_);
my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs);
# forward normal read (runs HTMLPlugin if index_pages T)
my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
if ( ! $ok ) { return $ok } # what is this returning??
my $outhandle = $self->{'outhandle'};
my $textref = $self->{'textref'};
my $htdoc_obj = $self->{'htdoc_obj'};
$numdocs = 0;
$base_dir =~ /(.*)\/.*/;
$tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded?
&util::mk_all_dir($tndir) unless -e "$tndir";
$imgs = \%{$self->{'imglist'}};
my $nimgs = $self->get_img_list($textref);
$self->{'images_indexed'} = $nimgs;
if ( $nimgs > 0 ) {
my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs});
my $i = 0;
foreach $filepath ( @fplist ) {
$pos = $imgs->{$filepath}{'pos'};
$context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker)
($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is );
if (! defined($imgtag)) { $imgtag = $filepath }
print $outhandle "HTMLImagePlugin: extracting $filepath\n"
if ( $self->{'verbosity'} > 1 );
$doc_obj = new doc ("", "indexed_doc", $self->{'file_rename_method'});
$section = $doc_obj->get_top_section();
my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'});
my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} );
$self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos);
$processor->process($doc_obj);
$numdocs++;
$i++;
}
return $numdocs;
} else {
print $outhandle "HTMLImagePlugin: No images from $file indexed\n"
if ( $self->{'verbosity'} > 2 );
return 1;
}
}
# for every valid image tag
# 1. extract related text and image metadata
# 2. add this as document meta-data
# 3. add assoc image(s) as files
#
sub extract_image_info {
my $self = shift (@_);
my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_);
my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs,
$thumbfp, $pagetitle, $alttext, $filepath, $aggr);
$aggr = $self->{'aggressiveness'};
$imgs = \%{$self->{'imglist'}};
$filepath = $imgs->{$id}{'relpath'};
($filename) = $filepath =~ /([^\/\\]+)$/s;
($orig_fp) = "$self->{'base_path'}/$filepath";
$orig_fp =~ tr/+/ /;
$orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc
$orig_fp =~ s/\\/\//g;
$filepath = "$self->{'htpath'}/$filepath";
my ($onlyfn) = $filename =~ /([^\\\/]*)$/;
($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/;
$fn = lc $fn; $ext = lc $ext;
($reltext) = "GifComment | " . `identify $filepath -ping -format "%c"` . " | \n"
if ($ext eq "gif");
$reltext .= "FilePath | $orig_fp | \n";
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$crcid = "$fn.$ext." . $self->{'next_crcid'}++;
} else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; }
$thumbfp = "$tndir/tn_$crcid.jpg";
`convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp;
if ( ! (-e $thumbfp) ) {
print STDERR "HTMLImagePlugin: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0;
}
# shove in full text (tag stripped or unstripped) if settings require it
if ( $aggr == 10) {
$reltext = "AllPage | " . $$textref . " | \n"; # level 10 (all text, verbatim)
} else {
$pagetitle = $self->get_meta_value("title", $textref);
($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is;
if ( defined($alttext) && length($alttext) > 1) {
$reltext .= " ALTtext | $alttext | \n"; }
$reltext .= "SplitCapitalisation | " .
$self->split_filepath($orig_fp) . " | \n";
# get caption/tag based near text (if appropriate)
if ( $aggr > 1 ) {
if ( $aggr >= 2 ) {
$reltext .=
$self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
$relreltext = $reltext;
}
# repeat the filepath, alt-text, caption, etc
if ( $aggr == 8 ) {
$reltext .= $relreltext;
}
if ( $aggr >= 3 ) {
$reltext .=
$self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
}
# get page metadata (if appropriate)
if ( $aggr >= 6 || ( $aggr >= 2 &&
( $self->{'images_indexed'} < 2 ||
($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) {
$reltext .= $self->get_page_metadata($textref);
}
# textual references
if ( $aggr == 5 || $aggr >= 7) {
if ( length($relreltext) > ($self->{'caption_length'} * 2) ) {
$reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); }
else {
$reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos);
}
}
} # aggr > 1
} # aggr != 10
$doc_obj->set_OID($crcid);
$doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section);
$doc_obj->add_metadata($section, "OriginalFilename", $filename);
$doc_obj->add_metadata($section, "FilePath", $orig_fp);
$doc_obj->add_metadata($section, "Filename", $fn);
$doc_obj->add_metadata($section, "FileExt", $ext);
$doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'});
$doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'});
$doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'});
$doc_obj->add_metadata($section, "URL", "http://$orig_fp");
$doc_obj->add_metadata($section, "PageURL", $self->{'hturl'});
$doc_obj->add_metadata($section, "PageTitle", $pagetitle);
$doc_obj->add_metadata($section, "ThumbURL",
"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$fn.thumb.jpg");
$doc_obj->add_metadata($section, "FileFormat", "W3Img");
if ( $self->{'document_text'} ) {
$doc_obj->add_utf8_text($section, "");
} else {
$doc_obj->add_metadata($section, "ImageText", "\n");
}
if ( $self->{'index_pages'} ) {
my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID();
if ( $imgs->{$id}{'anchored'} ) {
my $a_name = $id;
$a_name =~ s/[\/\\\:\&]/_/g;
$cache_url .= "#gsdl_$a_name" ;
}
$doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url);
}
if ( ! $self->{'no_cache_images'} ) {
$onlyfn = lc $onlyfn;
$doc_obj->associate_file($filepath, $onlyfn, undef, $section);
$doc_obj->add_utf8_metadata($section, "CacheURL",
"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$onlyfn");
}
return 1;
}
sub get_page_metadata {
my ($self, $textref) = (@_);
my (@rval);
$rval[0] = $self->get_meta_value("title", $textref);
$rval[1] = $self->get_meta_value("keywords", $textref);
$rval[2] = $self->get_meta_value("description", $textref);
$rval[3] = $self->{'filename'};
return wantarray ? @rval : "PageMeta | @rval | \n" ;
}
# turns LargeCatFish into Large,Cat,Fish so MG sees the separate words
sub split_filepath {
my ($self, $filepath) = (@_);
my (@words) = $filepath =~ /([A-Z][a-z]+)/g;
return join(',', @words);
}
# finds and extracts sentences
# that seem to be on the same topic
# as other related text (correlations)
# and textual references (e.g. in figure 3 ...)
sub get_textrefs {
my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_);
my ($maxtext, $mintext, $startpos, $context_size, $context);
my (@relwords, @refwords, %sentences, @pagemeta);
# extract larger context
$maxtext = $self->{'max_near_text'};
$startpos = $pos - ($maxtext * 4);
$context_size = $maxtext*10;
if ($startpos < $prevpos ) { $startpos = $prevpos }
if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) }
$context = substr ( $$textref, $startpos, $context_size );
$context =~ s/<.*?>//gs;
$context =~ s/^.*>(.*)/$1/gs;
$context =~ s/(.*)<.*$/$1/gs;
# get page meta-data (if not already included)
if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) {
@pagemeta = $self->get_page_metadata($textref);
foreach my $value ( @pagemeta ) {
$context .= "$value."; # make each into psuedo-sentence
}
}
# TODO: this list is not exhaustive
@refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)',
'(?:see)? (?:figure|table)? (?:below|above)');
# extract general references
foreach my $rw ( @refwords ) {
while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) {
my $sentence = $1;
$sentence =~ s/\s+/ /g;
$sentences{$sentence}+=2;
}
}
# extract specific (figure, table) references by number
my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig;
if ( $fignum ) {
foreach my $rw ( @refwords ) {
while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) {
my $sentence = $1;
$sentence =~ s/\s+/ /g;
$sentences{$sentence}+=4;
}
}
}
# sentences with occurances of important words
@relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words
foreach my $word ( @relwords ) {
if ( $self->{'stopwords'}{$word} ) { next } # skip stop words
while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) {
my $sentence = $1;
$sentence =~ s/\s+/ /g;
$sentences{$sentence}++;
}
}
foreach my $sentence ( keys %sentences ) {
if ($sentences{$sentence} < $self->{'textrefs_threshold'}) {
delete $sentences{$sentence};
}
}
my ($rval) = join " \n", (keys %sentences);
if ( $rval && length($rval) > 5 ) {
return ( "TextualReferences | " . $rval . " | \n") }
else { return "" }
}
# handles caption extraction
# calling the extractor with different
# tags and choosing the best candidate caption
sub extract_caption_text {
my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_);
my (@neartext, $len, $hdelim, $mintext, $goodlen,
$startpos, $context, $context_size);
$mintext = $self->{'min_near_text'};
$goodlen = $self->{'caption_length'};
# extract a context to extract near text from (faster)
$context_size = $self->{'max_near_text'}*3;
$startpos = $pos - ($context_size / 2);
if ($startpos < $prevpos ) { $startpos = $prevpos }
if ($nextpos != -1 && $context_size > ( $nextpos - $startpos ))
{ $context_size = ($nextpos - $startpos) }
$context = substr ( $$textref, $startpos, $context_size );
$context =~ s///gs;
$context =~ s/^.*-->(.*)/$1/gs;
$context =~ s/(.*)//gs;
$context =~ s/^.*-->(.*)/$1/gs;
$context =~ s/(.*) |