########################################################################### # # HTMLImagePlugin.pm -- Context-based image indexing plugin for HTML documents # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # DESCRIPTION: # # Extracts images and associated text and metadata from # web pages as individual documents for indexing. Thumbnails # are created from each image for browsing purposes. # # Options are available for configuring the aggressiveness of the # associated text extraction mechanisms. A higher level of # aggressiveness will extract more text and consequently # may mean lower accuracy (precision); however, it may also # retrieve more of the relevant images from the collection (recall). # Lower levels of aggressiveness maybe result in slightly faster # collection builds at the import stage. # # HTMLImagePlugin is a subclass of HTMLPlug (i.e. it will index pages also # if required). It can be used in place of HTMLPlugin to index both # pages and their images. # # REQUIREMENTS: # # The ImageMagick image manipulation is used to create # thumbnails and extract some image metadata. (Available # from http://www.imagemagick.org/) # # Unix: # Many Linux distributions contain ImageMagick. # # Windows: # ImageMagick can be downloaded from the website above. # Make sure the system path includes the ImageMagick binaries # before using HTMLImagePlugin. # # NOTE: NT/2000/XP contain a filesystem utility 'convert.exe' # with the same name as the image conversion utility. The # ImageMagick FAQ recommends renaming the filesystem # utility (e.g. to 'fsconvert.exe') to avoid this clash. # # USAGE: # # An image document consists of metadata elements: # # OriginalFilename, FilePath, Filename, FileExt, FileSize, # Width, Height, URL, PageURL, ThumbURL, CacheURL, CachePageURL # ImageText, PageTitle # # Most of these are only useful in format strings (e.g. ThumbURL, # Filename, URL, PageURL, CachePageURL). # # ImageText, as the name suggests contains the indexable text. # (unless using the -document_text plugin option) # # Since image documents are made up of metadata elements # alone, format strings are needed to display them properly. # NOTE: The receptionist will only display results (e.g. thumbnails) # in 4 columns if the format string begins with "". # # The example below takes the user to the image within the # source HTML document rather than using a format string # on DocumentText to display the image document itself. # # Example collect.cfg: # # ... # # indexes document:ImageText document:text # defaultindex document:ImageText # # collectionmeta .document:ImageText "images" # collectionmeta .document:text "documents" # # ... # # plugin HTMLImagePlugin -index_pages -aggressiveness 6 # # ... # # format SearchVList '' # # ... # package HTMLImagePlugin; use HTMLPlugin; use ghtml; use unicode; use util; use strict; # 'subs'; no strict 'refs'; # allow filehandles to be variables and viceversa sub BEGIN { @HTMLImagePlugin::ISA = qw( HTMLPlugin ); } my $aggressiveness_list = [ { 'name' => "1", 'desc' => "{HTMLImagePlugin.aggressiveness.1}" }, { 'name' => "2", 'desc' => "{HTMLImagePlugin.aggressiveness.2}" }, { 'name' => "3", 'desc' => "{HTMLImagePlugin.aggressiveness.3}" }, { 'name' => "4", 'desc' => "{HTMLImagePlugin.aggressiveness.4}" }, { 'name' => "5", 'desc' => "{HTMLImagePlugin.aggressiveness.5}" }, { 'name' => "6", 'desc' => "{HTMLImagePlugin.aggressiveness.6}" }, { 'name' => "7", 'desc' => "{HTMLImagePlugin.aggressiveness.7}" }, { 'name' => "8", 'desc' => "{HTMLImagePlugin.aggressiveness.8}" }, { 'name' => "9", 'desc' => "{HTMLImagePlugin.aggressiveness.9}" } ]; my $arguments = [ { 'name' => "aggressiveness", 'desc' => "{HTMLImagePlugin.aggressiveness}", 'type' => "int", 'list' => $aggressiveness_list, 'deft' => "3", 'reqd' => "no" }, { 'name' => "index_pages", 'desc' => "{HTMLImagePlugin.index_pages}", 'type' => "flag", 'reqd' => "no" }, { 'name' => "no_cache_images", 'desc' => "{HTMLImagePlugin.no_cache_images}", 'type' => "flag", 'reqd' => "no" }, { 'name' => "min_size", 'desc' => "{HTMLImagePlugin.min_size}", 'type' => "int", 'deft' => "2000", 'reqd' => "no" }, { 'name' => "min_width", 'desc' => "{HTMLImagePlugin.min_width}", 'type' => "int", 'deft' => "50", 'reqd' => "no" }, { 'name' => "min_height", 'desc' => "{HTMLImagePlugin.min_height}", 'type' => "int", 'deft' => "50", 'reqd' => "no" }, { 'name' => "thumb_size", 'desc' => "{HTMLImagePlugin.thumb_size}", 'type' => "int", 'deft' => "100", 'reqd' => "no" }, { 'name' => "convert_params", 'desc' => "{HTMLImagePlugin.convert_params}", 'type' => "string", 'deft' => "", 'reqd' => "no" }, { 'name' => "min_near_text", 'desc' => "{HTMLImagePlugin.min_near_text}", 'type' => "int", 'deft' => "10", 'reqd' => "no" }, { 'name' => "max_near_text", 'desc' => "{HTMLImagePlugin.max_near_text}", 'type' => "int", 'deft' => "400", 'reqd' => "no" }, { 'name' => "smallpage_threshold", 'desc' => "{HTMLImagePlugin.smallpage_threshold}", 'type' => "int", 'deft' => "2048", 'reqd' => "no" }, { 'name' => "textrefs_threshold", 'desc' => "{HTMLImagePlugin.textrefs_threshold}", 'type' => "int", 'deft' => "2", 'reqd' => "no" }, { 'name' => "caption_length", 'desc' => "{HTMLImagePlugin.caption_length}", 'type' => "int", 'deft' => "80", 'reqd' => "no" }, { 'name' => "neartext_length", 'desc' => "{HTMLImagePlugin.neartext_length}", 'type' => "int", 'deft' => "300", 'reqd' => "no" }, { 'name' => "document_text", 'desc' => "{HTMLImagePlugin.document_text}", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "HTMLImagePlugin", 'desc' => "{HTMLImagePlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists); # init class variables $self->{'textref'} = undef; # init by read_file fn $self->{'htdoc_obj'} = undef; # init by process fn $self->{'htpath'} = undef; # init by process fn $self->{'hturl'} = undef; # init by process fn $self->{'plaintext'} = undef; # HTML stripped version - only init if needed by raw_neartext sub $self->{'smallpage'} = 0; # set by process fn $self->{'images_indexed'} = undef; # num of images indexed - if 1 or 2 then we know page is small $self->{'initialised'} = undef; # flag (see set_extraction_options()) return bless $self, $class; } # if indexing pages, let HTMLPlugin do it's stuff # image extraction done through read() sub process { my $self = shift(@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; $self->{'imglist'} = (); if ( $self->{'index_pages'} ) { my $ok = $self->SUPER::process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli); if ( ! $ok ) { return $ok } $self->{'htdoc_obj'} = $doc_obj; } # else use URL for referencing #if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} = $1; } else { $self->{'htpath'} = $file; } $self->{'htpath'} = $base_dir if (-d $base_dir); if ( $file =~ /(.*)[\/\\]/ ) { $self->{'htpath'} .= "/$1"; } $self->{'htpath'} =~ s/\\/\//g; # replace \ with / $self->{'hturl'} = "http://$file"; $self->{'hturl'} =~ s/\\/\//g; # for windows ($self->{'filename'}) = $file =~ /.*[\/\\](.*)/; ($self->{'base_path'}) = $file =~ /(.*)[\/\\]/i; if ( ( -s "$base_dir/$file") <= $self->{'smallpage_threshold'} ) { $self->{'smallpage'} = 1; } else { $self->{'smallpage'} = 0; } if ( defined($self->{'initialised'}) ) { return 1; } else { $self->{'initialised'} = $self->set_extraction_options($base_dir =~ /^(.*?)\/import/i); return $self->{'initialised'}; } } # get complex configuration options from configuration files # -- $GSDLCOLLECTION/etc/HTMLImagePlugin.cfg (tag sets for aggr 2+) # -- $GSDLHOME/etc/packages/phind/stopword/en/brown.sw (stopwords for aggr 5+) # If there's no HTMLImagePlugin.cfg file we'll use the following default values my $defaultcfg = ' Caption font tt small b i u em td li a p tr center div caption br ul ol table hr Neartext tr|hr|table|h\d|img|body td|tr|hr|table|h\d|img|body p|br|td|tr|hr|table|h\d|img|body font|p|i|b|em|img '; sub set_extraction_options() { my ($self, $collpath) = @_; my ($filepath); print {$self->{'outhandle'}} "HTMLImagePlugin: Initialising\n" if $self->{'verbosity'} > 1; # etc/HTMLImagePlugin.cfg (XML) # tag sets for captions and neartext if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) { $self->{'delims'} = []; $self->{'cdelims'} = []; my ($cfg, @tagsets, $tagset, $type, @delims); $filepath = "$collpath/etc/HTMLImagePlugin.cfg"; if ( open CFG, "<$filepath" ) { while () { $cfg .= $_ } close CFG; } else { $cfg = $defaultcfg; } (@tagsets) = $cfg =~ /(.*?)<\/delimitertagset>/igs; foreach $tagset ( @tagsets ) { ($type) = $tagset =~ /(.*?)<\/setname>/i; if ( lc($type) eq "caption" ) { (@{$self->{'cdelims'}}) = $tagset =~ /(.*?)<\/taggroup>/igs; } elsif ( lc($type) eq "neartext" ) { (@{$self->{'delims'}}) = $tagset =~ /(.*?)<\/taggroup>/igs; } } # output a warning if there seem to be no delimiters if ( scalar(@{$self->{'cdelims'}} == 0)) { print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no caption delimiters found in $filepath\n"; } if ( scalar(@{$self->{'delims'}} == 0)) { print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: no neartext delimiters found in $filepath\n"; } } # get stop words for textual reference extraction # TODO: warnings scroll off. Would be best to output them again at end of import if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) { $self->{'stopwords'} = (); $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw"); if ( open STOPWORDS, "<$filepath" ) { while ( ) { chomp; $self->{'stopwords'}{$_} = 1; } close STOPWORDS; } else { print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: couldn't open stopwords file at $filepath ($!)\n"; } } if ( $self->{'neartext_length'} > $self->{'max_near_text'} ) { $self->{'max_near_text'} = $self->{'neartext_length'} * 1.33; print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n"; } if ( $self->{'caption_length'} > $self->{'max_near_text'} ) { $self->{'max_near_text'} = $self->{'caption_length'} * 1.33; print {$self->{'outhandle'}} "HTMLImagePlugin: Warning: adjusted max_text to $self->{'max_near_text'}\n"; } return 1; } # return number of files processed, undef if can't recognise, -1 if # cant process # Note that $base_dir might be "" and that $file might # include directories sub read { my ($self, $pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = (@_); my ($doc_obj, $section, $filepath, $imgtag, $pos, $context, $numdocs, $tndir, $imgs); # forward normal read (runs HTMLPlugin if index_pages T) my $ok = $self->SUPER::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli); if ( ! $ok ) { return $ok } # what is this returning?? my $outhandle = $self->{'outhandle'}; my $textref = $self->{'textref'}; my $htdoc_obj = $self->{'htdoc_obj'}; $numdocs = 0; $base_dir =~ /(.*)\/.*/; $tndir = "$1/archives/thumbnails"; # TODO: this path shouldn't be hardcoded? &util::mk_all_dir($tndir) unless -e "$tndir"; $imgs = \%{$self->{'imglist'}}; my $nimgs = $self->get_img_list($textref); $self->{'images_indexed'} = $nimgs; if ( $nimgs > 0 ) { my @fplist = (sort { $imgs->{$a}{'pos'} <=> $imgs->{$b}{'pos'} } keys %{$imgs}); my $i = 0; foreach $filepath ( @fplist ) { $pos = $imgs->{$filepath}{'pos'}; $context = substr ($$textref, $pos - 50, $pos + 50); # grab context (quicker) ($imgtag) = ($context =~ /(<(?:img|a|body)\s[^>]*$filepath[^>]*>)/is ); if (! defined($imgtag)) { $imgtag = $filepath } print $outhandle "HTMLImagePlugin: extracting $filepath\n" if ( $self->{'verbosity'} > 1 ); $doc_obj = new doc ("", "indexed_doc", $self->{'file_rename_method'}); $section = $doc_obj->get_top_section(); my $prevpos = ( $i == 0 ? 0 : $imgs->{$fplist[$i - 1]}{'pos'}); my $nextpos = ( $i >= ($nimgs -1) ? -1 : $imgs->{$fplist[$i + 1]}{'pos'} ); $self->extract_image_info($imgtag, $filepath, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos); $processor->process($doc_obj); $numdocs++; $i++; } return $numdocs; } else { print $outhandle "HTMLImagePlugin: No images from $file indexed\n" if ( $self->{'verbosity'} > 2 ); return 1; } } # for every valid image tag # 1. extract related text and image metadata # 2. add this as document meta-data # 3. add assoc image(s) as files # sub extract_image_info { my $self = shift (@_); my ($tag, $id, $textref, $doc_obj, $section, $tndir, $prevpos, $nextpos) = (@_); my ($filename, $orig_fp, $fn, $ext, $reltext, $relreltext, $crcid, $imgs, $thumbfp, $pagetitle, $alttext, $filepath, $aggr); $aggr = $self->{'aggressiveness'}; $imgs = \%{$self->{'imglist'}}; $filepath = $imgs->{$id}{'relpath'}; ($filename) = $filepath =~ /([^\/\\]+)$/s; ($orig_fp) = "$self->{'base_path'}/$filepath"; $orig_fp =~ tr/+/ /; $orig_fp =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # translate %2E to space, etc $orig_fp =~ s/\\/\//g; $filepath = "$self->{'htpath'}/$filepath"; my ($onlyfn) = $filename =~ /([^\\\/]*)$/; ($fn, $ext) = $onlyfn =~ /(.*)\.(.*)/; $fn = lc $fn; $ext = lc $ext; ($reltext) = "\n" if ($ext eq "gif"); $reltext .= "\n"; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $crcid = "$fn.$ext." . $self->{'next_crcid'}++; } else { ($crcid) = `cksum $filepath` =~ /^(\d+)/; } $thumbfp = "$tndir/tn_$crcid.jpg"; `convert -flatten -filter Hanning $self->{'convert_params'} -geometry "$self->{'thumb_size'}x$self->{'thumb_size'}>" $filepath $thumbfp` unless -e $thumbfp; if ( ! (-e $thumbfp) ) { print STDERR "HTMLImagePlugin: 'convert' failed. Check ImageMagicK binaries are installed and working correctly\n"; return 0; } # shove in full text (tag stripped or unstripped) if settings require it if ( $aggr == 10) { $reltext = "\n"; # level 10 (all text, verbatim) } else { $pagetitle = $self->get_meta_value("title", $textref); ($alttext) = $tag =~ /\salt\s*=\s*(?:\"|\')(.+?)(?:\"|\')/is; if ( defined($alttext) && length($alttext) > 1) { $reltext .= "\n"; } $reltext .= "\n"; # get caption/tag based near text (if appropriate) if ( $aggr > 1 ) { if ( $aggr >= 2 ) { $reltext .= $self->extract_caption_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); $relreltext = $reltext; } # repeat the filepath, alt-text, caption, etc if ( $aggr == 8 ) { $reltext .= $relreltext; } if ( $aggr >= 3 ) { $reltext .= $self->extract_near_text($tag, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); } # get page metadata (if appropriate) if ( $aggr >= 6 || ( $aggr >= 2 && ( $self->{'images_indexed'} < 2 || ($self->{'smallpage'} == 1 && $self->{'images_indexed'} < 6 )))) { $reltext .= $self->get_page_metadata($textref); } # textual references if ( $aggr == 5 || $aggr >= 7) { if ( length($relreltext) > ($self->{'caption_length'} * 2) ) { $reltext .= $self->get_textrefs($relreltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); } else { $reltext .= $self->get_textrefs($reltext, $textref, $prevpos, $imgs->{$id}{'pos'}, $nextpos); } } } # aggr > 1 } # aggr != 10 $doc_obj->set_OID($crcid); $doc_obj->associate_file($thumbfp, "$fn.thumb.jpg", undef, $section); $doc_obj->add_metadata($section, "OriginalFilename", $filename); $doc_obj->add_metadata($section, "FilePath", $orig_fp); $doc_obj->add_metadata($section, "Filename", $fn); $doc_obj->add_metadata($section, "FileExt", $ext); $doc_obj->add_metadata($section, "FileSize", $imgs->{$id}{'filesize'}); $doc_obj->add_metadata($section, "Width", $imgs->{$id}{'width'}); $doc_obj->add_metadata($section, "Height", $imgs->{$id}{'height'}); $doc_obj->add_metadata($section, "URL", "http://$orig_fp"); $doc_obj->add_metadata($section, "PageURL", $self->{'hturl'}); $doc_obj->add_metadata($section, "PageTitle", $pagetitle); $doc_obj->add_metadata($section, "ThumbURL", "_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$fn.thumb.jpg"); $doc_obj->add_metadata($section, "FileFormat", "W3Img"); if ( $self->{'document_text'} ) { $doc_obj->add_utf8_text($section, "

{If}{[Title],[link][icon] [Title][[/link], #

# [OriginalFilename] #
[Width]x[Height]

}

GifComment

" . `identify $filepath -ping -format "%c"` . "

FilePath

$orig_fp

AllPage

" . $$textref . "

ALTtext

$alttext

SplitCapitalisation

" . $self->split_filepath($orig_fp) . "

\n$reltext

"); } else { $doc_obj->add_metadata($section, "ImageText", "\n$reltext

\n"); } if ( $self->{'index_pages'} ) { my ($cache_url) = "_httpdoc_&d=" . $self->{'htdoc_obj'}->get_OID(); if ( $imgs->{$id}{'anchored'} ) { my $a_name = $id; $a_name =~ s/[\/\\\:\&]/_/g; $cache_url .= "#gsdl_$a_name" ; } $doc_obj->add_utf8_metadata($section, "CachePageURL", $cache_url); } if ( ! $self->{'no_cache_images'} ) { $onlyfn = lc $onlyfn; $doc_obj->associate_file($filepath, $onlyfn, undef, $section); $doc_obj->add_utf8_metadata($section, "CacheURL", "_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$onlyfn"); } return 1; } sub get_page_metadata { my ($self, $textref) = (@_); my (@rval); $rval[0] = $self->get_meta_value("title", $textref); $rval[1] = $self->get_meta_value("keywords", $textref); $rval[2] = $self->get_meta_value("description", $textref); $rval[3] = $self->{'filename'}; return wantarray ? @rval : "PageMeta@rval\n" ; } # turns LargeCatFish into Large,Cat,Fish so MG sees the separate words sub split_filepath { my ($self, $filepath) = (@_); my (@words) = $filepath =~ /([A-Z][a-z]+)/g; return join(',', @words); } # finds and extracts sentences # that seem to be on the same topic # as other related text (correlations) # and textual references (e.g. in figure 3 ...) sub get_textrefs { my ($self, $reltext, $textref, $prevpos, $pos, $nextpos) = (@_); my ($maxtext, $mintext, $startpos, $context_size, $context); my (@relwords, @refwords, %sentences, @pagemeta); # extract larger context $maxtext = $self->{'max_near_text'}; $startpos = $pos - ($maxtext * 4); $context_size = $maxtext*10; if ($startpos < $prevpos ) { $startpos = $prevpos } if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) } $context = substr ( $$textref, $startpos, $context_size ); $context =~ s/<.*?>//gs; $context =~ s/^.*>(.*)/$1/gs; $context =~ s/(.*)<.*$/$1/gs; # get page meta-data (if not already included) if ( $self->{'aggressiveness'} == 5 && ! $self->{'smallpage'} ) { @pagemeta = $self->get_page_metadata($textref); foreach my $value ( @pagemeta ) { $context .= "$value."; # make each into psuedo-sentence } } # TODO: this list is not exhaustive @refwords = ( '(?:is|are)? ?(?:show(?:s|n)|demonstrate(?:d|s)|explains|features) (?:in|by|below|above|here)', '(?:see)? (?:figure|table)? (?:below|above)'); # extract general references foreach my $rw ( @refwords ) { while ( $context =~ /[\.\?\!\,](.*?$rw\W.*?[\.\?\!\,])/ig ) { my $sentence = $1; $sentence =~ s/\s+/ /g; $sentences{$sentence}+=2; } } # extract specific (figure, table) references by number my ($fignum) = $context =~ /[\.\?\!].*?(?:figure|table)s?[\-\_\ \.](\d+\w*)\W.*?[\.\?\!]/ig; if ( $fignum ) { foreach my $rw ( @refwords ) { while ( $context =~ /[\.\?\!](.*?(figure|table)[\-\_\ \.]$fignum\W.*?[\.\?\!])/ig ) { my $sentence = $1; $sentence =~ s/\s+/ /g; $sentences{$sentence}+=4; } } } # sentences with occurances of important words @relwords = $reltext =~ /([a-zA-Z]{4,})/g; # take out small words foreach my $word ( @relwords ) { if ( $self->{'stopwords'}{$word} ) { next } # skip stop words while ( $context =~ /([^\.\?\!]*?$word\W.*?[\.\?\!])/ig ) { my $sentence = $1; $sentence =~ s/\s+/ /g; $sentences{$sentence}++; } } foreach my $sentence ( keys %sentences ) { if ($sentences{$sentence} < $self->{'textrefs_threshold'}) { delete $sentences{$sentence}; } } my ($rval) = join "
\n", (keys %sentences); if ( $rval && length($rval) > 5 ) { return ( "TextualReferences" . $rval . "\n") } else { return "" } } # handles caption extraction # calling the extractor with different # tags and choosing the best candidate caption sub extract_caption_text { my ($self, $tag, $textref, $prevpos, $pos, $nextpos) = (@_); my (@neartext, $len, $hdelim, $mintext, $goodlen, $startpos, $context, $context_size); $mintext = $self->{'min_near_text'}; $goodlen = $self->{'caption_length'}; # extract a context to extract near text from (faster) $context_size = $self->{'max_near_text'}*3; $startpos = $pos - ($context_size / 2); if ($startpos < $prevpos ) { $startpos = $prevpos } if ($nextpos != -1 && $context_size > ( $nextpos - $startpos )) { $context_size = ($nextpos - $startpos) } $context = substr ( $$textref, $startpos, $context_size ); $context =~ s///gs; $context =~ s/^.*-->(.*)/$1/gs; $context =~ s/(.*)//gs; $context =~ s/^.*-->(.*)/$1/gs; $context =~ s/(.*)