Files
oddmuse/modules/wordstem.pl
2015-04-11 23:41:33 +03:00

159 lines
4.7 KiB
Perl

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.
# 59 Temple Place, Suite 330
# Boston, MA 02111-1307 USA
#
# Porter stemming algorithm code copied verbatim from http://www.tartarus.org/~martin/PorterStemmer/
use strict;
AddModuleDescription('wordstem.pl', 'WordStemming');
*OldStemmingResolveId = \&ResolveId;
*ResolveId = \&NewStemmingResolveId;
initialise();
my %StemmedPages = ();
sub NewStemmingResolveId {
my $id = shift;
my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
return ($class, $resolved, $title, $exists) if $resolved;
if (not %StemmedPages) {
foreach my $page (AllPagesList()) {
$StemmedPages{&stemWord($page)} = $page;
}
}
my $page = &stemWord($id);
if ($StemmedPages{$page}) {
return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
}
}
my %step2list;
my %step3list;
my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
sub stem
{ my ($stem, $suffix, $firstch);
my $w = shift;
if (length($w) < 3) { return $w; } # length at least 3
# now map initial y to Y so that the patterns never treat it as vowel:
$w =~ /^./; $firstch = $&;
if ($firstch =~ /^y/) { $w = ucfirst $w; }
# Step 1a
if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
# Step 1b
if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } }
elsif ($w =~ /(ed|ing)$/)
{ $stem = $`;
if ($stem =~ /$_v/o)
{ $w = $stem;
if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
}
}
# Step 1c
if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } }
# Step 2
if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
{ $stem = $`; $suffix = $1;
if ($stem =~ /$mgr0/o) { $w = $stem . $step2list{$suffix}; }
}
# Step 3
if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
{ $stem = $`; $suffix = $1;
if ($stem =~ /$mgr0/o) { $w = $stem . $step3list{$suffix}; }
}
# Step 4
if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
{ $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } }
elsif ($w =~ /(s|t)(ion)$/)
{ $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } }
# Step 5
if ($w =~ /e$/)
{ $stem = $`;
if ($stem =~ /$mgr1/o or
($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o))
{ $w = $stem; }
}
if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); }
# and turn initial Y back to y
if ($firstch =~ /^y/) { $w = lcfirst $w; }
return $w;
}
sub initialise {
%step2list =
( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
%step3list =
('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
$c = "[^aeiou]"; # consonant
$v = "[aeiouy]"; # vowel
$C = "${c}[^aeiouy]*"; # consonant sequence
$V = "${v}[aeiou]*"; # vowel sequence
$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
$_v = "^(${C})?${v}"; # vowel in stem
}
sub stemWord {
my $page = shift;
my $oldpage = $page;
$page = "";
# Split the word up at case changes and stem each subword
my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
foreach my $w(@words) {
if ($w) {
if ($w =~ /_/) { # Possible word separated by _
my @subwords = split(/_/,$w);
foreach my $w(@subwords) {
if ($w) {
$page .= lc(&stem($w)); #Force case changes to not matter
}
}
}
else{
$page .= lc(&stem($w));
}
}
}
return $page;
}