forked from github/kensanata.oddmuse
159 lines
4.7 KiB
Perl
159 lines
4.7 KiB
Perl
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the
|
|
# Free Software Foundation, Inc.
|
|
# 59 Temple Place, Suite 330
|
|
# Boston, MA 02111-1307 USA
|
|
#
|
|
# Porter stemming algorithm code copied verbatim from http://www.tartarus.org/~martin/PorterStemmer/
|
|
|
|
use strict;
|
|
|
|
AddModuleDescription('wordstem.pl', 'WordStemming');
|
|
|
|
*OldStemmingResolveId = \&ResolveId;
|
|
*ResolveId = \&NewStemmingResolveId;
|
|
|
|
initialise();
|
|
my %StemmedPages = ();
|
|
|
|
sub NewStemmingResolveId {
|
|
my $id = shift;
|
|
my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
|
|
return ($class, $resolved, $title, $exists) if $resolved;
|
|
if (not %StemmedPages) {
|
|
foreach my $page (AllPagesList()) {
|
|
$StemmedPages{&stemWord($page)} = $page;
|
|
}
|
|
}
|
|
my $page = &stemWord($id);
|
|
if ($StemmedPages{$page}) {
|
|
return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
my %step2list;
|
|
my %step3list;
|
|
my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
|
|
|
|
|
|
sub stem
|
|
{ my ($stem, $suffix, $firstch);
|
|
my $w = shift;
|
|
if (length($w) < 3) { return $w; } # length at least 3
|
|
# now map initial y to Y so that the patterns never treat it as vowel:
|
|
$w =~ /^./; $firstch = $&;
|
|
if ($firstch =~ /^y/) { $w = ucfirst $w; }
|
|
|
|
# Step 1a
|
|
if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
|
|
elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
|
|
# Step 1b
|
|
if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } }
|
|
elsif ($w =~ /(ed|ing)$/)
|
|
{ $stem = $`;
|
|
if ($stem =~ /$_v/o)
|
|
{ $w = $stem;
|
|
if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
|
|
elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
|
|
elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
|
|
}
|
|
}
|
|
# Step 1c
|
|
if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } }
|
|
|
|
# Step 2
|
|
if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
|
|
{ $stem = $`; $suffix = $1;
|
|
if ($stem =~ /$mgr0/o) { $w = $stem . $step2list{$suffix}; }
|
|
}
|
|
|
|
# Step 3
|
|
|
|
if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
|
|
{ $stem = $`; $suffix = $1;
|
|
if ($stem =~ /$mgr0/o) { $w = $stem . $step3list{$suffix}; }
|
|
}
|
|
|
|
# Step 4
|
|
|
|
if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
|
|
{ $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } }
|
|
elsif ($w =~ /(s|t)(ion)$/)
|
|
{ $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } }
|
|
|
|
|
|
# Step 5
|
|
|
|
if ($w =~ /e$/)
|
|
{ $stem = $`;
|
|
if ($stem =~ /$mgr1/o or
|
|
($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o))
|
|
{ $w = $stem; }
|
|
}
|
|
if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); }
|
|
|
|
# and turn initial Y back to y
|
|
if ($firstch =~ /^y/) { $w = lcfirst $w; }
|
|
return $w;
|
|
}
|
|
|
|
sub initialise {
|
|
|
|
%step2list =
|
|
( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
|
|
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
|
|
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
|
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
|
|
|
|
%step3list =
|
|
('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
|
|
|
|
|
|
$c = "[^aeiou]"; # consonant
|
|
$v = "[aeiouy]"; # vowel
|
|
$C = "${c}[^aeiouy]*"; # consonant sequence
|
|
$V = "${v}[aeiou]*"; # vowel sequence
|
|
|
|
$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
|
|
$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
|
|
$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
|
|
$_v = "^(${C})?${v}"; # vowel in stem
|
|
|
|
}
|
|
|
|
sub stemWord {
|
|
my $page = shift;
|
|
my $oldpage = $page;
|
|
$page = "";
|
|
# Split the word up at case changes and stem each subword
|
|
my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
|
|
foreach my $w(@words) {
|
|
if ($w) {
|
|
if ($w =~ /_/) { # Possible word separated by _
|
|
my @subwords = split(/_/,$w);
|
|
foreach my $w(@subwords) {
|
|
if ($w) {
|
|
$page .= lc(&stem($w)); #Force case changes to not matter
|
|
}
|
|
}
|
|
}
|
|
else{
|
|
$page .= lc(&stem($w));
|
|
}
|
|
}
|
|
}
|
|
return $page;
|
|
}
|