Files
oddmuse/modules/wordstem.pl
Alex Schroeder f230a64e7d Changed nearly all modules from GPLv2 to GPLv3
There were some modules that did not offer "or (at your option) any
later version" in their license and these had to be left alone.
This should solve the incorrect FSF address issue #4 on GitHub.
2016-08-16 15:04:47 +02:00

155 lines
4.5 KiB
Perl

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
use strict;
use v5.10;
AddModuleDescription('wordstem.pl', 'WordStemming');
*OldStemmingResolveId = \&ResolveId;
*ResolveId = \&NewStemmingResolveId;
initialise();
my %StemmedPages = ();
sub NewStemmingResolveId {
my $id = shift;
my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
return ($class, $resolved, $title, $exists) if $resolved;
if (not %StemmedPages) {
foreach my $page (AllPagesList()) {
$StemmedPages{&stemWord($page)} = $page;
}
}
my $page = &stemWord($id);
if ($StemmedPages{$page}) {
return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
}
}
my %step2list;
my %step3list;
my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
sub stem
{ my ($stem, $suffix, $firstch);
my $w = shift;
if (length($w) < 3) { return $w; } # length at least 3
# now map initial y to Y so that the patterns never treat it as vowel:
$w =~ /^./; $firstch = $&;
if ($firstch =~ /^y/) { $w = ucfirst $w; }
# Step 1a
if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
# Step 1b
if ($w =~ /eed$/) { if ($` =~ /$mgr0/) { chop($w); } }
elsif ($w =~ /(ed|ing)$/)
{ $stem = $`;
if ($stem =~ /$_v/)
{ $w = $stem;
if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
elsif ($w =~ /^${C}${v}[^aeiouwxy]$/) { $w .= "e"; }
}
}
# Step 1c
if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/) { $w = $stem."i"; } }
# Step 2
if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
{ $stem = $`; $suffix = $1;
if ($stem =~ /$mgr0/) { $w = $stem . $step2list{$suffix}; }
}
# Step 3
if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
{ $stem = $`; $suffix = $1;
if ($stem =~ /$mgr0/) { $w = $stem . $step3list{$suffix}; }
}
# Step 4
if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
{ $stem = $`; if ($stem =~ /$mgr1/) { $w = $stem; } }
elsif ($w =~ /(s|t)(ion)$/)
{ $stem = $` . $1; if ($stem =~ /$mgr1/) { $w = $stem; } }
# Step 5
if ($w =~ /e$/)
{ $stem = $`;
if ($stem =~ /$mgr1/ or
($stem =~ /$meq1/ and not $stem =~ /^${C}${v}[^aeiouwxy]$/))
{ $w = $stem; }
}
if ($w =~ /ll$/ and $w =~ /$mgr1/) { chop($w); }
# and turn initial Y back to y
if ($firstch =~ /^y/) { $w = lcfirst $w; }
return $w;
}
sub initialise {
%step2list =
( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
%step3list =
('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
$c = "[^aeiou]"; # consonant
$v = "[aeiouy]"; # vowel
$C = "${c}[^aeiouy]*"; # consonant sequence
$V = "${v}[aeiou]*"; # vowel sequence
$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
$_v = "^(${C})?${v}"; # vowel in stem
}
sub stemWord {
my $page = shift;
my $oldpage = $page;
$page = "";
# Split the word up at case changes and stem each subword
my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
foreach my $w(@words) {
if ($w) {
if ($w =~ /_/) { # Possible word separated by _
my @subwords = split(/_/,$w);
foreach my $w(@subwords) {
if ($w) {
$page .= lc(&stem($w)); #Force case changes to not matter
}
}
}
else{
$page .= lc(&stem($w));
}
}
}
return $page;
}