forked from github/kensanata.oddmuse
There were some modules that did not offer "or (at your option) any later version" in their license and these had to be left alone. This should solve the incorrect FSF address issue #4 on GitHub.
155 lines
4.5 KiB
Perl
155 lines
4.5 KiB
Perl
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
use strict;
|
|
use v5.10;
|
|
|
|
AddModuleDescription('wordstem.pl', 'WordStemming');
|
|
|
|
*OldStemmingResolveId = \&ResolveId;
|
|
*ResolveId = \&NewStemmingResolveId;
|
|
|
|
initialise();
|
|
my %StemmedPages = ();
|
|
|
|
sub NewStemmingResolveId {
|
|
my $id = shift;
|
|
my ($class, $resolved, $title, $exists) = OldStemmingResolveId($id);
|
|
return ($class, $resolved, $title, $exists) if $resolved;
|
|
if (not %StemmedPages) {
|
|
foreach my $page (AllPagesList()) {
|
|
$StemmedPages{&stemWord($page)} = $page;
|
|
}
|
|
}
|
|
my $page = &stemWord($id);
|
|
if ($StemmedPages{$page}) {
|
|
return ('local stemmed', $StemmedPages{$page}, $StemmedPages{$page}, undef);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
my %step2list;
|
|
my %step3list;
|
|
my ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);
|
|
|
|
|
|
sub stem
|
|
{ my ($stem, $suffix, $firstch);
|
|
my $w = shift;
|
|
if (length($w) < 3) { return $w; } # length at least 3
|
|
# now map initial y to Y so that the patterns never treat it as vowel:
|
|
$w =~ /^./; $firstch = $&;
|
|
if ($firstch =~ /^y/) { $w = ucfirst $w; }
|
|
|
|
# Step 1a
|
|
if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
|
|
elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
|
|
# Step 1b
|
|
if ($w =~ /eed$/) { if ($` =~ /$mgr0/) { chop($w); } }
|
|
elsif ($w =~ /(ed|ing)$/)
|
|
{ $stem = $`;
|
|
if ($stem =~ /$_v/)
|
|
{ $w = $stem;
|
|
if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
|
|
elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
|
|
elsif ($w =~ /^${C}${v}[^aeiouwxy]$/) { $w .= "e"; }
|
|
}
|
|
}
|
|
# Step 1c
|
|
if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/) { $w = $stem."i"; } }
|
|
|
|
# Step 2
|
|
if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
|
|
{ $stem = $`; $suffix = $1;
|
|
if ($stem =~ /$mgr0/) { $w = $stem . $step2list{$suffix}; }
|
|
}
|
|
|
|
# Step 3
|
|
|
|
if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
|
|
{ $stem = $`; $suffix = $1;
|
|
if ($stem =~ /$mgr0/) { $w = $stem . $step3list{$suffix}; }
|
|
}
|
|
|
|
# Step 4
|
|
|
|
if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
|
|
{ $stem = $`; if ($stem =~ /$mgr1/) { $w = $stem; } }
|
|
elsif ($w =~ /(s|t)(ion)$/)
|
|
{ $stem = $` . $1; if ($stem =~ /$mgr1/) { $w = $stem; } }
|
|
|
|
|
|
# Step 5
|
|
|
|
if ($w =~ /e$/)
|
|
{ $stem = $`;
|
|
if ($stem =~ /$mgr1/ or
|
|
($stem =~ /$meq1/ and not $stem =~ /^${C}${v}[^aeiouwxy]$/))
|
|
{ $w = $stem; }
|
|
}
|
|
if ($w =~ /ll$/ and $w =~ /$mgr1/) { chop($w); }
|
|
|
|
# and turn initial Y back to y
|
|
if ($firstch =~ /^y/) { $w = lcfirst $w; }
|
|
return $w;
|
|
}
|
|
|
|
sub initialise {
|
|
|
|
%step2list =
|
|
( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
|
|
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
|
|
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
|
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');
|
|
|
|
%step3list =
|
|
('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');
|
|
|
|
|
|
$c = "[^aeiou]"; # consonant
|
|
$v = "[aeiouy]"; # vowel
|
|
$C = "${c}[^aeiouy]*"; # consonant sequence
|
|
$V = "${v}[aeiou]*"; # vowel sequence
|
|
|
|
$mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0
|
|
$meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1
|
|
$mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1
|
|
$_v = "^(${C})?${v}"; # vowel in stem
|
|
|
|
}
|
|
|
|
sub stemWord {
|
|
my $page = shift;
|
|
my $oldpage = $page;
|
|
$page = "";
|
|
# Split the word up at case changes and stem each subword
|
|
my @words = split(/([a-z]*)([A-Z]+[a-z]+)/,$oldpage);
|
|
foreach my $w(@words) {
|
|
if ($w) {
|
|
if ($w =~ /_/) { # Possible word separated by _
|
|
my @subwords = split(/_/,$w);
|
|
foreach my $w(@subwords) {
|
|
if ($w) {
|
|
$page .= lc(&stem($w)); #Force case changes to not matter
|
|
}
|
|
}
|
|
}
|
|
else{
|
|
$page .= lc(&stem($w));
|
|
}
|
|
}
|
|
}
|
|
return $page;
|
|
}
|