Files
lancaster_stemmer/src/lancaster_stemmer.gleam
2025-11-01 15:16:28 +00:00

486 lines
14 KiB
Gleam

import gleam/bool
import gleam/dict
import gleam/int
import gleam/io
import gleam/list
import gleam/option.{None, Some}
import gleam/result
import gleam/string
import simplifile
import splitter
pub opaque type Rule {
Rule(
id: String,
letter: String,
suffix: String,
intact: Bool,
remove: Int,
append: String,
restem: Bool,
)
}
pub type Rules =
dict.Dict(String, List(Rule))
const default_rules_list = [
#(
"a",
[
Rule("(1:ai*2.)", "a", "ia", True, 2, "", False),
Rule("(2:a*1.)", "a", "a", True, 1, "", False),
],
),
#("b", [Rule("(3:bb1.)", "b", "bb", False, 1, "", False)]),
#(
"c",
[
Rule("(4:city3s.)", "c", "ytic", False, 3, "s", False),
Rule("(5:ci2>)", "c", "ic", False, 2, "", True),
Rule("(6:cn1t>)", "c", "nc", False, 1, "t", True),
],
),
#(
"d",
[
Rule("(7:dd1.)", "d", "dd", False, 1, "", False),
Rule("(8:dei3y>)", "d", "ied", False, 3, "y", True),
Rule("(9:deec2ss.)", "d", "ceed", False, 2, "ss", False),
Rule("(10:dee1.)", "d", "eed", False, 1, "", False),
Rule("(11:de2>)", "d", "ed", False, 2, "", True),
Rule("(12:dooh4>)", "d", "hood", False, 4, "", True),
],
),
#("e", [Rule("(13:e1>)", "e", "e", False, 1, "", True)]),
#(
"f",
[
Rule("(14:feil1v.)", "f", "lief", False, 1, "v", False),
Rule("(15:fi2>)", "f", "if", False, 2, "", True),
],
),
#(
"g",
[
Rule("(16:gni3>)", "g", "ing", False, 3, "", True),
Rule("(17:gai3y.)", "g", "iag", False, 3, "y", False),
Rule("(18:ga2>)", "g", "ag", False, 2, "", True),
Rule("(19:gg1.)", "g", "gg", False, 1, "", False),
],
),
#(
"h",
[
Rule("(20:ht*2.)", "h", "th", True, 2, "", False),
Rule("(21:hsiug5ct.)", "h", "guish", False, 5, "ct", False),
Rule("(22:hsi3>)", "h", "ish", False, 3, "", True),
],
),
#(
"i",
[
Rule("(23:i*1.)", "i", "i", True, 1, "", False),
Rule("(24:i1y>)", "i", "i", False, 1, "y", True),
],
),
#(
"j",
[
Rule("(25:ji1d.)", "j", "ij", False, 1, "d", False),
Rule("(26:juf1s.)", "j", "fuj", False, 1, "s", False),
Rule("(27:ju1d.)", "j", "uj", False, 1, "d", False),
Rule("(28:jo1d.)", "j", "oj", False, 1, "d", False),
Rule("(29:jeh1r.)", "j", "hej", False, 1, "r", False),
Rule("(30:jrev1t.)", "j", "verj", False, 1, "t", False),
Rule("(31:jsim2t.)", "j", "misj", False, 2, "t", False),
Rule("(32:jn1d.)", "j", "nj", False, 1, "d", False),
Rule("(33:j1s.)", "j", "j", False, 1, "s", False),
],
),
#(
"l",
[
Rule("(34:lbaifi6.)", "l", "ifiabl", False, 6, "", False),
Rule("(35:lbai4y.)", "l", "iabl", False, 4, "y", False),
Rule("(36:lba3>)", "l", "abl", False, 3, "", True),
Rule("(37:lbi3.)", "l", "ibl", False, 3, "", False),
Rule("(38:lib2l>)", "l", "bil", False, 2, "l", True),
Rule("(39:lc1.)", "l", "cl", False, 1, "", False),
Rule("(40:lufi4y.)", "l", "iful", False, 4, "y", False),
Rule("(41:luf3>)", "l", "ful", False, 3, "", True),
Rule("(42:lu2.)", "l", "ul", False, 2, "", False),
Rule("(43:lai3>)", "l", "ial", False, 3, "", True),
Rule("(44:lau3>)", "l", "ual", False, 3, "", True),
Rule("(45:la2>)", "l", "al", False, 2, "", True),
Rule("(46:ll1.)", "l", "ll", False, 1, "", False),
],
),
#(
"m",
[
Rule("(47:mui3.)", "m", "ium", False, 3, "", False),
Rule("(48:mu*2.)", "m", "um", True, 2, "", False),
Rule("(49:msi3>)", "m", "ism", False, 3, "", True),
Rule("(50:mm1.)", "m", "mm", False, 1, "", False),
],
),
#(
"n",
[
Rule("(51:nois4j>)", "n", "sion", False, 4, "j", True),
Rule("(52:noix4ct.)", "n", "xion", False, 4, "ct", False),
Rule("(53:noi3>)", "n", "ion", False, 3, "", True),
Rule("(54:nai3>)", "n", "ian", False, 3, "", True),
Rule("(55:na2>)", "n", "an", False, 2, "", True),
Rule("(56:nee0.)", "n", "een", False, 0, "", False),
Rule("(57:ne2>)", "n", "en", False, 2, "", True),
Rule("(58:nn1.)", "n", "nn", False, 1, "", False),
],
),
#(
"p",
[
Rule("(59:pihs4>)", "p", "ship", False, 4, "", True),
Rule("(60:pp1.)", "p", "pp", False, 1, "", False),
],
),
#(
"r",
[
Rule("(61:re2>)", "r", "er", False, 2, "", True),
Rule("(62:rae0.)", "r", "ear", False, 0, "", False),
Rule("(63:ra2.)", "r", "ar", False, 2, "", False),
Rule("(64:ro2>)", "r", "or", False, 2, "", True),
Rule("(65:ru2>)", "r", "ur", False, 2, "", True),
Rule("(66:rr1.)", "r", "rr", False, 1, "", False),
Rule("(67:rt1>)", "r", "tr", False, 1, "", True),
Rule("(68:rei3y>)", "r", "ier", False, 3, "y", True),
],
),
#(
"s",
[
Rule("(69:sei3y>)", "s", "ies", False, 3, "y", True),
Rule("(70:sis2.)", "s", "sis", False, 2, "", False),
Rule("(71:si2>)", "s", "is", False, 2, "", True),
Rule("(72:ssen4>)", "s", "ness", False, 4, "", True),
Rule("(73:ss0.)", "s", "ss", False, 0, "", False),
Rule("(74:suo3>)", "s", "ous", False, 3, "", True),
Rule("(75:su*2.)", "s", "us", True, 2, "", False),
Rule("(76:s*1>)", "s", "s", True, 1, "", True),
Rule("(77:s0.)", "s", "s", False, 0, "", False),
],
),
#(
"t",
[
Rule("(78:tacilp4y.)", "t", "plicat", False, 4, "y", False),
Rule("(79:ta2>)", "t", "at", False, 2, "", True),
Rule("(80:tnem4>)", "t", "ment", False, 4, "", True),
Rule("(81:tne3>)", "t", "ent", False, 3, "", True),
Rule("(82:tna3>)", "t", "ant", False, 3, "", True),
Rule("(83:tpir2b.)", "t", "ript", False, 2, "b", False),
Rule("(84:tpro2b.)", "t", "orpt", False, 2, "b", False),
Rule("(85:tcud1.)", "t", "duct", False, 1, "", False),
Rule("(86:tpmus2.)", "t", "sumpt", False, 2, "", False),
Rule("(87:tpec2iv.)", "t", "cept", False, 2, "iv", False),
Rule("(88:tulo2v.)", "t", "olut", False, 2, "v", False),
Rule("(89:tsis0.)", "t", "sist", False, 0, "", False),
Rule("(90:tsi3>)", "t", "ist", False, 3, "", True),
Rule("(91:tt1.)", "t", "tt", False, 1, "", False),
],
),
#(
"u",
[
Rule("(92:uqi3.)", "u", "iqu", False, 3, "", False),
Rule("(93:ugo1.)", "u", "ogu", False, 1, "", False),
],
),
#(
"v",
[
Rule("(94:vis3j>)", "v", "siv", False, 3, "j", True),
Rule("(95:vie0.)", "v", "eiv", False, 0, "", False),
Rule("(96:vi2>)", "v", "iv", False, 2, "", True),
],
),
#(
"y",
[
Rule("(97:ylb1>)", "y", "bly", False, 1, "", True),
Rule("(98:yli3y>)", "y", "ily", False, 3, "y", True),
Rule("(99:ylp0.)", "y", "ply", False, 0, "", False),
Rule("(100:yl2>)", "y", "ly", False, 2, "", True),
Rule("(101:ygo1.)", "y", "ogy", False, 1, "", False),
Rule("(102:yhp1.)", "y", "phy", False, 1, "", False),
Rule("(103:ymo1.)", "y", "omy", False, 1, "", False),
Rule("(104:ypo1.)", "y", "opy", False, 1, "", False),
Rule("(105:yti3>)", "y", "ity", False, 3, "", True),
Rule("(106:yte3>)", "y", "ety", False, 3, "", True),
Rule("(107:ytl2.)", "y", "lty", False, 2, "", False),
Rule("(108:yrtsi5.)", "y", "istry", False, 5, "", False),
Rule("(109:yra3>)", "y", "ary", False, 3, "", True),
Rule("(110:yro3>)", "y", "ory", False, 3, "", True),
Rule("(111:yfi3.)", "y", "ify", False, 3, "", False),
Rule("(112:ycn2t>)", "y", "ncy", False, 2, "t", True),
Rule("(113:yca3>)", "y", "acy", False, 3, "", True),
],
),
#(
"z",
[
Rule("(114:zi2>)", "z", "iz", False, 2, "", True),
Rule("(115:zy1s.)", "z", "yz", False, 1, "s", False),
],
),
]
pub fn default_rules() -> Rules {
dict.from_list(default_rules_list)
}
pub fn stem(word: String, rules: Rules) -> String {
let word = string.lowercase(word)
case is_valid(word) {
True -> {
do_stem(word, rules, True)
}
False -> word
}
}
fn do_stem(word: String, rules: Rules, intact: Bool) -> String {
case string.reverse(word) |> string.pop_grapheme {
Ok(#(letter, _)) -> {
case dict.get(rules, letter) {
Ok(specific_rules) -> {
let #(stem, restem, intact) =
list.fold_until(
specific_rules,
#(word, False, intact),
fn(state, rule) {
case rule_matches(rule, word, intact) {
True -> {
let result = apply_rule(rule, word)
case is_valid(result) {
False -> list.Continue(state)
True -> {
list.Stop(#(result, rule.restem, False))
}
}
}
False -> list.Continue(state)
}
},
)
case restem {
True -> do_stem(stem, rules, intact)
False -> stem
}
}
Error(_) -> word
}
}
Error(_) -> word
}
}
fn rule_matches(rule: Rule, word: String, stem_intact: Bool) -> Bool {
case !stem_intact && rule.intact {
True -> False
False -> string.ends_with(word, rule.suffix)
}
}
fn apply_rule(rule: Rule, word: String) -> String {
string.drop_end(word, rule.remove) <> rule.append
}
fn is_valid(word: String) -> Bool {
case word {
"" -> False
"a" <> rest | "e" <> rest | "i" <> rest | "o" <> rest | "u" <> rest -> {
rest != ""
}
_ -> {
is_valid_internal(word, 0)
}
}
}
fn is_valid_internal(word: String, length: Int) -> Bool {
case word {
"" -> False
"a" <> rest
| "e" <> rest
| "i" <> rest
| "o" <> rest
| "u" <> rest
| "y" <> rest -> {
{ length + 1 + string.length(rest) } >= 3
}
_ -> {
is_valid_internal(string.drop_start(word, 1), length + 1)
}
}
}
pub fn load_rules(filename: String) -> Result(Rules, Nil) {
case simplifile.read(filename) {
Error(_) -> Error(Nil)
Ok(rules) -> {
let split = splitter.new(["\n", "\r\n"])
use rules <- result.try(process_rules(rules, split, [], 1))
list.fold(rules, dict.new(), fn(acc, rule) {
dict.upsert(acc, rule.letter, fn(rules) {
case rules {
Some(rules) -> {
[rule, ..rules]
}
None -> [rule]
}
})
})
|> Ok
}
}
}
fn process_rules(
rules: String,
split: splitter.Splitter,
acc: List(Rule),
id: Int,
) -> Result(List(Rule), Nil) {
case splitter.split(split, rules) {
#("end0." <> _, _, _) -> Ok(acc)
#(rule, _, rest) -> {
use rule <- result.try(process_rule(rule, id))
process_rules(rest, split, [rule, ..acc], id + 1)
}
}
}
fn process_rule(rule: String, id: Int) -> Result(Rule, Nil) {
let rule = string.trim(rule)
use #(suffix, rule) <- result.try(parse_rule_text(rule, ""))
use <- bool.guard(when: suffix == "", return: Error(Nil))
let #(intact, rule) = case rule {
"*" <> rule -> #(True, rule)
_ -> #(False, rule)
}
use #(remove, rule) <- result.try(parse_rule_remove(rule, ""))
use #(append, rule) <- result.try(parse_rule_text(rule, ""))
use restem <- result.try(case rule {
">" <> _ -> Ok(True)
"." <> _ -> Ok(False)
_ -> Error(Nil)
})
let id =
"("
<> int.to_string(id)
<> ":"
<> suffix
<> {
case intact {
True -> "*"
False -> ""
}
}
<> int.to_string(remove)
<> append
<> {
case restem {
True -> ">"
False -> "."
}
}
<> ")"
use letter <- result.try(string.first(suffix))
let suffix = string.reverse(suffix)
Ok(Rule(id, letter, suffix, intact, remove, append, restem))
}
fn parse_rule_remove(rule: String, acc: String) -> Result(#(Int, String), Nil) {
case rule {
"0" as number <> rest
| "1" as number <> rest
| "2" as number <> rest
| "3" as number <> rest
| "4" as number <> rest
| "5" as number <> rest
| "6" as number <> rest
| "7" as number <> rest
| "8" as number <> rest
| "9" as number <> rest -> parse_rule_remove(rest, acc <> number)
_ if acc == "" -> Error(Nil)
_ -> {
use i <- result.try(int.parse(acc))
Ok(#(i, rule))
}
}
}
fn parse_rule_text(rule: String, acc: String) -> Result(#(String, String), Nil) {
case rule {
"a" as letter <> rest
| "b" as letter <> rest
| "c" as letter <> rest
| "d" as letter <> rest
| "e" as letter <> rest
| "f" as letter <> rest
| "g" as letter <> rest
| "h" as letter <> rest
| "i" as letter <> rest
| "j" as letter <> rest
| "k" as letter <> rest
| "l" as letter <> rest
| "m" as letter <> rest
| "n" as letter <> rest
| "o" as letter <> rest
| "p" as letter <> rest
| "q" as letter <> rest
| "r" as letter <> rest
| "s" as letter <> rest
| "t" as letter <> rest
| "u" as letter <> rest
| "v" as letter <> rest
| "w" as letter <> rest
| "x" as letter <> rest
| "y" as letter <> rest
| "z" as letter <> rest
| "A" as letter <> rest
| "B" as letter <> rest
| "C" as letter <> rest
| "D" as letter <> rest
| "E" as letter <> rest
| "F" as letter <> rest
| "G" as letter <> rest
| "H" as letter <> rest
| "I" as letter <> rest
| "J" as letter <> rest
| "K" as letter <> rest
| "L" as letter <> rest
| "M" as letter <> rest
| "N" as letter <> rest
| "O" as letter <> rest
| "P" as letter <> rest
| "Q" as letter <> rest
| "R" as letter <> rest
| "S" as letter <> rest
| "T" as letter <> rest
| "U" as letter <> rest
| "V" as letter <> rest
| "W" as letter <> rest
| "X" as letter <> rest
| "Y" as letter <> rest
| "Z" as letter <> rest -> parse_rule_text(rest, acc <> letter)
_ -> Ok(#(acc, rule))
}
}