This commit is contained in:
23
.github/workflows/test.yml
vendored
Normal file
23
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
name: test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: erlef/setup-beam@v1
|
||||||
|
with:
|
||||||
|
otp-version: "28"
|
||||||
|
gleam-version: "1.13.0"
|
||||||
|
rebar3-version: "3"
|
||||||
|
# elixir-version: "1"
|
||||||
|
- run: gleam deps download
|
||||||
|
- run: gleam test
|
||||||
|
- run: gleam format --check src test
|
||||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
*.beam
|
||||||
|
*.ez
|
||||||
|
/build
|
||||||
|
erl_crash.dump
|
||||||
21
gleam.toml
Normal file
21
gleam.toml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
name = "lancaster_stemmer"
|
||||||
|
version = "1.0.0"
|
||||||
|
|
||||||
|
# Fill out these fields if you intend to generate HTML documentation or publish
|
||||||
|
# your project to the Hex package manager.
|
||||||
|
#
|
||||||
|
description = "Gleam implementation of the Lancaster (Paice/Husk) Stemmer"
|
||||||
|
licences = ["Apache-2.0"]
|
||||||
|
repository = { type = "gitea", host = "git.pendleton.ie", user = "pendletong", repo = "lancaster_stemmer" }
|
||||||
|
links = [{ title = "Lancaster Stemmer Specs", href = "https://dl.acm.org/doi/10.1145/101306.101310" }]
|
||||||
|
#
|
||||||
|
# For a full reference of all the available options, you can have a look at
|
||||||
|
# https://gleam.run/writing-gleam/gleam-toml/.
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
gleam_stdlib = ">= 0.44.0 and < 2.0.0"
|
||||||
|
simplifile = ">= 2.3.0 and < 3.0.0"
|
||||||
|
splitter = ">= 1.1.0 and < 2.0.0"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
gleeunit = ">= 1.0.0 and < 2.0.0"
|
||||||
16
manifest.toml
Normal file
16
manifest.toml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# This file was generated by Gleam
|
||||||
|
# You typically do not need to edit this file
|
||||||
|
|
||||||
|
packages = [
|
||||||
|
{ name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" },
|
||||||
|
{ name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" },
|
||||||
|
{ name = "gleeunit", version = "1.7.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "CD701726CBCE5588B375D157B4391CFD0F2F134CD12D9B6998A395484DE05C58" },
|
||||||
|
{ name = "simplifile", version = "2.3.0", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "0A868DAC6063D9E983477981839810DC2E553285AB4588B87E3E9C96A7FB4CB4" },
|
||||||
|
{ name = "splitter", version = "1.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "splitter", source = "hex", outer_checksum = "05564A381580395DCDEFF4F88A64B021E8DAFA6540AE99B4623962F52976AA9D" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[requirements]
|
||||||
|
gleam_stdlib = { version = ">= 0.44.0 and < 2.0.0" }
|
||||||
|
gleeunit = { version = ">= 1.0.0 and < 2.0.0" }
|
||||||
|
simplifile = { version = ">= 2.3.0 and < 3.0.0" }
|
||||||
|
splitter = { version = ">= 1.1.0 and < 2.0.0" }
|
||||||
116
paice-husk-rules.txt
Normal file
116
paice-husk-rules.txt
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
ai*2. { -ia > - if intact }
|
||||||
|
a*1. { -a > - if intact }
|
||||||
|
bb1. { -bb > -b }
|
||||||
|
city3s. { -ytic > -ys }
|
||||||
|
ci2> { -ic > - }
|
||||||
|
cn1t> { -nc > -nt }
|
||||||
|
dd1. { -dd > -d }
|
||||||
|
dei3y> { -ied > -y }
|
||||||
|
deec2ss. { -ceed > -cess }
|
||||||
|
dee1. { -eed > -ee }
|
||||||
|
de2> { -ed > - }
|
||||||
|
dooh4> { -hood > - }
|
||||||
|
e1> { -e > - }
|
||||||
|
feil1v. { -lief > -liev }
|
||||||
|
fi2> { -if > - }
|
||||||
|
gni3> { -ing > - }
|
||||||
|
gai3y. { -iag > -y }
|
||||||
|
ga2> { -ag > - }
|
||||||
|
gg1. { -gg > -g }
|
||||||
|
ht*2. { -th > - if intact }
|
||||||
|
hsiug5ct. { -guish > -ct }
|
||||||
|
hsi3> { -ish > - }
|
||||||
|
i*1. { -i > - if intact }
|
||||||
|
i1y> { -i > -y }
|
||||||
|
ji1d. { -ij > -id -- see nois4j> & vis3j> }
|
||||||
|
juf1s. { -fuj > -fus }
|
||||||
|
ju1d. { -uj > -ud }
|
||||||
|
jo1d. { -oj > -od }
|
||||||
|
jeh1r. { -hej > -her }
|
||||||
|
jrev1t. { -verj > -vert }
|
||||||
|
jsim2t. { -misj > -mit }
|
||||||
|
jn1d. { -nj > -nd }
|
||||||
|
j1s. { -j > -s }
|
||||||
|
lbaifi6. { -ifiabl > - }
|
||||||
|
lbai4y. { -iabl > -y }
|
||||||
|
lba3> { -abl > - }
|
||||||
|
lbi3. { -ibl > - }
|
||||||
|
lib2l> { -bil > -bl }
|
||||||
|
lc1. { -cl > c }
|
||||||
|
lufi4y. { -iful > -y }
|
||||||
|
luf3> { -ful > - }
|
||||||
|
lu2. { -ul > - }
|
||||||
|
lai3> { -ial > - }
|
||||||
|
lau3> { -ual > - }
|
||||||
|
la2> { -al > - }
|
||||||
|
ll1. { -ll > -l }
|
||||||
|
mui3. { -ium > - }
|
||||||
|
mu*2. { -um > - if intact }
|
||||||
|
msi3> { -ism > - }
|
||||||
|
mm1. { -mm > -m }
|
||||||
|
nois4j> { -sion > -j }
|
||||||
|
noix4ct. { -xion > -ct }
|
||||||
|
noi3> { -ion > - }
|
||||||
|
nai3> { -ian > - }
|
||||||
|
na2> { -an > - }
|
||||||
|
nee0. { protect -een }
|
||||||
|
ne2> { -en > - }
|
||||||
|
nn1. { -nn > -n }
|
||||||
|
pihs4> { -ship > - }
|
||||||
|
pp1. { -pp > -p }
|
||||||
|
re2> { -er > - }
|
||||||
|
rae0. { protect -ear }
|
||||||
|
ra2. { -ar > - }
|
||||||
|
ro2> { -or > - }
|
||||||
|
ru2> { -ur > - }
|
||||||
|
rr1. { -rr > -r }
|
||||||
|
rt1> { -tr > -t }
|
||||||
|
rei3y> { -ier > -y }
|
||||||
|
sei3y> { -ies > -y }
|
||||||
|
sis2. { -sis > -s }
|
||||||
|
si2> { -is > - }
|
||||||
|
ssen4> { -ness > - }
|
||||||
|
ss0. { protect -ss }
|
||||||
|
suo3> { -ous > - }
|
||||||
|
su*2. { -us > - if intact }
|
||||||
|
s*1> { -s > - if intact }
|
||||||
|
s0. { -s > -s }
|
||||||
|
tacilp4y. { -plicat > -ply }
|
||||||
|
ta2> { -at > - }
|
||||||
|
tnem4> { -ment > - }
|
||||||
|
tne3> { -ent > - }
|
||||||
|
tna3> { -ant > - }
|
||||||
|
tpir2b. { -ript > -rib }
|
||||||
|
tpro2b. { -orpt > -orb }
|
||||||
|
tcud1. { -duct > -duc }
|
||||||
|
tpmus2. { -sumpt > -sum }
|
||||||
|
tpec2iv. { -cept > -ceiv }
|
||||||
|
tulo2v. { -olut > -olv }
|
||||||
|
tsis0. { protect -sist }
|
||||||
|
tsi3> { -ist > - }
|
||||||
|
tt1. { -tt > -t }
|
||||||
|
uqi3. { -iqu > - }
|
||||||
|
ugo1. { -ogu > -og }
|
||||||
|
vis3j> { -siv > -j }
|
||||||
|
vie0. { protect -eiv }
|
||||||
|
vi2> { -iv > - }
|
||||||
|
ylb1> { -bly > -bl }
|
||||||
|
yli3y> { -ily > -y }
|
||||||
|
ylp0. { protect -ply }
|
||||||
|
yl2> { -ly > - }
|
||||||
|
ygo1. { -ogy > -og }
|
||||||
|
yhp1. { -phy > -ph }
|
||||||
|
ymo1. { -omy > -om }
|
||||||
|
ypo1. { -opy > -op }
|
||||||
|
yti3> { -ity > - }
|
||||||
|
yte3> { -ety > - }
|
||||||
|
ytl2. { -lty > -l }
|
||||||
|
yrtsi5. { -istry > - }
|
||||||
|
yra3> { -ary > - }
|
||||||
|
yro3> { -ory > - }
|
||||||
|
yfi3. { -ify > - }
|
||||||
|
ycn2t> { -ncy > -nt }
|
||||||
|
yca3> { -acy > - }
|
||||||
|
zi2> { -iz > - }
|
||||||
|
zy1s. { -yz > -ys }
|
||||||
|
end0.
|
||||||
491
src/lancaster_stemmer.gleam
Normal file
491
src/lancaster_stemmer.gleam
Normal file
@@ -0,0 +1,491 @@
|
|||||||
|
import gleam/bool
|
||||||
|
import gleam/dict
|
||||||
|
import gleam/int
|
||||||
|
import gleam/io
|
||||||
|
import gleam/list
|
||||||
|
import gleam/option.{None, Some}
|
||||||
|
import gleam/result
|
||||||
|
import gleam/string
|
||||||
|
import simplifile
|
||||||
|
import splitter
|
||||||
|
|
||||||
|
pub opaque type Rule {
|
||||||
|
Rule(
|
||||||
|
id: String,
|
||||||
|
letter: String,
|
||||||
|
suffix: String,
|
||||||
|
intact: Bool,
|
||||||
|
remove: Int,
|
||||||
|
append: String,
|
||||||
|
restem: Bool,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type Rules =
|
||||||
|
dict.Dict(String, List(Rule))
|
||||||
|
|
||||||
|
const default_rules_list = [
|
||||||
|
#(
|
||||||
|
"a",
|
||||||
|
[
|
||||||
|
Rule("(1:ai*2.)", "a", "ia", True, 2, "", False),
|
||||||
|
Rule("(2:a*1.)", "a", "a", True, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#("b", [Rule("(3:bb1.)", "b", "bb", False, 1, "", False)]),
|
||||||
|
#(
|
||||||
|
"c",
|
||||||
|
[
|
||||||
|
Rule("(4:city3s.)", "c", "ytic", False, 3, "s", False),
|
||||||
|
Rule("(5:ci2>)", "c", "ic", False, 2, "", True),
|
||||||
|
Rule("(6:cn1t>)", "c", "nc", False, 1, "t", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"d",
|
||||||
|
[
|
||||||
|
Rule("(7:dd1.)", "d", "dd", False, 1, "", False),
|
||||||
|
Rule("(8:dei3y>)", "d", "ied", False, 3, "y", True),
|
||||||
|
Rule("(9:deec2ss.)", "d", "ceed", False, 2, "ss", False),
|
||||||
|
Rule("(10:dee1.)", "d", "eed", False, 1, "", False),
|
||||||
|
Rule("(11:de2>)", "d", "ed", False, 2, "", True),
|
||||||
|
Rule("(12:dooh4>)", "d", "hood", False, 4, "", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#("e", [Rule("(13:e1>)", "e", "e", False, 1, "", True)]),
|
||||||
|
#(
|
||||||
|
"f",
|
||||||
|
[
|
||||||
|
Rule("(14:feil1v.)", "f", "lief", False, 1, "v", False),
|
||||||
|
Rule("(15:fi2>)", "f", "if", False, 2, "", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"g",
|
||||||
|
[
|
||||||
|
Rule("(16:gni3>)", "g", "ing", False, 3, "", True),
|
||||||
|
Rule("(17:gai3y.)", "g", "iag", False, 3, "y", False),
|
||||||
|
Rule("(18:ga2>)", "g", "ag", False, 2, "", True),
|
||||||
|
Rule("(19:gg1.)", "g", "gg", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"h",
|
||||||
|
[
|
||||||
|
Rule("(20:ht*2.)", "h", "th", True, 2, "", False),
|
||||||
|
Rule("(21:hsiug5ct.)", "h", "guish", False, 5, "ct", False),
|
||||||
|
Rule("(22:hsi3>)", "h", "ish", False, 3, "", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"i",
|
||||||
|
[
|
||||||
|
Rule("(23:i*1.)", "i", "i", True, 1, "", False),
|
||||||
|
Rule("(24:i1y>)", "i", "i", False, 1, "y", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"j",
|
||||||
|
[
|
||||||
|
Rule("(25:ji1d.)", "j", "ij", False, 1, "d", False),
|
||||||
|
Rule("(26:juf1s.)", "j", "fuj", False, 1, "s", False),
|
||||||
|
Rule("(27:ju1d.)", "j", "uj", False, 1, "d", False),
|
||||||
|
Rule("(28:jo1d.)", "j", "oj", False, 1, "d", False),
|
||||||
|
Rule("(29:jeh1r.)", "j", "hej", False, 1, "r", False),
|
||||||
|
Rule("(30:jrev1t.)", "j", "verj", False, 1, "t", False),
|
||||||
|
Rule("(31:jsim2t.)", "j", "misj", False, 2, "t", False),
|
||||||
|
Rule("(32:jn1d.)", "j", "nj", False, 1, "d", False),
|
||||||
|
Rule("(33:j1s.)", "j", "j", False, 1, "s", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"l",
|
||||||
|
[
|
||||||
|
Rule("(34:lbaifi6.)", "l", "ifiabl", False, 6, "", False),
|
||||||
|
Rule("(35:lbai4y.)", "l", "iabl", False, 4, "y", False),
|
||||||
|
Rule("(36:lba3>)", "l", "abl", False, 3, "", True),
|
||||||
|
Rule("(37:lbi3.)", "l", "ibl", False, 3, "", False),
|
||||||
|
Rule("(38:lib2l>)", "l", "bil", False, 2, "l", True),
|
||||||
|
Rule("(39:lc1.)", "l", "cl", False, 1, "", False),
|
||||||
|
Rule("(40:lufi4y.)", "l", "iful", False, 4, "y", False),
|
||||||
|
Rule("(41:luf3>)", "l", "ful", False, 3, "", True),
|
||||||
|
Rule("(42:lu2.)", "l", "ul", False, 2, "", False),
|
||||||
|
Rule("(43:lai3>)", "l", "ial", False, 3, "", True),
|
||||||
|
Rule("(44:lau3>)", "l", "ual", False, 3, "", True),
|
||||||
|
Rule("(45:la2>)", "l", "al", False, 2, "", True),
|
||||||
|
Rule("(46:ll1.)", "l", "ll", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"m",
|
||||||
|
[
|
||||||
|
Rule("(47:mui3.)", "m", "ium", False, 3, "", False),
|
||||||
|
Rule("(48:mu*2.)", "m", "um", True, 2, "", False),
|
||||||
|
Rule("(49:msi3>)", "m", "ism", False, 3, "", True),
|
||||||
|
Rule("(50:mm1.)", "m", "mm", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"n",
|
||||||
|
[
|
||||||
|
Rule("(51:nois4j>)", "n", "sion", False, 4, "j", True),
|
||||||
|
Rule("(52:noix4ct.)", "n", "xion", False, 4, "ct", False),
|
||||||
|
Rule("(53:noi3>)", "n", "ion", False, 3, "", True),
|
||||||
|
Rule("(54:nai3>)", "n", "ian", False, 3, "", True),
|
||||||
|
Rule("(55:na2>)", "n", "an", False, 2, "", True),
|
||||||
|
Rule("(56:nee0.)", "n", "een", False, 0, "", False),
|
||||||
|
Rule("(57:ne2>)", "n", "en", False, 2, "", True),
|
||||||
|
Rule("(58:nn1.)", "n", "nn", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"p",
|
||||||
|
[
|
||||||
|
Rule("(59:pihs4>)", "p", "ship", False, 4, "", True),
|
||||||
|
Rule("(60:pp1.)", "p", "pp", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"r",
|
||||||
|
[
|
||||||
|
Rule("(61:re2>)", "r", "er", False, 2, "", True),
|
||||||
|
Rule("(62:rae0.)", "r", "ear", False, 0, "", False),
|
||||||
|
Rule("(63:ra2.)", "r", "ar", False, 2, "", False),
|
||||||
|
Rule("(64:ro2>)", "r", "or", False, 2, "", True),
|
||||||
|
Rule("(65:ru2>)", "r", "ur", False, 2, "", True),
|
||||||
|
Rule("(66:rr1.)", "r", "rr", False, 1, "", False),
|
||||||
|
Rule("(67:rt1>)", "r", "tr", False, 1, "", True),
|
||||||
|
Rule("(68:rei3y>)", "r", "ier", False, 3, "y", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"s",
|
||||||
|
[
|
||||||
|
Rule("(69:sei3y>)", "s", "ies", False, 3, "y", True),
|
||||||
|
Rule("(70:sis2.)", "s", "sis", False, 2, "", False),
|
||||||
|
Rule("(71:si2>)", "s", "is", False, 2, "", True),
|
||||||
|
Rule("(72:ssen4>)", "s", "ness", False, 4, "", True),
|
||||||
|
Rule("(73:ss0.)", "s", "ss", False, 0, "", False),
|
||||||
|
Rule("(74:suo3>)", "s", "ous", False, 3, "", True),
|
||||||
|
Rule("(75:su*2.)", "s", "us", True, 2, "", False),
|
||||||
|
Rule("(76:s*1>)", "s", "s", True, 1, "", True),
|
||||||
|
Rule("(77:s0.)", "s", "s", False, 0, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"t",
|
||||||
|
[
|
||||||
|
Rule("(78:tacilp4y.)", "t", "plicat", False, 4, "y", False),
|
||||||
|
Rule("(79:ta2>)", "t", "at", False, 2, "", True),
|
||||||
|
Rule("(80:tnem4>)", "t", "ment", False, 4, "", True),
|
||||||
|
Rule("(81:tne3>)", "t", "ent", False, 3, "", True),
|
||||||
|
Rule("(82:tna3>)", "t", "ant", False, 3, "", True),
|
||||||
|
Rule("(83:tpir2b.)", "t", "ript", False, 2, "b", False),
|
||||||
|
Rule("(84:tpro2b.)", "t", "orpt", False, 2, "b", False),
|
||||||
|
Rule("(85:tcud1.)", "t", "duct", False, 1, "", False),
|
||||||
|
Rule("(86:tpmus2.)", "t", "sumpt", False, 2, "", False),
|
||||||
|
Rule("(87:tpec2iv.)", "t", "cept", False, 2, "iv", False),
|
||||||
|
Rule("(88:tulo2v.)", "t", "olut", False, 2, "v", False),
|
||||||
|
Rule("(89:tsis0.)", "t", "sist", False, 0, "", False),
|
||||||
|
Rule("(90:tsi3>)", "t", "ist", False, 3, "", True),
|
||||||
|
Rule("(91:tt1.)", "t", "tt", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"u",
|
||||||
|
[
|
||||||
|
Rule("(92:uqi3.)", "u", "iqu", False, 3, "", False),
|
||||||
|
Rule("(93:ugo1.)", "u", "ogu", False, 1, "", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"v",
|
||||||
|
[
|
||||||
|
Rule("(94:vis3j>)", "v", "siv", False, 3, "j", True),
|
||||||
|
Rule("(95:vie0.)", "v", "eiv", False, 0, "", False),
|
||||||
|
Rule("(96:vi2>)", "v", "iv", False, 2, "", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"y",
|
||||||
|
[
|
||||||
|
Rule("(97:ylb1>)", "y", "bly", False, 1, "", True),
|
||||||
|
Rule("(98:yli3y>)", "y", "ily", False, 3, "y", True),
|
||||||
|
Rule("(99:ylp0.)", "y", "ply", False, 0, "", False),
|
||||||
|
Rule("(100:yl2>)", "y", "ly", False, 2, "", True),
|
||||||
|
Rule("(101:ygo1.)", "y", "ogy", False, 1, "", False),
|
||||||
|
Rule("(102:yhp1.)", "y", "phy", False, 1, "", False),
|
||||||
|
Rule("(103:ymo1.)", "y", "omy", False, 1, "", False),
|
||||||
|
Rule("(104:ypo1.)", "y", "opy", False, 1, "", False),
|
||||||
|
Rule("(105:yti3>)", "y", "ity", False, 3, "", True),
|
||||||
|
Rule("(106:yte3>)", "y", "ety", False, 3, "", True),
|
||||||
|
Rule("(107:ytl2.)", "y", "lty", False, 2, "", False),
|
||||||
|
Rule("(108:yrtsi5.)", "y", "istry", False, 5, "", False),
|
||||||
|
Rule("(109:yra3>)", "y", "ary", False, 3, "", True),
|
||||||
|
Rule("(110:yro3>)", "y", "ory", False, 3, "", True),
|
||||||
|
Rule("(111:yfi3.)", "y", "ify", False, 3, "", False),
|
||||||
|
Rule("(112:ycn2t>)", "y", "ncy", False, 2, "t", True),
|
||||||
|
Rule("(113:yca3>)", "y", "acy", False, 3, "", True),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
#(
|
||||||
|
"z",
|
||||||
|
[
|
||||||
|
Rule("(114:zi2>)", "z", "iz", False, 2, "", True),
|
||||||
|
Rule("(115:zy1s.)", "z", "yz", False, 1, "s", False),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
pub fn main() {
|
||||||
|
io.println("Hello from paicehusk!")
|
||||||
|
let assert Ok(rules) = load_rules("paice-husk-rules.txt")
|
||||||
|
stem("abominable", rules)
|
||||||
|
|> echo
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn default_rules() -> Rules {
|
||||||
|
dict.from_list(default_rules_list)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stem(word: String, rules: Rules) -> String {
|
||||||
|
case is_valid(word) {
|
||||||
|
True -> {
|
||||||
|
do_stem(word, rules, True)
|
||||||
|
}
|
||||||
|
False -> word
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn do_stem(word: String, rules: Rules, intact: Bool) -> String {
|
||||||
|
case string.reverse(word) |> string.pop_grapheme {
|
||||||
|
Ok(#(letter, _)) -> {
|
||||||
|
case dict.get(rules, letter) {
|
||||||
|
Ok(specific_rules) -> {
|
||||||
|
let #(stem, restem, intact) =
|
||||||
|
list.fold_until(
|
||||||
|
specific_rules,
|
||||||
|
#(word, False, intact),
|
||||||
|
fn(state, rule) {
|
||||||
|
case rule_matches(rule, word, intact) {
|
||||||
|
True -> {
|
||||||
|
let result = apply_rule(rule, word)
|
||||||
|
case is_valid(result) {
|
||||||
|
False -> list.Continue(state)
|
||||||
|
True -> {
|
||||||
|
list.Stop(#(result, rule.restem, False))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
False -> list.Continue(state)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
case restem {
|
||||||
|
True -> do_stem(stem, rules, intact)
|
||||||
|
False -> stem
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Error(_) -> word
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Error(_) -> word
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn rule_matches(rule: Rule, word: String, stem_intact: Bool) -> Bool {
|
||||||
|
case !stem_intact && rule.intact {
|
||||||
|
True -> False
|
||||||
|
False -> string.ends_with(word, rule.suffix)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn apply_rule(rule: Rule, word: String) -> String {
|
||||||
|
string.drop_end(word, rule.remove) <> rule.append
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_valid(word: String) -> Bool {
|
||||||
|
case word {
|
||||||
|
"" -> False
|
||||||
|
"a" <> rest | "e" <> rest | "i" <> rest | "o" <> rest | "u" <> rest -> {
|
||||||
|
rest != ""
|
||||||
|
}
|
||||||
|
_ -> {
|
||||||
|
is_valid_internal(word, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_valid_internal(word: String, length: Int) -> Bool {
|
||||||
|
case word {
|
||||||
|
"" -> False
|
||||||
|
"a" <> rest
|
||||||
|
| "e" <> rest
|
||||||
|
| "i" <> rest
|
||||||
|
| "o" <> rest
|
||||||
|
| "u" <> rest
|
||||||
|
| "y" <> rest -> {
|
||||||
|
{ length + 1 + string.length(rest) } >= 3
|
||||||
|
}
|
||||||
|
_ -> {
|
||||||
|
is_valid_internal(string.drop_start(word, 1), length + 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load_rules(filename: String) -> Result(Rules, Nil) {
|
||||||
|
case simplifile.read(filename) {
|
||||||
|
Error(_) -> Error(Nil)
|
||||||
|
Ok(rules) -> {
|
||||||
|
let split = splitter.new(["\n", "\r\n"])
|
||||||
|
use rules <- result.try(process_rules(rules, split, [], 1))
|
||||||
|
list.fold(rules, dict.new(), fn(acc, rule) {
|
||||||
|
dict.upsert(acc, rule.letter, fn(rules) {
|
||||||
|
case rules {
|
||||||
|
Some(rules) -> {
|
||||||
|
[rule, ..rules]
|
||||||
|
}
|
||||||
|
None -> [rule]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|> Ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process_rules(
|
||||||
|
rules: String,
|
||||||
|
split: splitter.Splitter,
|
||||||
|
acc: List(Rule),
|
||||||
|
id: Int,
|
||||||
|
) -> Result(List(Rule), Nil) {
|
||||||
|
case splitter.split(split, rules) {
|
||||||
|
#("end0." <> _, _, _) -> Ok(acc)
|
||||||
|
#(rule, _, rest) -> {
|
||||||
|
use rule <- result.try(process_rule(rule, id))
|
||||||
|
process_rules(rest, split, [rule, ..acc], id + 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn process_rule(rule: String, id: Int) -> Result(Rule, Nil) {
|
||||||
|
let rule = string.trim(rule)
|
||||||
|
|
||||||
|
use #(suffix, rule) <- result.try(parse_rule_text(rule, ""))
|
||||||
|
|
||||||
|
use <- bool.guard(when: suffix == "", return: Error(Nil))
|
||||||
|
let #(intact, rule) = case rule {
|
||||||
|
"*" <> rule -> #(True, rule)
|
||||||
|
_ -> #(False, rule)
|
||||||
|
}
|
||||||
|
use #(remove, rule) <- result.try(parse_rule_remove(rule, ""))
|
||||||
|
use #(append, rule) <- result.try(parse_rule_text(rule, ""))
|
||||||
|
use restem <- result.try(case rule {
|
||||||
|
">" <> _ -> Ok(True)
|
||||||
|
"." <> _ -> Ok(False)
|
||||||
|
_ -> Error(Nil)
|
||||||
|
})
|
||||||
|
|
||||||
|
let id =
|
||||||
|
"("
|
||||||
|
<> int.to_string(id)
|
||||||
|
<> ":"
|
||||||
|
<> suffix
|
||||||
|
<> {
|
||||||
|
case intact {
|
||||||
|
True -> "*"
|
||||||
|
False -> ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
<> int.to_string(remove)
|
||||||
|
<> append
|
||||||
|
<> {
|
||||||
|
case restem {
|
||||||
|
True -> ">"
|
||||||
|
False -> "."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
<> ")"
|
||||||
|
use letter <- result.try(string.first(suffix))
|
||||||
|
let suffix = string.reverse(suffix)
|
||||||
|
Ok(Rule(id, letter, suffix, intact, remove, append, restem))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_rule_remove(rule: String, acc: String) -> Result(#(Int, String), Nil) {
|
||||||
|
case rule {
|
||||||
|
"0" as number <> rest
|
||||||
|
| "1" as number <> rest
|
||||||
|
| "2" as number <> rest
|
||||||
|
| "3" as number <> rest
|
||||||
|
| "4" as number <> rest
|
||||||
|
| "5" as number <> rest
|
||||||
|
| "6" as number <> rest
|
||||||
|
| "7" as number <> rest
|
||||||
|
| "8" as number <> rest
|
||||||
|
| "9" as number <> rest -> parse_rule_remove(rest, acc <> number)
|
||||||
|
_ if acc == "" -> Error(Nil)
|
||||||
|
_ -> {
|
||||||
|
use i <- result.try(int.parse(acc))
|
||||||
|
Ok(#(i, rule))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_rule_text(rule: String, acc: String) -> Result(#(String, String), Nil) {
|
||||||
|
case rule {
|
||||||
|
"a" as letter <> rest
|
||||||
|
| "b" as letter <> rest
|
||||||
|
| "c" as letter <> rest
|
||||||
|
| "d" as letter <> rest
|
||||||
|
| "e" as letter <> rest
|
||||||
|
| "f" as letter <> rest
|
||||||
|
| "g" as letter <> rest
|
||||||
|
| "h" as letter <> rest
|
||||||
|
| "i" as letter <> rest
|
||||||
|
| "j" as letter <> rest
|
||||||
|
| "k" as letter <> rest
|
||||||
|
| "l" as letter <> rest
|
||||||
|
| "m" as letter <> rest
|
||||||
|
| "n" as letter <> rest
|
||||||
|
| "o" as letter <> rest
|
||||||
|
| "p" as letter <> rest
|
||||||
|
| "q" as letter <> rest
|
||||||
|
| "r" as letter <> rest
|
||||||
|
| "s" as letter <> rest
|
||||||
|
| "t" as letter <> rest
|
||||||
|
| "u" as letter <> rest
|
||||||
|
| "v" as letter <> rest
|
||||||
|
| "w" as letter <> rest
|
||||||
|
| "x" as letter <> rest
|
||||||
|
| "y" as letter <> rest
|
||||||
|
| "z" as letter <> rest
|
||||||
|
| "A" as letter <> rest
|
||||||
|
| "B" as letter <> rest
|
||||||
|
| "C" as letter <> rest
|
||||||
|
| "D" as letter <> rest
|
||||||
|
| "E" as letter <> rest
|
||||||
|
| "F" as letter <> rest
|
||||||
|
| "G" as letter <> rest
|
||||||
|
| "H" as letter <> rest
|
||||||
|
| "I" as letter <> rest
|
||||||
|
| "J" as letter <> rest
|
||||||
|
| "K" as letter <> rest
|
||||||
|
| "L" as letter <> rest
|
||||||
|
| "M" as letter <> rest
|
||||||
|
| "N" as letter <> rest
|
||||||
|
| "O" as letter <> rest
|
||||||
|
| "P" as letter <> rest
|
||||||
|
| "Q" as letter <> rest
|
||||||
|
| "R" as letter <> rest
|
||||||
|
| "S" as letter <> rest
|
||||||
|
| "T" as letter <> rest
|
||||||
|
| "U" as letter <> rest
|
||||||
|
| "V" as letter <> rest
|
||||||
|
| "W" as letter <> rest
|
||||||
|
| "X" as letter <> rest
|
||||||
|
| "Y" as letter <> rest
|
||||||
|
| "Z" as letter <> rest -> parse_rule_text(rest, acc <> letter)
|
||||||
|
_ -> Ok(#(acc, rule))
|
||||||
|
}
|
||||||
|
}
|
||||||
13
test/lancaster_stemmer_test.gleam
Normal file
13
test/lancaster_stemmer_test.gleam
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import gleeunit
|
||||||
|
|
||||||
|
pub fn main() -> Nil {
|
||||||
|
gleeunit.main()
|
||||||
|
}
|
||||||
|
|
||||||
|
// gleeunit test functions end in `_test`
|
||||||
|
pub fn hello_world_test() {
|
||||||
|
let name = "Joe"
|
||||||
|
let greeting = "Hello, " <> name <> "!"
|
||||||
|
|
||||||
|
assert greeting == "Hello, Joe!"
|
||||||
|
}
|
||||||
25143
wordlist.txt
Normal file
25143
wordlist.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user