Compare commits

...

12 Commits

Author SHA1 Message Date
36e14f0070 perf: Minor tweak to length check
Some checks are pending
test / test (push) Waiting to run
2025-11-04 15:17:01 +00:00
3569a0a1af test: Add words with non-vowel initial letters to benchmark 2025-11-04 15:04:29 +00:00
52fc5b6f5b perf: Improve performance by pattern matching on a letter 2025-11-04 15:04:23 +00:00
b30c9f39fd perf: Change .length to .byte_size
Some checks failed
test / test (push) Has been cancelled
2025-11-04 14:15:21 +00:00
5feb4de7a4 perf: Removed pop_grapheme and used pattern matching first letter
Some checks failed
test / test (push) Has been cancelled
2025-11-04 13:57:29 +00:00
7793bbb3a3 test: Added benchmark test 2025-11-04 13:56:55 +00:00
6634406a78 test: Removed echo from test 2025-11-04 13:56:28 +00:00
2d8532b40e docs: Added CHANGELOG
Some checks failed
test / test (push) Has been cancelled
2025-11-01 15:42:43 +00:00
c848824583 docs: Update README 2025-11-01 15:41:40 +00:00
b66b696391 docs: Added fn docs 2025-11-01 15:40:42 +00:00
516066d322 test: Added wordlist tests
Some checks failed
test / test (push) Has been cancelled
2025-11-01 15:16:50 +00:00
85f1377328 fix: Removed main 2025-11-01 15:16:28 +00:00
9 changed files with 25420 additions and 50 deletions

5
CHANGELOG.md Normal file
View File

@@ -0,0 +1,5 @@
# Changelog
## v1.0.0
- Initial release

View File

@@ -12,7 +12,7 @@ gleam add lancaster_stemmer@1
import lancaster_stemmer import lancaster_stemmer
pub fn main() -> Nil { pub fn main() -> Nil {
// TODO: An example of the project in use lancaster_stemmer.stem("breathe", lancaster_stemmer.default_rules())
} }
``` ```

46
dev/benchmark.gleam Normal file
View File

@@ -0,0 +1,46 @@
import gleam/list
import glychee/benchmark
import glychee/configuration
import lancaster_stemmer
import porter_stemmer
@target(erlang)
pub fn main() {
configuration.initialize()
configuration.set_pair(configuration.Warmup, 2)
configuration.set_pair(configuration.Parallel, 2)
// pop_benchmark()
benchmark()
// reg_name_benchmark()
// ip_benchmark()
}
@target(erlang)
fn benchmark() {
let rules = lancaster_stemmer.default_rules()
benchmark.run(
[
benchmark.Function("Lancaster", fn(data) {
fn() { list.each(data, lancaster_stemmer.stem(_, rules)) }
}),
benchmark.Function("Porter", fn(data) {
fn() { list.each(data, porter_stemmer.stem) }
}),
],
[
benchmark.Data("10 words", [
"abbreviate",
"aberdeen",
"abode",
"abovementioned",
"blemish",
"christensen",
"christendom",
"flatiron",
"mountainside",
"zygote",
]),
],
)
}

View File

@@ -19,3 +19,5 @@ splitter = ">= 1.1.0 and < 2.0.0"
[dev-dependencies] [dev-dependencies]
gleeunit = ">= 1.0.0 and < 2.0.0" gleeunit = ">= 1.0.0 and < 2.0.0"
glychee = ">= 1.1.2 and < 2.0.0"
porter_stemmer = ">= 1.0.0 and < 2.0.0"

View File

@@ -2,15 +2,23 @@
# You typically do not need to edit this file # You typically do not need to edit this file
packages = [ packages = [
{ name = "benchee", version = "1.5.0", build_tools = ["mix"], requirements = ["deep_merge", "statistex", "table"], otp_app = "benchee", source = "hex", outer_checksum = "5B075393AEA81B8AE74EADD1C28B1D87E8A63696C649D8293DB7C4DF3EB67535" },
{ name = "deep_merge", version = "1.0.0", build_tools = ["mix"], requirements = [], otp_app = "deep_merge", source = "hex", outer_checksum = "CE708E5F094B9CD4E8F2BE4F00D2F4250C4095BE93F8CD6D018C753894885430" },
{ name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" }, { name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" },
{ name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" }, { name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" },
{ name = "gleeunit", version = "1.7.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "CD701726CBCE5588B375D157B4391CFD0F2F134CD12D9B6998A395484DE05C58" }, { name = "gleeunit", version = "1.8.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "7AE0F64B26CC065ED705FF7CA5F4EDAB8015E72A883736FE251E46FACCCE1E08" },
{ name = "glychee", version = "1.1.2", build_tools = ["gleam"], requirements = ["benchee"], otp_app = "glychee", source = "hex", outer_checksum = "41784216C213F223095BB3FC3EDDB60CC537835B2340A868EA3931193F7F3824" },
{ name = "porter_stemmer", version = "1.0.0", build_tools = ["gleam"], requirements = ["porter_stemming"], otp_app = "porter_stemmer", source = "hex", outer_checksum = "02248CA76802B75BE1EE7EE1878BAD088088E67E791ECE6813128B965560C99C" },
{ name = "porter_stemming", version = "1.0.1", build_tools = ["rebar3"], requirements = [], otp_app = "porter_stemming", source = "hex", outer_checksum = "8531E709A731C9A6A52477C44175411A6B5F5327CF55C18D9B9F5FD701C606B0" },
{ name = "simplifile", version = "2.3.0", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "0A868DAC6063D9E983477981839810DC2E553285AB4588B87E3E9C96A7FB4CB4" }, { name = "simplifile", version = "2.3.0", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "0A868DAC6063D9E983477981839810DC2E553285AB4588B87E3E9C96A7FB4CB4" },
{ name = "splitter", version = "1.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "splitter", source = "hex", outer_checksum = "05564A381580395DCDEFF4F88A64B021E8DAFA6540AE99B4623962F52976AA9D" }, { name = "splitter", version = "1.1.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "splitter", source = "hex", outer_checksum = "05564A381580395DCDEFF4F88A64B021E8DAFA6540AE99B4623962F52976AA9D" },
{ name = "statistex", version = "1.1.0", build_tools = ["mix"], requirements = [], otp_app = "statistex", source = "hex", outer_checksum = "F5950EA26AD43246BA2CCE54324AC394A4E7408FDCF98B8E230F503A0CBA9CF5" },
] ]
[requirements] [requirements]
gleam_stdlib = { version = ">= 0.44.0 and < 2.0.0" } gleam_stdlib = { version = ">= 0.44.0 and < 2.0.0" }
gleeunit = { version = ">= 1.0.0 and < 2.0.0" } gleeunit = { version = ">= 1.0.0 and < 2.0.0" }
glychee = { version = ">= 1.1.2 and < 2.0.0" }
porter_stemmer = { version = ">= 1.0.0 and < 2.0.0" }
simplifile = { version = ">= 2.3.0 and < 3.0.0" } simplifile = { version = ">= 2.3.0 and < 3.0.0" }
splitter = { version = ">= 1.1.0 and < 2.0.0" } splitter = { version = ">= 1.1.0 and < 2.0.0" }

View File

@@ -1,7 +1,6 @@
import gleam/bool import gleam/bool
import gleam/dict import gleam/dict
import gleam/int import gleam/int
import gleam/io
import gleam/list import gleam/list
import gleam/option.{None, Some} import gleam/option.{None, Some}
import gleam/result import gleam/result
@@ -237,18 +236,26 @@ const default_rules_list = [
), ),
] ]
pub fn main() { /// Constructs the default ruleset
io.println("Hello from paicehusk!")
let assert Ok(rules) = load_rules("paice-husk-rules.txt")
stem("abominable", rules)
|> echo
}
pub fn default_rules() -> Rules { pub fn default_rules() -> Rules {
dict.from_list(default_rules_list) dict.from_list(default_rules_list)
} }
/// Lancaster (Paice-Husk) stemming algorithm
///
/// ## Example
///
/// ```gleam
/// lancaster_stemmer.stem("Gleam", lancaster_stemmer.stem.default_rules())
/// // -> gleam
/// ```
///
/// ```gleam
/// lancaster_stemmer.stem("fancy", lancaster_stemmer.stem.default_rules())
/// // -> fant
/// ```
pub fn stem(word: String, rules: Rules) -> String { pub fn stem(word: String, rules: Rules) -> String {
let word = string.lowercase(word)
case is_valid(word) { case is_valid(word) {
True -> { True -> {
do_stem(word, rules, True) do_stem(word, rules, True)
@@ -258,45 +265,75 @@ pub fn stem(word: String, rules: Rules) -> String {
} }
fn do_stem(word: String, rules: Rules, intact: Bool) -> String { fn do_stem(word: String, rules: Rules, intact: Bool) -> String {
case string.reverse(word) |> string.pop_grapheme { case string.reverse(word) {
Ok(#(letter, _)) -> { "a" as letter <> _
case dict.get(rules, letter) { | "b" as letter <> _
Ok(specific_rules) -> { | "c" as letter <> _
let #(stem, restem, intact) = | "d" as letter <> _
list.fold_until( | "e" as letter <> _
specific_rules, | "f" as letter <> _
#(word, False, intact), | "g" as letter <> _
fn(state, rule) { | "h" as letter <> _
case rule_matches(rule, word, intact) { | "i" as letter <> _
True -> { | "j" as letter <> _
let result = apply_rule(rule, word) | "k" as letter <> _
case is_valid(result) { | "l" as letter <> _
False -> list.Continue(state) | "m" as letter <> _
True -> { | "n" as letter <> _
list.Stop(#(result, rule.restem, False)) | "o" as letter <> _
} | "p" as letter <> _
} | "q" as letter <> _
} | "r" as letter <> _
False -> list.Continue(state) | "s" as letter <> _
} | "t" as letter <> _
}, | "u" as letter <> _
) | "v" as letter <> _
case restem { | "w" as letter <> _
True -> do_stem(stem, rules, intact) | "x" as letter <> _
False -> stem | "y" as letter <> _
} | "z" as letter <> _ -> {
} case stem_letter(rules, letter, word, intact) {
Error(_) -> word #(stem, True, intact) -> do_stem(stem, rules, intact)
#(stem, _, _) -> stem
} }
} }
Error(_) -> word
_ -> word
}
}
fn stem_letter(
rules: dict.Dict(String, List(Rule)),
letter: String,
word: String,
intact: Bool,
) -> #(String, Bool, Bool) {
case dict.get(rules, letter) {
Ok(specific_rules) -> {
// let #(stem, restem, intact) =
list.fold_until(specific_rules, #(word, False, intact), fn(state, rule) {
case rule_matches(rule, word, intact) {
True -> {
let result = apply_rule(rule, word)
case is_valid(result) {
False -> list.Continue(state)
True -> {
list.Stop(#(result, rule.restem, False))
}
}
}
False -> list.Continue(state)
}
})
}
Error(_) -> #(word, False, False)
} }
} }
fn rule_matches(rule: Rule, word: String, stem_intact: Bool) -> Bool { fn rule_matches(rule: Rule, word: String, stem_intact: Bool) -> Bool {
case !stem_intact && rule.intact { case stem_intact || !rule.intact {
True -> False True -> string.ends_with(word, rule.suffix)
False -> string.ends_with(word, rule.suffix) False -> False
} }
} }
@@ -310,9 +347,30 @@ fn is_valid(word: String) -> Bool {
"a" <> rest | "e" <> rest | "i" <> rest | "o" <> rest | "u" <> rest -> { "a" <> rest | "e" <> rest | "i" <> rest | "o" <> rest | "u" <> rest -> {
rest != "" rest != ""
} }
_ -> { "b" <> _
| "c" <> _
| "d" <> _
| "f" <> _
| "g" <> _
| "h" <> _
| "j" <> _
| "k" <> _
| "l" <> _
| "m" <> _
| "n" <> _
| "p" <> _
| "q" <> _
| "r" <> _
| "s" <> _
| "t" <> _
| "v" <> _
| "w" <> _
| "x" <> _
| "y" <> _
| "z" <> _ -> {
is_valid_internal(word, 0) is_valid_internal(word, 0)
} }
_ -> False
} }
} }
@@ -325,14 +383,55 @@ fn is_valid_internal(word: String, length: Int) -> Bool {
| "o" <> rest | "o" <> rest
| "u" <> rest | "u" <> rest
| "y" <> rest -> { | "y" <> rest -> {
{ length + 1 + string.length(rest) } >= 3 case length {
0 -> string.byte_size(rest) >= 2
1 -> rest != ""
_ -> True
}
}
"b" <> rest
| "c" <> rest
| "d" <> rest
| "f" <> rest
| "g" <> rest
| "h" <> rest
| "j" <> rest
| "k" <> rest
| "l" <> rest
| "m" <> rest
| "n" <> rest
| "p" <> rest
| "q" <> rest
| "r" <> rest
| "s" <> rest
| "t" <> rest
| "v" <> rest
| "w" <> rest
| "x" <> rest
| "z" <> rest -> {
is_valid_internal(rest, length + 1)
} }
_ -> { _ -> {
is_valid_internal(string.drop_start(word, 1), length + 1) False
} }
} }
} }
/// Constructs a ruleset from the specified file
///
/// Format of the file is as follows:
/// Each line contains a specific rule (order matters)
/// The rule consists of a string made up of the following parts
/// | Rule part | Description |
/// | ------ | ------ |
/// |suffix|the reverse of the required suffix, e.g. the suffix for winning, ing would be specified gni|
/// |* (optional)|if the rule is only to be used if a previous rule has not been applied then add an asterisk. For example ht*2. only applies if th is the final suffix, so the stem of breath would be brea but the stem of breathe would be breath because the suffix e has already been removed|
/// |number of chars to remove|this is the number of characters to remove after the suffix has been matched. For example psychoanalytic has the suffix ytic of which 3 characters should be removed to retain psychoanaly, this would be 'city3'. This can be 0|
/// |append string (optional)|this is the characters that are appended after the match and removal of characters|
/// |> or .|If > then you can continue stemming process after this one, if . then stemming stops|
///
/// So for example with the `psychoanalytic` stem of `psychoanalys` the rule would be `ytic3s.`
///
pub fn load_rules(filename: String) -> Result(Rules, Nil) { pub fn load_rules(filename: String) -> Result(Rules, Nil) {
case simplifile.read(filename) { case simplifile.read(filename) {
Error(_) -> Error(Nil) Error(_) -> Error(Nil)

View File

@@ -1,4 +1,7 @@
import gleeunit import gleeunit
import lancaster_stemmer
import simplifile
import splitter
pub fn main() -> Nil { pub fn main() -> Nil {
gleeunit.main() gleeunit.main()
@@ -6,8 +9,29 @@ pub fn main() -> Nil {
// gleeunit test functions end in `_test` // gleeunit test functions end in `_test`
pub fn hello_world_test() { pub fn hello_world_test() {
let name = "Joe" let line_split = splitter.new(["\n", "\r\n"])
let greeting = "Hello, " <> name <> "!" let row_split = splitter.new([" ", "\t"])
let rules = lancaster_stemmer.default_rules()
assert greeting == "Hello, Joe!" let assert Ok(tests) = simplifile.read("./test/wordlist.txt")
run_test(tests, line_split, row_split, rules)
}
fn run_test(
tests: String,
line_split: splitter.Splitter,
row_split: splitter.Splitter,
rules: lancaster_stemmer.Rules,
) -> Nil {
case splitter.split(line_split, tests) {
#("", "", "") -> Nil
#(line, _, rest) -> {
case splitter.split(row_split, line) {
#("", "", "") -> Nil
#(word, _, stem) -> {
assert lancaster_stemmer.stem(word, rules) == stem
}
}
run_test(rest, line_split, row_split, rules)
}
}
} }

51
test/stem_cases.gleam Normal file
View File

@@ -0,0 +1,51 @@
import gleeunit
import lancaster_stemmer
pub fn main() -> Nil {
gleeunit.main()
}
pub fn stem_abbas_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("abbas", rules) == "abba"
}
pub fn stem_abbas_case_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("AbBaS", rules) == "abba"
}
pub fn stem_accomplish_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("accomplish", rules) == "accompl"
}
pub fn stem_accomplish_upper_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("ACCOMPLISH", rules) == "accompl"
}
pub fn stem_accompaniment_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("accompaniment", rules) == "accompany"
}
pub fn stem_test_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("test", rules) == "test"
}
pub fn stem_tessellate_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("tessellate", rules) == "tessel"
}
pub fn stem_a_invalid_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("a", rules) == "a"
}
pub fn stem_i_invalid_test() {
let rules = lancaster_stemmer.default_rules()
assert lancaster_stemmer.stem("i", rules) == "i"
}

25135
test/wordlist.txt Normal file

File diff suppressed because it is too large Load Diff