diff --git a/README.md b/README.md index a6a56f9..b2ec930 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ This crate implements some stemmer algorithms found in the [snowball project](ht - Arabic - Armenian +- Czech - Danish - Dutch - English diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl new file mode 100644 index 0000000..cdd79a8 --- /dev/null +++ b/algorithms/czech.sbl @@ -0,0 +1,255 @@ +/* + * Czech language stemmer + * + * Source obtained from https://snowballstem.org/algorithms/czech/stemmer.html, + * created by Ljiljana Dolamic & Jacques Savoy in 2009 (thank you :) + */ + +routines ( + RV R1 + palatalise + mark_regions + do_possessive + do_case + do_comparative + do_diminutive + do_augmentative + do_derivational + do_deriv_single + do_aggressive +) + +externals ( stem ) + +integers ( pV p1 ) + +groupings ( v ) + +stringescapes {} + +stringdef a' '{U+00E1}' +stringdef c^ '{U+010D}' +stringdef d^ '{U+010F}' +stringdef e' '{U+00E9}' +stringdef e^ '{U+011B}' +stringdef i' '{U+00ED}' +stringdef n^ '{U+0148}' +stringdef o' '{U+00F3}' +stringdef r^ '{U+0159}' +stringdef s^ '{U+0161}' +stringdef t^ '{U+0165}' +stringdef u' '{U+00FA}' +stringdef u* '{U+016F}' +stringdef y' '{U+00FD}' +stringdef z^ '{U+017E}' + +define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + + do ( + gopast non-v setmark pV + gopast non-v gopast v setmark p1 + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + + define palatalise as ( + [substring] RV among ( + 'ci' 'ce' '{c^}i' '{c^}' + (<- 'k') + 'zi' 'ze' '{z^}i' '{z^}e' + (<- 'h') + '{c^}t{e^}' '{c^}ti' '{c^}t{e'}' + (<- 'ck') + '{s^}t{e^}' '{s^}ti' '{s^}t{e'}' + (<- 'sk') + ) + ) + + define do_possessive as ( + [substring] RV among ( + 'ov' '{u*}v' + (delete) + 'in' + ( + delete + try palatalise + ) + ) + ) + + define do_case as ( + [substring] among ( + 'atech' + '{e^}tem' 'at{u*}m' + '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' + 'ata' 'aty' 'ama' 'ami' 'ovi' + 'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou' + 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' + (delete) + 'ech' 'ich' '{i'}ch' + '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi' + 'emi' 'iho' 'imu' + '{e'}m' '{i'}m' 'es' + 'e' 'i' '{i'}' '{e^}' + ( + delete + try palatalise + ) + 'em' + ( + <- 'e' + try palatalise + ) + ) + ) + + define do_derivational as ( + [substring] R1 among ( + 'obinec' + 'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k' + '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin' + '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k' + 'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv' + '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as' + 'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn' + '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk' + (delete) + 'ion{a'}{r^}' + 'inec' 'itel' + 'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb' + 'ic' 'in' 'it' 'iv' + ( + <- 'i' + palatalise + ) + 'enic' 'ec' 'en' + ( + <- 'e' + palatalise + ) + '{e'}{r^}' + ( + <- '{e'}' + palatalise + ) + '{e^}n' + ( + <- '{e^}' + palatalise + ) + '{i'}rn' + '{i'}{r^}' '{i'}n' + ( + <- '{i'}' + palatalise + ) + ) + ) + define do_deriv_single as ( + [substring] among ( + 'c' '{c^}' 'k' 'l' 'n' 't' + (delete) + ) + ) + + + define do_augmentative as ( + [substring] among ( + 'ajzn' '{a'}k' + (delete) + 'izn' 'isk' + ( + <- 'i' + palatalise + ) + ) + ) + + define do_diminutive as ( + [substring] among ( + 'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek' + 'anek' 'onek' 'unek' '{a'}nek' + 'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk' + '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk' + '{a'}tk' '{a'}nk' 'u{s^}k' + 'k' + (delete) + 'e{c^}ek' 'enek' 'ek' + ( + <- 'e' + palatalise + ) + '{e'}{c^}ek' '{e'}k' + ( + <- '{e'}' + palatalise + ) + 'i{c^}ek' 'inek' 'ik' + ( + <- 'i' + palatalise + ) + '{i'}{c^}ek' '{i'}k' + ( + <- '{i'}' + palatalise + ) + '{a'}k' + (<- '{a'}') + 'ak' + (<- 'a') + 'ok' + (<- 'o') + 'uk' + (<- 'u') + ) + ) + + define do_comparative as ( + [substring] among ( + '{e^}j{s^}' + ( + <- '{e^}' + palatalise + ) + 'ej{s^}' + ( + <- 'e' + palatalise + ) + ) + ) + + define do_aggressive as ( + do do_comparative + do do_diminutive + do do_augmentative + do_derivational or do_deriv_single + ) +) + +define stem as ( + do mark_regions + backwards ( + do_case + do_possessive + // light and aggressive are the same to this point + // comment next line for light stemmer + do_aggressive + ) +) + +// Ljiljana Dolamic and Jacques Savoy. 2009. +// Indexing and stemming approaches for the Czech language. +// Inf. Process. Manage. 45, 6 (November 2009), 714-720. +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt diff --git a/src/lib.rs b/src/lib.rs index 38c2c03..633c407 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,6 +37,7 @@ use snowball::algorithms; pub enum Algorithm { Arabic, Armenian, + Czech, Danish, Dutch, English, @@ -67,6 +68,7 @@ impl Stemmer { match lang { Algorithm::Arabic => Stemmer { stemmer: algorithms::arabic::stem }, Algorithm::Armenian => Stemmer { stemmer: algorithms::armenian::stem }, + Algorithm::Czech => Stemmer { stemmer: algorithms::czech::stem }, Algorithm::Danish => Stemmer { stemmer: algorithms::danish::stem }, Algorithm::Dutch => Stemmer { stemmer: algorithms::dutch::stem }, Algorithm::English => Stemmer { stemmer: algorithms::english::stem }, diff --git a/src/snowball/algorithms/czech.rs b/src/snowball/algorithms/czech.rs new file mode 100644 index 0000000..849902c --- /dev/null +++ b/src/snowball/algorithms/czech.rs @@ -0,0 +1,652 @@ +//! Generated by Snowball 2.2.0 - https://snowballstem.org/ + +#![allow(non_snake_case)] +#![allow(non_upper_case_globals)] +#![allow(unused_mut)] +#![allow(unused_parens)] +#![allow(unused_variables)] +use snowball::SnowballEnv; +use snowball::Among; + +static A_0: &'static [Among; 14] = &[ + Among("ce", -1, 1, None), + Among("ze", -1, 2, None), + Among("\u{017E}e", -1, 2, None), + Among("ci", -1, 1, None), + Among("\u{010D}ti", -1, 3, None), + Among("\u{0161}ti", -1, 4, None), + Among("zi", -1, 2, None), + Among("\u{010D}i", -1, 1, None), + Among("\u{017E}i", -1, 2, None), + Among("\u{010D}", -1, 1, None), + Among("\u{010D}t\u{011B}", -1, 3, None), + Among("\u{0161}t\u{011B}", -1, 4, None), + Among("\u{010D}t\u{00E9}", -1, 3, None), + Among("\u{0161}t\u{00E9}", -1, 4, None), +]; + +static A_1: &'static [Among; 3] = &[ + Among("in", -1, 2, None), + Among("ov", -1, 1, None), + Among("\u{016F}v", -1, 1, None), +]; + +static A_2: &'static [Among; 48] = &[ + Among("a", -1, 1, None), + Among("ama", 0, 1, None), + Among("ata", 0, 1, None), + Among("e", -1, 2, None), + Among("\u{011B}te", 3, 2, None), + Among("ech", -1, 2, None), + Among("atech", 5, 1, None), + Among("ich", -1, 2, None), + Among("\u{00E1}ch", -1, 1, None), + Among("\u{00ED}ch", -1, 2, None), + Among("\u{00FD}ch", -1, 1, None), + Among("i", -1, 2, None), + Among("mi", 11, 1, None), + Among("ami", 12, 1, None), + Among("emi", 12, 2, None), + Among("\u{011B}mi", 12, 2, None), + Among("\u{00ED}mi", 12, 2, None), + Among("\u{00FD}mi", 12, 1, None), + Among("\u{011B}ti", 11, 2, None), + Among("ovi", 11, 1, None), + Among("em", -1, 3, None), + Among("\u{011B}tem", 20, 1, None), + Among("\u{00E1}m", -1, 1, None), + Among("\u{00E9}m", -1, 2, None), + Among("\u{00ED}m", -1, 2, None), + Among("at\u{016F}m", -1, 1, None), + Among("\u{00FD}m", -1, 1, None), + Among("o", -1, 1, None), + Among("iho", 27, 2, None), + Among("\u{00E9}ho", 27, 2, None), + Among("\u{00ED}ho", 27, 2, None), + Among("es", -1, 2, None), + Among("os", -1, 1, None), + Among("us", -1, 1, None), + Among("at", -1, 1, None), + Among("u", -1, 1, None), + Among("imu", 35, 2, None), + Among("\u{00E9}mu", 35, 2, None), + Among("ou", 35, 1, None), + Among("y", -1, 1, None), + Among("aty", 39, 1, None), + Among("\u{011B}", -1, 2, None), + Among("\u{00E1}", -1, 1, None), + Among("\u{00E9}", -1, 1, None), + Among("ov\u{00E9}", 43, 1, None), + Among("\u{00ED}", -1, 2, None), + Among("\u{016F}", -1, 1, None), + Among("\u{00FD}", -1, 1, None), +]; + +static A_3: &'static [Among; 68] = &[ + Among("ob", -1, 1, None), + Among("itb", -1, 2, None), + Among("ec", -1, 3, None), + Among("inec", 2, 2, None), + Among("obinec", 3, 1, None), + Among("ovec", 2, 1, None), + Among("ic", -1, 2, None), + Among("enic", 6, 3, None), + Among("och", -1, 1, None), + Among("\u{00E1}sek", -1, 1, None), + Among("nk", -1, 1, None), + Among("isk", -1, 2, None), + Among("ovisk", 11, 1, None), + Among("tk", -1, 1, None), + Among("vk", -1, 1, None), + Among("\u{010D}k", -1, 1, None), + Among("i\u{0161}k", -1, 2, None), + Among("u\u{0161}k", -1, 1, None), + Among("n\u{00ED}k", -1, 1, None), + Among("ovn\u{00ED}k", 18, 1, None), + Among("ov\u{00ED}k", -1, 1, None), + Among("dl", -1, 1, None), + Among("itel", -1, 2, None), + Among("ul", -1, 1, None), + Among("an", -1, 1, None), + Among("\u{010D}an", 24, 1, None), + Among("en", -1, 3, None), + Among("in", -1, 2, None), + Among("\u{0161}tin", 27, 1, None), + Among("ovin", 27, 1, None), + Among("teln", -1, 1, None), + Among("\u{00E1}rn", -1, 1, None), + Among("\u{00ED}rn", -1, 6, None), + Among("oun", -1, 1, None), + Among("loun", 33, 1, None), + Among("ovn", -1, 1, None), + Among("yn", -1, 1, None), + Among("kyn", 36, 1, None), + Among("\u{010D}n", -1, 1, None), + Among("\u{011B}n", -1, 5, None), + Among("\u{00E1}n", -1, 1, None), + Among("i\u{00E1}n", 40, 2, None), + Among("\u{00ED}n", -1, 6, None), + Among("as", -1, 1, None), + Among("it", -1, 2, None), + Among("ot", -1, 1, None), + Among("ist", -1, 2, None), + Among("ost", -1, 1, None), + Among("nost", 47, 1, None), + Among("out", -1, 1, None), + Among("ovi\u{0161}t", -1, 1, None), + Among("iv", -1, 2, None), + Among("ov", -1, 1, None), + Among("tv", -1, 1, None), + Among("ctv", 53, 1, None), + Among("stv", 53, 1, None), + Among("ovstv", 55, 1, None), + Among("ovtv", 53, 1, None), + Among("o\u{0148}", -1, 1, None), + Among("a\u{010D}", -1, 1, None), + Among("\u{00E1}\u{010D}", -1, 1, None), + Among("\u{00E1}\u{0159}", -1, 1, None), + Among("k\u{00E1}\u{0159}", 61, 1, None), + Among("ion\u{00E1}\u{0159}", 61, 2, None), + Among("\u{00E9}\u{0159}", -1, 4, None), + Among("n\u{00E9}\u{0159}", 64, 1, None), + Among("\u{00ED}\u{0159}", -1, 6, None), + Among("ou\u{0161}", -1, 1, None), +]; + +static A_4: &'static [Among; 6] = &[ + Among("c", -1, 1, None), + Among("k", -1, 1, None), + Among("l", -1, 1, None), + Among("n", -1, 1, None), + Among("t", -1, 1, None), + Among("\u{010D}", -1, 1, None), +]; + +static A_5: &'static [Among; 4] = &[ + Among("isk", -1, 2, None), + Among("\u{00E1}k", -1, 1, None), + Among("izn", -1, 2, None), + Among("ajzn", -1, 1, None), +]; + +static A_6: &'static [Among; 42] = &[ + Among("k", -1, 1, None), + Among("ak", 0, 7, None), + Among("ek", 0, 2, None), + Among("anek", 2, 1, None), + Among("enek", 2, 2, None), + Among("inek", 2, 4, None), + Among("onek", 2, 1, None), + Among("unek", 2, 1, None), + Among("\u{00E1}nek", 2, 1, None), + Among("a\u{010D}ek", 2, 1, None), + Among("e\u{010D}ek", 2, 2, None), + Among("i\u{010D}ek", 2, 4, None), + Among("o\u{010D}ek", 2, 1, None), + Among("u\u{010D}ek", 2, 1, None), + Among("\u{00E1}\u{010D}ek", 2, 1, None), + Among("\u{00E9}\u{010D}ek", 2, 3, None), + Among("\u{00ED}\u{010D}ek", 2, 5, None), + Among("ou\u{0161}ek", 2, 1, None), + Among("ik", 0, 4, None), + Among("ank", 0, 1, None), + Among("enk", 0, 1, None), + Among("ink", 0, 1, None), + Among("onk", 0, 1, None), + Among("unk", 0, 1, None), + Among("\u{00E1}nk", 0, 1, None), + Among("\u{00E9}nk", 0, 1, None), + Among("\u{00ED}nk", 0, 1, None), + Among("ok", 0, 8, None), + Among("\u{00E1}tk", 0, 1, None), + Among("uk", 0, 9, None), + Among("a\u{010D}k", 0, 1, None), + Among("e\u{010D}k", 0, 1, None), + Among("i\u{010D}k", 0, 1, None), + Among("o\u{010D}k", 0, 1, None), + Among("u\u{010D}k", 0, 1, None), + Among("\u{00E1}\u{010D}k", 0, 1, None), + Among("\u{00E9}\u{010D}k", 0, 1, None), + Among("\u{00ED}\u{010D}k", 0, 1, None), + Among("\u{00E1}k", 0, 6, None), + Among("u\u{0161}k", 0, 1, None), + Among("\u{00E9}k", 0, 3, None), + Among("\u{00ED}k", 0, 5, None), +]; + +static A_7: &'static [Among; 2] = &[ + Among("ej\u{0161}", -1, 2, None), + Among("\u{011B}j\u{0161}", -1, 1, None), +]; + +static G_v: &'static [u8; 34] = &[17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 18, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64]; + +#[derive(Clone)] +struct Context { + i_p1: usize, + i_pV: usize, +} + +fn r_mark_regions(env: &mut SnowballEnv, context: &mut Context) -> bool { + context.i_pV = env.limit; + context.i_p1 = env.limit; + let v_1 = env.cursor; + 'lab0: loop { + 'golab1: loop { + 'lab2: loop { + if !env.out_grouping(G_v, 97, 367) { + break 'lab2; + } + break 'golab1; + } + if env.cursor >= env.limit { + break 'lab0; + } + env.next_char(); + } + context.i_pV = env.cursor; + 'golab3: loop { + 'lab4: loop { + if !env.out_grouping(G_v, 97, 367) { + break 'lab4; + } + break 'golab3; + } + if env.cursor >= env.limit { + break 'lab0; + } + env.next_char(); + } + 'golab5: loop { + 'lab6: loop { + if !env.in_grouping(G_v, 97, 367) { + break 'lab6; + } + break 'golab5; + } + if env.cursor >= env.limit { + break 'lab0; + } + env.next_char(); + } + context.i_p1 = env.cursor; + break 'lab0; + } + env.cursor = v_1; + return true +} + +fn r_RV(env: &mut SnowballEnv, context: &mut Context) -> bool { + return context.i_pV <= env.cursor +} + +fn r_R1(env: &mut SnowballEnv, context: &mut Context) -> bool { + return context.i_p1 <= env.cursor +} + +fn r_palatalise(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_0, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + if !r_RV(env, context) { + return false; + } + match among_var { + 1 => { + if !env.slice_from("k") { + return false; + } + } + 2 => { + if !env.slice_from("h") { + return false; + } + } + 3 => { + if !env.slice_from("ck") { + return false; + } + } + 4 => { + if !env.slice_from("sk") { + return false; + } + } + _ => () + } + return true +} + +fn r_do_possessive(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_1, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + if !r_RV(env, context) { + return false; + } + match among_var { + 1 => { + if !env.slice_del() { + return false; + } + } + 2 => { + if !env.slice_del() { + return false; + } + let v_1 = env.limit - env.cursor; + 'lab0: loop { + if !r_palatalise(env, context) { + env.cursor = env.limit - v_1; + break 'lab0; + } + break 'lab0; + } + } + _ => () + } + return true +} + +fn r_do_case(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_2, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + match among_var { + 1 => { + if !env.slice_del() { + return false; + } + } + 2 => { + if !env.slice_del() { + return false; + } + let v_1 = env.limit - env.cursor; + 'lab0: loop { + if !r_palatalise(env, context) { + env.cursor = env.limit - v_1; + break 'lab0; + } + break 'lab0; + } + } + 3 => { + if !env.slice_from("e") { + return false; + } + let v_2 = env.limit - env.cursor; + 'lab1: loop { + if !r_palatalise(env, context) { + env.cursor = env.limit - v_2; + break 'lab1; + } + break 'lab1; + } + } + _ => () + } + return true +} + +fn r_do_derivational(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_3, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + if !r_R1(env, context) { + return false; + } + match among_var { + 1 => { + if !env.slice_del() { + return false; + } + } + 2 => { + if !env.slice_from("i") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 3 => { + if !env.slice_from("e") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 4 => { + if !env.slice_from("\u{00E9}") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 5 => { + if !env.slice_from("\u{011B}") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 6 => { + if !env.slice_from("\u{00ED}") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + _ => () + } + return true +} + +fn r_do_deriv_single(env: &mut SnowballEnv, context: &mut Context) -> bool { + env.ket = env.cursor; + if env.find_among_b(A_4, context) == 0 { + return false; + } + env.bra = env.cursor; + if !env.slice_del() { + return false; + } + return true +} + +fn r_do_augmentative(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_5, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + match among_var { + 1 => { + if !env.slice_del() { + return false; + } + } + 2 => { + if !env.slice_from("i") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + _ => () + } + return true +} + +fn r_do_diminutive(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_6, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + match among_var { + 1 => { + if !env.slice_del() { + return false; + } + } + 2 => { + if !env.slice_from("e") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 3 => { + if !env.slice_from("\u{00E9}") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 4 => { + if !env.slice_from("i") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 5 => { + if !env.slice_from("\u{00ED}") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 6 => { + if !env.slice_from("\u{00E1}") { + return false; + } + } + 7 => { + if !env.slice_from("a") { + return false; + } + } + 8 => { + if !env.slice_from("o") { + return false; + } + } + 9 => { + if !env.slice_from("u") { + return false; + } + } + _ => () + } + return true +} + +fn r_do_comparative(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_7, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + match among_var { + 1 => { + if !env.slice_from("\u{011B}") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + 2 => { + if !env.slice_from("e") { + return false; + } + if !r_palatalise(env, context) { + return false; + } + } + _ => () + } + return true +} + +fn r_do_aggressive(env: &mut SnowballEnv, context: &mut Context) -> bool { + let v_1 = env.limit - env.cursor; + r_do_comparative(env, context); + env.cursor = env.limit - v_1; + let v_2 = env.limit - env.cursor; + r_do_diminutive(env, context); + env.cursor = env.limit - v_2; + let v_3 = env.limit - env.cursor; + r_do_augmentative(env, context); + env.cursor = env.limit - v_3; + 'lab0: loop { + let v_4 = env.limit - env.cursor; + 'lab1: loop { + if !r_do_derivational(env, context) { + break 'lab1; + } + break 'lab0; + } + env.cursor = env.limit - v_4; + if !r_do_deriv_single(env, context) { + return false; + } + break 'lab0; + } + return true +} + +pub fn stem(env: &mut SnowballEnv) -> bool { + let mut context = &mut Context { + i_p1: 0, + i_pV: 0, + }; + r_mark_regions(env, context); + env.limit_backward = env.cursor; + env.cursor = env.limit; + if !r_do_case(env, context) { + return false; + } + if !r_do_possessive(env, context) { + return false; + } + if !r_do_aggressive(env, context) { + return false; + } + env.cursor = env.limit_backward; + return true +} diff --git a/src/snowball/algorithms/mod.rs b/src/snowball/algorithms/mod.rs index c1c4073..dec6c8d 100644 --- a/src/snowball/algorithms/mod.rs +++ b/src/snowball/algorithms/mod.rs @@ -1,5 +1,6 @@ pub mod arabic; pub mod armenian; +pub mod czech; pub mod danish; pub mod dutch; pub mod english;