let text = "你好世界";let result = ngrams(text,2);assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["你好", "好世", "世界"]);
extract_consecutive_chinese_chars
let text = "foo中文bar字符baz";let result = extract_consecutive_chinese_chars(text);assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["中文", "字符"]);
extract_consecutive_letters
let text = "foo中文,bar,字符baz";let result = extract_consecutive_letters(text);assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["foo中文", "bar", "字符baz"]);
cedarwood(slow, low memory usage)
Ingore unmatched contents
use ultra_nlp::BehaviorForUnmatched,use ultra_nlp::cedarwood::{
segment_fully,ForwardDictionary,};let text = " 南京市长江大桥, hello world ";let dict = ForwardDictionary::new(vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]).unwrap();let result = segment_fully(
text,&dict,BehaviorForUnmatched::Ignore);assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec!["南京", "南京市", "市长", "长江", "大桥"]);