ultra-nlp

Install

cargo add ultra-nlp

Usage

ngrams

let text = "你好世界";

let result = ngrams(text, 2);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["你好", "好世", "世界"]
);

extract_consecutive_chinese_chars

let text = "foo中文bar字符baz";

let result = extract_consecutive_chinese_chars(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["中文", "字符"]
);

extract_consecutive_letters

let text = "foo中文，bar,字符baz";

let result = extract_consecutive_letters(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["foo中文", "bar", "字符baz"]
);

cedarwood(slow, low memory usage)

Ingore unmatched contents

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::Ignore
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec!["南京", "南京市", "市长", "长江", "大桥"]
);

Keep unmatched contents as chars

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsChars
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

Keep unmatched ocntents as words

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsWords
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

daachorse(fast, high memory usage)

Ignore unmatched contents

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::Ignore);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
      "南京", "南京市", "市长", "长江", "大桥",
    ]
);

Keep unmatched contents as chars

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsChars);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

Keep unmatched contents as words

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsWords);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

BlackGlory / ultra-nlp

ultra-nlp

Install

Usage

ngrams

extract_consecutive_chinese_chars

extract_consecutive_letters

cedarwood(slow, low memory usage)

Ingore unmatched contents

Keep unmatched contents as chars

Keep unmatched ocntents as words

daachorse(fast, high memory usage)

Ignore unmatched contents

Keep unmatched contents as chars

Keep unmatched contents as words

About

Languages