Versatile tokenizer that automatically tags each token with its type
Tokenize sentences and also automatically tag each token as either word, email, twitter handle, or more using wink-tokenizer
. It is a part of wink — a growing family of high quality packages for Statistical Analysis, Natural Language Processing and Machine Learning in NodeJS.
Use npm to install:
npm install wink-tokenizer --save
// Load tokenizer.
var tokenizer = require( 'wink-tokenizer' );
// Create it's instance.
var myTokenizer = tokenizer();
// Just tokenize the sentence...
var s = '@superman: hit me up on my email r2d2@gmail.com, 2 of us plan party🎉 tom at 3pm:) #fun';
myTokenizer.tokenize( s );
// -> [ { token: '@superman', tag: 'mention' },
// { token: ':', tag: 'punctuation' },
// { token: 'hit', tag: 'word' },
// { token: 'me', tag: 'word' },
// { token: 'up', tag: 'word' },
// { token: 'on', tag: 'word' },
// { token: 'my', tag: 'word' },
// { token: 'email', tag: 'word' },
// { token: 'r2d2@gmail.com', tag: 'email' },
// { token: ',', tag: 'punctuation' },
// { token: '2', tag: 'number' },
// { token: 'of', tag: 'word' },
// { token: 'us', tag: 'word' },
// { token: 'plan', tag: 'word' },
// { token: 'party', tag: 'word' },
// { token: '🎉', tag: 'emoji' },
// { token: 'tom', tag: 'word' },
// { token: 'at', tag: 'word' },
// { token: '3pm', tag: 'time' },
// { token: ':)', tag: 'emoticon' },
// { token: '#fun', tag: 'hashtag' } ]
For detailed API docs, check out http://winkjs.org/wink-tokenizer/ URL!
If you spot a bug and the same has not yet been reported, raise a new issue or consider fixing it and sending a pull request.
wink-tokenizer is copyright 2017 GRAYPE Systems Private Limited.
It is licensed under the under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3 of the License.