JSON output option for tokenize command
KEINOS opened this issue · comments
I wish to have the -json
option to empower the command line usage. Which lets the output in JSON.
$ # Default
$ echo "私は鰻" | kagome
私 名詞,代名詞,一般,*,*,*,私,ワタシ,ワタシ
は 助詞,係助詞,*,*,*,*,は,ハ,ワ
鰻 名詞,一般,*,*,*,*,鰻,ウナギ,ウナギ
EOS
$ # With -json option
$ echo "私は鰻" | kagome -json
[
{"surface":"BOS","features":null},
{"surface":"私","features":["名詞","代名詞","一般","*","*","*","私","ワタシ","ワタシ"]},
{"surface":"は","features":["助詞","係助詞","*","*","*","*","は","ハ","ワ"]},
{"surface":"鰻","features":["名詞","一般","*","*","*","*","鰻","ウナギ","ウナギ"]},
{"surface":"EOS","features":null}
]
- Use cases
$ # Cooperate with other commands
$ echo "私は鰻" | go run . -json | jq .
[
{
"surface": "BOS",
"features": null
},
{
"surface": "私",
"features": [
"名詞",
"代名詞",
"一般",
"*",
"*",
"*",
"私",
"ワタシ",
"ワタシ"
]
},
{
"surface": "は",
"features": [
"助詞",
"係助詞",
"*",
"*",
"*",
"*",
"は",
"ハ",
"ワ"
]
},
{
"surface": "鰻",
"features": [
"名詞",
"一般",
"*",
"*",
"*",
"*",
"鰻",
"ウナギ",
"ウナギ"
]
},
{
"surface": "EOS",
"features": null
}
]
$ echo "私は鰻" | go run . -json | jq -r '.[].features[8] | select(. != null)'
ワタシ
ワ
ウナギ
$ # TTS for example
$ echo "私は鰻" | go run . -json | jq -r '.[].features[8] | select(. != null)' | say
Sample implementation
Lines 158 to 171 in 92102e0
+ # TODO: Capture option flag
+ var flagJSONIsUp = true
- for s.Scan() {
- sen := s.Text()
- tokens := t.Analyze(sen, mode)
- for i, size := 1, len(tokens); i < size; i++ {
- tok := tokens[i]
- c := tok.Features()
- if tok.Class == tokenizer.DUMMY {
- fmt.Printf("%s\n", tok.Surface)
- } else {
- fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
- }
- }
- }
- return s.Err()
+ return ScanTokens(s, t, mode, flagJSONIsUp)
- kagome/cmd/tokenize/PrintToken.go
package tokenize
import (
"bufio"
"encoding/json"
"fmt"
"strings"
"github.com/ikawaha/kagome/v2/tokenizer"
)
type TokenedJSON struct {
Surface string `json:"surface"`
Features []string `json:"features"`
}
func parseToJSON(surface string, features []string) ([]byte, error) {
return json.Marshal(TokenedJSON{
Surface: surface,
Features: features,
})
}
func PrintDefault(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) error {
for s.Scan() {
sen := s.Text()
tokens := t.Analyze(sen, mode)
for i, size := 1, len(tokens); i < size; i++ {
tok := tokens[i]
c := tok.Features()
if tok.Class == tokenizer.DUMMY {
fmt.Printf("%s\n", tok.Surface)
} else {
fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
}
}
}
return s.Err()
}
func PrintJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
var buff []byte
fmt.Println("[") // Begin bracket
for s.Scan() {
sen := s.Text()
tokens := t.Analyze(sen, mode)
for _, tok := range tokens {
c := tok.Features()
if len(buff) > 0 {
fmt.Printf("%s,\n", buff) // Print with comma
}
if buff, err = parseToJSON(tok.Surface, c); err != nil {
return err
}
}
}
if s.Err() == nil {
fmt.Printf("%s\n", buff) // Spit the last buffer w/no comma
fmt.Println("]") // End bracket
}
return s.Err()
}
func ScanTokens(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode, jsonOut bool) error {
if !jsonOut {
return PrintDefault(s, t, mode)
}
return PrintJSON(s, t, mode)
}
If the way of the above sample implementation sounds ok for you, I would be happy to draft a PR.
That sounds good! 👍
How about formatting the token JSON the same way as server mode response format of kagome?
{
"id": 304999,
"start": 0,
"end": 1,
"surface": "私",
"class": "KNOWN",
"pos": [
"名詞",
"代名詞",
"一般",
"*"
],
"base_form": "私",
"reading": "ワタシ",
"pronunciation": "ワタシ",
"features": [
"名詞",
"代名詞",
"一般",
"*",
"*",
"*",
"私",
"ワタシ",
"ワタシ"
]
}
Lines 64 to 75 in 92102e0
How about formatting the token JSON the same way as server mode response format of kagome?
I agree! 👍
That's more uniformed and informative!
$ echo "私" | go run . -json | jq -r .
[
{
"id": 304999,
"start": 0,
"end": 1,
"surface": "私",
"class": "KNOWN",
"pos": [
"名詞",
"代名詞",
"一般",
"*"
],
"base_form": "私",
"reading": "ワタシ",
"pronunciation": "ワタシ",
"features": [
"名詞",
"代名詞",
"一般",
"*",
"*",
"*",
"私",
"ワタシ",
"ワタシ"
]
}
]
Fixed Code
// kagome/cmd/tokenize/PrintToken.go
package tokenize
import (
"bufio"
"encoding/json"
"fmt"
"strings"
"github.com/ikawaha/kagome/v2/tokenizer"
)
type TokenedJSON struct {
ID int `json:"id"`
Start int `json:"start"`
End int `json:"end"`
Surface string `json:"surface"`
Class string `json:"class"`
POS []string `json:"pos"`
BaseForm string `json:"base_form"`
Reading string `json:"reading"`
Pronunciation string `json:"pronunciation"`
Features []string `json:"features"`
}
// ParseTokenToJSON parses the token to JSON in the same format as the server mode response does.
func ParseTokenToJSON(tok tokenizer.Token) ([]byte, error) {
j := TokenedJSON{
ID: tok.ID,
Start: tok.Start,
End: tok.End,
Surface: tok.Surface,
Class: fmt.Sprintf("%v", tok.Class),
POS: tok.POS(),
Features: tok.Features(),
}
j.BaseForm, _ = tok.BaseForm()
j.Reading, _ = tok.Reading()
j.Pronunciation, _ = tok.Pronunciation()
return json.Marshal(j)
}
func PrintTokensDefault(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) error {
for s.Scan() {
sen := s.Text()
tokens := t.Analyze(sen, mode)
for i, size := 1, len(tokens); i < size; i++ {
tok := tokens[i]
c := tok.Features()
if tok.Class == tokenizer.DUMMY {
fmt.Printf("%s\n", tok.Surface)
} else {
fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
}
}
}
return s.Err()
}
func PrintTokensInJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
var buff []byte
fmt.Println("[") // Begin bracket
for s.Scan() {
sen := s.Text()
tokens := t.Analyze(sen, mode)
for _, tok := range tokens {
if tok.ID == tokenizer.BosEosID {
continue
}
if len(buff) > 0 {
fmt.Printf("%s,\n", buff) // Print with comma
}
if buff, err = ParseTokenToJSON(tok); err != nil {
return err
}
}
}
if s.Err() == nil {
fmt.Printf("%s\n", buff) // Spit the last buffer w/no comma
fmt.Println("]") // End bracket
}
return s.Err()
}
func ScanTokens(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode, jsonOut bool) error {
if !jsonOut {
return PrintTokensDefault(s, t, mode)
}
return PrintTokensInJSON(s, t, mode)
}
I would like to draft a PR for the above changes. Which branch should I point to? Is v2 おk?
🙆♀️ Please create a your PR for the v2 branch.