ikawaha / kagome

Self-contained Japanese Morphological Analyzer written in pure Go

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

JSON output option for tokenize command

KEINOS opened this issue · comments

I wish to have the -json option to empower the command line usage. Which lets the output in JSON.

$ # Default
$ echo "私は鰻" | kagome
私      名詞,代名詞,一般,*,*,*,私,ワタシ,ワタシ
は      助詞,係助詞,*,*,*,*,は,ハ,ワ
鰻      名詞,一般,*,*,*,*,鰻,ウナギ,ウナギ
EOS

$ # With -json option
$ echo "私は鰻" | kagome -json
[
{"surface":"BOS","features":null},
{"surface":"私","features":["名詞","代名詞","一般","*","*","*","私","ワタシ","ワタシ"]},
{"surface":"は","features":["助詞","係助詞","*","*","*","*","は","ハ","ワ"]},
{"surface":"鰻","features":["名詞","一般","*","*","*","*","鰻","ウナギ","ウナギ"]},
{"surface":"EOS","features":null}
]
  • Use cases
$ # Cooperate with other commands
$ echo "私は鰻" | go run . -json | jq .
[
  {
    "surface": "BOS",
    "features": null
  },
  {
    "surface": "私",
    "features": [
      "名詞",
      "代名詞",
      "一般",
      "*",
      "*",
      "*",
      "私",
      "ワタシ",
      "ワタシ"
    ]
  },
  {
    "surface": "は",
    "features": [
      "助詞",
      "係助詞",
      "*",
      "*",
      "*",
      "*",
      "は",
      "ハ",
      "ワ"
    ]
  },
  {
    "surface": "鰻",
    "features": [
      "名詞",
      "一般",
      "*",
      "*",
      "*",
      "*",
      "鰻",
      "ウナギ",
      "ウナギ"
    ]
  },
  {
    "surface": "EOS",
    "features": null
  }
]

$ echo "私は鰻" | go run . -json | jq -r '.[].features[8] | select(. != null)'
ワタシ

ウナギ

$ # TTS for example
$ echo "私は鰻" | go run . -json | jq -r '.[].features[8] | select(. != null)' | say

Sample implementation
  • kagome/cmd/tokenize/cmd.go

    Lines 158 to 171 in 92102e0

    for s.Scan() {
    sen := s.Text()
    tokens := t.Analyze(sen, mode)
    for i, size := 1, len(tokens); i < size; i++ {
    tok := tokens[i]
    c := tok.Features()
    if tok.Class == tokenizer.DUMMY {
    fmt.Printf("%s\n", tok.Surface)
    } else {
    fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
    }
    }
    }
    return s.Err()
+	# TODO: Capture option flag
+	var flagJSONIsUp = true

-	for s.Scan() {
-		sen := s.Text()
-		tokens := t.Analyze(sen, mode)
-		for i, size := 1, len(tokens); i < size; i++ {
-			tok := tokens[i]
-			c := tok.Features()
-			if tok.Class == tokenizer.DUMMY {
-				fmt.Printf("%s\n", tok.Surface)
-			} else {
-				fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
-			}
-		}
-	}
-	return s.Err()

+	return ScanTokens(s, t, mode, flagJSONIsUp)
  • kagome/cmd/tokenize/PrintToken.go
package tokenize

import (
	"bufio"
	"encoding/json"
	"fmt"
	"strings"

	"github.com/ikawaha/kagome/v2/tokenizer"
)

type TokenedJSON struct {
	Surface  string   `json:"surface"`
	Features []string `json:"features"`
}

func parseToJSON(surface string, features []string) ([]byte, error) {
	return json.Marshal(TokenedJSON{
		Surface:  surface,
		Features: features,
	})
}

func PrintDefault(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) error {
	for s.Scan() {
		sen := s.Text()
		tokens := t.Analyze(sen, mode)

		for i, size := 1, len(tokens); i < size; i++ {
			tok := tokens[i]
			c := tok.Features()
			if tok.Class == tokenizer.DUMMY {
				fmt.Printf("%s\n", tok.Surface)
			} else {
				fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
			}
		}
	}

	return s.Err()
}

func PrintJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
	var buff []byte

	fmt.Println("[") // Begin bracket

	for s.Scan() {
		sen := s.Text()
		tokens := t.Analyze(sen, mode)

		for _, tok := range tokens {
			c := tok.Features()

			if len(buff) > 0 {
				fmt.Printf("%s,\n", buff) // Print with comma
			}

			if buff, err = parseToJSON(tok.Surface, c); err != nil {
				return err
			}

		}
	}

	if s.Err() == nil {
		fmt.Printf("%s\n", buff) // Spit the last buffer w/no comma
		fmt.Println("]")         // End bracket
	}

	return s.Err()

}

func ScanTokens(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode, jsonOut bool) error {
	if !jsonOut {
		return PrintDefault(s, t, mode)
	}

	return PrintJSON(s, t, mode)
}

If the way of the above sample implementation sounds ok for you, I would be happy to draft a PR.

That sounds good! 👍
How about formatting the token JSON the same way as server mode response format of kagome?

   {
      "id": 304999,
      "start": 0,
      "end": 1,
      "surface": "",
      "class": "KNOWN",
      "pos": [
        "名詞",
        "代名詞",
        "一般",
        "*"
      ],
      "base_form": "",
      "reading": "ワタシ",
      "pronunciation": "ワタシ",
      "features": [
        "名詞",
        "代名詞",
        "一般",
        "*",
        "*",
        "*",
        "",
        "ワタシ",
        "ワタシ"
      ]
    }

kagome/cmd/server/api.go

Lines 64 to 75 in 92102e0

m := record{
ID: tok.ID,
Start: tok.Start,
End: tok.End,
Surface: tok.Surface,
Class: fmt.Sprintf("%v", tok.Class),
POS: tok.POS(),
Features: tok.Features(),
}
m.BaseForm, _ = tok.BaseForm()
m.Reading, _ = tok.Reading()
m.Pronunciation, _ = tok.Pronunciation()

How about formatting the token JSON the same way as server mode response format of kagome?

I agree! 👍

That's more uniformed and informative!

$ echo "" | go run . -json | jq -r .
[
  {
    "id": 304999,
    "start": 0,
    "end": 1,
    "surface": "私",
    "class": "KNOWN",
    "pos": [
      "名詞",
      "代名詞",
      "一般",
      "*"
    ],
    "base_form": "私",
    "reading": "ワタシ",
    "pronunciation": "ワタシ",
    "features": [
      "名詞",
      "代名詞",
      "一般",
      "*",
      "*",
      "*",
      "私",
      "ワタシ",
      "ワタシ"
    ]
  }
]
Fixed Code
// kagome/cmd/tokenize/PrintToken.go
package tokenize

import (
	"bufio"
	"encoding/json"
	"fmt"
	"strings"

	"github.com/ikawaha/kagome/v2/tokenizer"
)

type TokenedJSON struct {
	ID            int      `json:"id"`
	Start         int      `json:"start"`
	End           int      `json:"end"`
	Surface       string   `json:"surface"`
	Class         string   `json:"class"`
	POS           []string `json:"pos"`
	BaseForm      string   `json:"base_form"`
	Reading       string   `json:"reading"`
	Pronunciation string   `json:"pronunciation"`
	Features      []string `json:"features"`
}

// ParseTokenToJSON parses the token to JSON in the same format as the server mode response does.
func ParseTokenToJSON(tok tokenizer.Token) ([]byte, error) {
	j := TokenedJSON{
		ID:       tok.ID,
		Start:    tok.Start,
		End:      tok.End,
		Surface:  tok.Surface,
		Class:    fmt.Sprintf("%v", tok.Class),
		POS:      tok.POS(),
		Features: tok.Features(),
	}

	j.BaseForm, _ = tok.BaseForm()
	j.Reading, _ = tok.Reading()
	j.Pronunciation, _ = tok.Pronunciation()

	return json.Marshal(j)
}

func PrintTokensDefault(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) error {
	for s.Scan() {
		sen := s.Text()
		tokens := t.Analyze(sen, mode)

		for i, size := 1, len(tokens); i < size; i++ {
			tok := tokens[i]
			c := tok.Features()
			if tok.Class == tokenizer.DUMMY {
				fmt.Printf("%s\n", tok.Surface)
			} else {
				fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
			}
		}
	}

	return s.Err()
}

func PrintTokensInJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
	var buff []byte

	fmt.Println("[") // Begin bracket

	for s.Scan() {
		sen := s.Text()
		tokens := t.Analyze(sen, mode)

		for _, tok := range tokens {
			if tok.ID == tokenizer.BosEosID {
				continue
			}

			if len(buff) > 0 {
				fmt.Printf("%s,\n", buff) // Print with comma
			}

			if buff, err = ParseTokenToJSON(tok); err != nil {
				return err
			}
		}
	}

	if s.Err() == nil {
		fmt.Printf("%s\n", buff) // Spit the last buffer w/no comma
		fmt.Println("]")         // End bracket
	}

	return s.Err()

}

func ScanTokens(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode, jsonOut bool) error {
	if !jsonOut {
		return PrintTokensDefault(s, t, mode)
	}

	return PrintTokensInJSON(s, t, mode)
}

@ikawaha

I would like to draft a PR for the above changes. Which branch should I point to? Is v2 おk?

🙆‍♀️ Please create a your PR for the v2 branch.

Featured in PR #247 ! 🎉

Closing 〜