ikawaha / kagome

Self-contained Japanese Morphological Analyzer written in pure Go

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

JSON array goes wrong on interactive mode

KEINOS opened this issue · comments

As of v2.6.0, the brackets come before interaction and miss the last token like so.

$ kagome version
2.6.0
   ipa v1.0.3
   uni v1.1.2

$ kagome -json
[
すもももももももものうち
{"id":36163,"start":0,"end":3,"surface":"すもも","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"すもも","reading":"スモモ","pronunciation":"スモモ","features":["名詞","一般","*","*","*","*","すもも","スモモ","スモモ"]},
{"id":73244,"start":3,"end":4,"surface":"も","class":"KNOWN","pos":["助詞","係助詞","*","*"],"base_form":"も","reading":"モ","pronunciation":"モ","features":["助詞","係助詞","*","*","*","*","も","モ","モ"]},
{"id":74988,"start":4,"end":6,"surface":"もも","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"もも","reading":"モモ","pronunciation":"モモ","features":["名詞","一般","*","*","*","*","もも","モモ","モモ"]},
{"id":73244,"start":6,"end":7,"surface":"も","class":"KNOWN","pos":["助詞","係助詞","*","*"],"base_form":"も","reading":"モ","pronunciation":"モ","features":["助詞","係助詞","*","*","*","*","も","モ","モ"]},
{"id":74988,"start":7,"end":9,"surface":"もも","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"もも","reading":"モモ","pronunciation":"モモ","features":["名詞","一般","*","*","*","*","もも","モモ","モモ"]},
{"id":55829,"start":9,"end":10,"surface":"の","class":"KNOWN","pos":["助詞","連体化","*","*"],"base_form":"の","reading":"ノ","pronunciation":"ノ","features":["助詞","連体化","*","*","*","*","の","ノ","ノ"]},
私は鰻
{"id":8027,"start":8,"end":10,"surface":"うち","class":"KNOWN","pos":["名詞","非自立","副詞可能","*"],"base_form":"うち","reading":"ウチ","pronunciation":"ウチ","features":["名詞","非自立","副詞可能","*","*","*","うち","ウチ","ウチ"]},
{"id":304999,"start":0,"end":1,"surface":"私","class":"KNOWN","pos":["名詞","代名詞","一般","*"],"base_form":"私","reading":"ワタシ","pronunciation":"ワタシ","features":["名詞","代名詞","一般","*","*","*","私","ワタシ","ワタシ"]},
{"id":57061,"start":1,"end":2,"surface":"は","class":"KNOWN","pos":["助詞","係助詞","*","*"],"base_form":"は","reading":"ハ","pronunciation":"ワ","features":["助詞","係助詞","*","*","*","*","は","ハ","ワ"]},
^Csignal: interrupt

Note the [ bracket place and first token element of "私は鰻".

The last token, "うち" with the ID 8027 of the previous sentence "すもももももももものうち" appears.

The expected behavior may be as below.

$ kagome -json
すもももももももものうち
[
{"id":36163,"start":0,"end":3,"surface":"すもも","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"すもも","reading":"スモモ","pronunciation":"スモモ","features":["名詞","一般","*","*","*","*","すもも","スモモ","スモモ"]},
{"id":73244,"start":3,"end":4,"surface":"も","class":"KNOWN","pos":["助詞","係助詞","*","*"],"base_form":"も","reading":"モ","pronunciation":"モ","features":["助詞","係助詞","*","*","*","*","も","モ","モ"]},
{"id":74988,"start":4,"end":6,"surface":"もも","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"もも","reading":"モモ","pronunciation":"モモ","features":["名詞","一般","*","*","*","*","もも","モモ","モモ"]},
{"id":73244,"start":6,"end":7,"surface":"も","class":"KNOWN","pos":["助詞","係助詞","*","*"],"base_form":"も","reading":"モ","pronunciation":"モ","features":["助詞","係助詞","*","*","*","*","も","モ","モ"]},
{"id":74988,"start":7,"end":9,"surface":"もも","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"もも","reading":"モモ","pronunciation":"モモ","features":["名詞","一般","*","*","*","*","もも","モモ","モモ"]},
{"id":55829,"start":9,"end":10,"surface":"の","class":"KNOWN","pos":["助詞","連体化","*","*"],"base_form":"の","reading":"ノ","pronunciation":"ノ","features":["助詞","連体化","*","*","*","*","の","ノ","ノ"]},
{"id":8027,"start":10,"end":12,"surface":"うち","class":"KNOWN","pos":["名詞","非自立","副詞可能","*"],"base_form":"うち","reading":"ウチ","pronunciation":"ウチ","features":["名詞","非自立","副詞可能","*","*","*","うち","ウチ","ウチ"]}
]
私は鰻
[
{"id":304999,"start":0,"end":1,"surface":"私","class":"KNOWN","pos":["名詞","代名詞","一般","*"],"base_form":"私","reading":"ワタシ","pronunciation":"ワタシ","features":["名詞","代名詞","一般","*","*","*","私","ワタシ","ワタシ"]},
{"id":57061,"start":1,"end":2,"surface":"は","class":"KNOWN","pos":["助詞","係助詞","*","*"],"base_form":"は","reading":"ハ","pronunciation":"ワ","features":["助詞","係助詞","*","*","*","*","は","ハ","ワ"]},
{"id":387420,"start":2,"end":3,"surface":"鰻","class":"KNOWN","pos":["名詞","一般","*","*"],"base_form":"鰻","reading":"ウナギ","pronunciation":"ウナギ","features":["名詞","一般","*","*","*","*","鰻","ウナギ","ウナギ"]}
]
^Csignal: interrupt

This is my bad, sorry. Here's the fix and I will PR it A.S.A.P.

func printTokensInJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
	var buff []byte

-	fmtPrintF("[\n") // Begin array bracket

	for s.Scan() {
+		fmtPrintF("[\n") // Begin array bracket

		sen := s.Text()
		tokens := t.Analyze(sen, mode)

		for _, tok := range tokens {
			if tok.ID == tokenizer.BosEosID {
				continue
			}

			if len(buff) > 0 {
				fmtPrintF("%s,\n", buff) // Print array element (JSON with comma)
			}

			if buff, err = parseTokenToJSON(tok); err != nil {
				return err
			}
		}

+		fmtPrintF("%s\n", buff) // Spit out the last buffer without comma to close the array
+		fmtPrintF("]\n")        // End array bracket
	}

-	if s.Err() == nil {
-		fmtPrintF("%s\n", buff) // Spit out the last buffer without comma to close the array
-		fmtPrintF("]\n")        // End array bracket
-	}

	return s.Err()
}