leopku / bleve-gse-tokenizer

Gse plugin for Bleve search engine. Support English, Chinese and Japanese.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Some particular Chinese phrases can not be found

wolf1860 opened this issue · comments

such as search 审计 in 审计工作规定 or 审计工作,My code:

package main

import (
"fmt"
"github.com/blevesearch/bleve/v2"
_ "github.com/leopku/bleve-gse-tokenizer/v2"
"os"
)

func main(){
INDEX_DIR := "bleve.gse"
messages :=[]struct{
Id string json:"id"
Content string json:"content"
Specialty string json:"specialty"
} {
{"1","审计工作","军交运输"},
{"2","审计工作规定","智慧城市"},
}

mapping := bleve.NewIndexMapping()
os.RemoveAll(INDEX_DIR)
defer os.RemoveAll(INDEX_DIR)

if err := mapping.AddCustomTokenizer("gse", map[string]interface{}{
	"type":       "gse",
	"user_dicts": "./dict/zh/dict.txt",  // <-- MUST specified, otherwise panic would occurred.
}); err != nil {
	panic(err)
}
if err := mapping.AddCustomAnalyzer("gse", map[string]interface{}{
	"type":      "gse",
	"tokenizer": "gse",
}); err != nil {
	panic(err)
}
mapping.DefaultAnalyzer = "gse"

index, err := bleve.New(INDEX_DIR, mapping)
if err != nil {
	panic(err)
}
if err := index.Index("1", &messages[0]); err != nil {
	panic(err)
}

if err := index.Index("2", &messages[1]); err != nil {
	panic(err)
}

query := "content:审计"
req := bleve.NewSearchRequest(bleve.NewQueryStringQuery(query))
req.Highlight = bleve.NewHighlight()
res, err := index.Search(req)
if err != nil {
	panic(err)
}
fmt.Printf("Result of: '%s': %d matches\n", query, res.Total)
for i, hit := range res.Hits {
	rv := fmt.Sprintf("%d. %s, (%f)\n", i+res.Request.From+1, hit.ID, hit.Score)
	for fragmentField, fragments := range hit.Fragments {
		rv += fmt.Sprintf("%s: ", fragmentField)
		for _, fragment := range fragments {
			rv += fmt.Sprintf("%s", fragment)
		}
	}
	fmt.Printf("%s\n", rv)
}

index.Close()

}

result:
Result of: 'content:审计': 0 matches