donglixp / coarse2fine

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Missing "ent" in annotation

SivilTaram opened this issue · comments

commented

Hi Li,

when I want to preprocess some data new for the model, I have encountered an error which warns me about missing the "ent" field. I think there may be a omit in wikisql/annotate.py, and here is my solution:

def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'pos'])
    words, gloss, after, ents = [], [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
            ents.append(t.pos)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
        'ent': ents
    }

Hope it could help others, thanks :-D

Hi @SivilTaram ,

Thanks for the code!

import spacy
import codecs
import json
from spacy.tokens import Doc

nlp = spacy.load('en_core_web_lg')

def anno_main(anno_path):
    with codecs.open(anno_path.replace('annotated', 'annotated_ent'), "w", "utf-8") as f_out:
        with codecs.open(anno_path, "r", "utf-8") as f_in:
            for line in f_in:
                js = json.loads(line)
                w_list = js['question']['gloss']
                ws_list = [it.isspace() for it in js['question']['after']]
                doc = Doc(nlp.vocab, words=w_list, spaces=ws_list)
                for name, proc in nlp.pipeline:
                    doc = proc(doc)
                js['question']['ent'] = [tk.tag_ for tk in doc]
                assert(len(js['question']['ent']) == len(js['question']['words']))
                f_out.write(json.dumps(js))
                f_out.write('\n')


for split in ('train','dev','test'):
    anno_main("data_path/WikiSQL/annotated/{}.jsonl".format(split))