Missing "ent" in annotation
SivilTaram opened this issue · comments
Qian commented
Hi Li,
when I want to preprocess some data new for the model, I have encountered an error which warns me about missing the "ent" field. I think there may be a omit in wikisql/annotate.py
, and here is my solution:
def annotate(sentence, lower=True):
global client
if client is None:
client = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'pos'])
words, gloss, after, ents = [], [], [], []
for s in client.annotate(sentence):
for t in s:
words.append(t.word)
gloss.append(t.originalText)
after.append(t.after)
ents.append(t.pos)
if lower:
words = [w.lower() for w in words]
return {
'gloss': gloss,
'words': words,
'after': after,
'ent': ents
}
Hope it could help others, thanks :-D
Li Dong commented
Hi @SivilTaram ,
Thanks for the code!
import spacy
import codecs
import json
from spacy.tokens import Doc
nlp = spacy.load('en_core_web_lg')
def anno_main(anno_path):
with codecs.open(anno_path.replace('annotated', 'annotated_ent'), "w", "utf-8") as f_out:
with codecs.open(anno_path, "r", "utf-8") as f_in:
for line in f_in:
js = json.loads(line)
w_list = js['question']['gloss']
ws_list = [it.isspace() for it in js['question']['after']]
doc = Doc(nlp.vocab, words=w_list, spaces=ws_list)
for name, proc in nlp.pipeline:
doc = proc(doc)
js['question']['ent'] = [tk.tag_ for tk in doc]
assert(len(js['question']['ent']) == len(js['question']['words']))
f_out.write(json.dumps(js))
f_out.write('\n')
for split in ('train','dev','test'):
anno_main("data_path/WikiSQL/annotated/{}.jsonl".format(split))