syl-based content shelving and reinsertion
drupchen opened this issue · comments
This is to solve cross-line tokens such as "ཝ་ཡེ། བཀྲ་\nཤིས་ཡིན་པས།" where "བཀྲ་ཤིས་" should be counted as a token.
Stripping the \n is a bad idea for large documents, and splitting the tokens in the output is also a bad idea for most use cases
The default behaviour should be to shift the \n to the end of the current token, so that we get "[ཝ་ཡེ] [།] [བཀྲ་ཤིས་] [\n] [ཡིན་] [པས] [།]"
import botok
def get_chunks(raw_string):
chunker = botok.Chunks(raw_string)
chunks = chunker.make_chunks()
chunks = chunker.get_readable(chunks)
return chunks
def shelve_info(chunks):
shelved = []
clean_chunks = []
syl_count = 0
for i, chunk in enumerate(chunks):
marker, text = chunk
if marker == 'TEXT' or marker == 'PUNCT':
syl_count += 1
# 2.a. extract transparent chars
# TODO: adapt to also include \t as transparent char
if '\n' in text:
# remove transparent char
text = text.replace('\n', '')
index = (syl_count, '\n')
shelved.append(index)
clean_chunks.append((marker, text))
# 2.b. extract any non-bo chunk
elif marker != 'TEXT' and marker != 'PUNCT':
index = (syl_count, text)
shelved.append(index)
else:
clean_chunks.append(chunk)
return clean_chunks, shelved
test = "བཀྲ་ཤིས་བདེ་ལེགས་\nཕུན་སུམ་ཚོགས། this is non-bo text རྟག་ཏུ་བདེ་\nབ་ཐོབ་པ\nར་ཤོག"
# 1. get chunks
chunks = get_chunks(test)
# 2. shelve needed info
chunks, shelved = shelve_info(chunks)
##############################################################################################
# 3. tokenize
str_for_botok = ''.join([c[1] for c in chunks])
tok = botok.WordTokenizer()
tokens = tok.tokenize(str_for_botok)
# extract (text, amount_of_syls) from token list
tokens = [(t.text, 1) if t.chunk_type == 'PUNCT' else (t.text, len(t.syls)) for t in tokens]
##############################################################################################
# 4. reinsert shelved tokens
# at this point, the only thing left is to merge shelved with tokens in accordance with the indices
Here is the content of the two lists at this point of execution:
shelved = [(4, '\n'), (8, 'this is non-bo text '), (11, '\n'), (14, '\n'), (15, '\n')]
format : [(syl_index, string_to_reinsert), ...]
tokens = [('བཀྲ་ཤིས་', 2), ('བདེ་ལེགས་', 2), ('ཕུན་སུམ་', 2), ('ཚོགས', 1), ('། ', 1), ('རྟག་', 1), ('ཏུ་', 1), ('བདེ་བ་', 2), ('ཐོབ་པ', 2), ('ར་', 1), ('ཤོག', 1)]
format : # [(token_text, syl_amount), ...]