Retrieval Enhanced Generative Question Answering with OpenAI - Dictionary problem
Avs-safety opened this issue · comments
Avs-safety commented
Hi,
I've been following the code here: https://github.com/pinecone-io/examples/blob/master/generation/generative-qa/openai/gen-qa-openai/gen-qa-openai.ipynb with a different dataset, and get an error (TypeError: string indices must be integers) for ids_batch = [x['id'] for x in meta_batch] and further down for cleaning up the meta data.
from tqdm.auto import tqdm
from time import sleep
batch_size = 100 # how many embeddings we create and insert at once
for i in tqdm(range(0, len(new_data), batch_size)):
# find end of batch
i_end = min(len(new_data), i+batch_size)
meta_batch = new_data[i:i_end]
# get ids
ids_batch = [x['id'] for x in meta_batch]
# get texts to encode
texts = [x['text'] for x in meta_batch]
# create embeddings (try-except added to avoid RateLimitError)
try:
res = openai.Embedding.create(input=texts, engine=embed_model)
except:
done = False
while not done:
sleep(5)
try:
res = openai.Embedding.create(input=texts, engine=embed_model)
done = True
except:
pass
embeds = [record['embedding'] for record in res['data']]
# cleanup metadata
meta_batch = [{
'start': x['start'],
'end': x['end'],
'title': x['title'],
'text': x['text'],
'url': x['url'],
'published': x['published'],
'channel_id': x['channel_id']
} for x in meta_batch]
to_upsert = list(zip(ids_batch, embeds, meta_batch))
# upsert to Pinecone
index.upsert(vectors=to_upsert)
I have got around this by amending the code as shown below. However I'm not sure why there is a problem with accessing the dictionary keys (plus I'm not sure if my amendments make the vectors inaccurate) - Appreciate any help you can provide!
from tqdm.auto import tqdm
from time import sleep
batch_size = 100 # how many embeddings we create and insert at once
for i in tqdm(range(0, len(data), batch_size)):
# find end of batch
i_end = min(len(data), i+batch_size)
meta_batch = data[i:i_end]
# get ids
ids_batch = meta_batch['acn_num_ACN']#[x['acn_num_ACN'] for x in meta_batch]
# get texts to encode
texts = meta_batch['Report 1_Narrative']#[x['Report 1_Narrative'] for x in meta_batch]
# create embeddings (try-except added to avoid RateLimitError)
try:
res = openai.Embedding.create(input=texts, engine=embed_model)
except:
done = False
while not done:
sleep(5)
try:
res = openai.Embedding.create(input=texts, engine=embed_model)
done = True
except:
pass
embeds = [record['embedding'] for record in res['data']]
# cleanup metadata
#meta_batch = [{
# 'acn_num_ACN': x['acn_num_ACN'],
# 'Time_Date': x['Time_Date'],
# 'Report 1_Narrative': x['Report 1_Narrative'],
# 'Report 1.2_Synopsis': x['Report 1.2_Synopsis']
# } for x in meta_batch]
#meta_batch = [{'acn_num_ACN': ['acn_num_ACN'], 'Time_Date': ['Time_Date'], 'Report 1_Narrative': ['Report 1_Narrative'], 'Report 1.2_Synopsis': ['Report 1.2_Synopsis']}]
to_upsert = list(zip(ids_batch, embeds, data))
# upsert to Pinecone
index.upsert(vectors=to_upsert)