Retrieval Enhanced Generative Question Answering with OpenAI - Dictionary problem

Question

Retrieval Enhanced Generative Question Answering with OpenAI - Dictionary problem

Avs-safety opened this issue a year ago · comments

Hi,

I've been following the code here: https://github.com/pinecone-io/examples/blob/master/generation/generative-qa/openai/gen-qa-openai/gen-qa-openai.ipynb with a different dataset, and get an error (TypeError: string indices must be integers) for ids_batch = [x['id'] for x in meta_batch] and further down for cleaning up the meta data.

from tqdm.auto import tqdm
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(new_data), batch_size)):
    # find end of batch
    i_end = min(len(new_data), i+batch_size)
    meta_batch = new_data[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    meta_batch = [{
        'start': x['start'],
        'end': x['end'],
        'title': x['title'],
        'text': x['text'],
        'url': x['url'],
        'published': x['published'],
        'channel_id': x['channel_id']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

I have got around this by amending the code as shown below. However I'm not sure why there is a problem with accessing the dictionary keys (plus I'm not sure if my amendments make the vectors inaccurate) - Appreciate any help you can provide!

from tqdm.auto import tqdm
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    # find end of batch
    i_end = min(len(data), i+batch_size)
    meta_batch = data[i:i_end]
    # get ids
    ids_batch = meta_batch['acn_num_ACN']#[x['acn_num_ACN'] for x in meta_batch]
    # get texts to encode
    texts = meta_batch['Report 1_Narrative']#[x['Report 1_Narrative'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    #meta_batch = [{
  #      'acn_num_ACN': x['acn_num_ACN'],
  #      'Time_Date': x['Time_Date'],
  #     'Report 1_Narrative': x['Report 1_Narrative'],
   #     'Report 1.2_Synopsis': x['Report 1.2_Synopsis']
  #  } for x in meta_batch]
    #meta_batch = [{'acn_num_ACN': ['acn_num_ACN'], 'Time_Date': ['Time_Date'], 'Report 1_Narrative': ['Report 1_Narrative'], 'Report 1.2_Synopsis': ['Report 1.2_Synopsis']}]
    to_upsert = list(zip(ids_batch, embeds, data))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)