Abhilashvj's picture
Duplicate from Abhilashvj/haystack_QA
ab8dd8d
import pinecone
index_name = "abstractive-question-answering"
# check if the abstractive-question-answering index exists
if index_name not in pinecone.list_indexes():
# create the index if it does not exist
pinecone.create_index(
index_name,
dimension=768,
metric="cosine"
)
# connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)
# we will use batches of 64
batch_size = 64
for i in tqdm(range(0, len(df), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(df))
# extract batch
batch = df.iloc[i:i_end]
# generate embeddings for batch
emb = retriever.encode(batch["passage_text"].tolist()).tolist()
# get metadata
meta = batch.to_dict(orient="records")
# create unique IDs
ids = [f"{idx}" for idx in range(i, i_end)]
# add all to upsert list
to_upsert = list(zip(ids, emb, meta))
# upsert/insert these records to pinecone
_ = index.upsert(vectors=to_upsert)
# check that we have all vectors in index
index.describe_index_stats()
# from transformers import BartTokenizer, BartForConditionalGeneration
# # load bart tokenizer and model from huggingface
# tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
# generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')
# def query_pinecone(query, top_k):
# # generate embeddings for the query
# xq = retriever.encode([query]).tolist()
# # search pinecone index for context passage with the answer
# xc = index.query(xq, top_k=top_k, include_metadata=True)
# return xc
# def format_query(query, context):
# # extract passage_text from Pinecone search result and add the tag
# context = [f" {m['metadata']['passage_text']}" for m in context]
# # concatinate all context passages
# context = " ".join(context)
# # contcatinate the query and context passages
# query = f"question: {query} context: {context}"
# return query