import openai import os import langchain import pinecone from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Pinecone from langchain.llms import OpenAI from dotenv import load_dotenv load_dotenv() ## Lets Read the document def read_doc(directory): file_loader=PyPDFDirectoryLoader(directory) documents=file_loader.load() return documents doc=read_doc('documents/') #PDF directory len(doc) ## Divide the docs into chunks ### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html# def chunk_data(docs,chunk_size=800,chunk_overlap=50): text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap) doc=text_splitter.split_documents(docs) return docs documents=chunk_data(docs=doc) #Fnct that divides the PDF into chuncks len(documents) embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY']) embeddings vectors=embeddings.embed_query("How are you?") len(vectors) pinecone.init( api_key="3cdc872c-aecc-4b11-93d6-b5243930ac3a", environment="gcp-starter" ) index_name="knowledgebase" index=Pinecone.from_documents(doc,embeddings,index_name=index_name) ## Cosine Similarity Retreive Results from VectorDB def retrieve_query(query,k=2): matching_results=index.similarity_search(query,k=k) return matching_results from langchain.chains.question_answering import load_qa_chain from langchain import OpenAI llm=OpenAI(model_name="text-davinci-003",temperature=0.5) chain=load_qa_chain(llm,chain_type="stuff") ## Search answers from VectorDB def retrieve_answers(query): doc_search=retrieve_query(query) print(doc_search) response=chain.run(input_documents=doc_search,question=query) return response our_query = "What is my name?" answer = retrieve_answers(our_query) print(answer)