import openai from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Pinecone from langchain.llms import OpenAI from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.document_loaders import UnstructuredHTMLLoader from langchain.document_loaders import UnstructuredMarkdownLoader from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import Docx2txtLoader from langchain.schema import Document import requests import json import pinecone from pypdf import PdfReader from langchain.llms.openai import OpenAI from langchain.chains.summarize import load_summarize_chain import numpy as np import re import requests from transformers import BertTokenizerFast, BertLMHeadModel from transformers import pipeline #Extract Information from PDF file def get_pdf_text(filename): text = "" pdf_ = PdfReader(filename) for page in pdf_.pages: text += page.extract_text() return text # iterate over files in # that user uploaded PDF files, one by one def create_docs(user_file_list, unique_id): docs = [] for filename in user_file_list: ext = filename.split(".")[-1] # Use TextLoader for .txt files if ext == "txt": loader = TextLoader(filename) doc = loader.load() # Use HTMLLoader for .html files elif ext == "html": loader = UnstructuredHTMLLoader(filename) doc = loader.load() # Use PDFLoader for .pdf files elif ext == "pdf": loader = PyPDFLoader(filename) doc = loader.load() elif ext == "docx": loader = Docx2txtLoader(filename) doc = loader.load() elif ext == "md": loader = UnstructuredMarkdownLoader(filename) doc = loader.load() # Skip other file types else: continue docs.append(Document( page_content= doc[0].page_content , metadata={"name": f"{filename}" , "unique_id":unique_id } ) ) return docs # def create_docs(user_pdf_list, unique_id): # docs = [] # for filename in user_pdf_list: # docs.append(Document( page_content= get_pdf_text(filename), metadata={"name": f"{filename}" , "unique_id":unique_id } ) ) # docs.append(get_pdf_text(filename)) # return docs #Create embeddings instance def create_embeddings_load_data(): #embeddings = OpenAIEmbeddings() embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # 384 return embeddings #Function to push data to Vector Store - Pinecone here def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs): pinecone.init( api_key=pinecone_apikey, environment=pinecone_environment ) print("done......2") Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name) #Function to pull infrmation from Vector Store - Pinecone here def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings): pinecone.init( api_key=pinecone_apikey, environment=pinecone_environment ) index_name = pinecone_index_name index = Pinecone.from_existing_index(index_name, embeddings) return index def similar_docs_hf(query, final_docs_list, k): HF_KEY = "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" headers = {"Authorization": f"Bearer {HF_KEY}"} API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2" payload = { "inputs": { "source_sentence": query, # query "sentences": final_docs_list } } response = requests.post(API_URL, headers=headers, json=payload) score_list = response.json() pairs = list(zip( score_list , final_docs_list)) # Sort the pairs in descending order of the first element of each pair pairs.sort(key=lambda x: x[0], reverse=True) # Unzip the pairs back into two lists score_list , final_docs_list = zip(*pairs) # sorted_list[:k] , return score_list , final_docs_list #Function to help us get relavant documents from vector store - based on user input def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id): pinecone.init( api_key=pinecone_apikey, environment=pinecone_environment ) index_name = pinecone_index_name index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings) similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id}) #print(similar_docs) return similar_docs def get_score(relevant_docs): scores = [] for doc in relevant_docs: scores.append(doc[1]) return scores def metadata_filename( document ) : names = [ ] for doc in document: text = str(doc[0].metadata["name"] ) pattern = r"name=\'(.*?)\'" matches = re.findall(pattern, text) names.append(matches) return names def docs_content(relevant_docs): content = [] for doc in relevant_docs: content.append(doc[0].page_content) return content def docs_summary(relevant_docs ): documents = [] summary = [ ] for doc in relevant_docs: documents.append(doc[0].page_content) for document in documents : summary.append( document ) return summary def get_summary_hf(target) : # Specify the model name model_name = "bert-base-uncased" # Load the BERT tokenizer and model tokenizer = BertTokenizerFast.from_pretrained(model_name) model = BertLMHeadModel.from_pretrained(model_name) # Initialize the summarization pipeline summarizer = pipeline('summarization', model=model, tokenizer=tokenizer) # Use the pipeline to summarize the text summary = summarizer(str(target), max_length=150, min_length=25, do_sample=False) return summary # def get_summary_hf( document ): # HF_KEY = "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" # API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn" # headers = {"Authorization": f"Bearer {HF_KEY}"} # payload = { # "inputs": { # "inputs": document , # "parameters": {"do_sample": False} # } # } # response = requests.post(API_URL, headers=headers, json=payload) # return response.json() # Helps us get the summary of a document def get_summary(current_doc): llm = OpenAI(temperature=0 ) # url = "https://api.openai.com/v1/chat/completions" # headers = { # 'Content-Type': 'application/json', # 'Authorization': 'OPENAI_API_KEY' # } # data = { # "model": "gpt-3.5-turbo", # "messages": [ # {"role": "user", "content": f"Summarize this text : {current_doc}" } # ], # "temperature": 0.7 # } # response = requests.post(url, headers=headers, data=json.dumps(data)) # completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": f"Summarize this text : {current_doc}"}]) # summary = response # llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10}) chain = load_summarize_chain(llm, chain_type="map_reduce") summary = chain.run([current_doc]) # print(summary) return summary # client = OpenAI() # response = client.chat.completions.create( # model="gpt-3.5-turbo", # messages=[ # {"role": "system", "content": f"{current_doc}" }, # {"role": "user", "content": "Summarize the following text: '{text_to_summarize}'"}, # ]) # return response['choices'][0]['message']['content'] #