streamlit-llm-app / utils.py
aliyan22's picture
Create utils.py
4157c65
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.schema import Document
import requests
import json
import pinecone
from pypdf import PdfReader
from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain
import numpy as np
import re
import requests
from transformers import BertTokenizerFast, BertLMHeadModel
from transformers import pipeline
#Extract Information from PDF file
def get_pdf_text(filename):
text = ""
pdf_ = PdfReader(filename)
for page in pdf_.pages:
text += page.extract_text()
return text
# iterate over files in
# that user uploaded PDF files, one by one
def create_docs(user_file_list, unique_id):
docs = []
for filename in user_file_list:
ext = filename.split(".")[-1]
# Use TextLoader for .txt files
if ext == "txt":
loader = TextLoader(filename)
doc = loader.load()
# Use HTMLLoader for .html files
elif ext == "html":
loader = UnstructuredHTMLLoader(filename)
doc = loader.load()
# Use PDFLoader for .pdf files
elif ext == "pdf":
loader = PyPDFLoader(filename)
doc = loader.load()
elif ext == "docx":
loader = Docx2txtLoader(filename)
doc = loader.load()
elif ext == "md":
loader = UnstructuredMarkdownLoader(filename)
doc = loader.load()
# Skip other file types
else:
continue
docs.append(Document( page_content= doc[0].page_content , metadata={"name": f"{filename}" , "unique_id":unique_id } ) )
return docs
# def create_docs(user_pdf_list, unique_id):
# docs = []
# for filename in user_pdf_list:
# docs.append(Document( page_content= get_pdf_text(filename), metadata={"name": f"{filename}" , "unique_id":unique_id } ) )
# docs.append(get_pdf_text(filename))
# return docs
#Create embeddings instance
def create_embeddings_load_data():
#embeddings = OpenAIEmbeddings()
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # 384
return embeddings
#Function to push data to Vector Store - Pinecone here
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
pinecone.init(
api_key=pinecone_apikey,
environment=pinecone_environment
)
print("done......2")
Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
#Function to pull infrmation from Vector Store - Pinecone here
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
pinecone.init(
api_key=pinecone_apikey,
environment=pinecone_environment
)
index_name = pinecone_index_name
index = Pinecone.from_existing_index(index_name, embeddings)
return index
def similar_docs_hf(query, final_docs_list, k):
HF_KEY = "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"
headers = {"Authorization": f"Bearer {HF_KEY}"}
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
payload = {
"inputs": {
"source_sentence": query, # query
"sentences": final_docs_list
}
}
response = requests.post(API_URL, headers=headers, json=payload)
score_list = response.json()
pairs = list(zip( score_list , final_docs_list))
# Sort the pairs in descending order of the first element of each pair
pairs.sort(key=lambda x: x[0], reverse=True)
# Unzip the pairs back into two lists
score_list , final_docs_list = zip(*pairs)
# sorted_list[:k] ,
return score_list , final_docs_list
#Function to help us get relavant documents from vector store - based on user input
def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
pinecone.init(
api_key=pinecone_apikey,
environment=pinecone_environment
)
index_name = pinecone_index_name
index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
#print(similar_docs)
return similar_docs
def get_score(relevant_docs):
scores = []
for doc in relevant_docs:
scores.append(doc[1])
return scores
def metadata_filename( document ) :
names = [ ]
for doc in document:
text = str(doc[0].metadata["name"] )
pattern = r"name=\'(.*?)\'"
matches = re.findall(pattern, text)
names.append(matches)
return names
def docs_content(relevant_docs):
content = []
for doc in relevant_docs:
content.append(doc[0].page_content)
return content
def docs_summary(relevant_docs ):
documents = []
summary = [ ]
for doc in relevant_docs:
documents.append(doc[0].page_content)
for document in documents :
summary.append( document )
return summary
def get_summary_hf(target) :
# Specify the model name
model_name = "bert-base-uncased"
# Load the BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertLMHeadModel.from_pretrained(model_name)
# Initialize the summarization pipeline
summarizer = pipeline('summarization', model=model, tokenizer=tokenizer)
# Use the pipeline to summarize the text
summary = summarizer(str(target), max_length=150, min_length=25, do_sample=False)
return summary
# def get_summary_hf( document ):
# HF_KEY = "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"
# API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
# headers = {"Authorization": f"Bearer {HF_KEY}"}
# payload = {
# "inputs": {
# "inputs": document ,
# "parameters": {"do_sample": False}
# }
# }
# response = requests.post(API_URL, headers=headers, json=payload)
# return response.json()
# Helps us get the summary of a document
def get_summary(current_doc):
llm = OpenAI(temperature=0 )
# url = "https://api.openai.com/v1/chat/completions"
# headers = {
# 'Content-Type': 'application/json',
# 'Authorization': 'OPENAI_API_KEY'
# }
# data = {
# "model": "gpt-3.5-turbo",
# "messages": [
# {"role": "user", "content": f"Summarize this text : {current_doc}" }
# ],
# "temperature": 0.7
# }
# response = requests.post(url, headers=headers, data=json.dumps(data))
# completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": f"Summarize this text : {current_doc}"}])
# summary = response
# llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run([current_doc])
# print(summary)
return summary
# client = OpenAI()
# response = client.chat.completions.create(
# model="gpt-3.5-turbo",
# messages=[
# {"role": "system", "content": f"{current_doc}" },
# {"role": "user", "content": "Summarize the following text: '{text_to_summarize}'"},
# ])
# return response['choices'][0]['message']['content']
#