Spaces:
Build error
Build error
Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
3 |
+
from langchain.vectorstores import Pinecone
|
4 |
+
from langchain.llms import OpenAI
|
5 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
6 |
+
from langchain.document_loaders import UnstructuredHTMLLoader
|
7 |
+
from langchain.document_loaders import UnstructuredMarkdownLoader
|
8 |
+
from langchain.document_loaders import PyPDFLoader
|
9 |
+
from langchain.document_loaders import Docx2txtLoader
|
10 |
+
from langchain.schema import Document
|
11 |
+
import requests
|
12 |
+
import json
|
13 |
+
import pinecone
|
14 |
+
from pypdf import PdfReader
|
15 |
+
from langchain.llms.openai import OpenAI
|
16 |
+
from langchain.chains.summarize import load_summarize_chain
|
17 |
+
import numpy as np
|
18 |
+
import re
|
19 |
+
import requests
|
20 |
+
from transformers import BertTokenizerFast, BertLMHeadModel
|
21 |
+
from transformers import pipeline
|
22 |
+
|
23 |
+
#Extract Information from PDF file
|
24 |
+
def get_pdf_text(filename):
|
25 |
+
text = ""
|
26 |
+
pdf_ = PdfReader(filename)
|
27 |
+
for page in pdf_.pages:
|
28 |
+
text += page.extract_text()
|
29 |
+
return text
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
# iterate over files in
|
34 |
+
# that user uploaded PDF files, one by one
|
35 |
+
|
36 |
+
def create_docs(user_file_list, unique_id):
|
37 |
+
docs = []
|
38 |
+
for filename in user_file_list:
|
39 |
+
|
40 |
+
ext = filename.split(".")[-1]
|
41 |
+
|
42 |
+
# Use TextLoader for .txt files
|
43 |
+
if ext == "txt":
|
44 |
+
|
45 |
+
loader = TextLoader(filename)
|
46 |
+
doc = loader.load()
|
47 |
+
|
48 |
+
# Use HTMLLoader for .html files
|
49 |
+
elif ext == "html":
|
50 |
+
loader = UnstructuredHTMLLoader(filename)
|
51 |
+
doc = loader.load()
|
52 |
+
|
53 |
+
# Use PDFLoader for .pdf files
|
54 |
+
elif ext == "pdf":
|
55 |
+
loader = PyPDFLoader(filename)
|
56 |
+
doc = loader.load()
|
57 |
+
|
58 |
+
elif ext == "docx":
|
59 |
+
loader = Docx2txtLoader(filename)
|
60 |
+
doc = loader.load()
|
61 |
+
|
62 |
+
elif ext == "md":
|
63 |
+
loader = UnstructuredMarkdownLoader(filename)
|
64 |
+
doc = loader.load()
|
65 |
+
# Skip other file types
|
66 |
+
else:
|
67 |
+
continue
|
68 |
+
docs.append(Document( page_content= doc[0].page_content , metadata={"name": f"{filename}" , "unique_id":unique_id } ) )
|
69 |
+
|
70 |
+
return docs
|
71 |
+
|
72 |
+
|
73 |
+
# def create_docs(user_pdf_list, unique_id):
|
74 |
+
# docs = []
|
75 |
+
# for filename in user_pdf_list:
|
76 |
+
# docs.append(Document( page_content= get_pdf_text(filename), metadata={"name": f"{filename}" , "unique_id":unique_id } ) )
|
77 |
+
# docs.append(get_pdf_text(filename))
|
78 |
+
|
79 |
+
# return docs
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
#Create embeddings instance
|
84 |
+
def create_embeddings_load_data():
|
85 |
+
#embeddings = OpenAIEmbeddings()
|
86 |
+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # 384
|
87 |
+
return embeddings
|
88 |
+
|
89 |
+
|
90 |
+
#Function to push data to Vector Store - Pinecone here
|
91 |
+
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
|
92 |
+
|
93 |
+
pinecone.init(
|
94 |
+
api_key=pinecone_apikey,
|
95 |
+
environment=pinecone_environment
|
96 |
+
)
|
97 |
+
print("done......2")
|
98 |
+
Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
#Function to pull infrmation from Vector Store - Pinecone here
|
103 |
+
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
|
104 |
+
|
105 |
+
pinecone.init(
|
106 |
+
api_key=pinecone_apikey,
|
107 |
+
environment=pinecone_environment
|
108 |
+
)
|
109 |
+
|
110 |
+
index_name = pinecone_index_name
|
111 |
+
|
112 |
+
index = Pinecone.from_existing_index(index_name, embeddings)
|
113 |
+
return index
|
114 |
+
|
115 |
+
|
116 |
+
def similar_docs_hf(query, final_docs_list, k):
|
117 |
+
|
118 |
+
HF_KEY = "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"
|
119 |
+
|
120 |
+
headers = {"Authorization": f"Bearer {HF_KEY}"}
|
121 |
+
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
|
122 |
+
|
123 |
+
payload = {
|
124 |
+
"inputs": {
|
125 |
+
"source_sentence": query, # query
|
126 |
+
"sentences": final_docs_list
|
127 |
+
}
|
128 |
+
}
|
129 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
130 |
+
|
131 |
+
score_list = response.json()
|
132 |
+
|
133 |
+
|
134 |
+
pairs = list(zip( score_list , final_docs_list))
|
135 |
+
|
136 |
+
# Sort the pairs in descending order of the first element of each pair
|
137 |
+
pairs.sort(key=lambda x: x[0], reverse=True)
|
138 |
+
|
139 |
+
# Unzip the pairs back into two lists
|
140 |
+
score_list , final_docs_list = zip(*pairs)
|
141 |
+
# sorted_list[:k] ,
|
142 |
+
return score_list , final_docs_list
|
143 |
+
|
144 |
+
|
145 |
+
#Function to help us get relavant documents from vector store - based on user input
|
146 |
+
def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
|
147 |
+
|
148 |
+
pinecone.init(
|
149 |
+
api_key=pinecone_apikey,
|
150 |
+
environment=pinecone_environment
|
151 |
+
)
|
152 |
+
|
153 |
+
index_name = pinecone_index_name
|
154 |
+
|
155 |
+
index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
|
156 |
+
similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
|
157 |
+
#print(similar_docs)
|
158 |
+
return similar_docs
|
159 |
+
|
160 |
+
|
161 |
+
def get_score(relevant_docs):
|
162 |
+
scores = []
|
163 |
+
for doc in relevant_docs:
|
164 |
+
scores.append(doc[1])
|
165 |
+
|
166 |
+
return scores
|
167 |
+
|
168 |
+
|
169 |
+
def metadata_filename( document ) :
|
170 |
+
|
171 |
+
names = [ ]
|
172 |
+
for doc in document:
|
173 |
+
|
174 |
+
text = str(doc[0].metadata["name"] )
|
175 |
+
pattern = r"name=\'(.*?)\'"
|
176 |
+
matches = re.findall(pattern, text)
|
177 |
+
names.append(matches)
|
178 |
+
|
179 |
+
return names
|
180 |
+
|
181 |
+
def docs_content(relevant_docs):
|
182 |
+
content = []
|
183 |
+
for doc in relevant_docs:
|
184 |
+
content.append(doc[0].page_content)
|
185 |
+
|
186 |
+
return content
|
187 |
+
|
188 |
+
def docs_summary(relevant_docs ):
|
189 |
+
documents = []
|
190 |
+
summary = [ ]
|
191 |
+
|
192 |
+
for doc in relevant_docs:
|
193 |
+
documents.append(doc[0].page_content)
|
194 |
+
|
195 |
+
for document in documents :
|
196 |
+
summary.append( document )
|
197 |
+
return summary
|
198 |
+
|
199 |
+
|
200 |
+
def get_summary_hf(target) :
|
201 |
+
|
202 |
+
|
203 |
+
# Specify the model name
|
204 |
+
model_name = "bert-base-uncased"
|
205 |
+
|
206 |
+
# Load the BERT tokenizer and model
|
207 |
+
tokenizer = BertTokenizerFast.from_pretrained(model_name)
|
208 |
+
model = BertLMHeadModel.from_pretrained(model_name)
|
209 |
+
|
210 |
+
# Initialize the summarization pipeline
|
211 |
+
summarizer = pipeline('summarization', model=model, tokenizer=tokenizer)
|
212 |
+
|
213 |
+
# Use the pipeline to summarize the text
|
214 |
+
summary = summarizer(str(target), max_length=150, min_length=25, do_sample=False)
|
215 |
+
|
216 |
+
return summary
|
217 |
+
|
218 |
+
|
219 |
+
# def get_summary_hf( document ):
|
220 |
+
|
221 |
+
# HF_KEY = "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"
|
222 |
+
# API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
|
223 |
+
# headers = {"Authorization": f"Bearer {HF_KEY}"}
|
224 |
+
# payload = {
|
225 |
+
# "inputs": {
|
226 |
+
# "inputs": document ,
|
227 |
+
# "parameters": {"do_sample": False}
|
228 |
+
# }
|
229 |
+
# }
|
230 |
+
|
231 |
+
# response = requests.post(API_URL, headers=headers, json=payload)
|
232 |
+
# return response.json()
|
233 |
+
|
234 |
+
# Helps us get the summary of a document
|
235 |
+
|
236 |
+
|
237 |
+
def get_summary(current_doc):
|
238 |
+
|
239 |
+
llm = OpenAI(temperature=0 )
|
240 |
+
|
241 |
+
|
242 |
+
# url = "https://api.openai.com/v1/chat/completions"
|
243 |
+
# headers = {
|
244 |
+
# 'Content-Type': 'application/json',
|
245 |
+
# 'Authorization': 'OPENAI_API_KEY'
|
246 |
+
# }
|
247 |
+
# data = {
|
248 |
+
# "model": "gpt-3.5-turbo",
|
249 |
+
# "messages": [
|
250 |
+
# {"role": "user", "content": f"Summarize this text : {current_doc}" }
|
251 |
+
# ],
|
252 |
+
# "temperature": 0.7
|
253 |
+
# }
|
254 |
+
|
255 |
+
# response = requests.post(url, headers=headers, data=json.dumps(data))
|
256 |
+
|
257 |
+
|
258 |
+
# completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": f"Summarize this text : {current_doc}"}])
|
259 |
+
# summary = response
|
260 |
+
# llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
|
261 |
+
chain = load_summarize_chain(llm, chain_type="map_reduce")
|
262 |
+
summary = chain.run([current_doc])
|
263 |
+
# print(summary)
|
264 |
+
return summary
|
265 |
+
|
266 |
+
|
267 |
+
# client = OpenAI()
|
268 |
+
# response = client.chat.completions.create(
|
269 |
+
# model="gpt-3.5-turbo",
|
270 |
+
# messages=[
|
271 |
+
# {"role": "system", "content": f"{current_doc}" },
|
272 |
+
# {"role": "user", "content": "Summarize the following text: '{text_to_summarize}'"},
|
273 |
+
# ])
|
274 |
+
|
275 |
+
# return response['choices'][0]['message']['content']
|
276 |
+
#
|