aliyan22 commited on
Commit
4157c65
1 Parent(s): bf39cb5

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +276 -0
utils.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.vectorstores import Pinecone
4
+ from langchain.llms import OpenAI
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+ from langchain.document_loaders import UnstructuredHTMLLoader
7
+ from langchain.document_loaders import UnstructuredMarkdownLoader
8
+ from langchain.document_loaders import PyPDFLoader
9
+ from langchain.document_loaders import Docx2txtLoader
10
+ from langchain.schema import Document
11
+ import requests
12
+ import json
13
+ import pinecone
14
+ from pypdf import PdfReader
15
+ from langchain.llms.openai import OpenAI
16
+ from langchain.chains.summarize import load_summarize_chain
17
+ import numpy as np
18
+ import re
19
+ import requests
20
+ from transformers import BertTokenizerFast, BertLMHeadModel
21
+ from transformers import pipeline
22
+
23
+ #Extract Information from PDF file
24
+ def get_pdf_text(filename):
25
+ text = ""
26
+ pdf_ = PdfReader(filename)
27
+ for page in pdf_.pages:
28
+ text += page.extract_text()
29
+ return text
30
+
31
+
32
+
33
+ # iterate over files in
34
+ # that user uploaded PDF files, one by one
35
+
36
+ def create_docs(user_file_list, unique_id):
37
+ docs = []
38
+ for filename in user_file_list:
39
+
40
+ ext = filename.split(".")[-1]
41
+
42
+ # Use TextLoader for .txt files
43
+ if ext == "txt":
44
+
45
+ loader = TextLoader(filename)
46
+ doc = loader.load()
47
+
48
+ # Use HTMLLoader for .html files
49
+ elif ext == "html":
50
+ loader = UnstructuredHTMLLoader(filename)
51
+ doc = loader.load()
52
+
53
+ # Use PDFLoader for .pdf files
54
+ elif ext == "pdf":
55
+ loader = PyPDFLoader(filename)
56
+ doc = loader.load()
57
+
58
+ elif ext == "docx":
59
+ loader = Docx2txtLoader(filename)
60
+ doc = loader.load()
61
+
62
+ elif ext == "md":
63
+ loader = UnstructuredMarkdownLoader(filename)
64
+ doc = loader.load()
65
+ # Skip other file types
66
+ else:
67
+ continue
68
+ docs.append(Document( page_content= doc[0].page_content , metadata={"name": f"{filename}" , "unique_id":unique_id } ) )
69
+
70
+ return docs
71
+
72
+
73
+ # def create_docs(user_pdf_list, unique_id):
74
+ # docs = []
75
+ # for filename in user_pdf_list:
76
+ # docs.append(Document( page_content= get_pdf_text(filename), metadata={"name": f"{filename}" , "unique_id":unique_id } ) )
77
+ # docs.append(get_pdf_text(filename))
78
+
79
+ # return docs
80
+
81
+
82
+
83
+ #Create embeddings instance
84
+ def create_embeddings_load_data():
85
+ #embeddings = OpenAIEmbeddings()
86
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # 384
87
+ return embeddings
88
+
89
+
90
+ #Function to push data to Vector Store - Pinecone here
91
+ def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
92
+
93
+ pinecone.init(
94
+ api_key=pinecone_apikey,
95
+ environment=pinecone_environment
96
+ )
97
+ print("done......2")
98
+ Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
99
+
100
+
101
+
102
+ #Function to pull infrmation from Vector Store - Pinecone here
103
+ def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
104
+
105
+ pinecone.init(
106
+ api_key=pinecone_apikey,
107
+ environment=pinecone_environment
108
+ )
109
+
110
+ index_name = pinecone_index_name
111
+
112
+ index = Pinecone.from_existing_index(index_name, embeddings)
113
+ return index
114
+
115
+
116
+ def similar_docs_hf(query, final_docs_list, k):
117
+
118
+ HF_KEY = "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"
119
+
120
+ headers = {"Authorization": f"Bearer {HF_KEY}"}
121
+ API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
122
+
123
+ payload = {
124
+ "inputs": {
125
+ "source_sentence": query, # query
126
+ "sentences": final_docs_list
127
+ }
128
+ }
129
+ response = requests.post(API_URL, headers=headers, json=payload)
130
+
131
+ score_list = response.json()
132
+
133
+
134
+ pairs = list(zip( score_list , final_docs_list))
135
+
136
+ # Sort the pairs in descending order of the first element of each pair
137
+ pairs.sort(key=lambda x: x[0], reverse=True)
138
+
139
+ # Unzip the pairs back into two lists
140
+ score_list , final_docs_list = zip(*pairs)
141
+ # sorted_list[:k] ,
142
+ return score_list , final_docs_list
143
+
144
+
145
+ #Function to help us get relavant documents from vector store - based on user input
146
+ def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
147
+
148
+ pinecone.init(
149
+ api_key=pinecone_apikey,
150
+ environment=pinecone_environment
151
+ )
152
+
153
+ index_name = pinecone_index_name
154
+
155
+ index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
156
+ similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
157
+ #print(similar_docs)
158
+ return similar_docs
159
+
160
+
161
+ def get_score(relevant_docs):
162
+ scores = []
163
+ for doc in relevant_docs:
164
+ scores.append(doc[1])
165
+
166
+ return scores
167
+
168
+
169
+ def metadata_filename( document ) :
170
+
171
+ names = [ ]
172
+ for doc in document:
173
+
174
+ text = str(doc[0].metadata["name"] )
175
+ pattern = r"name=\'(.*?)\'"
176
+ matches = re.findall(pattern, text)
177
+ names.append(matches)
178
+
179
+ return names
180
+
181
+ def docs_content(relevant_docs):
182
+ content = []
183
+ for doc in relevant_docs:
184
+ content.append(doc[0].page_content)
185
+
186
+ return content
187
+
188
+ def docs_summary(relevant_docs ):
189
+ documents = []
190
+ summary = [ ]
191
+
192
+ for doc in relevant_docs:
193
+ documents.append(doc[0].page_content)
194
+
195
+ for document in documents :
196
+ summary.append( document )
197
+ return summary
198
+
199
+
200
+ def get_summary_hf(target) :
201
+
202
+
203
+ # Specify the model name
204
+ model_name = "bert-base-uncased"
205
+
206
+ # Load the BERT tokenizer and model
207
+ tokenizer = BertTokenizerFast.from_pretrained(model_name)
208
+ model = BertLMHeadModel.from_pretrained(model_name)
209
+
210
+ # Initialize the summarization pipeline
211
+ summarizer = pipeline('summarization', model=model, tokenizer=tokenizer)
212
+
213
+ # Use the pipeline to summarize the text
214
+ summary = summarizer(str(target), max_length=150, min_length=25, do_sample=False)
215
+
216
+ return summary
217
+
218
+
219
+ # def get_summary_hf( document ):
220
+
221
+ # HF_KEY = "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"
222
+ # API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
223
+ # headers = {"Authorization": f"Bearer {HF_KEY}"}
224
+ # payload = {
225
+ # "inputs": {
226
+ # "inputs": document ,
227
+ # "parameters": {"do_sample": False}
228
+ # }
229
+ # }
230
+
231
+ # response = requests.post(API_URL, headers=headers, json=payload)
232
+ # return response.json()
233
+
234
+ # Helps us get the summary of a document
235
+
236
+
237
+ def get_summary(current_doc):
238
+
239
+ llm = OpenAI(temperature=0 )
240
+
241
+
242
+ # url = "https://api.openai.com/v1/chat/completions"
243
+ # headers = {
244
+ # 'Content-Type': 'application/json',
245
+ # 'Authorization': 'OPENAI_API_KEY'
246
+ # }
247
+ # data = {
248
+ # "model": "gpt-3.5-turbo",
249
+ # "messages": [
250
+ # {"role": "user", "content": f"Summarize this text : {current_doc}" }
251
+ # ],
252
+ # "temperature": 0.7
253
+ # }
254
+
255
+ # response = requests.post(url, headers=headers, data=json.dumps(data))
256
+
257
+
258
+ # completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": f"Summarize this text : {current_doc}"}])
259
+ # summary = response
260
+ # llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
261
+ chain = load_summarize_chain(llm, chain_type="map_reduce")
262
+ summary = chain.run([current_doc])
263
+ # print(summary)
264
+ return summary
265
+
266
+
267
+ # client = OpenAI()
268
+ # response = client.chat.completions.create(
269
+ # model="gpt-3.5-turbo",
270
+ # messages=[
271
+ # {"role": "system", "content": f"{current_doc}" },
272
+ # {"role": "user", "content": "Summarize the following text: '{text_to_summarize}'"},
273
+ # ])
274
+
275
+ # return response['choices'][0]['message']['content']
276
+ #