YchKhan commited on
Commit
3219c03
1 Parent(s): 78aefdd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +544 -0
app.py ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import langchain
2
+ from langchain.embeddings import SentenceTransformerEmbeddings
3
+ from langchain.chains.question_answering import load_qa_chain
4
+ from langchain.document_loaders import UnstructuredPDFLoader,UnstructuredWordDocumentLoader
5
+ from langchain.indexes import VectorstoreIndexCreator
6
+ from langchain.vectorstores import FAISS
7
+ from langchain import HuggingFaceHub
8
+ from langchain import PromptTemplate
9
+ from langchain.chat_models import ChatOpenAI
10
+ from zipfile import ZipFile
11
+ import gradio as gr
12
+ import openpyxl
13
+ import os
14
+ import shutil
15
+ from langchain.schema import Document
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ import tiktoken
18
+ import secrets
19
+ import openai
20
+ import time
21
+ from duckduckgo_search import DDGS
22
+ import requests
23
+ import tempfile
24
+ import pandas as pd
25
+ import numpy as np
26
+ from openai import OpenAI
27
+
28
+
29
+ tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
30
+
31
+ # create the length function
32
+ def tiktoken_len(text):
33
+ tokens = tokenizer.encode(
34
+ text,
35
+ disallowed_special=()
36
+ )
37
+ return len(tokens)
38
+
39
+ text_splitter = RecursiveCharacterTextSplitter(
40
+ chunk_size=600,
41
+ chunk_overlap=200,
42
+ length_function=tiktoken_len,
43
+ separators=["\n\n", "\n", " ", ""]
44
+ )
45
+
46
+ embeddings = SentenceTransformerEmbeddings(model_name="thenlper/gte-base")
47
+ foo = Document(page_content='foo is fou!',metadata={"source":'foo source'})
48
+
49
+ def reset_database(ui_session_id):
50
+ session_id = f"PDFAISS-{ui_session_id}"
51
+ if 'drive' in session_id:
52
+ print("RESET DATABASE: session_id contains 'drive' !!")
53
+ return None
54
+
55
+ try:
56
+ shutil.rmtree(session_id)
57
+ except:
58
+ print(f'no {session_id} directory present')
59
+
60
+ try:
61
+ os.remove(f"{session_id}.zip")
62
+ except:
63
+ print("no {session_id}.zip present")
64
+
65
+ return None
66
+
67
+ def is_duplicate(split_docs,db):
68
+ epsilon=0.0
69
+ print(f"DUPLICATE: Treating: {split_docs[0].metadata['source'].split('/')[-1]}")
70
+ for i in range(min(3,len(split_docs))):
71
+ query = split_docs[i].page_content
72
+ docs = db.similarity_search_with_score(query,k=1)
73
+ _ , score = docs[0]
74
+ epsilon += score
75
+ print(f"DUPLICATE: epsilon: {epsilon}")
76
+ return epsilon < 0.1
77
+
78
+ def merge_split_docs_to_db(split_docs,db,progress,progress_step=0.1):
79
+ progress(progress_step,desc="merging docs")
80
+ if len(split_docs)==0:
81
+ print("MERGE to db: NO docs!!")
82
+ return
83
+
84
+ filename = split_docs[0].metadata['source']
85
+ # if is_duplicate(split_docs,db): #todo handle duplicate management
86
+ # print(f"MERGE: Document is duplicated: {filename}")
87
+ # return
88
+ # print(f"MERGE: number of split docs: {len(split_docs)}")
89
+ batch = 10
90
+ db1 = None
91
+ for i in range(0, len(split_docs), batch):
92
+ progress(i/len(split_docs),desc=f"added {i} chunks of {len(split_docs)} chunks")
93
+ if db1:
94
+ db1.add_documents(split_docs[i:i+batch])
95
+ else:
96
+ db1 = FAISS.from_documents(split_docs[i:i+batch], embeddings)
97
+
98
+ db1.save_local(split_docs[-1].metadata["source"].split(".")[-1]) #create an index with the same name as the file
99
+ #db.merge_from(db1) #we do not merge anymore, instead, we create a new index for each file
100
+ return db1
101
+
102
+ def merge_pdf_to_db(filename,session_folder,progress,progress_step=0.1):
103
+ progress_step+=0.05
104
+ progress(progress_step,'unpacking pdf')
105
+ doc = UnstructuredPDFLoader(filename).load()
106
+ doc[0].metadata['source'] = filename.split('/')[-1]
107
+ split_docs = text_splitter.split_documents(doc)
108
+ progress_step+=0.3
109
+ progress(progress_step,'pdf unpacked')
110
+ return merge_split_docs_to_db(split_docs,session_folder,progress,progress_step)
111
+
112
+ def merge_docx_to_db(filename,session_folder,progress,progress_step=0.1):
113
+ progress_step+=0.05
114
+ progress(progress_step,'unpacking docx')
115
+ doc = UnstructuredWordDocumentLoader(filename).load()
116
+ doc[0].metadata['source'] = filename.split('/')[-1]
117
+ split_docs = text_splitter.split_documents(doc)
118
+ progress_step+=0.3
119
+ progress(progress_step,'docx unpacked')
120
+ return merge_split_docs_to_db(split_docs,session_folder,progress,progress_step)
121
+
122
+ def merge_txt_to_db(filename,session_folder,progress,progress_step=0.1):
123
+ progress_step+=0.05
124
+ progress(progress_step,'unpacking txt')
125
+ with open(filename) as f:
126
+ docs = text_splitter.split_text(f.read())
127
+ split_docs = [Document(page_content=doc,metadata={'source':filename.split('/')[-1]}) for doc in docs]
128
+ progress_step+=0.3
129
+ progress(progress_step,'txt unpacked')
130
+ return merge_split_docs_to_db(split_docs,session_folder,progress,progress_step)
131
+
132
+ def unpack_zip_file(filename,db,progress):
133
+ with ZipFile(filename, 'r') as zipObj:
134
+ contents = zipObj.namelist()
135
+ print(f"unpack zip: contents: {contents}")
136
+ tmp_directory = filename.split('/')[-1].split('.')[-2]
137
+ shutil.unpack_archive(filename, tmp_directory)
138
+
139
+ if 'index.faiss' in [item.lower() for item in contents]:
140
+ db2 = FAISS.load_local(tmp_directory, embeddings)
141
+ db.merge_from(db2)
142
+ return db
143
+
144
+ for file in contents:
145
+ if file.lower().endswith('.docx'):
146
+ db = merge_docx_to_db(f"{tmp_directory}/{file}",db,progress)
147
+ if file.lower().endswith('.pdf'):
148
+ db = merge_pdf_to_db(f"{tmp_directory}/{file}",db,progress)
149
+ if file.lower().endswith('.txt'):
150
+ db = merge_txt_to_db(f"{tmp_directory}/{file}",db,progress)
151
+ return db
152
+
153
+ def unzip_db(filename, ui_session_id):
154
+ with ZipFile(filename, 'r') as zipObj:
155
+ contents = zipObj.namelist()
156
+ print(f"unzip: contents: {contents}")
157
+ tmp_directory = f"PDFAISS-{ui_session_id}"
158
+ shutil.unpack_archive(filename, tmp_directory)
159
+
160
+ def add_files_to_zip(session_id):
161
+ zip_file_name = f"{session_id}.zip"
162
+ with ZipFile(zip_file_name, "w") as zipObj:
163
+ for root, dirs, files in os.walk(session_id):
164
+ for file_name in files:
165
+ file_path = os.path.join(root, file_name)
166
+ arcname = os.path.relpath(file_path, session_id)
167
+ zipObj.write(file_path, arcname)
168
+
169
+ ## Search files functions ##
170
+
171
+ def search_docs(topic, max_references):
172
+ print(f"SEARCH PDF : {topic}")
173
+ doc_list = []
174
+ with DDGS() as ddgs:
175
+ i=0
176
+ for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
177
+ #doc_list.append(str(r))
178
+ if i>=max_references:
179
+ break
180
+ doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
181
+ i+=1
182
+ return doc_list
183
+
184
+
185
+ def store_files(references, ret_names=False):
186
+ url_list=[]
187
+ temp_files = []
188
+ for ref in references:
189
+ url_list.append(ref.split(" ")[-1])
190
+ for url in url_list:
191
+ response = requests.get(url)
192
+ if response.status_code == 200:
193
+ filename = url.split('/')[-1]
194
+ if filename.split('.')[-1] == 'pdf':
195
+ filename = filename[:-4]
196
+ print('File name.pdf :', filename)
197
+ temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
198
+ else:
199
+ print('File name :', filename)
200
+ temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
201
+ temp_file.write(response.content)
202
+ temp_file.close()
203
+ if ret_names:
204
+ temp_files.append(temp_file.name)
205
+ else:
206
+ temp_files.append(temp_file)
207
+
208
+ return temp_files
209
+
210
+ ## Summary functions ##
211
+
212
+ ## Load each doc from the vector store
213
+ def load_docs(ui_session_id):
214
+ session_id_global_db = f"PDFAISS-{ui_session_id}"
215
+ try:
216
+ db = FAISS.load_local(session_id_global_db,embeddings)
217
+ print("load_docs after loading global db:",session_id_global_db,len(db.index_to_docstore_id))
218
+ except:
219
+ return f"SESSION: {session_id_global_db} database does not exist","",""
220
+ docs = []
221
+ for i in range(1,len(db.index_to_docstore_id)):
222
+ docs.append(db.docstore.search(db.index_to_docstore_id[i]))
223
+ return docs
224
+
225
+
226
+ # summarize with gpt 3.5 turbo
227
+ def summarize_gpt(doc,system='provide a summary of the following document: ', first_tokens=600):
228
+ doc = doc.replace('\n\n\n', '').replace('---', '').replace('...', '').replace('___', '')
229
+ encoded = tokenizer.encode(doc)
230
+ print("/n TOKENIZED : ", encoded)
231
+ decoded = tokenizer.decode(encoded[:min(first_tokens, len(encoded))])
232
+ print("/n DOC SHORTEN", min(first_tokens, len(encoded)), " : ", decoded)
233
+ completion = openai.ChatCompletion.create(
234
+ model="gpt-3.5-turbo",
235
+ messages=[
236
+ {"role": "system", "content": system},
237
+ {"role": "user", "content": decoded}
238
+ ]
239
+ )
240
+ return completion.choices[0].message["content"]
241
+
242
+
243
+ def summarize_docs_generator(apikey_input, session_id):
244
+ openai.api_key = apikey_input
245
+ docs=load_docs(session_id)
246
+ print("################# DOCS LOADED ##################", "docs type : ", type(docs[0]))
247
+
248
+ try:
249
+ fail = docs[0].page_content
250
+ except:
251
+ return docs[0]
252
+
253
+ source = ""
254
+ summaries = ""
255
+ i = 0
256
+ while i<len(docs):
257
+ doc = docs[i]
258
+ unique_doc = ""
259
+ if source != doc.metadata:
260
+ unique_doc = ''.join([doc.page_content for doc in docs[i:i+3]])
261
+ print("\n\n****Open AI API called****\n\n")
262
+ if i == 0:
263
+ try:
264
+ summary = summarize_gpt(unique_doc)
265
+ except:
266
+ return f"ERROR : Try checking the validity of the provided OpenAI API Key"
267
+ else:
268
+ try:
269
+ summary = summarize_gpt(unique_doc)
270
+ except:
271
+ print(f"ERROR : There was an error but it is not linked with the validity of api key, taking a 20s nap")
272
+ yield summaries + f"\n\n °°° OpenAI error, please wait 20 sec of cooldown. °°°"
273
+ time.sleep(20)
274
+ summary = summarize_gpt(unique_doc)
275
+
276
+ print("SUMMARY : ", summary)
277
+ summaries += f"Source : {doc.metadata['source'].split('/')[-1]}\n{summary} \n\n"
278
+ source = doc.metadata
279
+ yield summaries
280
+ i+=1
281
+ yield summaries
282
+
283
+
284
+ def summarize_docs(apikey_input, session_id):
285
+ gen = summarize_docs_generator(apikey_input, session_id)
286
+ while True:
287
+ try:
288
+ yield str(next(gen))
289
+ except StopIteration:
290
+ return
291
+
292
+ #### UI Functions ####
293
+
294
+ def update_df(ui_session_id):
295
+ df = pd.DataFrame(columns=["File name", "Question 1"])
296
+ session_folder = f"PDFAISS-{ui_session_id}"
297
+ file_names = os.listdir(session_folder)
298
+ for i, file_name in enumerate(file_names):
299
+ new_row = {'File name': str(file_name), 'Question': " ", 'Generated answer': " ", 'Sources': " "}
300
+ df.loc[i] = new_row
301
+ return df
302
+
303
+ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
304
+ print(files)
305
+ progress(progress_step,desc="Starting...")
306
+ split_docs=[]
307
+ if len(ui_session_id)==0:
308
+ ui_session_id = secrets.token_urlsafe(16)
309
+ session_folder = f"PDFAISS-{ui_session_id}"
310
+
311
+ if os.path.exists(session_folder) and os.path.isdir(session_folder):
312
+ databases = os.listdir(session_folder)
313
+ # db = FAISS.load_local(databases[0],embeddings)
314
+ else:
315
+ try:
316
+ os.makedirs(session_folder)
317
+ print(f"The folder '{session_folder}' has been created.")
318
+ except OSError as e:
319
+ print(f"Failed to create the folder '{session_folder}': {e}")
320
+ # db = FAISS.from_documents([foo], embeddings)
321
+ # db.save_local(session_id)
322
+ # print(f"SESSION: {session_id} database created")
323
+
324
+ #print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
325
+ for file_id,file in enumerate(files):
326
+ print("ID : ", file_id, "FILE : ", file)
327
+ file_type = file.name.split('.')[-1].lower()
328
+ source = file.name.split('/')[-1]
329
+ print(f"current file: {source}")
330
+ progress(file_id/len(files),desc=f"Treating {source}")
331
+
332
+ if file_type == 'zip':
333
+ unzip_db(file.name, ui_session_id)
334
+ add_files_to_zip(session_folder)
335
+ return f"{session_folder}.zip", ui_session_id, update_df(ui_session_id)
336
+
337
+ if file_type == 'pdf':
338
+ db2 = merge_pdf_to_db(file.name,session_folder,progress)
339
+
340
+ if file_type == 'txt':
341
+ db2 = merge_txt_to_db(file.name,session_folder,progress)
342
+
343
+ if file_type == 'docx':
344
+ db2 = merge_docx_to_db(file.name,session_folder,progress)
345
+
346
+ if db2 != None:
347
+ # db = db2
348
+ # db.save_local(session_id)
349
+ db2.save_local(f"{session_folder}/{source}")
350
+ ### move file to store ###
351
+ progress(progress_step, desc = 'moving file to store')
352
+ directory_path = f"{session_folder}/{source}/store/"
353
+ if not os.path.exists(directory_path):
354
+ os.makedirs(directory_path)
355
+ try:
356
+ shutil.move(file.name, directory_path)
357
+ except:
358
+ pass
359
+
360
+ ### load the updated db and zip it ###
361
+ progress(progress_step, desc = 'loading db')
362
+ # db = FAISS.load_local(session_id,embeddings)
363
+ # print("EMBEDDED, after embeddeding: ",session_id,len(db.index_to_docstore_id))
364
+ progress(progress_step, desc = 'zipping db for download')
365
+ add_files_to_zip(session_folder)
366
+ print(f"EMBEDDED: db zipped")
367
+ progress(progress_step, desc = 'db zipped')
368
+
369
+
370
+ return f"{session_folder}.zip",ui_session_id, update_df(ui_session_id)
371
+
372
+
373
+
374
+ def add_to_db(references,ui_session_id):
375
+ files = store_files(references)
376
+ return embed_files(files,ui_session_id)
377
+
378
+ def export_files(references):
379
+ files = store_files(references, ret_names=True)
380
+ #paths = [file.name for file in files]
381
+ return files
382
+
383
+
384
+ def display_docs(docs):
385
+ output_str = ''
386
+ for i, doc in enumerate(docs):
387
+ source = doc.metadata['source'].split('/')[-1]
388
+ output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
389
+ return output_str
390
+
391
+ def ask_gpt(query, apikey,history,ui_session_id):
392
+ session_id = f"PDFAISS-{ui_session_id}"
393
+ try:
394
+ db = FAISS.load_local(session_id,embeddings)
395
+ print("ASKGPT after loading",session_id,len(db.index_to_docstore_id))
396
+ except:
397
+ print(f"SESSION: {session_id} database does not exist")
398
+ return f"SESSION: {session_id} database does not exist","",""
399
+
400
+ docs = db.similarity_search(query)
401
+ history += f"[query]\n{query}\n[answer]\n"
402
+ if(apikey==""):
403
+ history += f"None\n[references]\n{display_docs(docs)}\n\n"
404
+ return "No answer from GPT", display_docs(docs),history
405
+ else:
406
+ llm = ChatOpenAI(temperature=0, model_name = 'gpt-3.5-turbo', openai_api_key=apikey)
407
+ chain = load_qa_chain(llm, chain_type="stuff")
408
+ answer = chain.run(input_documents=docs, question=query, verbose=True)
409
+ history += f"{answer}\n[references]\n{display_docs(docs)}\n\n"
410
+ return answer,display_docs(docs),history
411
+
412
+
413
+ # tmp functions to move somewhere else
414
+
415
+
416
+ #new api query format
417
+ def gpt_answer(api_key, query, model="gpt-3.5-turbo-1106", system_prompt="Use the provided References to answer the user Question. If the provided document do not contain the elements to answer the user question, just say 'No information.'."):
418
+ client = OpenAI(
419
+ api_key=api_key,
420
+ )
421
+
422
+ chat_completion = client.chat.completions.create(
423
+ messages=[
424
+ {"role": "system", "content": system_prompt},
425
+ {"role": "user", "content": query},
426
+
427
+ ],
428
+ model=model,
429
+ )
430
+ return chat_completion.choices[0].message.content
431
+
432
+ def ask_df(df, api_key, model, ui_session_id):
433
+ answers = []
434
+ session_folder = f"PDFAISS-{ui_session_id}"
435
+ question_column = df.columns[-1]
436
+ if len(df.at[0, question_column])<2: #df.columns[-1] ==> last column label, last question
437
+ return df
438
+ for index, row in df.iterrows():
439
+ question = row.iloc[-1]
440
+ print(f"Question: {question}")
441
+ if len(question)<2:
442
+ question = df.at[index-1, question_column].split("\n---\n")[0]
443
+ db_folder = "/".join([session_folder, row["File name"]])
444
+ db = FAISS.load_local(db_folder,embeddings)
445
+ docs = db.similarity_search(question)
446
+ references = '\n******************************\n'.join([d.page_content for d in docs])
447
+ print(f"REFERENCES: {references}")
448
+ sources_file = f"{secrets.token_urlsafe(16)}.txt"
449
+ with open(sources_file, 'w') as file:
450
+ file.write(references)
451
+ try:
452
+ source = f"https://organizedprogrammers-pdfaiss-2-3-4.hf.space/file={sources_file}"
453
+ except:
454
+ source = "ERROR WHILE GETTING THE SOURCES FILE"
455
+ query = f"## USER QUESTION:\n{question}\n\n## REFERENCES:\n{references}\n\nANSWER:\n\n"
456
+ try:
457
+ answer = gpt_answer(api_key, query, model)
458
+ except Exception as e:
459
+ answer = "ERROR WHILE ANSWERING THE QUESTION"
460
+ print("ERROR: ", e)
461
+ complete_answer = "\n---\n".join(["## " + question, answer, "[Sources](" + source + ")"])
462
+ answers.append(complete_answer)
463
+ print(complete_answer)
464
+ df[question_column] = answers
465
+ return df
466
+
467
+ def export_df(df, ftype):
468
+ fname=secrets.token_urlsafe(16)
469
+ if ftype=="xlsx":
470
+ df.to_excel(f"{fname}.xlsx", index=False)
471
+ return f"{fname}.xlsx"
472
+ if ftype=="pkl":
473
+ df.to_pickle(f"{fname}.pkl", index=False)
474
+ return f"{fname}.pkl"
475
+ if ftype=="csv":
476
+ df.to_csv(f"{fname}.csv", index=False)
477
+ return f"{fname}.csv"
478
+
479
+
480
+ with gr.Blocks() as demo:
481
+ gr.Markdown("Upload your documents and question them.")
482
+ with gr.Accordion("Open to enter your API key", open=False):
483
+ apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
484
+ dd_model = gr.Dropdown(["gpt-3.5-turbo-1106", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-1106-preview", "gpt-4", "gpt-4-32k"], value="gpt-3.5-turbo-1106", label='List of models', allow_custom_value=True, scale=1)
485
+
486
+ with gr.Tab("Upload PDF & TXT"):
487
+ with gr.Accordion("Get files from the web", open=False):
488
+ with gr.Column():
489
+ topic_input = gr.Textbox(placeholder="Type your research", label="Research")
490
+ with gr.Row():
491
+ max_files = gr.Slider(1, 30, step=1, value=10, label="Maximum number of files")
492
+ btn_search = gr.Button("Search")
493
+ dd_documents = gr.Dropdown(label='List of documents', info='Click to remove from selection', multiselect=True)
494
+ with gr.Row():
495
+ btn_dl = gr.Button("Add these files to the Database")
496
+ btn_export = gr.Button("⬇ Export selected files ⬇")
497
+
498
+ tb_session_id = gr.Textbox(label='session id')
499
+ docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
500
+ db_output = gr.File(label="Download zipped database")
501
+ btn_generate_db = gr.Button("Generate database")
502
+ btn_reset_db = gr.Button("Reset database")
503
+ df_qna = gr.Dataframe(interactive=True, datatype="markdown")
504
+ with gr.Row():
505
+ btn_clear_df = gr.Button("Clear df")
506
+ btn_fill_answers = gr.Button("Fill table with generated answers")
507
+ with gr.Accordion("Export dataframe", open=False):
508
+ with gr.Row():
509
+ btn_export_df = gr.Button("Export df as", scale=1)
510
+ r_format = gr.Radio(["xlsx", "pkl", "csv"], label="File type", value="xlsx", scale=2)
511
+ file_df = gr.File(scale=1)
512
+
513
+
514
+
515
+ btn_clear_df.click(update_df, inputs=[tb_session_id], outputs=df_qna)
516
+ btn_fill_answers.click(ask_df, inputs=[df_qna, apikey_input, dd_model, tb_session_id], outputs=df_qna)
517
+ btn_export_df.click(export_df, inputs=[df_qna, r_format], outputs=[file_df])
518
+ with gr.Tab("Summarize PDF"):
519
+ with gr.Column():
520
+ summary_output = gr.Textbox(label='Summarized files')
521
+ btn_summary = gr.Button("Summarize")
522
+
523
+
524
+ with gr.Tab("Ask PDF"):
525
+ with gr.Column():
526
+ query_input = gr.Textbox(placeholder="Type your question", label="Question")
527
+ btn_askGPT = gr.Button("Answer")
528
+ answer_output = gr.Textbox(label='GPT 3.5 answer')
529
+ sources = gr.Textbox(label='Sources')
530
+ history = gr.Textbox(label='History')
531
+
532
+
533
+ topic_input.submit(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
534
+ btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
535
+ btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
536
+ btn_export.click(export_files, inputs=dd_documents, outputs=docs_input)
537
+ btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id, df_qna])
538
+ btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
539
+ btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
540
+ btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
541
+
542
+
543
+ #demo.queue(concurrency_count=10)
544
+ demo.launch(debug=False,share=False)