Spaces:
Runtime error
Runtime error
import json | |
import os | |
import re | |
import statistics | |
import gradio as gr | |
import pandas as pd | |
from langchain.document_loaders import OnlinePDFLoader | |
from langchain.text_splitter import ( | |
CharacterTextSplitter, | |
RecursiveCharacterTextSplitter, | |
) | |
from tqdm import tqdm | |
from tempfile import NamedTemporaryFile | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
os.environ["OPENAI_API_KEY"] = "sk-" | |
def pdf_parser(uploaded_file): | |
''' | |
bytes_data = uploaded_file.read() | |
with NamedTemporaryFile(delete=False) as tmp: # open a named temporary file | |
tmp.write(bytes_data) # Write data from the uploaded file into it | |
pdf_loader = PyPDFLoader(tmp.name) # <---- now it works! | |
''' | |
#pdf_loader = PyPDFLoader(file_path) only for file path offline | |
pdf_loader=OnlinePDFLoader(uploaded_file.name) #https://huggingface.co/spaces/fffiloni/langchain-chat-with-pdf/blob/main/app.py | |
documents = pdf_loader.load() | |
documents_text = [d.page_content for d in documents] | |
text_splitter = RecursiveCharacterTextSplitter( | |
# Set a really small chunk size, just to show. | |
chunk_size=600, | |
chunk_overlap=200, | |
length_function=len, | |
is_separator_regex=False, | |
) | |
# Split the text into chunks | |
texts = text_splitter.create_documents(documents_text) | |
#os.remove(tmp.name) # remove temp file | |
return texts | |
def qa_generator(texts): | |
question_tokenizer = AutoTokenizer.from_pretrained( | |
"potsawee/t5-large-generation-squad-QuestionAnswer" | |
) | |
question_model = AutoModelForSeq2SeqLM.from_pretrained( | |
"potsawee/t5-large-generation-squad-QuestionAnswer" | |
) | |
question_answer_dic = {} | |
for i in tqdm(texts): | |
context = i.page_content | |
try: | |
inputs = question_tokenizer(context, return_tensors="pt") | |
outputs = question_model.generate(**inputs, max_length=100) | |
question_answer = question_tokenizer.decode( | |
outputs[0], skip_special_tokens=False | |
) | |
question_answer = question_answer.replace( | |
question_tokenizer.pad_token, "" | |
).replace(question_tokenizer.eos_token, "") | |
question, answer = question_answer.split(question_tokenizer.sep_token) | |
question_answer_dic[question] = answer | |
except: | |
print(i) | |
qa_notes_df = pd.DataFrame(data=[], columns=["No", "Question", "Answer"]) | |
qa_notes_df["No"] = [i + 1 for i in range(0, len(question_answer_dic))] | |
qa_notes_df["Question"] = [k for k in question_answer_dic.keys()] | |
qa_notes_df["Answer"] = [a for a in question_answer_dic.values()] | |
qa_notes_json = qa_notes_df.to_dict("records") | |
return qa_notes_json | |