Spaces:

Sawon2023
/

llm-pdf-qa

Runtime error

Sawon2023 commited on Sep 19, 2023

Commit

38b91e9

•

1 Parent(s): a0ca735

Updated the file reading for the app

Error Fix: https://stackoverflow.com/questions/51337167/typeerror-stat-path-should-be-string-bytes-os-pathlike-or-integer-not-io-t

Files changed (1) hide show

pdftoqa_generator.py CHANGED Viewed

@@ -11,29 +11,35 @@ from langchain.text_splitter import (
     RecursiveCharacterTextSplitter,
 )
 from tqdm import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 os.environ["OPENAI_API_KEY"] = "sk-"
 def pdf_parser(file_path):
-    pdf_loader = PyPDFLoader(file_path)
-    documents = pdf_loader.load()
-    documents_text = [d.page_content for d in documents]
-    text_splitter = RecursiveCharacterTextSplitter(
-        # Set a really small chunk size, just to show.
-        chunk_size=600,
-        chunk_overlap=200,
-        length_function=len,
-        is_separator_regex=False,
-    )
-    # Split the text into chunks
-    texts = text_splitter.create_documents(documents_text)
-    return texts
 def qa_generator(texts):

     RecursiveCharacterTextSplitter,
 )
 from tqdm import tqdm
+from tempfile import NamedTemporaryFile
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 os.environ["OPENAI_API_KEY"] = "sk-"
 def pdf_parser(file_path):
+    bytes_data = uploaded_file.read()
+    with NamedTemporaryFile(delete=False) as tmp:  # open a named temporary file
+        tmp.write(bytes_data)                      # Write data from the uploaded file into it
+        pdf_loader = PyPDFLoader(tmp.name)        # <---- now it works!
+        #pdf_loader = PyPDFLoader(file_path) only for file path offline
+        documents = pdf_loader.load()
+        documents_text = [d.page_content for d in documents]
+        text_splitter = RecursiveCharacterTextSplitter(
+            # Set a really small chunk size, just to show.
+            chunk_size=600,
+            chunk_overlap=200,
+            length_function=len,
+            is_separator_regex=False,
+        )
+        # Split the text into chunks
+        texts = text_splitter.create_documents(documents_text)
+        os.remove(tmp.name)                            # remove temp file
+        return texts
 def qa_generator(texts):