Sawon2023 commited on
Commit
38b91e9
1 Parent(s): a0ca735

Updated the file reading for the app

Browse files

Error Fix: https://stackoverflow.com/questions/51337167/typeerror-stat-path-should-be-string-bytes-os-pathlike-or-integer-not-io-t

Files changed (1) hide show
  1. pdftoqa_generator.py +22 -16
pdftoqa_generator.py CHANGED
@@ -11,29 +11,35 @@ from langchain.text_splitter import (
11
  RecursiveCharacterTextSplitter,
12
  )
13
  from tqdm import tqdm
 
14
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
 
16
  os.environ["OPENAI_API_KEY"] = "sk-"
17
 
18
 
19
  def pdf_parser(file_path):
20
- pdf_loader = PyPDFLoader(file_path)
 
 
 
21
 
22
- documents = pdf_loader.load()
23
- documents_text = [d.page_content for d in documents]
24
-
25
- text_splitter = RecursiveCharacterTextSplitter(
26
- # Set a really small chunk size, just to show.
27
- chunk_size=600,
28
- chunk_overlap=200,
29
- length_function=len,
30
- is_separator_regex=False,
31
- )
32
-
33
- # Split the text into chunks
34
- texts = text_splitter.create_documents(documents_text)
35
-
36
- return texts
 
 
37
 
38
 
39
  def qa_generator(texts):
 
11
  RecursiveCharacterTextSplitter,
12
  )
13
  from tqdm import tqdm
14
+ from tempfile import NamedTemporaryFile
15
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
16
 
17
  os.environ["OPENAI_API_KEY"] = "sk-"
18
 
19
 
20
  def pdf_parser(file_path):
21
+ bytes_data = uploaded_file.read()
22
+ with NamedTemporaryFile(delete=False) as tmp: # open a named temporary file
23
+ tmp.write(bytes_data) # Write data from the uploaded file into it
24
+ pdf_loader = PyPDFLoader(tmp.name) # <---- now it works!
25
 
26
+ #pdf_loader = PyPDFLoader(file_path) only for file path offline
27
+
28
+ documents = pdf_loader.load()
29
+ documents_text = [d.page_content for d in documents]
30
+
31
+ text_splitter = RecursiveCharacterTextSplitter(
32
+ # Set a really small chunk size, just to show.
33
+ chunk_size=600,
34
+ chunk_overlap=200,
35
+ length_function=len,
36
+ is_separator_regex=False,
37
+ )
38
+
39
+ # Split the text into chunks
40
+ texts = text_splitter.create_documents(documents_text)
41
+ os.remove(tmp.name) # remove temp file
42
+ return texts
43
 
44
 
45
  def qa_generator(texts):