Spaces:
Runtime error
Runtime error
changes to app and sambaparse
Browse files- utils/parsing/sambaparse.py +11 -0
utils/parsing/sambaparse.py
CHANGED
@@ -8,6 +8,7 @@ from dotenv import load_dotenv
|
|
8 |
from langchain.docstore.document import Document
|
9 |
import shutil
|
10 |
from langchain_community.document_loaders import PyMuPDFLoader
|
|
|
11 |
|
12 |
load_dotenv()
|
13 |
|
@@ -303,6 +304,16 @@ class SambaParse:
|
|
303 |
loader = PyMuPDFLoader(file_path)
|
304 |
docs = loader.load()
|
305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
for doc in docs:
|
307 |
text = doc.page_content
|
308 |
metadata = doc.metadata
|
|
|
8 |
from langchain.docstore.document import Document
|
9 |
import shutil
|
10 |
from langchain_community.document_loaders import PyMuPDFLoader
|
11 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
12 |
|
13 |
load_dotenv()
|
14 |
|
|
|
304 |
loader = PyMuPDFLoader(file_path)
|
305 |
docs = loader.load()
|
306 |
|
307 |
+
splitter = RecursiveCharacterTextSplitter(
|
308 |
+
chunk_size=1000,
|
309 |
+
chunk_overlap=200,
|
310 |
+
length_function=len,
|
311 |
+
separators=['\n\n', '\n', ' ', ''],
|
312 |
+
is_separator_regex=False,
|
313 |
+
)
|
314 |
+
|
315 |
+
docs = splitter.split_documents(docs)
|
316 |
+
|
317 |
for doc in docs:
|
318 |
text = doc.page_content
|
319 |
metadata = doc.metadata
|