petrojm commited on
Commit
2c44136
1 Parent(s): e18dfac

changes to app and sambaparse

Browse files
Files changed (1) hide show
  1. utils/parsing/sambaparse.py +11 -0
utils/parsing/sambaparse.py CHANGED
@@ -8,6 +8,7 @@ from dotenv import load_dotenv
8
  from langchain.docstore.document import Document
9
  import shutil
10
  from langchain_community.document_loaders import PyMuPDFLoader
 
11
 
12
  load_dotenv()
13
 
@@ -303,6 +304,16 @@ class SambaParse:
303
  loader = PyMuPDFLoader(file_path)
304
  docs = loader.load()
305
 
 
 
 
 
 
 
 
 
 
 
306
  for doc in docs:
307
  text = doc.page_content
308
  metadata = doc.metadata
 
8
  from langchain.docstore.document import Document
9
  import shutil
10
  from langchain_community.document_loaders import PyMuPDFLoader
11
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
12
 
13
  load_dotenv()
14
 
 
304
  loader = PyMuPDFLoader(file_path)
305
  docs = loader.load()
306
 
307
+ splitter = RecursiveCharacterTextSplitter(
308
+ chunk_size=1000,
309
+ chunk_overlap=200,
310
+ length_function=len,
311
+ separators=['\n\n', '\n', ' ', ''],
312
+ is_separator_regex=False,
313
+ )
314
+
315
+ docs = splitter.split_documents(docs)
316
+
317
  for doc in docs:
318
  text = doc.page_content
319
  metadata = doc.metadata