Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ from langchain_core.output_parsers import StrOutputParser
|
|
17 |
from langchain_community.document_loaders import PyMuPDFLoader
|
18 |
from langchain_openai import OpenAIEmbeddings
|
19 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
20 |
from langchain_core.prompts import ChatPromptTemplate
|
21 |
from langchain_openai import ChatOpenAI
|
22 |
import re
|
@@ -73,8 +74,8 @@ def summarize_pdf(pdf_file_path, num_clusters=10):
|
|
73 |
docs = loader.load()
|
74 |
full_text = "\n".join(doc.page_content for doc in docs)
|
75 |
cleaned_full_text = clean_text(remove_references(full_text))
|
76 |
-
|
77 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
|
78 |
split_contents = text_splitter.split_text(cleaned_full_text)
|
79 |
embeddings = embeddings_model.embed_documents(split_contents)
|
80 |
|
@@ -103,8 +104,9 @@ def qa_pdf(pdf_file_path, query, num_clusters=5, similarity_threshold=0.6):
|
|
103 |
docs = loader.load()
|
104 |
full_text = "\n".join(doc.page_content for doc in docs)
|
105 |
cleaned_full_text = clean_text(remove_references(full_text))
|
|
|
106 |
|
107 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
|
108 |
split_contents = text_splitter.split_text(cleaned_full_text)
|
109 |
embeddings = embeddings_model.embed_documents(split_contents)
|
110 |
|
|
|
17 |
from langchain_community.document_loaders import PyMuPDFLoader
|
18 |
from langchain_openai import OpenAIEmbeddings
|
19 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
20 |
+
from langchain_text_splitters import SpacyTextSplitter
|
21 |
from langchain_core.prompts import ChatPromptTemplate
|
22 |
from langchain_openai import ChatOpenAI
|
23 |
import re
|
|
|
74 |
docs = loader.load()
|
75 |
full_text = "\n".join(doc.page_content for doc in docs)
|
76 |
cleaned_full_text = clean_text(remove_references(full_text))
|
77 |
+
text_splitter = SpacyTextSplitter(chunk_size=500)
|
78 |
+
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
|
79 |
split_contents = text_splitter.split_text(cleaned_full_text)
|
80 |
embeddings = embeddings_model.embed_documents(split_contents)
|
81 |
|
|
|
104 |
docs = loader.load()
|
105 |
full_text = "\n".join(doc.page_content for doc in docs)
|
106 |
cleaned_full_text = clean_text(remove_references(full_text))
|
107 |
+
text_splitter = SpacyTextSplitter(chunk_size=500)
|
108 |
|
109 |
+
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
|
110 |
split_contents = text_splitter.split_text(cleaned_full_text)
|
111 |
embeddings = embeddings_model.embed_documents(split_contents)
|
112 |
|