Spaces:
Running
Running
dsmultimedika
commited on
Commit
•
647b702
1
Parent(s):
91c6b27
fix : improve error llamaparse
Browse files- api/function.py +8 -8
- core/book_enabler/__init__.py +0 -0
- core/journal_reading/__init__.py +0 -0
- core/journal_reading/extractor.py +0 -8
- core/journal_reading/prompt.py +0 -0
- core/journal_reading/upload.py +0 -86
- core/module_creator/__init__.py +0 -0
- core/summarization/__init__.py +0 -0
- core/summarization/summarizer.py +0 -135
- script/document_uploader.py +6 -34
- service/reader_v4.py +7 -2
api/function.py
CHANGED
@@ -29,15 +29,9 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
|
|
29 |
user_id="admin_book_uploaded",
|
30 |
)
|
31 |
|
32 |
-
# # Upload to AWS
|
33 |
-
file_name = f"{reference['title']}"
|
34 |
-
aws_loader = Loader()
|
35 |
-
|
36 |
-
file_obj = file
|
37 |
-
aws_loader.upload_to_s3(file_obj, file_name)
|
38 |
-
|
39 |
uploader = Uploader(reference, file)
|
40 |
-
|
|
|
41 |
nodes_with_metadata = await uploader.process_documents()
|
42 |
if isinstance(nodes_with_metadata, JSONResponse):
|
43 |
return nodes_with_metadata # Return the error response directly
|
@@ -45,6 +39,12 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
|
|
45 |
# Build indexes using IndexManager
|
46 |
index = IndexManager()
|
47 |
index.build_indexes(nodes_with_metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
return json.dumps(
|
50 |
{"status": "success", "message": "Vector Index loaded successfully."}
|
|
|
29 |
user_id="admin_book_uploaded",
|
30 |
)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
uploader = Uploader(reference, file)
|
33 |
+
nodes_with_metadata, file_stream = await uploader.process_documents()
|
34 |
+
|
35 |
nodes_with_metadata = await uploader.process_documents()
|
36 |
if isinstance(nodes_with_metadata, JSONResponse):
|
37 |
return nodes_with_metadata # Return the error response directly
|
|
|
39 |
# Build indexes using IndexManager
|
40 |
index = IndexManager()
|
41 |
index.build_indexes(nodes_with_metadata)
|
42 |
+
|
43 |
+
# Upload AWS
|
44 |
+
file_name = f"{reference['title']}"
|
45 |
+
aws_loader = Loader()
|
46 |
+
|
47 |
+
aws_loader.upload_to_s3(file_stream, file_name)
|
48 |
|
49 |
return json.dumps(
|
50 |
{"status": "success", "message": "Vector Index loaded successfully."}
|
core/book_enabler/__init__.py
DELETED
File without changes
|
core/journal_reading/__init__.py
DELETED
File without changes
|
core/journal_reading/extractor.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
class Extractor():
|
6 |
-
def __init__(self):
|
7 |
-
pass
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/journal_reading/prompt.py
DELETED
File without changes
|
core/journal_reading/upload.py
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import nest_asyncio
|
3 |
-
|
4 |
-
from llama_parse import LlamaParse
|
5 |
-
from llama_index.core.node_parser import SimpleNodeParser
|
6 |
-
from dotenv import load_dotenv
|
7 |
-
from fastapi import UploadFile, HTTPException, File
|
8 |
-
from fastapi.responses import JSONResponse
|
9 |
-
import fitz
|
10 |
-
|
11 |
-
from script.get_metadata import Metadata
|
12 |
-
|
13 |
-
load_dotenv()
|
14 |
-
nest_asyncio.apply()
|
15 |
-
|
16 |
-
|
17 |
-
async def parse_journal(content: bytes, file_name: str):
|
18 |
-
"""Parse the journal using LlamaParse."""
|
19 |
-
try:
|
20 |
-
# Initialize the parser
|
21 |
-
parser = LlamaParse(
|
22 |
-
api_key=os.getenv("LLAMA_PARSE_API_KEY"),
|
23 |
-
result_type="markdown",
|
24 |
-
max_timeout=5000,
|
25 |
-
)
|
26 |
-
|
27 |
-
# Load and process the document
|
28 |
-
llama_parse_documents = parser.load_data(
|
29 |
-
content, extra_info={"file_name": file_name}
|
30 |
-
)
|
31 |
-
|
32 |
-
return llama_parse_documents
|
33 |
-
|
34 |
-
except Exception as e:
|
35 |
-
return JSONResponse(status_code=400, content=f"Error processing file: {e}")
|
36 |
-
|
37 |
-
|
38 |
-
async def extract_metadata(content: bytes):
|
39 |
-
"""Extract metadata from the PDF content."""
|
40 |
-
try:
|
41 |
-
# Open the binary content with PyMuPDF
|
42 |
-
pdf_document = fitz.open("pdf", content) # "pdf" specifies the format
|
43 |
-
|
44 |
-
# Extract metadata
|
45 |
-
metadata = pdf_document.metadata
|
46 |
-
|
47 |
-
# Prepare metadata dictionary with default values for missing fields
|
48 |
-
metadata_dict = {
|
49 |
-
"title": metadata.get("title", "N/A"),
|
50 |
-
"author": metadata.get("author", "N/A"),
|
51 |
-
"subject": metadata.get("subject", "N/A"),
|
52 |
-
"keywords": metadata.get("keywords", "N/A"),
|
53 |
-
"creation_date": metadata.get("created", "N/A"),
|
54 |
-
"modification_date": metadata.get("modified", "N/A"),
|
55 |
-
}
|
56 |
-
|
57 |
-
return metadata_dict
|
58 |
-
|
59 |
-
except Exception as e:
|
60 |
-
return JSONResponse(status_code=500, content=f"Error inputting metadata: {e}")
|
61 |
-
|
62 |
-
|
63 |
-
async def upload_file(file: UploadFile = File(...)):
|
64 |
-
try:
|
65 |
-
# Read the binary content of the uploaded file once
|
66 |
-
content = await file.read()
|
67 |
-
# Parse the journal
|
68 |
-
parsed_documents = await parse_journal(content, file.filename)
|
69 |
-
# Extract metadata
|
70 |
-
metadata_dict = await extract_metadata(content)
|
71 |
-
|
72 |
-
print("Metadata Dictionary : \n\n", metadata_dict)
|
73 |
-
|
74 |
-
metadata_gen = Metadata(metadata_dict)
|
75 |
-
documents_with_metadata = metadata_gen.add_metadata(
|
76 |
-
parsed_documents, metadata_dict
|
77 |
-
)
|
78 |
-
|
79 |
-
print("Document with Metadata : \n\n", documents_with_metadata)
|
80 |
-
print("Banyak documents : \n", len(documents_with_metadata))
|
81 |
-
|
82 |
-
# Return both parsed documents and metadata
|
83 |
-
return {"status": "SUCCESS"}
|
84 |
-
|
85 |
-
except Exception as e:
|
86 |
-
return JSONResponse(status_code=500, content=f"Error processing file: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core/module_creator/__init__.py
DELETED
File without changes
|
core/summarization/__init__.py
DELETED
File without changes
|
core/summarization/summarizer.py
DELETED
@@ -1,135 +0,0 @@
|
|
1 |
-
from io import BytesIO
|
2 |
-
import os
|
3 |
-
import base64
|
4 |
-
import fitz
|
5 |
-
|
6 |
-
from fastapi.responses import JSONResponse
|
7 |
-
from llama_index.core.vector_stores import (
|
8 |
-
MetadataFilter,
|
9 |
-
MetadataFilters,
|
10 |
-
FilterCondition,
|
11 |
-
)
|
12 |
-
|
13 |
-
from llama_index.core import load_index_from_storage
|
14 |
-
from llama_index.core.storage import StorageContext
|
15 |
-
from llama_index.llms.openai import OpenAI
|
16 |
-
from core.parser import parse_topics_to_dict
|
17 |
-
from llama_index.core.llms import ChatMessage
|
18 |
-
from core.prompt import (
|
19 |
-
SYSTEM_TOPIC_TEMPLATE,
|
20 |
-
USER_TOPIC_TEMPLATE,
|
21 |
-
REFINED_GET_TOPIC_TEMPLATE,
|
22 |
-
)
|
23 |
-
|
24 |
-
# from langfuse.openai import openai
|
25 |
-
|
26 |
-
|
27 |
-
class SummarizeGenerator:
|
28 |
-
def __init__(self, references):
|
29 |
-
|
30 |
-
self.references = references
|
31 |
-
self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)
|
32 |
-
|
33 |
-
def extract_pages(self, content_table):
|
34 |
-
try:
|
35 |
-
content_bytes = content_table.file.read()
|
36 |
-
print(content_bytes)
|
37 |
-
# Open the PDF file
|
38 |
-
content_table = fitz.open(stream=content_bytes, filetype="pdf")
|
39 |
-
print(content_table)
|
40 |
-
# content_table = fitz.open(topics_image)
|
41 |
-
except Exception as e:
|
42 |
-
return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")
|
43 |
-
|
44 |
-
# Initialize a list to collect base64 encoded images
|
45 |
-
pix_encoded_combined = []
|
46 |
-
|
47 |
-
# Iterate over each page to extract images
|
48 |
-
for page_number in range(len(content_table)):
|
49 |
-
try:
|
50 |
-
page = content_table.load_page(page_number)
|
51 |
-
pix_encoded = self._extract_image_as_base64(page)
|
52 |
-
pix_encoded_combined.append(pix_encoded)
|
53 |
-
# print("pix encoded combined", pix_encoded_combined)
|
54 |
-
|
55 |
-
except Exception as e:
|
56 |
-
print(f"Error processing page {page_number}: {e}")
|
57 |
-
continue # Skip to the next page if there's an error
|
58 |
-
|
59 |
-
if not pix_encoded_combined:
|
60 |
-
return JSONResponse(status_code=404, content="No images found in the PDF")
|
61 |
-
|
62 |
-
return pix_encoded_combined
|
63 |
-
|
64 |
-
def extract_content_table(self, content_table):
|
65 |
-
try:
|
66 |
-
images = self.extract_pages(content_table)
|
67 |
-
|
68 |
-
image_messages = [
|
69 |
-
{
|
70 |
-
"type": "image_url",
|
71 |
-
"image_url": {
|
72 |
-
"url": f"data:image/jpeg;base64,{image}",
|
73 |
-
},
|
74 |
-
}
|
75 |
-
for image in images
|
76 |
-
]
|
77 |
-
|
78 |
-
messages = [
|
79 |
-
ChatMessage(
|
80 |
-
role="system",
|
81 |
-
content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
|
82 |
-
),
|
83 |
-
ChatMessage(
|
84 |
-
role="user",
|
85 |
-
content=[
|
86 |
-
{"type": "text", "text": USER_TOPIC_TEMPLATE},
|
87 |
-
*image_messages,
|
88 |
-
],
|
89 |
-
),
|
90 |
-
]
|
91 |
-
|
92 |
-
extractor_output = self.llm.chat(messages)
|
93 |
-
print("extractor output : ", extractor_output)
|
94 |
-
refined_extractor_output = self.llm.complete(
|
95 |
-
REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
|
96 |
-
)
|
97 |
-
|
98 |
-
print("refined extractor output : ",str(refined_extractor_output))
|
99 |
-
|
100 |
-
extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))
|
101 |
-
|
102 |
-
return str(refined_extractor_output), extractor_dics
|
103 |
-
|
104 |
-
except Exception as e:
|
105 |
-
return JSONResponse(status_code=500, content=f"An error occurred: {e}")
|
106 |
-
|
107 |
-
def _extract_image_as_base64(self, page):
|
108 |
-
try:
|
109 |
-
pix = page.get_pixmap()
|
110 |
-
pix_bytes = pix.tobytes()
|
111 |
-
return base64.b64encode(pix_bytes).decode("utf-8")
|
112 |
-
except Exception as e:
|
113 |
-
return JSONResponse(status_code=500, content=f"Error extracting image: {e}")
|
114 |
-
|
115 |
-
def index_summarizer_engine(self, topic, subtopic, index):
|
116 |
-
filters = MetadataFilters(
|
117 |
-
filters=[
|
118 |
-
MetadataFilter(key="title", value=topic),
|
119 |
-
MetadataFilter(key="category", value=subtopic),
|
120 |
-
],
|
121 |
-
condition=FilterCondition.AND,
|
122 |
-
)
|
123 |
-
|
124 |
-
# Create the QueryEngineTool with the index and filters
|
125 |
-
kwargs = {"similarity_top_k": 5, "filters": filters}
|
126 |
-
|
127 |
-
query_engine = index.as_query_engine(**kwargs)
|
128 |
-
|
129 |
-
return query_engine
|
130 |
-
|
131 |
-
def get_summarizer_engine(self, topic, subtopic):
|
132 |
-
pass
|
133 |
-
|
134 |
-
def prepare_summaries(self):
|
135 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script/document_uploader.py
CHANGED
@@ -1,8 +1,10 @@
|
|
|
|
|
|
|
|
1 |
from llama_index.core.ingestion import IngestionPipeline
|
2 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
3 |
from config import PINECONE_CONFIG
|
4 |
from pinecone.grpc import PineconeGRPC as Pinecone
|
5 |
-
# from service.reader import Reader
|
6 |
from script.get_metadata import Metadata
|
7 |
from fastapi import UploadFile, status
|
8 |
from fastapi.responses import JSONResponse
|
@@ -12,32 +14,17 @@ from llama_index.core.node_parser import (
|
|
12 |
SemanticSplitterNodeParser,
|
13 |
)
|
14 |
from llama_index.core import Settings
|
15 |
-
# from service.reader_v3 import upload_file
|
16 |
from service.reader_v4 import upload_file
|
17 |
|
18 |
-
# from script.get_topic import extract_topic
|
19 |
|
20 |
-
import logging
|
21 |
-
import random
|
22 |
|
23 |
|
24 |
class Uploader:
|
25 |
-
# def __init__(self, reference, file: UploadFile, content_table: UploadFile):
|
26 |
def __init__(self, reference, file: UploadFile):
|
27 |
self.file = file
|
28 |
-
# self.content_table = content_table
|
29 |
-
# self.reader = Reader()
|
30 |
self.reference = reference
|
31 |
self.metadata = Metadata(reference)
|
32 |
|
33 |
-
# async def ingest_documents(self, file: UploadFile):
|
34 |
-
# """Load documents from the storage path."""
|
35 |
-
# documents = await self.reader.read_from_uploadfile(file)
|
36 |
-
# print("Banyak document : ", len(documents))
|
37 |
-
# print("document successfully ingested")
|
38 |
-
|
39 |
-
# return documents
|
40 |
-
|
41 |
def check_existing_metadata(self, pinecone_index, title, random_vector):
|
42 |
try:
|
43 |
result = pinecone_index.query(
|
@@ -56,20 +43,11 @@ class Uploader:
|
|
56 |
|
57 |
async def process_documents(self):
|
58 |
# Ingest documents
|
59 |
-
|
60 |
-
|
61 |
-
# Get metadata
|
62 |
-
# documents_with_metadata = self.metadata.apply_metadata(documents)
|
63 |
-
documents_with_metadata = await upload_file(self.reference, self.file)
|
64 |
|
65 |
if isinstance(documents_with_metadata, JSONResponse):
|
66 |
return documents_with_metadata # Return the error response directly
|
67 |
|
68 |
-
# Get Topic
|
69 |
-
# topic_extractor = extract_topic(self.reference, self.content_table)
|
70 |
-
# document_filtered = self.filter_document(documents_with_metadata)
|
71 |
-
|
72 |
-
# embed_model = OpenAIEmbedding()
|
73 |
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
|
74 |
Settings.embed_model = embed_model
|
75 |
# Set up the ingestion pipeline
|
@@ -80,20 +58,14 @@ class Uploader:
|
|
80 |
breakpoint_percentile_threshold=95,
|
81 |
embed_model=embed_model,
|
82 |
),
|
83 |
-
# topic_extractor,
|
84 |
]
|
85 |
)
|
86 |
|
87 |
-
# splitter = SemanticSplitterNodeParser(
|
88 |
-
# buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
|
89 |
-
# )
|
90 |
-
|
91 |
# Run the pipeline
|
92 |
try:
|
93 |
print("Pipeline processing completed with Semantic Spliter.")
|
94 |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
|
95 |
-
|
96 |
-
return nodes_with_metadata
|
97 |
|
98 |
except Exception as e:
|
99 |
try:
|
@@ -103,7 +75,7 @@ class Uploader:
|
|
103 |
documents_with_metadata
|
104 |
)
|
105 |
print("Pipeline processing completed with SentenceSplitter fallback.")
|
106 |
-
return nodes_with_metadata
|
107 |
except Exception as fallback_error:
|
108 |
# Log the second error and return JSONResponse for FastAPI
|
109 |
logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
|
|
|
1 |
+
import logging
|
2 |
+
import random
|
3 |
+
|
4 |
from llama_index.core.ingestion import IngestionPipeline
|
5 |
from llama_index.embeddings.openai import OpenAIEmbedding
|
6 |
from config import PINECONE_CONFIG
|
7 |
from pinecone.grpc import PineconeGRPC as Pinecone
|
|
|
8 |
from script.get_metadata import Metadata
|
9 |
from fastapi import UploadFile, status
|
10 |
from fastapi.responses import JSONResponse
|
|
|
14 |
SemanticSplitterNodeParser,
|
15 |
)
|
16 |
from llama_index.core import Settings
|
|
|
17 |
from service.reader_v4 import upload_file
|
18 |
|
|
|
19 |
|
|
|
|
|
20 |
|
21 |
|
22 |
class Uploader:
|
|
|
23 |
def __init__(self, reference, file: UploadFile):
|
24 |
self.file = file
|
|
|
|
|
25 |
self.reference = reference
|
26 |
self.metadata = Metadata(reference)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def check_existing_metadata(self, pinecone_index, title, random_vector):
|
29 |
try:
|
30 |
result = pinecone_index.query(
|
|
|
43 |
|
44 |
async def process_documents(self):
|
45 |
# Ingest documents
|
46 |
+
documents_with_metadata, file_stream = await upload_file(self.reference, self.file)
|
|
|
|
|
|
|
|
|
47 |
|
48 |
if isinstance(documents_with_metadata, JSONResponse):
|
49 |
return documents_with_metadata # Return the error response directly
|
50 |
|
|
|
|
|
|
|
|
|
|
|
51 |
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
|
52 |
Settings.embed_model = embed_model
|
53 |
# Set up the ingestion pipeline
|
|
|
58 |
breakpoint_percentile_threshold=95,
|
59 |
embed_model=embed_model,
|
60 |
),
|
|
|
61 |
]
|
62 |
)
|
63 |
|
|
|
|
|
|
|
|
|
64 |
# Run the pipeline
|
65 |
try:
|
66 |
print("Pipeline processing completed with Semantic Spliter.")
|
67 |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
|
68 |
+
return nodes_with_metadata, file_stream
|
|
|
69 |
|
70 |
except Exception as e:
|
71 |
try:
|
|
|
75 |
documents_with_metadata
|
76 |
)
|
77 |
print("Pipeline processing completed with SentenceSplitter fallback.")
|
78 |
+
return nodes_with_metadata, file_stream
|
79 |
except Exception as fallback_error:
|
80 |
# Log the second error and return JSONResponse for FastAPI
|
81 |
logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
|
service/reader_v4.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import os
|
2 |
import nest_asyncio
|
3 |
-
from
|
4 |
|
|
|
5 |
from dotenv import load_dotenv
|
6 |
from fastapi import UploadFile
|
7 |
import joblib
|
@@ -85,6 +86,10 @@ async def upload_file(reference, file: UploadFile):
|
|
85 |
try:
|
86 |
# Read the binary content of the uploaded file once
|
87 |
content = await file.read()
|
|
|
|
|
|
|
|
|
88 |
# Parse the journal
|
89 |
title = reference["title"]
|
90 |
|
@@ -97,7 +102,7 @@ async def upload_file(reference, file: UploadFile):
|
|
97 |
print("Banyak documents : \n", len(documents_with_metadata))
|
98 |
|
99 |
# Return both parsed documents and metadata
|
100 |
-
return documents_with_metadata
|
101 |
|
102 |
except Exception as e:
|
103 |
print("error ", e)
|
|
|
1 |
import os
|
2 |
import nest_asyncio
|
3 |
+
from io import BytesIO
|
4 |
|
5 |
+
from typing import List
|
6 |
from dotenv import load_dotenv
|
7 |
from fastapi import UploadFile
|
8 |
import joblib
|
|
|
86 |
try:
|
87 |
# Read the binary content of the uploaded file once
|
88 |
content = await file.read()
|
89 |
+
|
90 |
+
# Store the file content in a BytesIO stream for reuse later
|
91 |
+
file_stream = BytesIO(content)
|
92 |
+
|
93 |
# Parse the journal
|
94 |
title = reference["title"]
|
95 |
|
|
|
102 |
print("Banyak documents : \n", len(documents_with_metadata))
|
103 |
|
104 |
# Return both parsed documents and metadata
|
105 |
+
return documents_with_metadata, file_stream
|
106 |
|
107 |
except Exception as e:
|
108 |
print("error ", e)
|