dsmultimedika commited on
Commit
647b702
1 Parent(s): 91c6b27

fix : improve error llamaparse

Browse files
api/function.py CHANGED
@@ -29,15 +29,9 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
29
  user_id="admin_book_uploaded",
30
  )
31
 
32
- # # Upload to AWS
33
- file_name = f"{reference['title']}"
34
- aws_loader = Loader()
35
-
36
- file_obj = file
37
- aws_loader.upload_to_s3(file_obj, file_name)
38
-
39
  uploader = Uploader(reference, file)
40
-
 
41
  nodes_with_metadata = await uploader.process_documents()
42
  if isinstance(nodes_with_metadata, JSONResponse):
43
  return nodes_with_metadata # Return the error response directly
@@ -45,6 +39,12 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
45
  # Build indexes using IndexManager
46
  index = IndexManager()
47
  index.build_indexes(nodes_with_metadata)
 
 
 
 
 
 
48
 
49
  return json.dumps(
50
  {"status": "success", "message": "Vector Index loaded successfully."}
 
29
  user_id="admin_book_uploaded",
30
  )
31
 
 
 
 
 
 
 
 
32
  uploader = Uploader(reference, file)
33
+ nodes_with_metadata, file_stream = await uploader.process_documents()
34
+
35
  nodes_with_metadata = await uploader.process_documents()
36
  if isinstance(nodes_with_metadata, JSONResponse):
37
  return nodes_with_metadata # Return the error response directly
 
39
  # Build indexes using IndexManager
40
  index = IndexManager()
41
  index.build_indexes(nodes_with_metadata)
42
+
43
+ # Upload AWS
44
+ file_name = f"{reference['title']}"
45
+ aws_loader = Loader()
46
+
47
+ aws_loader.upload_to_s3(file_stream, file_name)
48
 
49
  return json.dumps(
50
  {"status": "success", "message": "Vector Index loaded successfully."}
core/book_enabler/__init__.py DELETED
File without changes
core/journal_reading/__init__.py DELETED
File without changes
core/journal_reading/extractor.py DELETED
@@ -1,8 +0,0 @@
1
-
2
-
3
-
4
-
5
- class Extractor():
6
- def __init__(self):
7
- pass
8
-
 
 
 
 
 
 
 
 
 
core/journal_reading/prompt.py DELETED
File without changes
core/journal_reading/upload.py DELETED
@@ -1,86 +0,0 @@
1
- import os
2
- import nest_asyncio
3
-
4
- from llama_parse import LlamaParse
5
- from llama_index.core.node_parser import SimpleNodeParser
6
- from dotenv import load_dotenv
7
- from fastapi import UploadFile, HTTPException, File
8
- from fastapi.responses import JSONResponse
9
- import fitz
10
-
11
- from script.get_metadata import Metadata
12
-
13
- load_dotenv()
14
- nest_asyncio.apply()
15
-
16
-
17
- async def parse_journal(content: bytes, file_name: str):
18
- """Parse the journal using LlamaParse."""
19
- try:
20
- # Initialize the parser
21
- parser = LlamaParse(
22
- api_key=os.getenv("LLAMA_PARSE_API_KEY"),
23
- result_type="markdown",
24
- max_timeout=5000,
25
- )
26
-
27
- # Load and process the document
28
- llama_parse_documents = parser.load_data(
29
- content, extra_info={"file_name": file_name}
30
- )
31
-
32
- return llama_parse_documents
33
-
34
- except Exception as e:
35
- return JSONResponse(status_code=400, content=f"Error processing file: {e}")
36
-
37
-
38
- async def extract_metadata(content: bytes):
39
- """Extract metadata from the PDF content."""
40
- try:
41
- # Open the binary content with PyMuPDF
42
- pdf_document = fitz.open("pdf", content) # "pdf" specifies the format
43
-
44
- # Extract metadata
45
- metadata = pdf_document.metadata
46
-
47
- # Prepare metadata dictionary with default values for missing fields
48
- metadata_dict = {
49
- "title": metadata.get("title", "N/A"),
50
- "author": metadata.get("author", "N/A"),
51
- "subject": metadata.get("subject", "N/A"),
52
- "keywords": metadata.get("keywords", "N/A"),
53
- "creation_date": metadata.get("created", "N/A"),
54
- "modification_date": metadata.get("modified", "N/A"),
55
- }
56
-
57
- return metadata_dict
58
-
59
- except Exception as e:
60
- return JSONResponse(status_code=500, content=f"Error inputting metadata: {e}")
61
-
62
-
63
- async def upload_file(file: UploadFile = File(...)):
64
- try:
65
- # Read the binary content of the uploaded file once
66
- content = await file.read()
67
- # Parse the journal
68
- parsed_documents = await parse_journal(content, file.filename)
69
- # Extract metadata
70
- metadata_dict = await extract_metadata(content)
71
-
72
- print("Metadata Dictionary : \n\n", metadata_dict)
73
-
74
- metadata_gen = Metadata(metadata_dict)
75
- documents_with_metadata = metadata_gen.add_metadata(
76
- parsed_documents, metadata_dict
77
- )
78
-
79
- print("Document with Metadata : \n\n", documents_with_metadata)
80
- print("Banyak documents : \n", len(documents_with_metadata))
81
-
82
- # Return both parsed documents and metadata
83
- return {"status": "SUCCESS"}
84
-
85
- except Exception as e:
86
- return JSONResponse(status_code=500, content=f"Error processing file: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core/module_creator/__init__.py DELETED
File without changes
core/summarization/__init__.py DELETED
File without changes
core/summarization/summarizer.py DELETED
@@ -1,135 +0,0 @@
1
- from io import BytesIO
2
- import os
3
- import base64
4
- import fitz
5
-
6
- from fastapi.responses import JSONResponse
7
- from llama_index.core.vector_stores import (
8
- MetadataFilter,
9
- MetadataFilters,
10
- FilterCondition,
11
- )
12
-
13
- from llama_index.core import load_index_from_storage
14
- from llama_index.core.storage import StorageContext
15
- from llama_index.llms.openai import OpenAI
16
- from core.parser import parse_topics_to_dict
17
- from llama_index.core.llms import ChatMessage
18
- from core.prompt import (
19
- SYSTEM_TOPIC_TEMPLATE,
20
- USER_TOPIC_TEMPLATE,
21
- REFINED_GET_TOPIC_TEMPLATE,
22
- )
23
-
24
- # from langfuse.openai import openai
25
-
26
-
27
- class SummarizeGenerator:
28
- def __init__(self, references):
29
-
30
- self.references = references
31
- self.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=4096)
32
-
33
- def extract_pages(self, content_table):
34
- try:
35
- content_bytes = content_table.file.read()
36
- print(content_bytes)
37
- # Open the PDF file
38
- content_table = fitz.open(stream=content_bytes, filetype="pdf")
39
- print(content_table)
40
- # content_table = fitz.open(topics_image)
41
- except Exception as e:
42
- return JSONResponse(status_code=400, content=f"Error opening PDF file: {e}")
43
-
44
- # Initialize a list to collect base64 encoded images
45
- pix_encoded_combined = []
46
-
47
- # Iterate over each page to extract images
48
- for page_number in range(len(content_table)):
49
- try:
50
- page = content_table.load_page(page_number)
51
- pix_encoded = self._extract_image_as_base64(page)
52
- pix_encoded_combined.append(pix_encoded)
53
- # print("pix encoded combined", pix_encoded_combined)
54
-
55
- except Exception as e:
56
- print(f"Error processing page {page_number}: {e}")
57
- continue # Skip to the next page if there's an error
58
-
59
- if not pix_encoded_combined:
60
- return JSONResponse(status_code=404, content="No images found in the PDF")
61
-
62
- return pix_encoded_combined
63
-
64
- def extract_content_table(self, content_table):
65
- try:
66
- images = self.extract_pages(content_table)
67
-
68
- image_messages = [
69
- {
70
- "type": "image_url",
71
- "image_url": {
72
- "url": f"data:image/jpeg;base64,{image}",
73
- },
74
- }
75
- for image in images
76
- ]
77
-
78
- messages = [
79
- ChatMessage(
80
- role="system",
81
- content=[{"type": "text", "text": SYSTEM_TOPIC_TEMPLATE}],
82
- ),
83
- ChatMessage(
84
- role="user",
85
- content=[
86
- {"type": "text", "text": USER_TOPIC_TEMPLATE},
87
- *image_messages,
88
- ],
89
- ),
90
- ]
91
-
92
- extractor_output = self.llm.chat(messages)
93
- print("extractor output : ", extractor_output)
94
- refined_extractor_output = self.llm.complete(
95
- REFINED_GET_TOPIC_TEMPLATE.format(topics=str(extractor_output))
96
- )
97
-
98
- print("refined extractor output : ",str(refined_extractor_output))
99
-
100
- extractor_dics = dict(parse_topics_to_dict(str(refined_extractor_output)))
101
-
102
- return str(refined_extractor_output), extractor_dics
103
-
104
- except Exception as e:
105
- return JSONResponse(status_code=500, content=f"An error occurred: {e}")
106
-
107
- def _extract_image_as_base64(self, page):
108
- try:
109
- pix = page.get_pixmap()
110
- pix_bytes = pix.tobytes()
111
- return base64.b64encode(pix_bytes).decode("utf-8")
112
- except Exception as e:
113
- return JSONResponse(status_code=500, content=f"Error extracting image: {e}")
114
-
115
- def index_summarizer_engine(self, topic, subtopic, index):
116
- filters = MetadataFilters(
117
- filters=[
118
- MetadataFilter(key="title", value=topic),
119
- MetadataFilter(key="category", value=subtopic),
120
- ],
121
- condition=FilterCondition.AND,
122
- )
123
-
124
- # Create the QueryEngineTool with the index and filters
125
- kwargs = {"similarity_top_k": 5, "filters": filters}
126
-
127
- query_engine = index.as_query_engine(**kwargs)
128
-
129
- return query_engine
130
-
131
- def get_summarizer_engine(self, topic, subtopic):
132
- pass
133
-
134
- def prepare_summaries(self):
135
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
script/document_uploader.py CHANGED
@@ -1,8 +1,10 @@
 
 
 
1
  from llama_index.core.ingestion import IngestionPipeline
2
  from llama_index.embeddings.openai import OpenAIEmbedding
3
  from config import PINECONE_CONFIG
4
  from pinecone.grpc import PineconeGRPC as Pinecone
5
- # from service.reader import Reader
6
  from script.get_metadata import Metadata
7
  from fastapi import UploadFile, status
8
  from fastapi.responses import JSONResponse
@@ -12,32 +14,17 @@ from llama_index.core.node_parser import (
12
  SemanticSplitterNodeParser,
13
  )
14
  from llama_index.core import Settings
15
- # from service.reader_v3 import upload_file
16
  from service.reader_v4 import upload_file
17
 
18
- # from script.get_topic import extract_topic
19
 
20
- import logging
21
- import random
22
 
23
 
24
  class Uploader:
25
- # def __init__(self, reference, file: UploadFile, content_table: UploadFile):
26
  def __init__(self, reference, file: UploadFile):
27
  self.file = file
28
- # self.content_table = content_table
29
- # self.reader = Reader()
30
  self.reference = reference
31
  self.metadata = Metadata(reference)
32
 
33
- # async def ingest_documents(self, file: UploadFile):
34
- # """Load documents from the storage path."""
35
- # documents = await self.reader.read_from_uploadfile(file)
36
- # print("Banyak document : ", len(documents))
37
- # print("document successfully ingested")
38
-
39
- # return documents
40
-
41
  def check_existing_metadata(self, pinecone_index, title, random_vector):
42
  try:
43
  result = pinecone_index.query(
@@ -56,20 +43,11 @@ class Uploader:
56
 
57
  async def process_documents(self):
58
  # Ingest documents
59
- # documents = await self.ingest_documents(self.file)
60
-
61
- # Get metadata
62
- # documents_with_metadata = self.metadata.apply_metadata(documents)
63
- documents_with_metadata = await upload_file(self.reference, self.file)
64
 
65
  if isinstance(documents_with_metadata, JSONResponse):
66
  return documents_with_metadata # Return the error response directly
67
 
68
- # Get Topic
69
- # topic_extractor = extract_topic(self.reference, self.content_table)
70
- # document_filtered = self.filter_document(documents_with_metadata)
71
-
72
- # embed_model = OpenAIEmbedding()
73
  embed_model = OpenAIEmbedding(model="text-embedding-3-large")
74
  Settings.embed_model = embed_model
75
  # Set up the ingestion pipeline
@@ -80,20 +58,14 @@ class Uploader:
80
  breakpoint_percentile_threshold=95,
81
  embed_model=embed_model,
82
  ),
83
- # topic_extractor,
84
  ]
85
  )
86
 
87
- # splitter = SemanticSplitterNodeParser(
88
- # buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
89
- # )
90
-
91
  # Run the pipeline
92
  try:
93
  print("Pipeline processing completed with Semantic Spliter.")
94
  nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
95
- # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
96
- return nodes_with_metadata
97
 
98
  except Exception as e:
99
  try:
@@ -103,7 +75,7 @@ class Uploader:
103
  documents_with_metadata
104
  )
105
  print("Pipeline processing completed with SentenceSplitter fallback.")
106
- return nodes_with_metadata
107
  except Exception as fallback_error:
108
  # Log the second error and return JSONResponse for FastAPI
109
  logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
 
1
+ import logging
2
+ import random
3
+
4
  from llama_index.core.ingestion import IngestionPipeline
5
  from llama_index.embeddings.openai import OpenAIEmbedding
6
  from config import PINECONE_CONFIG
7
  from pinecone.grpc import PineconeGRPC as Pinecone
 
8
  from script.get_metadata import Metadata
9
  from fastapi import UploadFile, status
10
  from fastapi.responses import JSONResponse
 
14
  SemanticSplitterNodeParser,
15
  )
16
  from llama_index.core import Settings
 
17
  from service.reader_v4 import upload_file
18
 
 
19
 
 
 
20
 
21
 
22
  class Uploader:
 
23
  def __init__(self, reference, file: UploadFile):
24
  self.file = file
 
 
25
  self.reference = reference
26
  self.metadata = Metadata(reference)
27
 
 
 
 
 
 
 
 
 
28
  def check_existing_metadata(self, pinecone_index, title, random_vector):
29
  try:
30
  result = pinecone_index.query(
 
43
 
44
  async def process_documents(self):
45
  # Ingest documents
46
+ documents_with_metadata, file_stream = await upload_file(self.reference, self.file)
 
 
 
 
47
 
48
  if isinstance(documents_with_metadata, JSONResponse):
49
  return documents_with_metadata # Return the error response directly
50
 
 
 
 
 
 
51
  embed_model = OpenAIEmbedding(model="text-embedding-3-large")
52
  Settings.embed_model = embed_model
53
  # Set up the ingestion pipeline
 
58
  breakpoint_percentile_threshold=95,
59
  embed_model=embed_model,
60
  ),
 
61
  ]
62
  )
63
 
 
 
 
 
64
  # Run the pipeline
65
  try:
66
  print("Pipeline processing completed with Semantic Spliter.")
67
  nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
68
+ return nodes_with_metadata, file_stream
 
69
 
70
  except Exception as e:
71
  try:
 
75
  documents_with_metadata
76
  )
77
  print("Pipeline processing completed with SentenceSplitter fallback.")
78
+ return nodes_with_metadata, file_stream
79
  except Exception as fallback_error:
80
  # Log the second error and return JSONResponse for FastAPI
81
  logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
service/reader_v4.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
2
  import nest_asyncio
3
- from typing import List
4
 
 
5
  from dotenv import load_dotenv
6
  from fastapi import UploadFile
7
  import joblib
@@ -85,6 +86,10 @@ async def upload_file(reference, file: UploadFile):
85
  try:
86
  # Read the binary content of the uploaded file once
87
  content = await file.read()
 
 
 
 
88
  # Parse the journal
89
  title = reference["title"]
90
 
@@ -97,7 +102,7 @@ async def upload_file(reference, file: UploadFile):
97
  print("Banyak documents : \n", len(documents_with_metadata))
98
 
99
  # Return both parsed documents and metadata
100
- return documents_with_metadata
101
 
102
  except Exception as e:
103
  print("error ", e)
 
1
  import os
2
  import nest_asyncio
3
+ from io import BytesIO
4
 
5
+ from typing import List
6
  from dotenv import load_dotenv
7
  from fastapi import UploadFile
8
  import joblib
 
86
  try:
87
  # Read the binary content of the uploaded file once
88
  content = await file.read()
89
+
90
+ # Store the file content in a BytesIO stream for reuse later
91
+ file_stream = BytesIO(content)
92
+
93
  # Parse the journal
94
  title = reference["title"]
95
 
 
102
  print("Banyak documents : \n", len(documents_with_metadata))
103
 
104
  # Return both parsed documents and metadata
105
+ return documents_with_metadata, file_stream
106
 
107
  except Exception as e:
108
  print("error ", e)