Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -67,87 +67,182 @@ hybrid_rag = HybridColpaliRAG(
|
|
67 |
IngestResult = namedtuple("IngestResult", ["status_text", "progress_table"])
|
68 |
|
69 |
|
70 |
-
@spaces.GPU(duration=120)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def ingest_data(pdf_files, use_ocr, chunk_size, progress=gr.Progress()):
|
72 |
file_paths = [pdf_file.name for pdf_file in pdf_files]
|
73 |
total_start_time = time.time()
|
74 |
progress_data = []
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
total_time = time.time() - total_start_time
|
153 |
progress_data.append({"Technique": "Total", "Time Taken (s)": f"{total_time:.2f}"})
|
@@ -313,6 +408,18 @@ Built on [VARAG](https://github.com/adithya-s-k/VARAG) - Vision-Augmented Retrie
|
|
313 |
)
|
314 |
|
315 |
with gr.Tab("Ingest Data"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
pdf_input = gr.File(
|
317 |
label="Upload PDF(s)", file_count="multiple", file_types=["pdf"]
|
318 |
)
|
|
|
67 |
IngestResult = namedtuple("IngestResult", ["status_text", "progress_table"])
|
68 |
|
69 |
|
70 |
+
# @spaces.GPU(duration=120)
|
71 |
+
# def ingest_data(pdf_files, use_ocr, chunk_size, progress=gr.Progress()):
|
72 |
+
# file_paths = [pdf_file.name for pdf_file in pdf_files]
|
73 |
+
# total_start_time = time.time()
|
74 |
+
# progress_data = []
|
75 |
+
|
76 |
+
# # SimpleRAG
|
77 |
+
# yield IngestResult(
|
78 |
+
# status_text="Starting SimpleRAG ingestion...\n",
|
79 |
+
# progress_table=pd.DataFrame(progress_data),
|
80 |
+
# )
|
81 |
+
# start_time = time.time()
|
82 |
+
# simple_rag.index(
|
83 |
+
# file_paths,
|
84 |
+
# recursive=False,
|
85 |
+
# chunking_strategy=FixedTokenChunker(chunk_size=chunk_size),
|
86 |
+
# metadata={"source": "gradio_upload"},
|
87 |
+
# overwrite=True,
|
88 |
+
# verbose=True,
|
89 |
+
# ocr=use_ocr,
|
90 |
+
# )
|
91 |
+
# simple_time = time.time() - start_time
|
92 |
+
# progress_data.append(
|
93 |
+
# {"Technique": "SimpleRAG", "Time Taken (s)": f"{simple_time:.2f}"}
|
94 |
+
# )
|
95 |
+
# yield IngestResult(
|
96 |
+
# status_text=f"SimpleRAG ingestion complete. Time taken: {simple_time:.2f} seconds\n\n",
|
97 |
+
# progress_table=pd.DataFrame(progress_data),
|
98 |
+
# )
|
99 |
+
# # progress(0.25, desc="SimpleRAG complete")
|
100 |
+
|
101 |
+
# # VisionRAG
|
102 |
+
# yield IngestResult(
|
103 |
+
# status_text="Starting VisionRAG ingestion...\n",
|
104 |
+
# progress_table=pd.DataFrame(progress_data),
|
105 |
+
# )
|
106 |
+
# start_time = time.time()
|
107 |
+
# vision_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
|
108 |
+
# vision_time = time.time() - start_time
|
109 |
+
# progress_data.append(
|
110 |
+
# {"Technique": "VisionRAG", "Time Taken (s)": f"{vision_time:.2f}"}
|
111 |
+
# )
|
112 |
+
# yield IngestResult(
|
113 |
+
# status_text=f"VisionRAG ingestion complete. Time taken: {vision_time:.2f} seconds\n\n",
|
114 |
+
# progress_table=pd.DataFrame(progress_data),
|
115 |
+
# )
|
116 |
+
# # progress(0.5, desc="VisionRAG complete")
|
117 |
+
|
118 |
+
# # ColpaliRAG
|
119 |
+
# yield IngestResult(
|
120 |
+
# status_text="Starting ColpaliRAG ingestion...\n",
|
121 |
+
# progress_table=pd.DataFrame(progress_data),
|
122 |
+
# )
|
123 |
+
# start_time = time.time()
|
124 |
+
# colpali_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
|
125 |
+
# colpali_time = time.time() - start_time
|
126 |
+
# progress_data.append(
|
127 |
+
# {"Technique": "ColpaliRAG", "Time Taken (s)": f"{colpali_time:.2f}"}
|
128 |
+
# )
|
129 |
+
# yield IngestResult(
|
130 |
+
# status_text=f"ColpaliRAG ingestion complete. Time taken: {colpali_time:.2f} seconds\n\n",
|
131 |
+
# progress_table=pd.DataFrame(progress_data),
|
132 |
+
# )
|
133 |
+
# # progress(0.75, desc="ColpaliRAG complete")
|
134 |
+
|
135 |
+
# # HybridColpaliRAG
|
136 |
+
# yield IngestResult(
|
137 |
+
# status_text="Starting HybridColpaliRAG ingestion...\n",
|
138 |
+
# progress_table=pd.DataFrame(progress_data),
|
139 |
+
# )
|
140 |
+
# start_time = time.time()
|
141 |
+
# hybrid_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
|
142 |
+
# hybrid_time = time.time() - start_time
|
143 |
+
# progress_data.append(
|
144 |
+
# {"Technique": "HybridColpaliRAG", "Time Taken (s)": f"{hybrid_time:.2f}"}
|
145 |
+
# )
|
146 |
+
# yield IngestResult(
|
147 |
+
# status_text=f"HybridColpaliRAG ingestion complete. Time taken: {hybrid_time:.2f} seconds\n\n",
|
148 |
+
# progress_table=pd.DataFrame(progress_data),
|
149 |
+
# )
|
150 |
+
# # progress(1.0, desc="HybridColpaliRAG complete")
|
151 |
+
|
152 |
+
# total_time = time.time() - total_start_time
|
153 |
+
# progress_data.append({"Technique": "Total", "Time Taken (s)": f"{total_time:.2f}"})
|
154 |
+
# yield IngestResult(
|
155 |
+
# status_text=f"Total ingestion time: {total_time:.2f} seconds",
|
156 |
+
# progress_table=pd.DataFrame(progress_data),
|
157 |
+
# )
|
158 |
+
|
159 |
+
|
160 |
def ingest_data(pdf_files, use_ocr, chunk_size, progress=gr.Progress()):
|
161 |
file_paths = [pdf_file.name for pdf_file in pdf_files]
|
162 |
total_start_time = time.time()
|
163 |
progress_data = []
|
164 |
|
165 |
+
@spaces.GPU(duration=120)
|
166 |
+
def ingest_simple_rag():
|
167 |
+
yield IngestResult(
|
168 |
+
status_text="Starting SimpleRAG ingestion...\n",
|
169 |
+
progress_table=pd.DataFrame(progress_data),
|
170 |
+
)
|
171 |
+
start_time = time.time()
|
172 |
+
simple_rag.index(
|
173 |
+
file_paths,
|
174 |
+
recursive=False,
|
175 |
+
chunking_strategy=FixedTokenChunker(chunk_size=chunk_size),
|
176 |
+
metadata={"source": "gradio_upload"},
|
177 |
+
overwrite=True,
|
178 |
+
verbose=True,
|
179 |
+
ocr=use_ocr,
|
180 |
+
)
|
181 |
+
simple_time = time.time() - start_time
|
182 |
+
progress_data.append(
|
183 |
+
{"Technique": "SimpleRAG", "Time Taken (s)": f"{simple_time:.2f}"}
|
184 |
+
)
|
185 |
+
yield IngestResult(
|
186 |
+
status_text=f"SimpleRAG ingestion complete. Time taken: {simple_time:.2f} seconds\n\n",
|
187 |
+
progress_table=pd.DataFrame(progress_data),
|
188 |
+
)
|
189 |
|
190 |
+
@spaces.GPU(duration=120)
|
191 |
+
def ingest_vision_rag():
|
192 |
+
yield IngestResult(
|
193 |
+
status_text="Starting VisionRAG ingestion...\n",
|
194 |
+
progress_table=pd.DataFrame(progress_data),
|
195 |
+
)
|
196 |
+
start_time = time.time()
|
197 |
+
vision_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
|
198 |
+
vision_time = time.time() - start_time
|
199 |
+
progress_data.append(
|
200 |
+
{"Technique": "VisionRAG", "Time Taken (s)": f"{vision_time:.2f}"}
|
201 |
+
)
|
202 |
+
yield IngestResult(
|
203 |
+
status_text=f"VisionRAG ingestion complete. Time taken: {vision_time:.2f} seconds\n\n",
|
204 |
+
progress_table=pd.DataFrame(progress_data),
|
205 |
+
)
|
206 |
|
207 |
+
@spaces.GPU(duration=120)
|
208 |
+
def ingest_colpali_rag():
|
209 |
+
yield IngestResult(
|
210 |
+
status_text="Starting ColpaliRAG ingestion...\n",
|
211 |
+
progress_table=pd.DataFrame(progress_data),
|
212 |
+
)
|
213 |
+
start_time = time.time()
|
214 |
+
colpali_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
|
215 |
+
colpali_time = time.time() - start_time
|
216 |
+
progress_data.append(
|
217 |
+
{"Technique": "ColpaliRAG", "Time Taken (s)": f"{colpali_time:.2f}"}
|
218 |
+
)
|
219 |
+
yield IngestResult(
|
220 |
+
status_text=f"ColpaliRAG ingestion complete. Time taken: {colpali_time:.2f} seconds\n\n",
|
221 |
+
progress_table=pd.DataFrame(progress_data),
|
222 |
+
)
|
223 |
|
224 |
+
@spaces.GPU(duration=120)
|
225 |
+
def ingest_hybrid_rag():
|
226 |
+
yield IngestResult(
|
227 |
+
status_text="Starting HybridColpaliRAG ingestion...\n",
|
228 |
+
progress_table=pd.DataFrame(progress_data),
|
229 |
+
)
|
230 |
+
start_time = time.time()
|
231 |
+
hybrid_rag.index(file_paths, overwrite=False, recursive=False, verbose=True)
|
232 |
+
hybrid_time = time.time() - start_time
|
233 |
+
progress_data.append(
|
234 |
+
{"Technique": "HybridColpaliRAG", "Time Taken (s)": f"{hybrid_time:.2f}"}
|
235 |
+
)
|
236 |
+
yield IngestResult(
|
237 |
+
status_text=f"HybridColpaliRAG ingestion complete. Time taken: {hybrid_time:.2f} seconds\n\n",
|
238 |
+
progress_table=pd.DataFrame(progress_data),
|
239 |
+
)
|
240 |
+
|
241 |
+
# Call each ingestion function
|
242 |
+
yield from ingest_simple_rag()
|
243 |
+
yield from ingest_vision_rag()
|
244 |
+
yield from ingest_colpali_rag()
|
245 |
+
yield from ingest_hybrid_rag()
|
246 |
|
247 |
total_time = time.time() - total_start_time
|
248 |
progress_data.append({"Technique": "Total", "Time Taken (s)": f"{total_time:.2f}"})
|
|
|
408 |
)
|
409 |
|
410 |
with gr.Tab("Ingest Data"):
|
411 |
+
gr.Markdown(
|
412 |
+
"""
|
413 |
+
## ⚠️ Important Note on Data Ingestion
|
414 |
+
|
415 |
+
This Space has a maximum GPU-enabled time of 120 seconds. It's recommended to try ingesting only 1 or 2 pdfs at a time.
|
416 |
+
|
417 |
+
If you want to ingest a larger amount of data, please try it out in a Google Colab notebook:
|
418 |
+
|
419 |
+
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adithya-s-k/VARAG/blob/main/docs/demo.ipynb)
|
420 |
+
|
421 |
+
"""
|
422 |
+
)
|
423 |
pdf_input = gr.File(
|
424 |
label="Upload PDF(s)", file_count="multiple", file_types=["pdf"]
|
425 |
)
|