Sean-Case
commited on
Commit
•
d4b0a2c
1
Parent(s):
c2ff47a
Added csv/Excel file support
Browse files- app.py +25 -15
- chatfuncs/chatfuncs.py +36 -17
- chatfuncs/ingest.py +114 -14
app.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
|
6 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
7 |
#os.system("pip uninstall -y gradio")
|
8 |
-
os.system("pip install gradio==3.42.0")
|
9 |
|
10 |
from typing import TypeVar
|
11 |
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
|
@@ -25,7 +25,6 @@ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
|
25 |
#from chatfuncs.chatfuncs import *
|
26 |
import chatfuncs.ingest as ing
|
27 |
|
28 |
-
|
29 |
## Load preset embeddings, vectorstore, and model
|
30 |
|
31 |
embeddings_name = "BAAI/bge-base-en-v1.5"
|
@@ -107,7 +106,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
107 |
|
108 |
if model_type == "Flan Alpaca (small, fast)":
|
109 |
# Huggingface chat model
|
110 |
-
hf_checkpoint = 'declare-lab/flan-alpaca-large'
|
111 |
|
112 |
def create_hf_model(model_name):
|
113 |
|
@@ -140,9 +139,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
140 |
return model_type, load_confirmation, model_type
|
141 |
|
142 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
143 |
-
model_type = "Mistral Open Orca (larger, slow)"
|
144 |
-
|
145 |
-
load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
146 |
|
147 |
model_type = "Flan Alpaca (small, fast)"
|
148 |
load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
@@ -183,7 +181,7 @@ with block:
|
|
183 |
|
184 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
185 |
|
186 |
-
gr.Markdown("Chat with PDF
|
187 |
|
188 |
with gr.Row():
|
189 |
current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
|
@@ -192,10 +190,10 @@ with block:
|
|
192 |
with gr.Tab("Chatbot"):
|
193 |
|
194 |
with gr.Row():
|
195 |
-
chat_height = 500
|
196 |
-
chatbot = gr.Chatbot(
|
197 |
with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
|
198 |
-
sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here",
|
199 |
|
200 |
with gr.Row():
|
201 |
message = gr.Textbox(
|
@@ -219,18 +217,23 @@ with block:
|
|
219 |
|
220 |
|
221 |
|
222 |
-
with gr.Tab("Load in a different
|
223 |
with gr.Accordion("PDF file", open = False):
|
224 |
in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
|
225 |
load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
|
226 |
|
227 |
with gr.Accordion("Web page", open = False):
|
228 |
with gr.Row():
|
229 |
-
in_web = gr.Textbox(label="Enter
|
230 |
-
in_div = gr.Textbox(label="(Advanced)
|
231 |
-
load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
-
ingest_embed_out = gr.Textbox(label="File/
|
234 |
|
235 |
with gr.Tab("Advanced features"):
|
236 |
with gr.Row():
|
@@ -264,6 +267,12 @@ with block:
|
|
264 |
then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
|
265 |
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
|
266 |
then(chatf.hide_block, outputs = [examples_set])
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
268 |
# Load in a webpage
|
269 |
|
@@ -289,6 +298,7 @@ with block:
|
|
289 |
clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
|
290 |
clear.click(lambda: None, None, chatbot, queue=False)
|
291 |
|
|
|
292 |
chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
|
293 |
|
294 |
block.queue(concurrency_count=1).launch(debug=True)
|
|
|
5 |
|
6 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
7 |
#os.system("pip uninstall -y gradio")
|
8 |
+
#os.system("pip install gradio==3.42.0")
|
9 |
|
10 |
from typing import TypeVar
|
11 |
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
|
|
|
25 |
#from chatfuncs.chatfuncs import *
|
26 |
import chatfuncs.ingest as ing
|
27 |
|
|
|
28 |
## Load preset embeddings, vectorstore, and model
|
29 |
|
30 |
embeddings_name = "BAAI/bge-base-en-v1.5"
|
|
|
106 |
|
107 |
if model_type == "Flan Alpaca (small, fast)":
|
108 |
# Huggingface chat model
|
109 |
+
hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
|
110 |
|
111 |
def create_hf_model(model_name):
|
112 |
|
|
|
139 |
return model_type, load_confirmation, model_type
|
140 |
|
141 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
142 |
+
#model_type = "Mistral Open Orca (larger, slow)"
|
143 |
+
#load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
|
|
144 |
|
145 |
model_type = "Flan Alpaca (small, fast)"
|
146 |
load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
|
|
181 |
|
182 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
183 |
|
184 |
+
gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
185 |
|
186 |
with gr.Row():
|
187 |
current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
|
|
|
190 |
with gr.Tab("Chatbot"):
|
191 |
|
192 |
with gr.Row():
|
193 |
+
#chat_height = 500
|
194 |
+
chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
|
195 |
with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
|
196 |
+
sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", scale = 1) # , height=chat_height
|
197 |
|
198 |
with gr.Row():
|
199 |
message = gr.Textbox(
|
|
|
217 |
|
218 |
|
219 |
|
220 |
+
with gr.Tab("Load in a different file to chat with"):
|
221 |
with gr.Accordion("PDF file", open = False):
|
222 |
in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
|
223 |
load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
|
224 |
|
225 |
with gr.Accordion("Web page", open = False):
|
226 |
with gr.Row():
|
227 |
+
in_web = gr.Textbox(label="Enter web page url")
|
228 |
+
in_div = gr.Textbox(label="(Advanced) Web page div for text extraction", value="p", placeholder="p")
|
229 |
+
load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
|
230 |
+
|
231 |
+
with gr.Accordion("CSV/Excel file", open = False):
|
232 |
+
in_csv = gr.File(label="Upload CSV/Excel file", file_count="multiple", file_types=['.csv', '.xlsx'])
|
233 |
+
in_text_column = gr.Textbox(label="Enter column name where text is stored")
|
234 |
+
load_csv = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
|
235 |
|
236 |
+
ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
|
237 |
|
238 |
with gr.Tab("Advanced features"):
|
239 |
with gr.Row():
|
|
|
267 |
then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
|
268 |
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
|
269 |
then(chatf.hide_block, outputs = [examples_set])
|
270 |
+
|
271 |
+
# Load in a csv/excel file
|
272 |
+
load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
|
273 |
+
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
|
274 |
+
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
|
275 |
+
then(chatf.hide_block, outputs = [examples_set])
|
276 |
|
277 |
# Load in a webpage
|
278 |
|
|
|
298 |
clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
|
299 |
clear.click(lambda: None, None, chatbot, queue=False)
|
300 |
|
301 |
+
# Thumbs up or thumbs down voting function
|
302 |
chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
|
303 |
|
304 |
block.queue(concurrency_count=1).launch(debug=True)
|
chatfuncs/chatfuncs.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import re
|
|
|
2 |
import datetime
|
3 |
from typing import TypeVar, Dict, List, Tuple
|
4 |
import time
|
@@ -66,7 +67,7 @@ ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base
|
|
66 |
# Used to pull out keywords from chat history to add to user queries behind the scenes
|
67 |
kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
68 |
|
69 |
-
|
70 |
if torch.cuda.is_available():
|
71 |
torch_device = "cuda"
|
72 |
gpu_layers = 0
|
@@ -136,18 +137,6 @@ gpu_config = CtransInitConfig_gpu()
|
|
136 |
cpu_config = CtransInitConfig_cpu()
|
137 |
|
138 |
|
139 |
-
#@dataclass
|
140 |
-
#class CtransGenGenerationConfig:
|
141 |
-
# top_k: int = top_k
|
142 |
-
# top_p: float = top_p
|
143 |
-
# temperature: float = temperature
|
144 |
-
# repetition_penalty: float = tinyllama_repetition_penalty
|
145 |
-
# last_n_tokens: int = last_n_tokens
|
146 |
-
# seed: int = seed
|
147 |
-
# batch_size:int = batch_size
|
148 |
-
# threads: int = threads
|
149 |
-
# reset: bool = True
|
150 |
-
|
151 |
class CtransGenGenerationConfig:
|
152 |
def __init__(self, temperature=temperature,
|
153 |
top_k=top_k,
|
@@ -333,7 +322,11 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
333 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
334 |
|
335 |
# Expand the found passages to the neighbouring context
|
336 |
-
|
|
|
|
|
|
|
|
|
337 |
|
338 |
if docs_keep_as_doc == []:
|
339 |
{"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
|
@@ -344,8 +337,9 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
344 |
doc_df['meta_clean'] = [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
|
345 |
doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
|
346 |
|
347 |
-
modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
|
348 |
-
|
|
|
349 |
|
350 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
351 |
|
@@ -481,6 +475,19 @@ def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_
|
|
481 |
|
482 |
return new_question_kworded
|
483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
def create_doc_df(docs_keep_out):
|
485 |
# Extract content and metadata from 'winning' passages.
|
486 |
content=[]
|
@@ -489,11 +496,17 @@ def create_doc_df(docs_keep_out):
|
|
489 |
page_section=[]
|
490 |
score=[]
|
491 |
|
|
|
|
|
492 |
for item in docs_keep_out:
|
493 |
content.append(item[0].page_content)
|
494 |
meta.append(item[0].metadata)
|
495 |
meta_url.append(item[0].metadata['source'])
|
496 |
-
|
|
|
|
|
|
|
|
|
497 |
score.append(item[1])
|
498 |
|
499 |
# Create df from 'winning' passages
|
@@ -728,6 +741,12 @@ def get_expanded_passages(vectorstore, docs, width):
|
|
728 |
expanded_docs = []
|
729 |
for doc, score in docs:
|
730 |
search_source = doc.metadata['source']
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
search_section = doc.metadata['page_section']
|
732 |
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
|
733 |
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
|
|
|
1 |
import re
|
2 |
+
import os
|
3 |
import datetime
|
4 |
from typing import TypeVar, Dict, List, Tuple
|
5 |
import time
|
|
|
67 |
# Used to pull out keywords from chat history to add to user queries behind the scenes
|
68 |
kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
69 |
|
70 |
+
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
71 |
if torch.cuda.is_available():
|
72 |
torch_device = "cuda"
|
73 |
gpu_layers = 0
|
|
|
137 |
cpu_config = CtransInitConfig_cpu()
|
138 |
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
class CtransGenGenerationConfig:
|
141 |
def __init__(self, temperature=temperature,
|
142 |
top_k=top_k,
|
|
|
322 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
323 |
|
324 |
# Expand the found passages to the neighbouring context
|
325 |
+
file_type = determine_file_type(doc_df['meta_url'][0])
|
326 |
+
|
327 |
+
# Only expand passages if not tabular data
|
328 |
+
if (file_type != ".csv") & (file_type != ".xlsx"):
|
329 |
+
docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
|
330 |
|
331 |
if docs_keep_as_doc == []:
|
332 |
{"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
|
|
|
337 |
doc_df['meta_clean'] = [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
|
338 |
doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
|
339 |
|
340 |
+
#modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
|
341 |
+
modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
|
342 |
+
docs_content_string = '<br><br>'.join(modified_page_content)
|
343 |
|
344 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
345 |
|
|
|
475 |
|
476 |
return new_question_kworded
|
477 |
|
478 |
+
def determine_file_type(file_path):
|
479 |
+
"""
|
480 |
+
Determine the file type based on its extension.
|
481 |
+
|
482 |
+
Parameters:
|
483 |
+
file_path (str): Path to the file.
|
484 |
+
|
485 |
+
Returns:
|
486 |
+
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
|
487 |
+
"""
|
488 |
+
return os.path.splitext(file_path)[1].lower()
|
489 |
+
|
490 |
+
|
491 |
def create_doc_df(docs_keep_out):
|
492 |
# Extract content and metadata from 'winning' passages.
|
493 |
content=[]
|
|
|
496 |
page_section=[]
|
497 |
score=[]
|
498 |
|
499 |
+
|
500 |
+
|
501 |
for item in docs_keep_out:
|
502 |
content.append(item[0].page_content)
|
503 |
meta.append(item[0].metadata)
|
504 |
meta_url.append(item[0].metadata['source'])
|
505 |
+
|
506 |
+
file_extension = determine_file_type(item[0].metadata['source'])
|
507 |
+
if (file_extension != ".csv") & (file_extension != ".xlsx"):
|
508 |
+
page_section.append(item[0].metadata['page_section'])
|
509 |
+
else: page_section.append("")
|
510 |
score.append(item[1])
|
511 |
|
512 |
# Create df from 'winning' passages
|
|
|
741 |
expanded_docs = []
|
742 |
for doc, score in docs:
|
743 |
search_source = doc.metadata['source']
|
744 |
+
|
745 |
+
|
746 |
+
#if file_type == ".csv" | file_type == ".xlsx":
|
747 |
+
# content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_by_source[search_source], 0, search_index)
|
748 |
+
|
749 |
+
#else:
|
750 |
search_section = doc.metadata['page_section']
|
751 |
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
|
752 |
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
|
chatfuncs/ingest.py
CHANGED
@@ -44,31 +44,32 @@ chunk_overlap = 0
|
|
44 |
start_index = True
|
45 |
|
46 |
## Parse files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
def parse_file(file_paths):
|
49 |
"""
|
50 |
Accepts a list of file paths, determines each file's type based on its extension,
|
51 |
and passes it to the relevant parsing function.
|
52 |
|
53 |
Parameters:
|
54 |
file_paths (list): List of file paths.
|
55 |
-
|
56 |
|
57 |
Returns:
|
58 |
dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
|
59 |
"""
|
60 |
|
61 |
-
def determine_file_type(file_path):
|
62 |
-
"""
|
63 |
-
Determine the file type based on its extension.
|
64 |
-
|
65 |
-
Parameters:
|
66 |
-
file_path (str): Path to the file.
|
67 |
|
68 |
-
Returns:
|
69 |
-
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
|
70 |
-
"""
|
71 |
-
return os.path.splitext(file_path)[1].lower()
|
72 |
|
73 |
if not isinstance(file_paths, list):
|
74 |
raise ValueError("Expected a list of file paths.")
|
@@ -78,7 +79,9 @@ def parse_file(file_paths):
|
|
78 |
'.docx': parse_docx,
|
79 |
'.txt': parse_txt,
|
80 |
'.html': parse_html,
|
81 |
-
'.htm': parse_html # Considering both .html and .htm for HTML files
|
|
|
|
|
82 |
}
|
83 |
|
84 |
parsed_contents = {}
|
@@ -115,6 +118,64 @@ def text_regex_clean(text):
|
|
115 |
|
116 |
return text
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
def parse_pdf(file) -> List[str]:
|
119 |
|
120 |
"""
|
@@ -308,8 +369,9 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
|
|
308 |
if ext == '.pdf':
|
309 |
docs, page_docs = pdf_text_to_docs(content, chunk_size)
|
310 |
elif ext in ['.html', '.htm', '.txt', '.docx']:
|
311 |
-
# Assuming you want to process HTML similarly to PDF in this context
|
312 |
docs = html_text_to_docs(content, chunk_size)
|
|
|
|
|
313 |
else:
|
314 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
315 |
continue
|
@@ -400,6 +462,44 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
|
|
400 |
|
401 |
return documents
|
402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
# # Functions for working with documents after loading them back in
|
404 |
|
405 |
def pull_out_data(series):
|
|
|
44 |
start_index = True
|
45 |
|
46 |
## Parse files
|
47 |
+
def determine_file_type(file_path):
|
48 |
+
"""
|
49 |
+
Determine the file type based on its extension.
|
50 |
+
|
51 |
+
Parameters:
|
52 |
+
file_path (str): Path to the file.
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
|
56 |
+
"""
|
57 |
+
return os.path.splitext(file_path)[1].lower()
|
58 |
|
59 |
+
def parse_file(file_paths, text_column='text'):
|
60 |
"""
|
61 |
Accepts a list of file paths, determines each file's type based on its extension,
|
62 |
and passes it to the relevant parsing function.
|
63 |
|
64 |
Parameters:
|
65 |
file_paths (list): List of file paths.
|
66 |
+
text_column (str): Name of the column in CSV/Excel files that contains the text content.
|
67 |
|
68 |
Returns:
|
69 |
dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
|
70 |
"""
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
|
|
|
|
|
|
|
|
73 |
|
74 |
if not isinstance(file_paths, list):
|
75 |
raise ValueError("Expected a list of file paths.")
|
|
|
79 |
'.docx': parse_docx,
|
80 |
'.txt': parse_txt,
|
81 |
'.html': parse_html,
|
82 |
+
'.htm': parse_html, # Considering both .html and .htm for HTML files
|
83 |
+
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
84 |
+
'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column)
|
85 |
}
|
86 |
|
87 |
parsed_contents = {}
|
|
|
118 |
|
119 |
return text
|
120 |
|
121 |
+
def parse_csv_or_excel(file_paths, text_column = "text"):
|
122 |
+
"""
|
123 |
+
Read in a CSV or Excel file.
|
124 |
+
|
125 |
+
Parameters:
|
126 |
+
file_path (str): Path to the CSV file.
|
127 |
+
text_column (str): Name of the column in the CSV file that contains the text content.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
Pandas DataFrame: Dataframe output from file read
|
131 |
+
"""
|
132 |
+
|
133 |
+
file_names = []
|
134 |
+
out_df = pd.DataFrame()
|
135 |
+
|
136 |
+
for file_path in file_paths:
|
137 |
+
file_extension = determine_file_type(file_path.name)
|
138 |
+
file_name = get_file_path_end(file_path.name)
|
139 |
+
|
140 |
+
if file_extension == ".csv":
|
141 |
+
df = pd.read_csv(file_path.name)
|
142 |
+
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
143 |
+
df['source'] = file_name
|
144 |
+
df['page_section'] = ""
|
145 |
+
elif file_extension == ".xlsx":
|
146 |
+
df = pd.read_excel(file_path.name, engine='openpyxl')
|
147 |
+
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
148 |
+
df['source'] = file_name
|
149 |
+
df['page_section'] = ""
|
150 |
+
else:
|
151 |
+
print(f"Unsupported file type: {file_extension}")
|
152 |
+
return pd.DataFrame(), ['Please choose a valid file type']
|
153 |
+
|
154 |
+
file_names.append(file_name)
|
155 |
+
out_df = pd.concat([out_df, df])
|
156 |
+
|
157 |
+
#if text_column not in df.columns:
|
158 |
+
# return f"Column '{text_column}' not found in {file_path}"
|
159 |
+
#text_out = " ".join(df[text_column].dropna().astype(str))
|
160 |
+
return out_df, file_names
|
161 |
+
|
162 |
+
def parse_excel(file_path, text_column):
|
163 |
+
"""
|
164 |
+
Read text from an Excel file.
|
165 |
+
|
166 |
+
Parameters:
|
167 |
+
file_path (str): Path to the Excel file.
|
168 |
+
text_column (str): Name of the column in the Excel file that contains the text content.
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
Pandas DataFrame: Dataframe output from file read
|
172 |
+
"""
|
173 |
+
df = pd.read_excel(file_path, engine='openpyxl')
|
174 |
+
#if text_column not in df.columns:
|
175 |
+
# return f"Column '{text_column}' not found in {file_path}"
|
176 |
+
#text_out = " ".join(df[text_column].dropna().astype(str))
|
177 |
+
return df
|
178 |
+
|
179 |
def parse_pdf(file) -> List[str]:
|
180 |
|
181 |
"""
|
|
|
369 |
if ext == '.pdf':
|
370 |
docs, page_docs = pdf_text_to_docs(content, chunk_size)
|
371 |
elif ext in ['.html', '.htm', '.txt', '.docx']:
|
|
|
372 |
docs = html_text_to_docs(content, chunk_size)
|
373 |
+
elif ext in ['.csv', '.xlsx']:
|
374 |
+
docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
|
375 |
else:
|
376 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
377 |
continue
|
|
|
462 |
|
463 |
return documents
|
464 |
|
465 |
+
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
466 |
+
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
467 |
+
|
468 |
+
doc_sections = []
|
469 |
+
df[text_column] = df[text_column].astype(str) # Ensure column is a string column
|
470 |
+
|
471 |
+
# For each row in the dataframe
|
472 |
+
for idx, row in df.iterrows():
|
473 |
+
# Extract the text content for the document
|
474 |
+
doc_content = row[text_column]
|
475 |
+
|
476 |
+
# Generate metadata containing other columns' data
|
477 |
+
metadata = {"row": idx + 1}
|
478 |
+
for col, value in row.items():
|
479 |
+
if col != text_column:
|
480 |
+
metadata[col] = value
|
481 |
+
|
482 |
+
# If chunk_size is provided, split the text into chunks
|
483 |
+
if chunk_size:
|
484 |
+
# Assuming you have a text splitter function similar to the PDF handling
|
485 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
486 |
+
chunk_size=chunk_size,
|
487 |
+
# Other arguments as required by the splitter
|
488 |
+
)
|
489 |
+
sections = text_splitter.split_text(doc_content)
|
490 |
+
|
491 |
+
# For each section, create a Document object
|
492 |
+
for i, section in enumerate(sections):
|
493 |
+
doc = Document(page_content=section,
|
494 |
+
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
495 |
+
doc_sections.append(doc)
|
496 |
+
else:
|
497 |
+
# If no chunk_size is provided, create a single Document object for the row
|
498 |
+
doc = Document(page_content=doc_content, metadata=metadata)
|
499 |
+
doc_sections.append(doc)
|
500 |
+
|
501 |
+
return doc_sections
|
502 |
+
|
503 |
# # Functions for working with documents after loading them back in
|
504 |
|
505 |
def pull_out_data(series):
|