Sean-Case
commited on
Commit
•
ee77123
1
Parent(s):
a462256
Improved prompting for csv/excel files
Browse files- app.py +9 -9
- chatfuncs/chatfuncs.py +12 -6
- chatfuncs/ingest.py +25 -10
app.py
CHANGED
@@ -8,7 +8,7 @@ import os
|
|
8 |
os.system("pip install gradio==3.42.0")
|
9 |
|
10 |
from typing import TypeVar
|
11 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
12 |
from langchain.vectorstores import FAISS
|
13 |
import gradio as gr
|
14 |
|
@@ -29,17 +29,17 @@ import chatfuncs.ingest as ing
|
|
29 |
|
30 |
embeddings_name = "BAAI/bge-base-en-v1.5"
|
31 |
|
32 |
-
def load_embeddings(embeddings_name = "
|
33 |
|
34 |
|
35 |
-
if embeddings_name == "hkunlp/instructor-large":
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
else:
|
42 |
-
|
43 |
|
44 |
global embeddings
|
45 |
|
|
|
8 |
os.system("pip install gradio==3.42.0")
|
9 |
|
10 |
from typing import TypeVar
|
11 |
+
from langchain.embeddings import HuggingFaceEmbeddings#, HuggingFaceInstructEmbeddings
|
12 |
from langchain.vectorstores import FAISS
|
13 |
import gradio as gr
|
14 |
|
|
|
29 |
|
30 |
embeddings_name = "BAAI/bge-base-en-v1.5"
|
31 |
|
32 |
+
def load_embeddings(embeddings_name = "BAAI/bge-base-en-v1.5"):
|
33 |
|
34 |
|
35 |
+
#if embeddings_name == "hkunlp/instructor-large":
|
36 |
+
# embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
|
37 |
+
# embed_instruction="Represent the paragraph for retrieval: ",
|
38 |
+
# query_instruction="Represent the question for retrieving supporting documents: "
|
39 |
+
# )
|
40 |
|
41 |
+
#else:
|
42 |
+
embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
|
43 |
|
44 |
global embeddings
|
45 |
|
chatfuncs/chatfuncs.py
CHANGED
@@ -308,6 +308,10 @@ QUESTION: {question}
|
|
308 |
|
309 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
310 |
|
|
|
|
|
|
|
|
|
311 |
def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
|
312 |
|
313 |
question = inputs["question"]
|
@@ -317,7 +321,7 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
317 |
new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
|
318 |
|
319 |
|
320 |
-
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val =
|
321 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
322 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
323 |
|
@@ -333,12 +337,14 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
333 |
|
334 |
|
335 |
# Build up sources content to add to user display
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
#modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
|
341 |
-
modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
|
342 |
docs_content_string = '<br><br>'.join(modified_page_content)
|
343 |
|
344 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
|
|
308 |
|
309 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
310 |
|
311 |
+
def write_out_metadata_as_string(metadata_in):
|
312 |
+
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
313 |
+
return metadata_string
|
314 |
+
|
315 |
def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
|
316 |
|
317 |
question = inputs["question"]
|
|
|
321 |
new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
|
322 |
|
323 |
|
324 |
+
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 25, out_passages = 2,
|
325 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
326 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
327 |
|
|
|
337 |
|
338 |
|
339 |
# Build up sources content to add to user display
|
340 |
+
doc_df['meta_clean'] = write_out_metadata_as_string(doc_df["metadata"]) # [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
|
341 |
+
|
342 |
+
# Remove meta text from the page content if it already exists there
|
343 |
+
doc_df['page_content_no_meta'] = doc_df.apply(lambda row: row['page_content'].replace(row['meta_clean'] + ". ", ""), axis=1)
|
344 |
+
doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content_no_meta'].astype(str)
|
345 |
|
346 |
+
#modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
|
347 |
+
modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
|
|
|
|
|
|
|
348 |
docs_content_string = '<br><br>'.join(modified_page_content)
|
349 |
|
350 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
chatfuncs/ingest.py
CHANGED
@@ -25,7 +25,7 @@ import pandas as pd
|
|
25 |
import dateutil.parser
|
26 |
from typing import TypeVar, List
|
27 |
|
28 |
-
from langchain.embeddings import HuggingFaceInstructEmbeddings,
|
29 |
from langchain.vectorstores.faiss import FAISS
|
30 |
from langchain.vectorstores import Chroma
|
31 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -462,6 +462,14 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
|
|
462 |
|
463 |
return documents
|
464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
466 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
467 |
|
@@ -479,6 +487,10 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
479 |
if col != text_column:
|
480 |
metadata[col] = value
|
481 |
|
|
|
|
|
|
|
|
|
482 |
# If chunk_size is provided, split the text into chunks
|
483 |
if chunk_size:
|
484 |
# Assuming you have a text splitter function similar to the PDF handling
|
@@ -487,14 +499,17 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
487 |
# Other arguments as required by the splitter
|
488 |
)
|
489 |
sections = text_splitter.split_text(doc_content)
|
|
|
490 |
|
491 |
# For each section, create a Document object
|
492 |
for i, section in enumerate(sections):
|
|
|
493 |
doc = Document(page_content=section,
|
494 |
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
495 |
doc_sections.append(doc)
|
496 |
else:
|
497 |
# If no chunk_size is provided, create a single Document object for the row
|
|
|
498 |
doc = Document(page_content=doc_content, metadata=metadata)
|
499 |
doc_sections.append(doc)
|
500 |
|
@@ -559,16 +574,16 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
|
|
559 |
|
560 |
# ## Create embeddings and save faiss vector store to the path specified in `save_to`
|
561 |
|
562 |
-
def load_embeddings(model_name = "
|
563 |
|
564 |
-
if model_name == "hkunlp/instructor-large":
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
|
570 |
-
else:
|
571 |
-
|
572 |
|
573 |
global embeddings
|
574 |
|
@@ -576,7 +591,7 @@ def load_embeddings(model_name = "thenlper/gte-base"):
|
|
576 |
|
577 |
return embeddings_func
|
578 |
|
579 |
-
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "
|
580 |
|
581 |
load_embeddings(model_name=model_name)
|
582 |
|
|
|
25 |
import dateutil.parser
|
26 |
from typing import TypeVar, List
|
27 |
|
28 |
+
from langchain.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
|
29 |
from langchain.vectorstores.faiss import FAISS
|
30 |
from langchain.vectorstores import Chroma
|
31 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
462 |
|
463 |
return documents
|
464 |
|
465 |
+
def write_out_metadata_as_string(metadata_in):
|
466 |
+
# If metadata_in is a single dictionary, wrap it in a list
|
467 |
+
if isinstance(metadata_in, dict):
|
468 |
+
metadata_in = [metadata_in]
|
469 |
+
|
470 |
+
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
471 |
+
return metadata_string
|
472 |
+
|
473 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
474 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
475 |
|
|
|
487 |
if col != text_column:
|
488 |
metadata[col] = value
|
489 |
|
490 |
+
metadata_string = write_out_metadata_as_string(metadata)[0]
|
491 |
+
|
492 |
+
|
493 |
+
|
494 |
# If chunk_size is provided, split the text into chunks
|
495 |
if chunk_size:
|
496 |
# Assuming you have a text splitter function similar to the PDF handling
|
|
|
499 |
# Other arguments as required by the splitter
|
500 |
)
|
501 |
sections = text_splitter.split_text(doc_content)
|
502 |
+
|
503 |
|
504 |
# For each section, create a Document object
|
505 |
for i, section in enumerate(sections):
|
506 |
+
section = '. '.join([metadata_string, section])
|
507 |
doc = Document(page_content=section,
|
508 |
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
509 |
doc_sections.append(doc)
|
510 |
else:
|
511 |
# If no chunk_size is provided, create a single Document object for the row
|
512 |
+
doc_content = '. '.join([metadata_string, doc_content])
|
513 |
doc = Document(page_content=doc_content, metadata=metadata)
|
514 |
doc_sections.append(doc)
|
515 |
|
|
|
574 |
|
575 |
# ## Create embeddings and save faiss vector store to the path specified in `save_to`
|
576 |
|
577 |
+
def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
|
578 |
|
579 |
+
#if model_name == "hkunlp/instructor-large":
|
580 |
+
# embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
|
581 |
+
# embed_instruction="Represent the paragraph for retrieval: ",
|
582 |
+
# query_instruction="Represent the question for retrieving supporting documents: "
|
583 |
+
# )
|
584 |
|
585 |
+
#else:
|
586 |
+
embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
|
587 |
|
588 |
global embeddings
|
589 |
|
|
|
591 |
|
592 |
return embeddings_func
|
593 |
|
594 |
+
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "BAAI/bge-base-en-v1.5"):
|
595 |
|
596 |
load_embeddings(model_name=model_name)
|
597 |
|