Sean-Case commited on
Commit
ee77123
1 Parent(s): a462256

Improved prompting for csv/excel files

Browse files
Files changed (3) hide show
  1. app.py +9 -9
  2. chatfuncs/chatfuncs.py +12 -6
  3. chatfuncs/ingest.py +25 -10
app.py CHANGED
@@ -8,7 +8,7 @@ import os
8
  os.system("pip install gradio==3.42.0")
9
 
10
  from typing import TypeVar
11
- from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
12
  from langchain.vectorstores import FAISS
13
  import gradio as gr
14
 
@@ -29,17 +29,17 @@ import chatfuncs.ingest as ing
29
 
30
  embeddings_name = "BAAI/bge-base-en-v1.5"
31
 
32
- def load_embeddings(embeddings_name = "thenlper/gte-base"):
33
 
34
 
35
- if embeddings_name == "hkunlp/instructor-large":
36
- embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
37
- embed_instruction="Represent the paragraph for retrieval: ",
38
- query_instruction="Represent the question for retrieving supporting documents: "
39
- )
40
 
41
- else:
42
- embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
43
 
44
  global embeddings
45
 
 
8
  os.system("pip install gradio==3.42.0")
9
 
10
  from typing import TypeVar
11
+ from langchain.embeddings import HuggingFaceEmbeddings#, HuggingFaceInstructEmbeddings
12
  from langchain.vectorstores import FAISS
13
  import gradio as gr
14
 
 
29
 
30
  embeddings_name = "BAAI/bge-base-en-v1.5"
31
 
32
+ def load_embeddings(embeddings_name = "BAAI/bge-base-en-v1.5"):
33
 
34
 
35
+ #if embeddings_name == "hkunlp/instructor-large":
36
+ # embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
37
+ # embed_instruction="Represent the paragraph for retrieval: ",
38
+ # query_instruction="Represent the question for retrieving supporting documents: "
39
+ # )
40
 
41
+ #else:
42
+ embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
43
 
44
  global embeddings
45
 
chatfuncs/chatfuncs.py CHANGED
@@ -308,6 +308,10 @@ QUESTION: {question}
308
 
309
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
310
 
 
 
 
 
311
  def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
312
 
313
  question = inputs["question"]
@@ -317,7 +321,7 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
317
  new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
318
 
319
 
320
- docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 10, out_passages = 2,
321
  vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
322
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
323
 
@@ -333,12 +337,14 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
333
 
334
 
335
  # Build up sources content to add to user display
 
 
 
 
 
336
 
337
- doc_df['meta_clean'] = [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
338
- doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
339
-
340
- #modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
341
- modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
342
  docs_content_string = '<br><br>'.join(modified_page_content)
343
 
344
  sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
 
308
 
309
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
310
 
311
+ def write_out_metadata_as_string(metadata_in):
312
+ metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
313
+ return metadata_string
314
+
315
  def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
316
 
317
  question = inputs["question"]
 
321
  new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
322
 
323
 
324
+ docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 25, out_passages = 2,
325
  vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
326
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
327
 
 
337
 
338
 
339
  # Build up sources content to add to user display
340
+ doc_df['meta_clean'] = write_out_metadata_as_string(doc_df["metadata"]) # [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
341
+
342
+ # Remove meta text from the page content if it already exists there
343
+ doc_df['page_content_no_meta'] = doc_df.apply(lambda row: row['page_content'].replace(row['meta_clean'] + ". ", ""), axis=1)
344
+ doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content_no_meta'].astype(str)
345
 
346
+ #modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
347
+ modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
 
 
 
348
  docs_content_string = '<br><br>'.join(modified_page_content)
349
 
350
  sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
chatfuncs/ingest.py CHANGED
@@ -25,7 +25,7 @@ import pandas as pd
25
  import dateutil.parser
26
  from typing import TypeVar, List
27
 
28
- from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
29
  from langchain.vectorstores.faiss import FAISS
30
  from langchain.vectorstores import Chroma
31
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -462,6 +462,14 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
462
 
463
  return documents
464
 
 
 
 
 
 
 
 
 
465
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
466
  """Converts a DataFrame's content to a list of Documents with metadata."""
467
 
@@ -479,6 +487,10 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
479
  if col != text_column:
480
  metadata[col] = value
481
 
 
 
 
 
482
  # If chunk_size is provided, split the text into chunks
483
  if chunk_size:
484
  # Assuming you have a text splitter function similar to the PDF handling
@@ -487,14 +499,17 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
487
  # Other arguments as required by the splitter
488
  )
489
  sections = text_splitter.split_text(doc_content)
 
490
 
491
  # For each section, create a Document object
492
  for i, section in enumerate(sections):
 
493
  doc = Document(page_content=section,
494
  metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
495
  doc_sections.append(doc)
496
  else:
497
  # If no chunk_size is provided, create a single Document object for the row
 
498
  doc = Document(page_content=doc_content, metadata=metadata)
499
  doc_sections.append(doc)
500
 
@@ -559,16 +574,16 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
559
 
560
  # ## Create embeddings and save faiss vector store to the path specified in `save_to`
561
 
562
- def load_embeddings(model_name = "thenlper/gte-base"):
563
 
564
- if model_name == "hkunlp/instructor-large":
565
- embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
566
- embed_instruction="Represent the paragraph for retrieval: ",
567
- query_instruction="Represent the question for retrieving supporting documents: "
568
- )
569
 
570
- else:
571
- embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
572
 
573
  global embeddings
574
 
@@ -576,7 +591,7 @@ def load_embeddings(model_name = "thenlper/gte-base"):
576
 
577
  return embeddings_func
578
 
579
- def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
580
 
581
  load_embeddings(model_name=model_name)
582
 
 
25
  import dateutil.parser
26
  from typing import TypeVar, List
27
 
28
+ from langchain.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
29
  from langchain.vectorstores.faiss import FAISS
30
  from langchain.vectorstores import Chroma
31
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
462
 
463
  return documents
464
 
465
+ def write_out_metadata_as_string(metadata_in):
466
+ # If metadata_in is a single dictionary, wrap it in a list
467
+ if isinstance(metadata_in, dict):
468
+ metadata_in = [metadata_in]
469
+
470
+ metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
471
+ return metadata_string
472
+
473
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
474
  """Converts a DataFrame's content to a list of Documents with metadata."""
475
 
 
487
  if col != text_column:
488
  metadata[col] = value
489
 
490
+ metadata_string = write_out_metadata_as_string(metadata)[0]
491
+
492
+
493
+
494
  # If chunk_size is provided, split the text into chunks
495
  if chunk_size:
496
  # Assuming you have a text splitter function similar to the PDF handling
 
499
  # Other arguments as required by the splitter
500
  )
501
  sections = text_splitter.split_text(doc_content)
502
+
503
 
504
  # For each section, create a Document object
505
  for i, section in enumerate(sections):
506
+ section = '. '.join([metadata_string, section])
507
  doc = Document(page_content=section,
508
  metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
509
  doc_sections.append(doc)
510
  else:
511
  # If no chunk_size is provided, create a single Document object for the row
512
+ doc_content = '. '.join([metadata_string, doc_content])
513
  doc = Document(page_content=doc_content, metadata=metadata)
514
  doc_sections.append(doc)
515
 
 
574
 
575
  # ## Create embeddings and save faiss vector store to the path specified in `save_to`
576
 
577
+ def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
578
 
579
+ #if model_name == "hkunlp/instructor-large":
580
+ # embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
581
+ # embed_instruction="Represent the paragraph for retrieval: ",
582
+ # query_instruction="Represent the question for retrieving supporting documents: "
583
+ # )
584
 
585
+ #else:
586
+ embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
587
 
588
  global embeddings
589
 
 
591
 
592
  return embeddings_func
593
 
594
+ def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "BAAI/bge-base-en-v1.5"):
595
 
596
  load_embeddings(model_name=model_name)
597