Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

Sean-Case commited on Oct 23, 2023

Commit

d4b0a2c

•

1 Parent(s): c2ff47a

Added csv/Excel file support

Browse files

Files changed (3) hide show

app.py +25 -15
chatfuncs/chatfuncs.py +36 -17
chatfuncs/ingest.py +114 -14

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
 #os.system("pip uninstall -y gradio")
-os.system("pip install gradio==3.42.0")
 from typing import TypeVar
 from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
@@ -25,7 +25,6 @@ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
 #from chatfuncs.chatfuncs import *
 import chatfuncs.ingest as ing
 ##  Load preset embeddings, vectorstore, and model
 embeddings_name = "BAAI/bge-base-en-v1.5"
@@ -107,7 +106,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
     if model_type == "Flan Alpaca (small, fast)":
         # Huggingface chat model
-        hf_checkpoint = 'declare-lab/flan-alpaca-large'
         def create_hf_model(model_name):
@@ -140,9 +139,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
     return model_type, load_confirmation, model_type
 # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
-model_type = "Mistral Open Orca (larger, slow)"
-load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
 model_type = "Flan Alpaca (small, fast)"
 load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
@@ -183,7 +181,7 @@ with block:
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
-    gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
     with gr.Row():
         current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
@@ -192,10 +190,10 @@ with block:
     with gr.Tab("Chatbot"):
         with gr.Row():
-            chat_height = 500
-            chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
             with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
-                sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", height=chat_height, scale = 2)
         with gr.Row():
             message = gr.Textbox(
@@ -219,18 +217,23 @@ with block:
-    with gr.Tab("Load in a different PDF file or web page to chat"):
         with gr.Accordion("PDF file", open = False):
             in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
             load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
         with gr.Accordion("Web page", open = False):
             with gr.Row():
-                in_web = gr.Textbox(label="Enter webpage url")
-                in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
-            load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
-        ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
     with gr.Tab("Advanced features"):
         with gr.Row():
@@ -264,6 +267,12 @@ with block:
              then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
              then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
              then(chatf.hide_block, outputs = [examples_set])
     # Load in a webpage
@@ -289,6 +298,7 @@ with block:
     clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
     clear.click(lambda: None, None, chatbot, queue=False)
     chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
 block.queue(concurrency_count=1).launch(debug=True)

 # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
 #os.system("pip uninstall -y gradio")
+#os.system("pip install gradio==3.42.0")
 from typing import TypeVar
 from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
 #from chatfuncs.chatfuncs import *
 import chatfuncs.ingest as ing
 ##  Load preset embeddings, vectorstore, and model
 embeddings_name = "BAAI/bge-base-en-v1.5"
     if model_type == "Flan Alpaca (small, fast)":
         # Huggingface chat model
+        hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
         def create_hf_model(model_name):
     return model_type, load_confirmation, model_type
 # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
+#model_type = "Mistral Open Orca (larger, slow)"
+#load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
 model_type = "Flan Alpaca (small, fast)"
 load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
+    gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
     with gr.Row():
         current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
     with gr.Tab("Chatbot"):
         with gr.Row():
+            #chat_height = 500
+            chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
             with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
+                sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", scale = 1) # , height=chat_height
         with gr.Row():
             message = gr.Textbox(
+    with gr.Tab("Load in a different file to chat with"):
         with gr.Accordion("PDF file", open = False):
             in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
             load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
         with gr.Accordion("Web page", open = False):
             with gr.Row():
+                in_web = gr.Textbox(label="Enter web page url")
+                in_div = gr.Textbox(label="(Advanced) Web page div for text extraction", value="p", placeholder="p")
+            load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
+        with gr.Accordion("CSV/Excel file", open = False):
+            in_csv = gr.File(label="Upload CSV/Excel file", file_count="multiple", file_types=['.csv', '.xlsx'])
+            in_text_column = gr.Textbox(label="Enter column name where text is stored")
+            load_csv = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
+        ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
     with gr.Tab("Advanced features"):
         with gr.Row():
              then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
              then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
              then(chatf.hide_block, outputs = [examples_set])
+    # Load in a csv/excel file
+    load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
+             then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
+             then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
+             then(chatf.hide_block, outputs = [examples_set])
     # Load in a webpage
     clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
     clear.click(lambda: None, None, chatbot, queue=False)
+    # Thumbs up or thumbs down voting function
     chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
 block.queue(concurrency_count=1).launch(debug=True)

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import re
 import datetime
 from typing import TypeVar, Dict, List, Tuple
 import time
@@ -66,7 +67,7 @@ ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base
 # Used to pull out keywords from chat history to add to user queries behind the scenes
 kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
 if torch.cuda.is_available():
     torch_device = "cuda"
     gpu_layers = 0
@@ -136,18 +137,6 @@ gpu_config = CtransInitConfig_gpu()
 cpu_config = CtransInitConfig_cpu()
-#@dataclass
-#class CtransGenGenerationConfig:
-#    top_k: int = top_k
-#    top_p: float = top_p
-#    temperature: float = temperature
-#    repetition_penalty: float = tinyllama_repetition_penalty
-#    last_n_tokens: int = last_n_tokens
-#    seed: int = seed
-#    batch_size:int = batch_size
-#     threads: int = threads
-#     reset: bool = True
 class CtransGenGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
@@ -333,7 +322,11 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
                                                                           #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
         # Expand the found passages to the neighbouring context
-        docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
         if docs_keep_as_doc == []:
             {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
@@ -344,8 +337,9 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
         doc_df['meta_clean'] = [f"<b>{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
         doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
-        modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
-        docs_content_string = ''.join(modified_page_content)
         sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace("  "," ")#.strip()
@@ -481,6 +475,19 @@ def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_
         return new_question_kworded
 def create_doc_df(docs_keep_out):
     # Extract content and metadata from 'winning' passages.
             content=[]
@@ -489,11 +496,17 @@ def create_doc_df(docs_keep_out):
             page_section=[]
             score=[]
             for item in docs_keep_out:
                 content.append(item[0].page_content)
                 meta.append(item[0].metadata)
                 meta_url.append(item[0].metadata['source'])
-                page_section.append(item[0].metadata['page_section'])
                 score.append(item[1])
             # Create df from 'winning' passages
@@ -728,6 +741,12 @@ def get_expanded_passages(vectorstore, docs, width):
     expanded_docs = []
     for doc, score in docs:
         search_source = doc.metadata['source']
         search_section = doc.metadata['page_section']
         parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
         search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1

 import re
+import os
 import datetime
 from typing import TypeVar, Dict, List, Tuple
 import time
 # Used to pull out keywords from chat history to add to user queries behind the scenes
 kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
+# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
 if torch.cuda.is_available():
     torch_device = "cuda"
     gpu_layers = 0
 cpu_config = CtransInitConfig_cpu()
 class CtransGenGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
                                                                           #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
         # Expand the found passages to the neighbouring context
+        file_type = determine_file_type(doc_df['meta_url'][0])
+        # Only expand passages if not tabular data
+        if (file_type != ".csv") & (file_type != ".xlsx"):
+            docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
         if docs_keep_as_doc == []:
             {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
         doc_df['meta_clean'] = [f"<b>{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
         doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
+        #modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
+        modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
+        docs_content_string = '<br><br>'.join(modified_page_content)
         sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace("  "," ")#.strip()
         return new_question_kworded
+def determine_file_type(file_path):
+        """
+        Determine the file type based on its extension.
+        Parameters:
+            file_path (str): Path to the file.
+        Returns:
+            str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
+        """
+        return os.path.splitext(file_path)[1].lower()
 def create_doc_df(docs_keep_out):
     # Extract content and metadata from 'winning' passages.
             content=[]
             page_section=[]
             score=[]
             for item in docs_keep_out:
                 content.append(item[0].page_content)
                 meta.append(item[0].metadata)
                 meta_url.append(item[0].metadata['source'])
+                file_extension = determine_file_type(item[0].metadata['source'])
+                if (file_extension != ".csv") & (file_extension != ".xlsx"):
+                    page_section.append(item[0].metadata['page_section'])
+                else: page_section.append("")
                 score.append(item[1])
             # Create df from 'winning' passages
     expanded_docs = []
     for doc, score in docs:
         search_source = doc.metadata['source']
+        #if file_type == ".csv" | file_type == ".xlsx":
+        #     content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_by_source[search_source], 0, search_index)
+        #else:
         search_section = doc.metadata['page_section']
         parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
         search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1

chatfuncs/ingest.py CHANGED Viewed

@@ -44,31 +44,32 @@ chunk_overlap = 0
 start_index = True
 ## Parse files
-def parse_file(file_paths):
     """
     Accepts a list of file paths, determines each file's type based on its extension,
     and passes it to the relevant parsing function.
     Parameters:
         file_paths (list): List of file paths.
-        div (str): (optional) Div to pull out of html file/url with BeautifulSoup
     Returns:
         dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
     """
-    def determine_file_type(file_path):
-        """
-        Determine the file type based on its extension.
-        Parameters:
-            file_path (str): Path to the file.
-        Returns:
-            str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
-        """
-        return os.path.splitext(file_path)[1].lower()
     if not isinstance(file_paths, list):
         raise ValueError("Expected a list of file paths.")
@@ -78,7 +79,9 @@ def parse_file(file_paths):
         '.docx': parse_docx,
         '.txt': parse_txt,
         '.html': parse_html,
-        '.htm': parse_html  # Considering both .html and .htm for HTML files
     }
     parsed_contents = {}
@@ -115,6 +118,64 @@ def text_regex_clean(text):
         return text
 def parse_pdf(file) -> List[str]:
     """
@@ -308,8 +369,9 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
         if ext == '.pdf':
             docs, page_docs = pdf_text_to_docs(content, chunk_size)
         elif ext in ['.html', '.htm', '.txt', '.docx']:
-            # Assuming you want to process HTML similarly to PDF in this context
             docs = html_text_to_docs(content, chunk_size)
         else:
             print(f"Unsupported file type {ext} for {file_path}. Skipping.")
             continue
@@ -400,6 +462,44 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
     return documents
 # # Functions for working with documents after loading them back in
 def pull_out_data(series):

 start_index = True
 ## Parse files
+def determine_file_type(file_path):
+        """
+        Determine the file type based on its extension.
+        Parameters:
+            file_path (str): Path to the file.
+        Returns:
+            str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
+        """
+        return os.path.splitext(file_path)[1].lower()
+def parse_file(file_paths, text_column='text'):
     """
     Accepts a list of file paths, determines each file's type based on its extension,
     and passes it to the relevant parsing function.
     Parameters:
         file_paths (list): List of file paths.
+        text_column (str): Name of the column in CSV/Excel files that contains the text content.
     Returns:
         dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
     """
     if not isinstance(file_paths, list):
         raise ValueError("Expected a list of file paths.")
         '.docx': parse_docx,
         '.txt': parse_txt,
         '.html': parse_html,
+        '.htm': parse_html,  # Considering both .html and .htm for HTML files
+        '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
+        '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column)
     }
     parsed_contents = {}
         return text
+def parse_csv_or_excel(file_paths, text_column = "text"):
+        """
+        Read in a CSV or Excel file.
+        Parameters:
+            file_path (str): Path to the CSV file.
+            text_column (str): Name of the column in the CSV file that contains the text content.
+        Returns:
+            Pandas DataFrame: Dataframe output from file read
+        """
+        file_names = []
+        out_df = pd.DataFrame()
+        for file_path in file_paths:
+            file_extension = determine_file_type(file_path.name)
+            file_name = get_file_path_end(file_path.name)
+            if file_extension == ".csv":
+                df = pd.read_csv(file_path.name)
+                if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
+                df['source'] = file_name
+                df['page_section'] = ""
+            elif file_extension == ".xlsx":
+                df = pd.read_excel(file_path.name, engine='openpyxl')
+                if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
+                df['source'] = file_name
+                df['page_section'] = ""
+            else:
+                print(f"Unsupported file type: {file_extension}")
+                return pd.DataFrame(), ['Please choose a valid file type']
+            file_names.append(file_name)
+            out_df = pd.concat([out_df, df])
+        #if text_column not in df.columns:
+        #    return f"Column '{text_column}' not found in {file_path}"
+        #text_out = " ".join(df[text_column].dropna().astype(str))
+        return out_df, file_names
+def parse_excel(file_path, text_column):
+        """
+        Read text from an Excel file.
+        Parameters:
+            file_path (str): Path to the Excel file.
+            text_column (str): Name of the column in the Excel file that contains the text content.
+        Returns:
+            Pandas DataFrame: Dataframe output from file read
+        """
+        df = pd.read_excel(file_path, engine='openpyxl')
+        #if text_column not in df.columns:
+        #    return f"Column '{text_column}' not found in {file_path}"
+        #text_out = " ".join(df[text_column].dropna().astype(str))
+        return df
 def parse_pdf(file) -> List[str]:
     """
         if ext == '.pdf':
             docs, page_docs = pdf_text_to_docs(content, chunk_size)
         elif ext in ['.html', '.htm', '.txt', '.docx']:
             docs = html_text_to_docs(content, chunk_size)
+        elif ext in ['.csv', '.xlsx']:
+            docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
         else:
             print(f"Unsupported file type {ext} for {file_path}. Skipping.")
             continue
     return documents
+def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
+    """Converts a DataFrame's content to a list of Documents with metadata."""
+    doc_sections = []
+    df[text_column] = df[text_column].astype(str) # Ensure column is a string column
+    # For each row in the dataframe
+    for idx, row in df.iterrows():
+        # Extract the text content for the document
+        doc_content = row[text_column]
+        # Generate metadata containing other columns' data
+        metadata = {"row": idx + 1}
+        for col, value in row.items():
+            if col != text_column:
+                metadata[col] = value
+        # If chunk_size is provided, split the text into chunks
+        if chunk_size:
+            # Assuming you have a text splitter function similar to the PDF handling
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                # Other arguments as required by the splitter
+            )
+            sections = text_splitter.split_text(doc_content)
+            # For each section, create a Document object
+            for i, section in enumerate(sections):
+                doc = Document(page_content=section,
+                               metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
+                doc_sections.append(doc)
+        else:
+            # If no chunk_size is provided, create a single Document object for the row
+            doc = Document(page_content=doc_content, metadata=metadata)
+            doc_sections.append(doc)
+    return doc_sections
 # # Functions for working with documents after loading them back in
 def pull_out_data(series):