Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

TheoLvs commited on Feb 1

Commit

4b4bf28

•

1 Parent(s): 24f8d00

Updated v1.3 with images

Browse files

Files changed (8) hide show

app.py +161 -91
climateqa/engine/prompts.py +44 -3
climateqa/engine/rag.py +43 -7
climateqa/engine/retriever.py +5 -4
climateqa/engine/vectorstore.py +19 -10
climateqa/sample_questions.py +6 -0
climateqa/utils.py +1 -1
style.css +25 -1

app.py CHANGED Viewed

@@ -9,6 +9,10 @@ import os
 import time
 import re
 import json
 from datetime import datetime
 from azure.storage.fileshare import ShareServiceClient
@@ -64,8 +68,6 @@ file_share_name = "climateqa"
 service = ShareServiceClient(account_url=account_url, credential=credential)
 share_client = service.get_share_client(file_share_name)
-print("YO",account_url,credential)
 user_id = create_user_id()
@@ -145,18 +147,12 @@ async def chat(query,history,audience,sources,reports):
         reports = []
-    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,reports = reports,k_summary = 3,k_total = 10,threshold=0.5)
     rag_chain = make_rag_chain(retriever,llm)
-    source_string = ""
     # gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
     # history = history + [(query,"")]
     # print(history)
     # print(gradio_format)
     # # reset memory
@@ -227,7 +223,7 @@ async def chat(query,history,audience,sources,reports):
                     output_language = op['value']["language"] # str
                     output_query = op["value"]["question"]
                 except Exception as e:
-                    raise gr.Error(f"ClimateQ&A Error: {e}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
             elif op['path'] == retriever_path_id: # documents
                 try:
@@ -267,8 +263,7 @@ async def chat(query,history,audience,sources,reports):
             yield history,docs_html,output_query,output_language,gallery
     except Exception as e:
-        print(f"Error in fallback iterator: {e}")
-        raise gr.Error(f"ClimateQ&A Error: {e}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
     try:
@@ -282,6 +277,7 @@ async def chat(query,history,audience,sources,reports):
                 "prompt": prompt,
                 "query": prompt,
                 "question":output_query,
                 "docs":serialize_docs(docs),
                 "answer": history[-1][1],
                 "time": timestamp,
@@ -289,8 +285,43 @@ async def chat(query,history,audience,sources,reports):
             log_on_azure(file, logs, share_client)
     except Exception as e:
         print(f"Error logging on Azure Blob Storage: {e}")
-        raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
     # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
     # if len(gallery) > 0:
@@ -334,21 +365,66 @@ def make_html_source(source,i):
     meta = source.metadata
     # content = source.page_content.split(":",1)[1].strip()
     content = source.page_content.strip()
-    return f"""
-<div class="card">
-    <div class="card-content">
-        <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
-        <p>{content}</p>
-    </div>
-    <div class="card-footer">
-        <span>{meta['name']}</span>
-        <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-            <span role="img" aria-label="Open PDF">🔗</span>
-        </a>
     </div>
-</div>
-"""
@@ -501,71 +577,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
                         output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
                         output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
-                    with gr.Tab("Figures",elem_id = "tab-images",id = 3):
-                        gallery = gr.Gallery()
-                def start_chat(query,history):
-                    history = history + [(query,"")]
-                    history = [tuple(x) for x in history]
-                    print(history)
-                    return (gr.update(interactive = False),gr.update(selected=1),history)
-                def finish_chat():
-                    return (gr.update(interactive = True,value = ""))
-                (textbox
-                    .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-                    .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery],concurrency_limit = 8,api_name = "chat_textbox")
-                    .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
-                )
-                (examples_hidden
-                    .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-                    .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery],concurrency_limit = 8,api_name = "chat_examples")
-                    .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
-                )
-                def change_sample_questions(key):
-                    index = list(QUESTIONS.keys()).index(key)
-                    visible_bools = [False] * len(samples)
-                    visible_bools[index] = True
-                    return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
-                dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
-                # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
-                # (textbox
-                #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
-                #     .success(change_tab,None,tabs)
-                #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
-                #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
-                #     .success(lambda x : textbox,[textbox],[textbox])
-                # )
-                # (examples_hidden
-                #     .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
-                #     .success(change_tab,None,tabs)
-                #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
-                #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
-                #     .success(lambda x : textbox,[textbox],[textbox])
-                # )
-                # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
-                #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
-                #     )
@@ -575,6 +586,9 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
 #---------------------------------------------------------------------------------------
     with gr.Tab("About ClimateQ&A",elem_classes = "max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
@@ -758,6 +772,62 @@ Or around 2 to 4 times more than a typical Google search.
 """
     )
     demo.queue()
 demo.launch()

 import time
 import re
 import json
+from io import BytesIO
+import base64
 from datetime import datetime
 from azure.storage.fileshare import ShareServiceClient
 service = ShareServiceClient(account_url=account_url, credential=credential)
 share_client = service.get_share_client(file_share_name)
 user_id = create_user_id()
         reports = []
+    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
     rag_chain = make_rag_chain(retriever,llm)
     # gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
     # history = history + [(query,"")]
     # print(history)
     # print(gradio_format)
     # # reset memory
                     output_language = op['value']["language"] # str
                     output_query = op["value"]["question"]
                 except Exception as e:
+                    raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
             elif op['path'] == retriever_path_id: # documents
                 try:
             yield history,docs_html,output_query,output_language,gallery
     except Exception as e:
+        raise gr.Error(f"ClimateQ&A Error: {e}</br>The error has been noted, try another question and if the error remains, you can contact us :)")
     try:
                 "prompt": prompt,
                 "query": prompt,
                 "question":output_query,
+                "sources":sources,
                 "docs":serialize_docs(docs),
                 "answer": history[-1][1],
                 "time": timestamp,
             log_on_azure(file, logs, share_client)
     except Exception as e:
         print(f"Error logging on Azure Blob Storage: {e}")
+        raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]}</br>The error has been noted, try another question and if the error remains, you can contact us :)")
+    image_dict = {}
+    for i,doc in enumerate(docs):
+        if doc.metadata["chunk_type"] == "image":
+            try:
+                key = f"Image {i}"
+                image_path = doc.metadata["image_path"].split("documents/")[1]
+                img = get_image_from_azure_blob_storage(image_path)
+                # Convert the image to a byte buffer
+                buffered = BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode()
+                # Embedding the base64 string in Markdown
+                markdown_image = f"![Alt text](data:image/png;base64,{img_str})"
+                image_dict[key] = {"img":img,"md":markdown_image,"caption":doc.page_content,"key":key,"figure_code":doc.metadata["figure_code"]}
+            except Exception as e:
+                print(f"Skipped adding image {i} because of {e}")
+    if len(image_dict) > 0:
+        gallery = [x["img"] for x in list(image_dict.values())]
+        img = list(image_dict.values())[0]
+        img_md = img["md"]
+        img_caption = img["caption"]
+        img_code = img["figure_code"]
+        if img_code != "N/A":
+            img_name = f"{img['key']} - {img['figure_code']}"
+        else:
+            img_name = f"{img['key']}"
+        answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
+        history[-1] = (history[-1][0],answer_yet)
+        history = [tuple(x) for x in history]
     # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
     # if len(gallery) > 0:
     meta = source.metadata
     # content = source.page_content.split(":",1)[1].strip()
     content = source.page_content.strip()
+    toc_levels = []
+    for j in range(2):
+        level = meta[f"toc_level{j}"]
+        if level != "N/A":
+            toc_levels.append(level)
+        else:
+            break
+    toc_levels = " > ".join(toc_levels)
+    print(toc_levels)
+    if len(toc_levels) > 0:
+        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
+    else:
+        name = meta['name']
+    print(name)
+    if meta["chunk_type"] == "text":
+        card = f"""
+    <div class="card">
+        <div class="card-content">
+            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
     </div>
+    """
+    else:
+        if meta["figure_code"] != "N/A":
+            title = f"{meta['figure_code']} - {meta['short_name']}"
+        else:
+            title = f"{meta['short_name']}"
+        card = f"""
+    <div class="card card-image">
+        <div class="card-content">
+            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+            <p class='ai-generated'>AI-generated description</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    return card
                         output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
                         output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
 #---------------------------------------------------------------------------------------
+    with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
+        gallery_component = gr.Gallery()
     with gr.Tab("About ClimateQ&A",elem_classes = "max-height other-tabs"):
         with gr.Row():
             with gr.Column(scale=1):
 """
     )
+    def start_chat(query,history):
+        history = history + [(query,"")]
+        history = [tuple(x) for x in history]
+        print(history)
+        return (gr.update(interactive = False),gr.update(selected=1),history)
+    def finish_chat():
+        return (gr.update(interactive = True,value = ""))
+    (textbox
+        .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
+        .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
+    )
+    (examples_hidden
+        .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
+        .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
+    )
+    def change_sample_questions(key):
+        index = list(QUESTIONS.keys()).index(key)
+        visible_bools = [False] * len(samples)
+        visible_bools[index] = True
+        return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
+    dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
+    # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
+    # (textbox
+    #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # (examples_hidden
+    #     .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
+    #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
+    #     )
     demo.queue()
 demo.launch()

climateqa/engine/prompts.py CHANGED Viewed

@@ -49,6 +49,7 @@ Guidelines:
 - You do not need to use every passage. Only use the ones that help answer the question.
 - If the documents do not have the information needed to answer the question, just say you do not have enough information.
 - Consider by default that the question is about the past century unless it is specified otherwise.
 -----------------------
 Passages:
@@ -59,18 +60,58 @@ Question: {question} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 answer_prompt_without_docs_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. Your role is to explain climate-related questions using info from the IPCC and/or IPBES reports.
 Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
 Guidelines:
-- Start by explaining clearly that you could not find the answer in the IPCC/IPBES reports, so your answer is based on your own knowledge and must be taken with great caution because it's AI generated.
 - Also explain that the user should reformulate the question to be more specific to the IPCC/IPBES reports
 - Because your role is to explain climate science, you can never be skeptical about climate change.
-- If the question is not related to environmental issues, never never answer it. Say it's not your role.
-Question: {question} - Explained to {audience}
 Answer in {language}:
 """

 - You do not need to use every passage. Only use the ones that help answer the question.
 - If the documents do not have the information needed to answer the question, just say you do not have enough information.
 - Consider by default that the question is about the past century unless it is specified otherwise.
+- If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
 -----------------------
 Passages:
 Answer in {language} with the passages citations:
 """
+answer_prompt_images_template = """
+You are ClimateQ&A, an AI Assistant created by Ekimetrics.
+You are given the answer to a environmental question based on passages from the IPCC and IPBES reports and image captions.
+Generate a follow-up and illustrated explanation to the existing answer using the content of the image caption.
+The actual images will be inserted in the user interface afterward.
+Guidelines:
+- Don't summarize the previous answer or make an introduction, you only need to illustrate with the images.
+- Mention the image using similar sentence : "Indeed, as we see in this picture ...", "In the following image, it is shown that ...", but without mentioning the Image number
+- Insert a placeholder like this [Image i] and by skipping to a new line before and after, where the image will be displayed within your explanation
+For example :
+```
+Sea rise is projected to endanger isolated islands by 2050.
+In the figure below, we see an projection of sea level rise:
+[Image 1]
+This image is depicting the urgency depicted in the passages.
+```
+-----------------------
+Image captions:
+{images}
+-----------------------
+Question:
+{question}
+-----------------------
+Answer:
+{answer}
+-----------------------
+Follow-up explanation in {language} explained to {audience}:
+"""
 answer_prompt_without_docs_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. Your role is to explain climate-related questions using info from the IPCC and/or IPBES reports.
 Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
 Guidelines:
+- If it's a conversational question such as "hello", "who are you", ..., you can answer directly
+- Start by explaining clearly that you could not find any passages to answer in the IPCC/IPBES reports, but it can be because of the search engine, and not because it's not there. So your answer is based on your own knowledge and must be taken with great caution because it's AI generated and you prefer to use sources to answer.
 - Also explain that the user should reformulate the question to be more specific to the IPCC/IPBES reports
 - Because your role is to explain climate science, you can never be skeptical about climate change.
+- If the question is not related to environmental issues, never never answer it. Say it's not your role.
+- Make paragraphs by starting new lines to make your answers more readable.
+Question: {question}
 Answer in {language}:
 """

climateqa/engine/rag.py CHANGED Viewed

@@ -7,7 +7,7 @@ from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.prompts.base import format_document
 from climateqa.engine.reformulation import make_reformulation_chain
-from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template
 from climateqa.engine.utils import pass_values, flatten_dict
@@ -16,10 +16,26 @@ DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}"
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
 ):
-    doc_strings = [f"Doc {i+1}: " + format_document(doc, document_prompt) for i,doc in enumerate(docs)]
     return sep.join(doc_strings)
 def make_rag_chain(retriever,llm):
@@ -51,22 +67,29 @@ def make_rag_chain(retriever,llm):
         **pass_values(["question","audience","language"])
     }
-    # Generate the answer
     answer_with_docs = {
         "answer": input_documents | prompt | llm | StrOutputParser(),
-        **pass_values(["question","audience","language","query","docs"])
     }
     answer_without_docs = {
         "answer":  prompt_without_docs | llm | StrOutputParser(),
-        **pass_values(["question","audience","language","query","docs"])
     }
     answer = RunnableBranch(
-        (lambda x: len(x["docs"]) > 0, answer_with_docs),
         answer_without_docs,
     )
@@ -77,3 +100,16 @@ def make_rag_chain(retriever,llm):
     return rag_chain

 from langchain_core.prompts.base import format_document
 from climateqa.engine.reformulation import make_reformulation_chain
+from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
 from climateqa.engine.utils import pass_values, flatten_dict
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
 ):
+    doc_strings =  []
+    for i,doc in enumerate(docs):
+        # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
+        chunk_type = "Doc"
+        doc_string = f"{chunk_type} {i+1}: " + format_document(doc, document_prompt)
+        doc_string = doc_string.replace("\n"," ")
+        doc_strings.append(doc_string)
     return sep.join(doc_strings)
+def get_text_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
+def get_image_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
 def make_rag_chain(retriever,llm):
         **pass_values(["question","audience","language"])
     }
+    # ------- CHAIN 3
+    # Bot answer
     answer_with_docs = {
         "answer": input_documents | prompt | llm | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs"]),
     }
     answer_without_docs = {
         "answer":  prompt_without_docs | llm | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs"]),
     }
+    # def has_images(x):
+    #     image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
+    #     return len(image_docs) > 0
+    def has_docs(x):
+        return len(x["docs"]) > 0
     answer = RunnableBranch(
+        (lambda x: has_docs(x), answer_with_docs),
         answer_without_docs,
     )
     return rag_chain
+def make_illustration_chain(llm):
+    prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
+    input_description_images = {
+        "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
+        **pass_values(["question","audience","language","answer"]),
+    }
+    illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
+    return illustration_chain

climateqa/engine/retriever.py CHANGED Viewed

@@ -18,7 +18,8 @@ class ClimateQARetriever(BaseRetriever):
     threshold:float = 0.6
     k_summary:int = 3
     k_total:int = 10
-    namespace:str = "vectors"
     def _get_relevant_documents(
@@ -31,8 +32,8 @@ class ClimateQARetriever(BaseRetriever):
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {}
         if len(self.reports) > 0:
             filters["short_name"] = {"$in":self.reports}
         else:
@@ -59,14 +60,14 @@ class ClimateQARetriever(BaseRetriever):
         docs = docs_summaries + docs_full
         # Filter if scores are below threshold
-        docs = [x for x in docs if x[1] > self.threshold]
         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
-            doc.metadata["page_number"] = int(doc.metadata["page_number"])
             # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
             results.append(doc)

     threshold:float = 0.6
     k_summary:int = 3
     k_total:int = 10
+    namespace:str = "vectors",
+    min_size:int = 200,
     def _get_relevant_documents(
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
         filters = {}
         if len(self.reports) > 0:
             filters["short_name"] = {"$in":self.reports}
         else:
         docs = docs_summaries + docs_full
         # Filter if scores are below threshold
+        # docs = [x for x in docs if x[1] > self.threshold and len(x[0].page_content) > self.min_size]
         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
             # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
             results.append(doc)

climateqa/engine/vectorstore.py CHANGED Viewed

@@ -2,8 +2,8 @@
 # More info at https://docs.pinecone.io/docs/langchain
 # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
 import os
-import pinecone
-from langchain_community.vectorstores import Pinecone
 # LOAD ENVIRONMENT VARIABLES
 try:
@@ -13,20 +13,29 @@ except:
     pass
-def get_pinecone_vectorstore(embeddings,text_key = "text"):
-    # initialize pinecone
-    pinecone.init(
-        api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
-        environment=os.getenv("PINECONE_API_ENVIRONMENT"),  # next to api key in console
-    )
-    index_name = os.getenv("PINECONE_API_INDEX")
-    vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
     return vectorstore
 # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
 #     assert isinstance(sources,list)

 # More info at https://docs.pinecone.io/docs/langchain
 # And https://python.langchain.com/docs/integrations/vectorstores/pinecone
 import os
+from pinecone import Pinecone
+from langchain_community.vectorstores import Pinecone as PineconeVectorstore
 # LOAD ENVIRONMENT VARIABLES
 try:
     pass
+def get_pinecone_vectorstore(embeddings,text_key = "content"):
+    # # initialize pinecone
+    # pinecone.init(
+    #     api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
+    #     environment=os.getenv("PINECONE_API_ENVIRONMENT"),  # next to api key in console
+    # )
+    # index_name = os.getenv("PINECONE_API_INDEX")
+    # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
+    # return vectorstore
+    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+    index = pc.Index(os.getenv("PINECONE_API_INDEX"))
+    vectorstore = PineconeVectorstore(
+        index, embeddings, text_key,
+    )
     return vectorstore
 # def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
 #     assert isinstance(sources,list)

climateqa/sample_questions.py CHANGED Viewed

@@ -73,6 +73,12 @@ QUESTIONS = {
         "What are the impacts of invasive alien species on Indigenous Peoples and local communities?",
         "What technologies and tools are available for managing invasive alien species?",
         "How do economic and land-use changes facilitate the introduction and spread of invasive alien species?"
     ]
 }

         "What are the impacts of invasive alien species on Indigenous Peoples and local communities?",
         "What technologies and tools are available for managing invasive alien species?",
         "How do economic and land-use changes facilitate the introduction and spread of invasive alien species?"
+    ],
+    "Experimental images":[
+        "Is warming unprecedented in the past 200 years ?",
+        "Are human activities causing global warming?",
+        "What is the distribution of uncertainty in projected precipitation changes across different time frames ?",
+        "What are the anticipated changes in the global water cycle by the end of the 21st century under an intermediate emissions scenario ?",
     ]
 }

climateqa/utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ def get_file_from_azure_blob_storage(path):
 def get_image_from_azure_blob_storage(path):
-    base_path = "search_demo/climateq&a/processed_image/"
     path = os.path.join(base_path, path)
     file_object = get_file_from_azure_blob_storage(path)
     image = Image.open(file_object)

 def get_image_from_azure_blob_storage(path):
+    base_path = "climateqa/documents/"
     path = os.path.join(base_path, path)
     file_object = get_file_from_azure_blob_storage(path)
     image = Image.open(file_object)

style.css CHANGED Viewed

@@ -295,4 +295,28 @@ body.dark .card-footer span {
     white-space: normal !important; /* Allow the text to wrap */
     word-break: break-word !important; /* Break words to prevent overflow */
     overflow-wrap: break-word !important; /* Break long words if necessary */
-  }

     white-space: normal !important; /* Allow the text to wrap */
     word-break: break-word !important; /* Break words to prevent overflow */
     overflow-wrap: break-word !important; /* Break long words if necessary */
+  }
+span.chatbot > p > img{
+    margin-top:40px !important;
+    max-height: none !important;
+    max-width: 80% !important;
+    border-radius:0px !important;
+}
+.chatbot-caption{
+    font-size:11px;
+    font-style:italic;
+    color:#508094;
+}
+.ai-generated{
+    font-size:11px!important;
+    font-style:italic;
+    color:#73b8d4 !important;
+}
+.card-image > .card-content{
+    background-color:#f1f7fa !important;
+}