Updated v1.3 with images
Browse files- app.py +161 -91
- climateqa/engine/prompts.py +44 -3
- climateqa/engine/rag.py +43 -7
- climateqa/engine/retriever.py +5 -4
- climateqa/engine/vectorstore.py +19 -10
- climateqa/sample_questions.py +6 -0
- climateqa/utils.py +1 -1
- style.css +25 -1
app.py
CHANGED
@@ -9,6 +9,10 @@ import os
|
|
9 |
import time
|
10 |
import re
|
11 |
import json
|
|
|
|
|
|
|
|
|
12 |
from datetime import datetime
|
13 |
from azure.storage.fileshare import ShareServiceClient
|
14 |
|
@@ -64,8 +68,6 @@ file_share_name = "climateqa"
|
|
64 |
service = ShareServiceClient(account_url=account_url, credential=credential)
|
65 |
share_client = service.get_share_client(file_share_name)
|
66 |
|
67 |
-
print("YO",account_url,credential)
|
68 |
-
|
69 |
user_id = create_user_id()
|
70 |
|
71 |
|
@@ -145,18 +147,12 @@ async def chat(query,history,audience,sources,reports):
|
|
145 |
reports = []
|
146 |
|
147 |
|
148 |
-
retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,reports = reports,k_summary = 3,k_total =
|
149 |
rag_chain = make_rag_chain(retriever,llm)
|
150 |
|
151 |
-
source_string = ""
|
152 |
-
|
153 |
-
|
154 |
# gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
|
155 |
-
|
156 |
# history = history + [(query,"")]
|
157 |
-
|
158 |
# print(history)
|
159 |
-
|
160 |
# print(gradio_format)
|
161 |
|
162 |
# # reset memory
|
@@ -227,7 +223,7 @@ async def chat(query,history,audience,sources,reports):
|
|
227 |
output_language = op['value']["language"] # str
|
228 |
output_query = op["value"]["question"]
|
229 |
except Exception as e:
|
230 |
-
raise gr.Error(f"ClimateQ&A Error: {e}
|
231 |
|
232 |
elif op['path'] == retriever_path_id: # documents
|
233 |
try:
|
@@ -267,8 +263,7 @@ async def chat(query,history,audience,sources,reports):
|
|
267 |
yield history,docs_html,output_query,output_language,gallery
|
268 |
|
269 |
except Exception as e:
|
270 |
-
|
271 |
-
raise gr.Error(f"ClimateQ&A Error: {e}\nThe error has been noted, try another question and if the error remains, you can contact us :)")
|
272 |
|
273 |
|
274 |
try:
|
@@ -282,6 +277,7 @@ async def chat(query,history,audience,sources,reports):
|
|
282 |
"prompt": prompt,
|
283 |
"query": prompt,
|
284 |
"question":output_query,
|
|
|
285 |
"docs":serialize_docs(docs),
|
286 |
"answer": history[-1][1],
|
287 |
"time": timestamp,
|
@@ -289,8 +285,43 @@ async def chat(query,history,audience,sources,reports):
|
|
289 |
log_on_azure(file, logs, share_client)
|
290 |
except Exception as e:
|
291 |
print(f"Error logging on Azure Blob Storage: {e}")
|
292 |
-
raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]}
|
293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
# gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
|
296 |
# if len(gallery) > 0:
|
@@ -334,21 +365,66 @@ def make_html_source(source,i):
|
|
334 |
meta = source.metadata
|
335 |
# content = source.page_content.split(":",1)[1].strip()
|
336 |
content = source.page_content.strip()
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
</div>
|
349 |
-
|
350 |
-
|
|
|
351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
|
354 |
|
@@ -501,71 +577,6 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
|
|
501 |
output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
|
502 |
output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
|
503 |
|
504 |
-
with gr.Tab("Figures",elem_id = "tab-images",id = 3):
|
505 |
-
gallery = gr.Gallery()
|
506 |
-
|
507 |
-
|
508 |
-
def start_chat(query,history):
|
509 |
-
history = history + [(query,"")]
|
510 |
-
history = [tuple(x) for x in history]
|
511 |
-
print(history)
|
512 |
-
return (gr.update(interactive = False),gr.update(selected=1),history)
|
513 |
-
|
514 |
-
def finish_chat():
|
515 |
-
return (gr.update(interactive = True,value = ""))
|
516 |
-
|
517 |
-
(textbox
|
518 |
-
.submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
|
519 |
-
.then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery],concurrency_limit = 8,api_name = "chat_textbox")
|
520 |
-
.then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
|
521 |
-
)
|
522 |
-
|
523 |
-
(examples_hidden
|
524 |
-
.change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
|
525 |
-
.then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery],concurrency_limit = 8,api_name = "chat_examples")
|
526 |
-
.then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
|
527 |
-
)
|
528 |
-
|
529 |
-
|
530 |
-
def change_sample_questions(key):
|
531 |
-
index = list(QUESTIONS.keys()).index(key)
|
532 |
-
visible_bools = [False] * len(samples)
|
533 |
-
visible_bools[index] = True
|
534 |
-
return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
|
539 |
-
|
540 |
-
# # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
|
541 |
-
# (textbox
|
542 |
-
# .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
|
543 |
-
# .success(change_tab,None,tabs)
|
544 |
-
# .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
|
545 |
-
# .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
|
546 |
-
# .success(lambda x : textbox,[textbox],[textbox])
|
547 |
-
# )
|
548 |
-
|
549 |
-
# (examples_hidden
|
550 |
-
# .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
|
551 |
-
# .success(change_tab,None,tabs)
|
552 |
-
# .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
|
553 |
-
# .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
|
554 |
-
# .success(lambda x : textbox,[textbox],[textbox])
|
555 |
-
# )
|
556 |
-
# submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
|
557 |
-
# answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
|
558 |
-
# )
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
|
570 |
|
571 |
|
@@ -575,6 +586,9 @@ with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main
|
|
575 |
#---------------------------------------------------------------------------------------
|
576 |
|
577 |
|
|
|
|
|
|
|
578 |
with gr.Tab("About ClimateQ&A",elem_classes = "max-height other-tabs"):
|
579 |
with gr.Row():
|
580 |
with gr.Column(scale=1):
|
@@ -758,6 +772,62 @@ Or around 2 to 4 times more than a typical Google search.
|
|
758 |
"""
|
759 |
)
|
760 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
761 |
demo.queue()
|
762 |
|
763 |
demo.launch()
|
|
|
9 |
import time
|
10 |
import re
|
11 |
import json
|
12 |
+
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
|
16 |
from datetime import datetime
|
17 |
from azure.storage.fileshare import ShareServiceClient
|
18 |
|
|
|
68 |
service = ShareServiceClient(account_url=account_url, credential=credential)
|
69 |
share_client = service.get_share_client(file_share_name)
|
70 |
|
|
|
|
|
71 |
user_id = create_user_id()
|
72 |
|
73 |
|
|
|
147 |
reports = []
|
148 |
|
149 |
|
150 |
+
retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
|
151 |
rag_chain = make_rag_chain(retriever,llm)
|
152 |
|
|
|
|
|
|
|
153 |
# gradio_format = make_pairs([a.content for a in history]) + [(query, "")]
|
|
|
154 |
# history = history + [(query,"")]
|
|
|
155 |
# print(history)
|
|
|
156 |
# print(gradio_format)
|
157 |
|
158 |
# # reset memory
|
|
|
223 |
output_language = op['value']["language"] # str
|
224 |
output_query = op["value"]["question"]
|
225 |
except Exception as e:
|
226 |
+
raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
|
227 |
|
228 |
elif op['path'] == retriever_path_id: # documents
|
229 |
try:
|
|
|
263 |
yield history,docs_html,output_query,output_language,gallery
|
264 |
|
265 |
except Exception as e:
|
266 |
+
raise gr.Error(f"ClimateQ&A Error: {e}</br>The error has been noted, try another question and if the error remains, you can contact us :)")
|
|
|
267 |
|
268 |
|
269 |
try:
|
|
|
277 |
"prompt": prompt,
|
278 |
"query": prompt,
|
279 |
"question":output_query,
|
280 |
+
"sources":sources,
|
281 |
"docs":serialize_docs(docs),
|
282 |
"answer": history[-1][1],
|
283 |
"time": timestamp,
|
|
|
285 |
log_on_azure(file, logs, share_client)
|
286 |
except Exception as e:
|
287 |
print(f"Error logging on Azure Blob Storage: {e}")
|
288 |
+
raise gr.Error(f"ClimateQ&A Error: {str(e)[:100]}</br>The error has been noted, try another question and if the error remains, you can contact us :)")
|
289 |
|
290 |
+
image_dict = {}
|
291 |
+
for i,doc in enumerate(docs):
|
292 |
+
|
293 |
+
if doc.metadata["chunk_type"] == "image":
|
294 |
+
try:
|
295 |
+
key = f"Image {i}"
|
296 |
+
image_path = doc.metadata["image_path"].split("documents/")[1]
|
297 |
+
img = get_image_from_azure_blob_storage(image_path)
|
298 |
+
|
299 |
+
# Convert the image to a byte buffer
|
300 |
+
buffered = BytesIO()
|
301 |
+
img.save(buffered, format="PNG")
|
302 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
303 |
+
|
304 |
+
# Embedding the base64 string in Markdown
|
305 |
+
markdown_image = f"![Alt text](data:image/png;base64,{img_str})"
|
306 |
+
image_dict[key] = {"img":img,"md":markdown_image,"caption":doc.page_content,"key":key,"figure_code":doc.metadata["figure_code"]}
|
307 |
+
except Exception as e:
|
308 |
+
print(f"Skipped adding image {i} because of {e}")
|
309 |
+
|
310 |
+
if len(image_dict) > 0:
|
311 |
+
|
312 |
+
gallery = [x["img"] for x in list(image_dict.values())]
|
313 |
+
img = list(image_dict.values())[0]
|
314 |
+
img_md = img["md"]
|
315 |
+
img_caption = img["caption"]
|
316 |
+
img_code = img["figure_code"]
|
317 |
+
if img_code != "N/A":
|
318 |
+
img_name = f"{img['key']} - {img['figure_code']}"
|
319 |
+
else:
|
320 |
+
img_name = f"{img['key']}"
|
321 |
+
|
322 |
+
answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
|
323 |
+
history[-1] = (history[-1][0],answer_yet)
|
324 |
+
history = [tuple(x) for x in history]
|
325 |
|
326 |
# gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
|
327 |
# if len(gallery) > 0:
|
|
|
365 |
meta = source.metadata
|
366 |
# content = source.page_content.split(":",1)[1].strip()
|
367 |
content = source.page_content.strip()
|
368 |
+
|
369 |
+
toc_levels = []
|
370 |
+
for j in range(2):
|
371 |
+
level = meta[f"toc_level{j}"]
|
372 |
+
if level != "N/A":
|
373 |
+
toc_levels.append(level)
|
374 |
+
else:
|
375 |
+
break
|
376 |
+
toc_levels = " > ".join(toc_levels)
|
377 |
+
print(toc_levels)
|
378 |
+
|
379 |
+
if len(toc_levels) > 0:
|
380 |
+
name = f"<b>{toc_levels}</b><br/>{meta['name']}"
|
381 |
+
else:
|
382 |
+
name = meta['name']
|
383 |
+
|
384 |
+
print(name)
|
385 |
+
|
386 |
+
|
387 |
+
if meta["chunk_type"] == "text":
|
388 |
+
|
389 |
+
card = f"""
|
390 |
+
<div class="card">
|
391 |
+
<div class="card-content">
|
392 |
+
<h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
|
393 |
+
<p>{content}</p>
|
394 |
+
</div>
|
395 |
+
<div class="card-footer">
|
396 |
+
<span>{name}</span>
|
397 |
+
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
|
398 |
+
<span role="img" aria-label="Open PDF">🔗</span>
|
399 |
+
</a>
|
400 |
+
</div>
|
401 |
</div>
|
402 |
+
"""
|
403 |
+
|
404 |
+
else:
|
405 |
|
406 |
+
if meta["figure_code"] != "N/A":
|
407 |
+
title = f"{meta['figure_code']} - {meta['short_name']}"
|
408 |
+
else:
|
409 |
+
title = f"{meta['short_name']}"
|
410 |
+
|
411 |
+
card = f"""
|
412 |
+
<div class="card card-image">
|
413 |
+
<div class="card-content">
|
414 |
+
<h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
|
415 |
+
<p>{content}</p>
|
416 |
+
<p class='ai-generated'>AI-generated description</p>
|
417 |
+
</div>
|
418 |
+
<div class="card-footer">
|
419 |
+
<span>{name}</span>
|
420 |
+
<a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
|
421 |
+
<span role="img" aria-label="Open PDF">🔗</span>
|
422 |
+
</a>
|
423 |
+
</div>
|
424 |
+
</div>
|
425 |
+
"""
|
426 |
+
|
427 |
+
return card
|
428 |
|
429 |
|
430 |
|
|
|
577 |
output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
|
578 |
output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
|
579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
580 |
|
581 |
|
582 |
|
|
|
586 |
#---------------------------------------------------------------------------------------
|
587 |
|
588 |
|
589 |
+
with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
|
590 |
+
gallery_component = gr.Gallery()
|
591 |
+
|
592 |
with gr.Tab("About ClimateQ&A",elem_classes = "max-height other-tabs"):
|
593 |
with gr.Row():
|
594 |
with gr.Column(scale=1):
|
|
|
772 |
"""
|
773 |
)
|
774 |
|
775 |
+
|
776 |
+
|
777 |
+
|
778 |
+
def start_chat(query,history):
|
779 |
+
history = history + [(query,"")]
|
780 |
+
history = [tuple(x) for x in history]
|
781 |
+
print(history)
|
782 |
+
return (gr.update(interactive = False),gr.update(selected=1),history)
|
783 |
+
|
784 |
+
def finish_chat():
|
785 |
+
return (gr.update(interactive = True,value = ""))
|
786 |
+
|
787 |
+
(textbox
|
788 |
+
.submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
|
789 |
+
.then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
|
790 |
+
.then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
|
791 |
+
)
|
792 |
+
|
793 |
+
(examples_hidden
|
794 |
+
.change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
|
795 |
+
.then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
|
796 |
+
.then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
|
797 |
+
)
|
798 |
+
|
799 |
+
|
800 |
+
def change_sample_questions(key):
|
801 |
+
index = list(QUESTIONS.keys()).index(key)
|
802 |
+
visible_bools = [False] * len(samples)
|
803 |
+
visible_bools[index] = True
|
804 |
+
return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
|
805 |
+
|
806 |
+
|
807 |
+
|
808 |
+
dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
|
809 |
+
|
810 |
+
# # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
|
811 |
+
# (textbox
|
812 |
+
# .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
|
813 |
+
# .success(change_tab,None,tabs)
|
814 |
+
# .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
|
815 |
+
# .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
|
816 |
+
# .success(lambda x : textbox,[textbox],[textbox])
|
817 |
+
# )
|
818 |
+
|
819 |
+
# (examples_hidden
|
820 |
+
# .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
|
821 |
+
# .success(change_tab,None,tabs)
|
822 |
+
# .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
|
823 |
+
# .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
|
824 |
+
# .success(lambda x : textbox,[textbox],[textbox])
|
825 |
+
# )
|
826 |
+
# submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
|
827 |
+
# answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
|
828 |
+
# )
|
829 |
+
|
830 |
+
|
831 |
demo.queue()
|
832 |
|
833 |
demo.launch()
|
climateqa/engine/prompts.py
CHANGED
@@ -49,6 +49,7 @@ Guidelines:
|
|
49 |
- You do not need to use every passage. Only use the ones that help answer the question.
|
50 |
- If the documents do not have the information needed to answer the question, just say you do not have enough information.
|
51 |
- Consider by default that the question is about the past century unless it is specified otherwise.
|
|
|
52 |
|
53 |
-----------------------
|
54 |
Passages:
|
@@ -59,18 +60,58 @@ Question: {question} - Explained to {audience}
|
|
59 |
Answer in {language} with the passages citations:
|
60 |
"""
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
answer_prompt_without_docs_template = """
|
64 |
You are ClimateQ&A, an AI Assistant created by Ekimetrics. Your role is to explain climate-related questions using info from the IPCC and/or IPBES reports.
|
65 |
Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
|
66 |
|
67 |
Guidelines:
|
68 |
-
-
|
|
|
69 |
- Also explain that the user should reformulate the question to be more specific to the IPCC/IPBES reports
|
70 |
- Because your role is to explain climate science, you can never be skeptical about climate change.
|
71 |
-
- If the question is not related to environmental issues, never never answer it. Say it's not your role.
|
|
|
72 |
|
73 |
-
Question: {question}
|
74 |
Answer in {language}:
|
75 |
"""
|
76 |
|
|
|
49 |
- You do not need to use every passage. Only use the ones that help answer the question.
|
50 |
- If the documents do not have the information needed to answer the question, just say you do not have enough information.
|
51 |
- Consider by default that the question is about the past century unless it is specified otherwise.
|
52 |
+
- If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
|
53 |
|
54 |
-----------------------
|
55 |
Passages:
|
|
|
60 |
Answer in {language} with the passages citations:
|
61 |
"""
|
62 |
|
63 |
+
answer_prompt_images_template = """
|
64 |
+
You are ClimateQ&A, an AI Assistant created by Ekimetrics.
|
65 |
+
You are given the answer to a environmental question based on passages from the IPCC and IPBES reports and image captions.
|
66 |
+
|
67 |
+
Generate a follow-up and illustrated explanation to the existing answer using the content of the image caption.
|
68 |
+
The actual images will be inserted in the user interface afterward.
|
69 |
+
|
70 |
+
|
71 |
+
Guidelines:
|
72 |
+
- Don't summarize the previous answer or make an introduction, you only need to illustrate with the images.
|
73 |
+
- Mention the image using similar sentence : "Indeed, as we see in this picture ...", "In the following image, it is shown that ...", but without mentioning the Image number
|
74 |
+
- Insert a placeholder like this [Image i] and by skipping to a new line before and after, where the image will be displayed within your explanation
|
75 |
+
|
76 |
+
For example :
|
77 |
+
```
|
78 |
+
Sea rise is projected to endanger isolated islands by 2050.
|
79 |
+
In the figure below, we see an projection of sea level rise:
|
80 |
+
[Image 1]
|
81 |
+
This image is depicting the urgency depicted in the passages.
|
82 |
+
```
|
83 |
+
|
84 |
+
-----------------------
|
85 |
+
Image captions:
|
86 |
+
{images}
|
87 |
+
|
88 |
+
-----------------------
|
89 |
+
Question:
|
90 |
+
{question}
|
91 |
+
|
92 |
+
-----------------------
|
93 |
+
Answer:
|
94 |
+
{answer}
|
95 |
+
|
96 |
+
-----------------------
|
97 |
+
|
98 |
+
Follow-up explanation in {language} explained to {audience}:
|
99 |
+
"""
|
100 |
+
|
101 |
|
102 |
answer_prompt_without_docs_template = """
|
103 |
You are ClimateQ&A, an AI Assistant created by Ekimetrics. Your role is to explain climate-related questions using info from the IPCC and/or IPBES reports.
|
104 |
Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
|
105 |
|
106 |
Guidelines:
|
107 |
+
- If it's a conversational question such as "hello", "who are you", ..., you can answer directly
|
108 |
+
- Start by explaining clearly that you could not find any passages to answer in the IPCC/IPBES reports, but it can be because of the search engine, and not because it's not there. So your answer is based on your own knowledge and must be taken with great caution because it's AI generated and you prefer to use sources to answer.
|
109 |
- Also explain that the user should reformulate the question to be more specific to the IPCC/IPBES reports
|
110 |
- Because your role is to explain climate science, you can never be skeptical about climate change.
|
111 |
+
- If the question is not related to environmental issues, never never answer it. Say it's not your role.
|
112 |
+
- Make paragraphs by starting new lines to make your answers more readable.
|
113 |
|
114 |
+
Question: {question}
|
115 |
Answer in {language}:
|
116 |
"""
|
117 |
|
climateqa/engine/rag.py
CHANGED
@@ -7,7 +7,7 @@ from langchain_core.prompts.prompt import PromptTemplate
|
|
7 |
from langchain_core.prompts.base import format_document
|
8 |
|
9 |
from climateqa.engine.reformulation import make_reformulation_chain
|
10 |
-
from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template
|
11 |
from climateqa.engine.utils import pass_values, flatten_dict
|
12 |
|
13 |
|
@@ -16,10 +16,26 @@ DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}"
|
|
16 |
def _combine_documents(
|
17 |
docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
|
18 |
):
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return sep.join(doc_strings)
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def make_rag_chain(retriever,llm):
|
24 |
|
25 |
|
@@ -51,22 +67,29 @@ def make_rag_chain(retriever,llm):
|
|
51 |
**pass_values(["question","audience","language"])
|
52 |
}
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
|
57 |
|
58 |
answer_with_docs = {
|
59 |
"answer": input_documents | prompt | llm | StrOutputParser(),
|
60 |
-
**pass_values(["question","audience","language","query","docs"])
|
61 |
}
|
62 |
|
63 |
answer_without_docs = {
|
64 |
"answer": prompt_without_docs | llm | StrOutputParser(),
|
65 |
-
**pass_values(["question","audience","language","query","docs"])
|
66 |
}
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
answer = RunnableBranch(
|
69 |
-
(lambda x:
|
70 |
answer_without_docs,
|
71 |
)
|
72 |
|
@@ -77,3 +100,16 @@ def make_rag_chain(retriever,llm):
|
|
77 |
|
78 |
return rag_chain
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from langchain_core.prompts.base import format_document
|
8 |
|
9 |
from climateqa.engine.reformulation import make_reformulation_chain
|
10 |
+
from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
|
11 |
from climateqa.engine.utils import pass_values, flatten_dict
|
12 |
|
13 |
|
|
|
16 |
def _combine_documents(
|
17 |
docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
|
18 |
):
|
19 |
+
|
20 |
+
doc_strings = []
|
21 |
+
|
22 |
+
for i,doc in enumerate(docs):
|
23 |
+
# chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
|
24 |
+
chunk_type = "Doc"
|
25 |
+
doc_string = f"{chunk_type} {i+1}: " + format_document(doc, document_prompt)
|
26 |
+
doc_string = doc_string.replace("\n"," ")
|
27 |
+
doc_strings.append(doc_string)
|
28 |
+
|
29 |
return sep.join(doc_strings)
|
30 |
|
31 |
|
32 |
+
def get_text_docs(x):
|
33 |
+
return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
|
34 |
+
|
35 |
+
def get_image_docs(x):
|
36 |
+
return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
|
37 |
+
|
38 |
+
|
39 |
def make_rag_chain(retriever,llm):
|
40 |
|
41 |
|
|
|
67 |
**pass_values(["question","audience","language"])
|
68 |
}
|
69 |
|
70 |
+
# ------- CHAIN 3
|
71 |
+
# Bot answer
|
72 |
|
73 |
|
74 |
answer_with_docs = {
|
75 |
"answer": input_documents | prompt | llm | StrOutputParser(),
|
76 |
+
**pass_values(["question","audience","language","query","docs"]),
|
77 |
}
|
78 |
|
79 |
answer_without_docs = {
|
80 |
"answer": prompt_without_docs | llm | StrOutputParser(),
|
81 |
+
**pass_values(["question","audience","language","query","docs"]),
|
82 |
}
|
83 |
|
84 |
+
# def has_images(x):
|
85 |
+
# image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
|
86 |
+
# return len(image_docs) > 0
|
87 |
+
|
88 |
+
def has_docs(x):
|
89 |
+
return len(x["docs"]) > 0
|
90 |
+
|
91 |
answer = RunnableBranch(
|
92 |
+
(lambda x: has_docs(x), answer_with_docs),
|
93 |
answer_without_docs,
|
94 |
)
|
95 |
|
|
|
100 |
|
101 |
return rag_chain
|
102 |
|
103 |
+
|
104 |
+
|
105 |
+
def make_illustration_chain(llm):
|
106 |
+
|
107 |
+
prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
|
108 |
+
|
109 |
+
input_description_images = {
|
110 |
+
"images":lambda x : _combine_documents(get_image_docs(x["docs"])),
|
111 |
+
**pass_values(["question","audience","language","answer"]),
|
112 |
+
}
|
113 |
+
|
114 |
+
illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
|
115 |
+
return illustration_chain
|
climateqa/engine/retriever.py
CHANGED
@@ -18,7 +18,8 @@ class ClimateQARetriever(BaseRetriever):
|
|
18 |
threshold:float = 0.6
|
19 |
k_summary:int = 3
|
20 |
k_total:int = 10
|
21 |
-
namespace:str = "vectors"
|
|
|
22 |
|
23 |
|
24 |
def _get_relevant_documents(
|
@@ -31,8 +32,8 @@ class ClimateQARetriever(BaseRetriever):
|
|
31 |
assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
|
32 |
|
33 |
# Prepare base search kwargs
|
34 |
-
|
35 |
filters = {}
|
|
|
36 |
if len(self.reports) > 0:
|
37 |
filters["short_name"] = {"$in":self.reports}
|
38 |
else:
|
@@ -59,14 +60,14 @@ class ClimateQARetriever(BaseRetriever):
|
|
59 |
docs = docs_summaries + docs_full
|
60 |
|
61 |
# Filter if scores are below threshold
|
62 |
-
docs = [x for x in docs if x[1] > self.threshold]
|
63 |
|
64 |
# Add score to metadata
|
65 |
results = []
|
66 |
for i,(doc,score) in enumerate(docs):
|
67 |
doc.metadata["similarity_score"] = score
|
68 |
doc.metadata["content"] = doc.page_content
|
69 |
-
doc.metadata["page_number"] = int(doc.metadata["page_number"])
|
70 |
# doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
|
71 |
results.append(doc)
|
72 |
|
|
|
18 |
threshold:float = 0.6
|
19 |
k_summary:int = 3
|
20 |
k_total:int = 10
|
21 |
+
namespace:str = "vectors",
|
22 |
+
min_size:int = 200,
|
23 |
|
24 |
|
25 |
def _get_relevant_documents(
|
|
|
32 |
assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
|
33 |
|
34 |
# Prepare base search kwargs
|
|
|
35 |
filters = {}
|
36 |
+
|
37 |
if len(self.reports) > 0:
|
38 |
filters["short_name"] = {"$in":self.reports}
|
39 |
else:
|
|
|
60 |
docs = docs_summaries + docs_full
|
61 |
|
62 |
# Filter if scores are below threshold
|
63 |
+
# docs = [x for x in docs if x[1] > self.threshold and len(x[0].page_content) > self.min_size]
|
64 |
|
65 |
# Add score to metadata
|
66 |
results = []
|
67 |
for i,(doc,score) in enumerate(docs):
|
68 |
doc.metadata["similarity_score"] = score
|
69 |
doc.metadata["content"] = doc.page_content
|
70 |
+
doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
|
71 |
# doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
|
72 |
results.append(doc)
|
73 |
|
climateqa/engine/vectorstore.py
CHANGED
@@ -2,8 +2,8 @@
|
|
2 |
# More info at https://docs.pinecone.io/docs/langchain
|
3 |
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
4 |
import os
|
5 |
-
import
|
6 |
-
from langchain_community.vectorstores import Pinecone
|
7 |
|
8 |
# LOAD ENVIRONMENT VARIABLES
|
9 |
try:
|
@@ -13,20 +13,29 @@ except:
|
|
13 |
pass
|
14 |
|
15 |
|
16 |
-
def get_pinecone_vectorstore(embeddings,text_key = "
|
17 |
|
18 |
-
# initialize pinecone
|
19 |
-
pinecone.init(
|
20 |
-
|
21 |
-
|
22 |
-
)
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
|
|
|
|
|
|
|
27 |
return vectorstore
|
28 |
|
29 |
|
|
|
30 |
# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
|
31 |
|
32 |
# assert isinstance(sources,list)
|
|
|
2 |
# More info at https://docs.pinecone.io/docs/langchain
|
3 |
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
4 |
import os
|
5 |
+
from pinecone import Pinecone
|
6 |
+
from langchain_community.vectorstores import Pinecone as PineconeVectorstore
|
7 |
|
8 |
# LOAD ENVIRONMENT VARIABLES
|
9 |
try:
|
|
|
13 |
pass
|
14 |
|
15 |
|
16 |
+
def get_pinecone_vectorstore(embeddings,text_key = "content"):
|
17 |
|
18 |
+
# # initialize pinecone
|
19 |
+
# pinecone.init(
|
20 |
+
# api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
|
21 |
+
# environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
|
22 |
+
# )
|
23 |
+
|
24 |
+
# index_name = os.getenv("PINECONE_API_INDEX")
|
25 |
+
# vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
|
26 |
+
|
27 |
+
# return vectorstore
|
28 |
|
29 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
30 |
+
index = pc.Index(os.getenv("PINECONE_API_INDEX"))
|
31 |
|
32 |
+
vectorstore = PineconeVectorstore(
|
33 |
+
index, embeddings, text_key,
|
34 |
+
)
|
35 |
return vectorstore
|
36 |
|
37 |
|
38 |
+
|
39 |
# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
|
40 |
|
41 |
# assert isinstance(sources,list)
|
climateqa/sample_questions.py
CHANGED
@@ -73,6 +73,12 @@ QUESTIONS = {
|
|
73 |
"What are the impacts of invasive alien species on Indigenous Peoples and local communities?",
|
74 |
"What technologies and tools are available for managing invasive alien species?",
|
75 |
"How do economic and land-use changes facilitate the introduction and spread of invasive alien species?"
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
]
|
77 |
|
78 |
}
|
|
|
73 |
"What are the impacts of invasive alien species on Indigenous Peoples and local communities?",
|
74 |
"What technologies and tools are available for managing invasive alien species?",
|
75 |
"How do economic and land-use changes facilitate the introduction and spread of invasive alien species?"
|
76 |
+
],
|
77 |
+
"Experimental images":[
|
78 |
+
"Is warming unprecedented in the past 200 years ?",
|
79 |
+
"Are human activities causing global warming?",
|
80 |
+
"What is the distribution of uncertainty in projected precipitation changes across different time frames ?",
|
81 |
+
"What are the anticipated changes in the global water cycle by the end of the 21st century under an intermediate emissions scenario ?",
|
82 |
]
|
83 |
|
84 |
}
|
climateqa/utils.py
CHANGED
@@ -15,7 +15,7 @@ def get_file_from_azure_blob_storage(path):
|
|
15 |
|
16 |
|
17 |
def get_image_from_azure_blob_storage(path):
|
18 |
-
base_path = "
|
19 |
path = os.path.join(base_path, path)
|
20 |
file_object = get_file_from_azure_blob_storage(path)
|
21 |
image = Image.open(file_object)
|
|
|
15 |
|
16 |
|
17 |
def get_image_from_azure_blob_storage(path):
|
18 |
+
base_path = "climateqa/documents/"
|
19 |
path = os.path.join(base_path, path)
|
20 |
file_object = get_file_from_azure_blob_storage(path)
|
21 |
image = Image.open(file_object)
|
style.css
CHANGED
@@ -295,4 +295,28 @@ body.dark .card-footer span {
|
|
295 |
white-space: normal !important; /* Allow the text to wrap */
|
296 |
word-break: break-word !important; /* Break words to prevent overflow */
|
297 |
overflow-wrap: break-word !important; /* Break long words if necessary */
|
298 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
white-space: normal !important; /* Allow the text to wrap */
|
296 |
word-break: break-word !important; /* Break words to prevent overflow */
|
297 |
overflow-wrap: break-word !important; /* Break long words if necessary */
|
298 |
+
}
|
299 |
+
|
300 |
+
span.chatbot > p > img{
|
301 |
+
margin-top:40px !important;
|
302 |
+
max-height: none !important;
|
303 |
+
max-width: 80% !important;
|
304 |
+
border-radius:0px !important;
|
305 |
+
}
|
306 |
+
|
307 |
+
|
308 |
+
.chatbot-caption{
|
309 |
+
font-size:11px;
|
310 |
+
font-style:italic;
|
311 |
+
color:#508094;
|
312 |
+
}
|
313 |
+
|
314 |
+
.ai-generated{
|
315 |
+
font-size:11px!important;
|
316 |
+
font-style:italic;
|
317 |
+
color:#73b8d4 !important;
|
318 |
+
}
|
319 |
+
|
320 |
+
.card-image > .card-content{
|
321 |
+
background-color:#f1f7fa !important;
|
322 |
+
}
|