from tenacity import retry, stop_after_attempt, wait_random_exponential from tqdm import tqdm import time import sys # import openai import time # import pandas as pd import random import csv import os import pickle import json import nltk nltk.download('punkt') nltk.download('stopwords') from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords import string from typing import List import difflib # import tiktoken import re from nltk.tokenize import sent_tokenize from collections import defaultdict import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import numpy as np from retrieve import get_retrieved_results, get_slide # Ensure you have downloaded the 'punkt' tokenizer models nltk.download('punkt') import streamlit as st # Get the parent directory # parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) # Add the parent directory to the system path # sys.path.append(parent_dir) from utils import AzureModels, write_to_file, read_from_file # from utils_open import OpenModels # Function to calculate similarity def calculate_similarity(sentence1: str, sentence2: str) -> float: return difflib.SequenceMatcher(None, sentence1, sentence2).ratio() # Function to highlight sentences based on similarity def highlight_sentences(predicted: str, ground_truth: str) -> str: ground_truth_sentences = nltk.sent_tokenize(ground_truth) predicted_sentences = nltk.sent_tokenize(predicted) highlighted_text = "" for pred_sentence in predicted_sentences: max_similarity = 0 for gt_sentence in ground_truth_sentences: similarity = calculate_similarity(pred_sentence, gt_sentence) if similarity > max_similarity: max_similarity = similarity # Determine shade of green shade = max_similarity # No need to convert to int, max_similarity is already in [0, 1] highlighted_text += f'{pred_sentence} ' return highlighted_text st.title('Multi-Document Narrative Generation') options = ["Select", "Adobe Firefly", "Adobe Acrobat"] selection = st.selectbox('Select an example', options) if selection=="Select": pass elif selection=="Adobe Firefly": with open('wiki_1.json', 'r') as fr: list_1 = json.load(fr) with open('wiki_2.json', 'r') as fr: list_2 = json.load(fr) document_name = "Adobe Firefly" section_names = ["Introduction"]*7+["History"]*2 ref_doc_indices = np.arange(1,8).tolist() + np.arange(1,3).tolist() else: with open('wiki_2.json', 'r') as fr: list_1 = json.load(fr) with open('wiki_1.json', 'r') as fr: list_2 = json.load(fr) document_name = "Adobe Acrobat" section_names = ["Introduction"]*3+["History"]*3+["Document Cloud"]*2 ref_doc_indices = np.arange(1,4).tolist() + np.arange(1,4).tolist() + np.arange(1,3).tolist() inp_doc_list = [] inp_keys_list = [] retrieved_doc_list = [] if selection!='Select': # for item, ret_item in zip(list_1, retrieved_out): for item in list_1: for key in item['ref_abstract']: inp_doc_list.append(item['ref_abstract'][key]) inp_keys_list.append(key) # retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract']) # Initialize session state if 'retrieve_clicked' not in st.session_state: st.session_state.retrieve_clicked = False retrieve_prompt_template = "{} : Document {} for the '{}' Section of the Article titled '{}'" ui_doc_list = [] ui_retrieved_doc_list = [] # 5 input text boxes for 5 input documents st.header('Input Documents') # doc1 = st.text_area('Document 1', value="1. What up bruh??") for i in range(len(section_names)): ui_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=inp_doc_list[i])) if st.button('Retrieve'): if 'organize_clicked' not in st.session_state: st.session_state.organize_clicked = False retrieved_out = get_retrieved_results("gpt4o", 0, "fixed", list_2, list_1) write_to_file("retrieved_docs.json", retrieved_out) retrieved_out_train = get_retrieved_results("gpt4o", 0, "fixed", list_1, list_2) write_to_file("retrieved_docs_train.json", retrieved_out_train) for ret_item in retrieved_out: for key in ret_item['ref_abstract']: # inp_doc_list.append(item['ref_abstract'][key]) retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract']) # Step 2: Lowercase the documents st.session_state.retrieve_clicked = True st.header('Retrieved Documents') for i in range(len(section_names)): ui_retrieved_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=retrieved_doc_list[i])) if st.session_state.retrieve_clicked: if st.button('Organize'): if 'summarize_clicked' not in st.session_state: st.session_state.summarize_clicked = False st.session_state.organize_clicked = True st.header("Organization of the documents in the narrative") topics_list = ["Introduction", "History", "Document Cloud"] organize_list = [] ui_organize_list = [] test_list = read_from_file("retrieved_docs.json") train_list = read_from_file("retrieved_docs_train.json") organize_out = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, True) for i in range(len(organize_out)): organize_list.append(organize_out[i]) ui_organize_list.append(st.text_area("Section: " + topics_list[i], value=organize_out[i])) write_to_file("organized_docs.json", organize_out) if st.session_state.organize_clicked: if st.button("Summarize"): # if 'narrative_clicked' not in st.session_state: # st.session_state.narrative_clicked = False st.session_state.summarize_clicked = True st.header("Intent-based multi-document summary") topics_list = ["Introduction", "History", "Document Cloud"] generate_list = [] ui_generate_list = [] slides_list = [] test_list = read_from_file("retrieved_docs.json") train_list = read_from_file("retrieved_docs_train.json") organize_out = read_from_file("organized_docs.json") gen_summary_dict = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, False, organize_out) for i in range(len(gen_summary_dict)): highlighted_summary = highlight_sentences(gen_summary_dict[i], test_list[i]['abstract']) slides_list.append(get_slide(topics_list[i], gen_summary_dict[i])) # generate_list.append(.format(topics_list[i], gen_summary_dict[i])) st.markdown(f"## {topics_list[i]}") # st.markdown(f"*{gen_summary_dict[i]}*") st.markdown(highlighted_summary, unsafe_allow_html=True) st.header("Generated Narrative") for i in range(len(slides_list)): st.markdown("---") st.markdown(slides_list[i]) st.markdown("---") # if st.session_state.summarize_clicked: # if st.button("Narrative"): # st.session_state.narrative_clicked = True