from tenacity import retry, stop_after_attempt, wait_random_exponential from tqdm import tqdm import time import sys # import openai import time # import pandas as pd import random import csv import os import pickle import json import nltk nltk.download('punkt') nltk.download('stopwords') nltk.download('punkt_tab') from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords import string from typing import List import difflib # import tiktoken import re from nltk.tokenize import sent_tokenize from collections import defaultdict import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import numpy as np from retrieve import get_retrieved_results, get_slide # Ensure you have downloaded the 'punkt' tokenizer models nltk.download('punkt') import streamlit as st # Get the parent directory # parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) # Add the parent directory to the system path # sys.path.append(parent_dir) from utils import AzureModels, write_to_file, read_from_file # from utils_open import OpenModels # Function to calculate similarity def calculate_similarity(sentence1: str, sentence2: str) -> float: return difflib.SequenceMatcher(None, sentence1, sentence2).ratio() # Function to highlight sentences based on similarity def highlight_sentences(predicted: str, ground_truth: str) -> str: ground_truth_sentences = nltk.sent_tokenize(ground_truth) predicted_sentences = nltk.sent_tokenize(predicted) highlighted_text = "" for pred_sentence in predicted_sentences: max_similarity = 0 for gt_sentence in ground_truth_sentences: similarity = calculate_similarity(pred_sentence, gt_sentence) if similarity > max_similarity: max_similarity = similarity # Determine shade of green shade = max_similarity # No need to convert to int, max_similarity is already in [0, 1] highlighted_text += f'{pred_sentence} ' return highlighted_text st.title('Multi-Document Narrative Generation') # options = ["Select", "Adobe Firefly", "Adobe Acrobat"] # selection = st.selectbox('Select an example', options) selection = "Adobe Firefly" # Input for Presentation Title presentation_title = st.text_input("Presentation Title") # Input for Slide Title slide_title = st.text_input("Slide Title") # Option for uploading a folder (simulated by allowing multiple file uploads) uploaded_files = st.file_uploader( "Upload source documents (multiple .txt files allowed)", accept_multiple_files=True, type="txt" ) if selection=="Select": pass elif selection=="Adobe Firefly": # with open('wiki_1.json', 'r') as fr: # list_1 = json.load(fr) with open('wiki_2.json', 'r') as fr: list_2 = json.load(fr) tmp_ref_abstract = {} file_count=0 for file in uploaded_files: tmp_filename = file.name tmp_content = file.read().decode('utf-8').strip() tmp_ref_abstract[tmp_filename] = tmp_content file_count+=1 document_name = presentation_title section_names = [slide_title]*file_count ref_doc_indices = np.arange(1,file_count+1).tolist() list_1 = [ { "abstract": "Write the '{}' section of the article titled '{}'.".format(slide_title, presentation_title), "ref_abstract": tmp_ref_abstract, "related_work": "" } ] else: with open('wiki_2.json', 'r') as fr: list_1 = json.load(fr) with open('wiki_1.json', 'r') as fr: list_2 = json.load(fr) document_name = "Adobe Acrobat" section_names = ["Introduction"]*3+["History"]*3+["Document Cloud"]*2 ref_doc_indices = np.arange(1,4).tolist() + np.arange(1,4).tolist() + np.arange(1,3).tolist() # Initialize session state if 'submit_clicked' not in st.session_state: st.session_state.submit_clicked = False inp_doc_list = [] inp_keys_list = [] retrieved_doc_list = [] if st.button('Submit'): if 'retrieve_clicked' not in st.session_state: st.session_state.retrieve_clicked = False st.session_state.submit_clicked = True # for item, ret_item in zip(list_1, retrieved_out): for item in list_1: for key in item['ref_abstract']: inp_doc_list.append(item['ref_abstract'][key]) inp_keys_list.append(key) # retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract']) # Initialize session state # if 'retrieve_clicked' not in st.session_state: # st.session_state.retrieve_clicked = False retrieve_prompt_template = "{} : Document {} for the '{}' Section of the Article titled '{}'" ui_doc_list = [] ui_retrieved_doc_list = [] # 5 input text boxes for 5 input documents st.header('Input Documents') # doc1 = st.text_area('Document 1', value="1. What up bruh??") for i in range(len(section_names)): ui_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=inp_doc_list[i])) # write_to_file('ui_doc.json', ui_inp_keys_list.jsondoc_list) # ref_doc_indices[i], section_names[i], document_name write_to_file("inp_keys_list.json", inp_keys_list) write_to_file("section_names.json", section_names) write_to_file("document_name.pickle", document_name) if st.session_state.submit_clicked: if st.button('Retrieve'): # ui_doc_list=read_from_file('ui_doc.json') inp_keys_list = read_from_file("inp_keys_list.json") section_names = read_from_file("section_names.json") document_name = read_from_file("document_name.pickle") ui_retrieved_doc_list=[] if 'organize_clicked' not in st.session_state: st.session_state.organize_clicked = False st.session_state.retrieve_clicked = True retrieved_out = get_retrieved_results("gpt4o", 0, "fixed", list_2, list_1) write_to_file("retrieved_docs.json", retrieved_out) retrieved_out_train = get_retrieved_results("gpt4o", 0, "fixed", list_1, list_2) write_to_file("retrieved_docs_train.json", retrieved_out_train) for ret_item in retrieved_out: for key in ret_item['ref_abstract']: # inp_doc_list.append(item['ref_abstract'][key]) retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract']) # Step 2: Lowercase the documents st.header('Retrieved Documents') retrieve_prompt_template = "{} : Document {} for the '{}' Section of the Article titled '{}'" for i in range(len(section_names)): ui_retrieved_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=retrieved_doc_list[i])) if st.session_state.retrieve_clicked: if st.button('Organize'): if 'summarize_clicked' not in st.session_state: st.session_state.summarize_clicked = False st.session_state.organize_clicked = True st.header("Organization of the documents in the narrative") topics_list = [slide_title] organize_list = [] ui_organize_list = [] test_list = read_from_file("retrieved_docs.json") train_list = read_from_file("retrieved_docs_train.json") organize_out = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, True) for i in range(len(organize_out)): organize_list.append(organize_out[i]) ui_organize_list.append(st.text_area("Section: " + topics_list[i], value=organize_out[i])) write_to_file("organized_docs.json", organize_out) if st.session_state.organize_clicked: if st.button("Summarize"): # if 'narrative_clicked' not in st.session_state: # st.session_state.narrative_clicked = False st.session_state.summarize_clicked = True st.header("Intent-based multi-document summary") topics_list = [slide_title] generate_list = [] ui_generate_list = [] slides_list = [] test_list = read_from_file("retrieved_docs.json") train_list = read_from_file("retrieved_docs_train.json") organize_out = read_from_file("organized_docs.json") gen_summary_dict = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, False, organize_out) for i in range(len(gen_summary_dict)): # highlighted_summary = highlight_sentences(gen_summary_dict[i], test_list[i]['abstract']) slides_list.append(get_slide(topics_list[i], gen_summary_dict[i])) # generate_list.append(.format(topics_list[i], gen_summary_dict[i])) st.markdown(f"## {topics_list[i]}") st.markdown(f"{gen_summary_dict[i]}") # st.markdown(highlighted_summary, unsafe_allow_html=True) st.header("Generated Narrative") for i in range(len(slides_list)): st.markdown("---") st.markdown(slides_list[i]) st.markdown("---") # if st.session_state.summarize_clicked: # if st.button("Narrative"): # st.session_state.narrative_clicked = True