Spaces:
Sleeping
Sleeping
File size: 7,957 Bytes
2b935c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tqdm import tqdm
import time
import sys
# import openai
import time
# import pandas as pd
import random
import csv
import os
import pickle
import json
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
from typing import List
import difflib
# import tiktoken
import re
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import numpy as np
from retrieve import get_retrieved_results, get_slide
# Ensure you have downloaded the 'punkt' tokenizer models
nltk.download('punkt')
import streamlit as st
# Get the parent directory
# parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# Add the parent directory to the system path
# sys.path.append(parent_dir)
from utils import AzureModels, write_to_file, read_from_file
# from utils_open import OpenModels
# Function to calculate similarity
def calculate_similarity(sentence1: str, sentence2: str) -> float:
return difflib.SequenceMatcher(None, sentence1, sentence2).ratio()
# Function to highlight sentences based on similarity
def highlight_sentences(predicted: str, ground_truth: str) -> str:
ground_truth_sentences = nltk.sent_tokenize(ground_truth)
predicted_sentences = nltk.sent_tokenize(predicted)
highlighted_text = ""
for pred_sentence in predicted_sentences:
max_similarity = 0
for gt_sentence in ground_truth_sentences:
similarity = calculate_similarity(pred_sentence, gt_sentence)
if similarity > max_similarity:
max_similarity = similarity
# Determine shade of green
shade = max_similarity # No need to convert to int, max_similarity is already in [0, 1]
highlighted_text += f'<span style="background-color: rgba(0, 255, 0, {shade})">{pred_sentence}</span> '
return highlighted_text
st.title('Multi-Document Narrative Generation')
options = ["Select", "Adobe Firefly", "Adobe Acrobat"]
selection = st.selectbox('Select an example', options)
if selection=="Select":
pass
elif selection=="Adobe Firefly":
with open('wiki_1.json', 'r') as fr:
list_1 = json.load(fr)
with open('wiki_2.json', 'r') as fr:
list_2 = json.load(fr)
document_name = "Adobe Firefly"
section_names = ["Introduction"]*7+["History"]*2
ref_doc_indices = np.arange(1,8).tolist() + np.arange(1,3).tolist()
else:
with open('wiki_2.json', 'r') as fr:
list_1 = json.load(fr)
with open('wiki_1.json', 'r') as fr:
list_2 = json.load(fr)
document_name = "Adobe Acrobat"
section_names = ["Introduction"]*3+["History"]*3+["Document Cloud"]*2
ref_doc_indices = np.arange(1,4).tolist() + np.arange(1,4).tolist() + np.arange(1,3).tolist()
inp_doc_list = []
inp_keys_list = []
retrieved_doc_list = []
if selection!='Select':
# for item, ret_item in zip(list_1, retrieved_out):
for item in list_1:
for key in item['ref_abstract']:
inp_doc_list.append(item['ref_abstract'][key])
inp_keys_list.append(key)
# retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract'])
# Initialize session state
if 'retrieve_clicked' not in st.session_state:
st.session_state.retrieve_clicked = False
retrieve_prompt_template = "{} : Document {} for the '{}' Section of the Article titled '{}'"
ui_doc_list = []
ui_retrieved_doc_list = []
# 5 input text boxes for 5 input documents
st.header('Input Documents')
# doc1 = st.text_area('Document 1', value="1. What up bruh??")
for i in range(len(section_names)):
ui_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=inp_doc_list[i]))
if st.button('Retrieve'):
if 'organize_clicked' not in st.session_state:
st.session_state.organize_clicked = False
retrieved_out = get_retrieved_results("gpt4o", 0, "fixed", list_2, list_1)
write_to_file("retrieved_docs.json", retrieved_out)
retrieved_out_train = get_retrieved_results("gpt4o", 0, "fixed", list_1, list_2)
write_to_file("retrieved_docs_train.json", retrieved_out_train)
for ret_item in retrieved_out:
for key in ret_item['ref_abstract']:
# inp_doc_list.append(item['ref_abstract'][key])
retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract'])
# Step 2: Lowercase the documents
st.session_state.retrieve_clicked = True
st.header('Retrieved Documents')
for i in range(len(section_names)):
ui_retrieved_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=retrieved_doc_list[i]))
if st.session_state.retrieve_clicked:
if st.button('Organize'):
if 'summarize_clicked' not in st.session_state:
st.session_state.summarize_clicked = False
st.session_state.organize_clicked = True
st.header("Organization of the documents in the narrative")
topics_list = ["Introduction", "History", "Document Cloud"]
organize_list = []
ui_organize_list = []
test_list = read_from_file("retrieved_docs.json")
train_list = read_from_file("retrieved_docs_train.json")
organize_out = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, True)
for i in range(len(organize_out)):
organize_list.append(organize_out[i])
ui_organize_list.append(st.text_area("Section: " + topics_list[i], value=organize_out[i]))
write_to_file("organized_docs.json", organize_out)
if st.session_state.organize_clicked:
if st.button("Summarize"):
# if 'narrative_clicked' not in st.session_state:
# st.session_state.narrative_clicked = False
st.session_state.summarize_clicked = True
st.header("Intent-based multi-document summary")
topics_list = ["Introduction", "History", "Document Cloud"]
generate_list = []
ui_generate_list = []
slides_list = []
test_list = read_from_file("retrieved_docs.json")
train_list = read_from_file("retrieved_docs_train.json")
organize_out = read_from_file("organized_docs.json")
gen_summary_dict = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, False, organize_out)
for i in range(len(gen_summary_dict)):
highlighted_summary = highlight_sentences(gen_summary_dict[i], test_list[i]['abstract'])
slides_list.append(get_slide(topics_list[i], gen_summary_dict[i]))
# generate_list.append(.format(topics_list[i], gen_summary_dict[i]))
st.markdown(f"## {topics_list[i]}")
# st.markdown(f"*{gen_summary_dict[i]}*")
st.markdown(highlighted_summary, unsafe_allow_html=True)
st.header("Generated Narrative")
for i in range(len(slides_list)):
st.markdown("---")
st.markdown(slides_list[i])
st.markdown("---")
# if st.session_state.summarize_clicked:
# if st.button("Narrative"):
# st.session_state.narrative_clicked = True |