import os from io import StringIO import joblib from copy import deepcopy from pypdf import PdfReader import pandas as pd import plotly.express as px from huggingface_hub import hf_hub_download, snapshot_download import streamlit as st import streamlit_analytics from utils import add_logo_to_sidebar, add_footer, add_email_signup_form HF_TOKEN = os.environ.get("HF_TOKEN") MODEL_REPO_ID = "simplexico/cuad-sklearn-contract-clustering" DATA_REPO_ID = "simplexico/cuad-top-ten" MODEL_FILENAME = "cuad_tfidf_umap_kmeans.pkl" DATA_FILENAME = "cuad_top_ten_popular_contract_types.json" streamlit_analytics.start_tracking() st.set_page_config( page_title="Organise Demo", page_icon="🗂", layout="wide", initial_sidebar_state="expanded", menu_items={ 'Get Help': 'mailto:hello@simplexico.ai', 'Report a bug': None, 'About': "## This a demo showcasing different Legal AI Actions" } ) add_logo_to_sidebar() st.title('🗂 Organise Demo') st.write(""" This demo shows how AI can be used to organise a collection of texts. We've trained a model to group documents into similar types. The plot below shows a sample set of contracts that have been automatically grouped together. Each point in the plot represents how the model interprets a contract, the closer together a pair of points are, the more similar they appear to the model. Similar documents are grouped by color. \n**TIP:** Hover over each point to see the filename of the contract. Groups can be added or removed by clicking on the symbol in the plot legend. """) st.info("👈 Upload your own documents on the left (as .txt or .pdf files) to see how your own documents can be organised using AI.") @st.cache(allow_output_mutation=True) def load_model(): model = joblib.load( hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME, token=HF_TOKEN) ) return model @st.cache(allow_output_mutation=True) def load_dataset(): snapshot_download(repo_id=DATA_REPO_ID, token=HF_TOKEN, local_dir='./', repo_type='dataset') df = pd.read_json(DATA_FILENAME) return df def get_transform_and_predictions(model, X): y = model.predict(X) X_transform = model[:2].transform(X) return X_transform, y def generate_plot(X, y, filenames): fig = px.scatter_3d( x=X[:, 0], y=X[:, 1], z=X[:, 2], color=[str(y_i) for y_i in y], hover_name=filenames) fig.update_traces( marker_size=8, marker_line=dict(width=2), selector=dict(mode='markers') ) fig.update_layout( legend=dict( title='grouping', yanchor="top", y=0.99, xanchor="left", x=0.01 ), width=1100, height=900 ) return fig @st.cache(allow_output_mutation=True) def prepare_figure(model, df): X = [text[:500] for text in df['text'].to_list()] filenames = df['filename'].to_list() X_transform, y = get_transform_and_predictions(model, X) fig = generate_plot(X_transform, y, filenames) return fig @st.cache() def prepare_page(): model = load_model() df = load_dataset() X = [text[:500] for text in df['text'].to_list()] filenames = df['filename'].to_list() X_transform, y = get_transform_and_predictions(model, X) fig = prepare_figure(model, df) return fig, model uploaded_files = st.sidebar.file_uploader("Upload your documents", accept_multiple_files=True, type=['pdf', 'txt'], help="Upload your own documents. Don't worry we don't store any data.") # button = st.sidebar.button('Organise Contracts', type='primary', use_container_width=True) with st.spinner('⚙️ Loading model...'): fig, cuad_tfidf_umap_kmeans = prepare_page() figure = st.plotly_chart(fig, use_container_width=True) if uploaded_files: figure.empty() filenames = [] X_train = [] if len(uploaded_files) < 5: st.error('### 💔 Please upload more than 4 files.') else: with st.spinner('⚙️ Training model...'): for uploaded_file in uploaded_files: print(uploaded_file.name) if '.pdf' in uploaded_file.name.lower(): reader = PdfReader(uploaded_file) page_texts = [page.extract_text() for page in reader.pages] text = "\n".join(page_texts) if '.txt' in uploaded_file.name.lower(): stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) text = stringio.read() X_train.append(text[:500]) filenames.append(uploaded_file.name) if len(uploaded_files) < 10: n_clusters = 3 else: n_clusters = 8 tfidf_umap_kmeans = deepcopy(cuad_tfidf_umap_kmeans) tfidf_umap_kmeans.set_params(kmeans__n_clusters=n_clusters) tfidf_umap_kmeans.fit(X_train) X_transform, y = get_transform_and_predictions(cuad_tfidf_umap_kmeans, X_train) fig = generate_plot(X_transform, y, filenames) st.markdown("## 🗂 Your Organised Documents") st.plotly_chart(fig, use_container_width=True) add_email_signup_form() add_footer() streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"])