Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import streamlit as st | |
import difflib | |
import spacy | |
import streamlit_analytics | |
from utils import add_logo_to_sidebar, add_footer | |
def load_model(): | |
return spacy.load('en_core_web_md') | |
streamlit_analytics.start_tracking() | |
## Layout stuff | |
st.set_page_config( | |
page_title="Compare Demo", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="expanded", | |
menu_items={ | |
'Get Help': 'mailto:[email protected]', | |
'Report a bug': None, | |
'About': "## This a demo showcasing different Legal AI Actions" | |
} | |
) | |
add_logo_to_sidebar() | |
st.title('π Compare Demo') | |
st.write("""This demo shows how AI can be used to compare passages of text.""") | |
st.write("**π Enter two passages of text on the left** and hit the button **Compare** to see the demo in action") | |
with st.spinner('βοΈ Loading model...'): | |
nlp = load_model() | |
EXAMPLE_TEXT_1 = """This Agreement shall be governed by and interpreted under the laws of the | |
State of Delaware without regard to its conflicts of law provisions.""" | |
EXAMPLE_TEXT_2 = """This agreement will be governed by and must be construed in accordance with the laws of the State of Israel.""" | |
text_1 = st.sidebar.text_area('Enter a passage of text', value=EXAMPLE_TEXT_1, height=150, key='input1') | |
text_2 = st.sidebar.text_area('Enter a second passage of text', value=EXAMPLE_TEXT_2, height=150, key='input2') | |
button = st.sidebar.button('Compare', type='primary', use_container_width=True) | |
def get_tokens(doc): | |
return [token.lower for token in doc] | |
def get_pos_tags(doc): | |
return [token.pos_ for token in doc] | |
def add_md_color(text, match): | |
color = 'green' if match else 'red' | |
return f":{color}[{text}]" | |
def add_em(text, match): | |
if match: | |
return f"**{text}**" | |
else: | |
return f"*{text}*" | |
def create_str_output(doc, idxs): | |
out = [] | |
for token in doc: | |
text = token.text | |
# higlight word diff | |
if any(token.i in range(start, end) for start, end in idxs): | |
text = add_md_color(text, match=True) | |
else: | |
text = add_md_color(text, match=False) | |
out.append(text) | |
return ' '.join(out) | |
def get_matching_idxs(items_1, items_2): | |
sm = difflib.SequenceMatcher(None, items_1, items_2) | |
matching_blocks = [match for match in sm.get_matching_blocks()] | |
doc_1_matching_idxs = [] | |
doc_2_matching_idxs = [] | |
for a, b, n in matching_blocks: | |
doc_1_matching_idxs.append((a, a + n)) | |
doc_2_matching_idxs.append((b, b + n)) | |
return doc_1_matching_idxs, doc_2_matching_idxs | |
if button: | |
with st.spinner('βοΈ Comparing Texts...'): | |
doc_1 = nlp(text_1) | |
doc_2 = nlp(text_2) | |
st.header('π§ͺ Comparison') | |
st.markdown('We can highlight the :green[similarities] and :red[differences] in **wording** across the two texts.') | |
doc_1_token_idxs, doc_2_token_idxs = get_matching_idxs(get_tokens(doc_1), get_tokens(doc_2)) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown(create_str_output(doc_1, doc_1_token_idxs)) | |
with col2: | |
st.markdown(create_str_output(doc_2, doc_2_token_idxs)) | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
# perform simple sequence matching | |
sm = difflib.SequenceMatcher(None, get_tokens(doc_1), get_tokens(doc_2)) | |
st.subheader('π Textual Similarity') | |
st.markdown('We can measure the similarity based on the *wording* of the two texts.') | |
st.metric(label='Textual Similarity', value=f"{sm.ratio() * 100:.1f}%") | |
with col2: | |
st.subheader('π Linguistic Similarity') | |
st.markdown('We can measure the similarity based on the *linguistic features* of the two texts.') | |
postags_1 = [token.pos_ for token in doc_1] | |
postags_2 = [token.pos_ for token in doc_2] | |
sm = difflib.SequenceMatcher(None, postags_1, postags_2) | |
st.metric(label='Linguistic Similarity', value=f"{sm.ratio() * 100:.1f}%") | |
with col3: | |
st.subheader('π Semantic Similarity') | |
st.markdown('We can measure the similarity based on the *meaning* of the two texts.') | |
st.metric(label='Semantic Similarity', value=f"{doc_1.similarity(doc_2) * 100:.1f}%") | |
add_footer() | |
streamlit_analytics.stop_tracking(unsafe_password=os.environ["ANALYTICS_PASSWORD"]) |