from io import BytesIO import streamlit as st import base64 from transformers import AutoModel, AutoTokenizer from graphviz import Digraph import json def display_tree(output): size = str(int(len(output))) + ',5' dpi = '300' format = 'svg' print(size, dpi) # Initialize Digraph object dot = Digraph(engine='dot', format=format) dot.attr('graph', rankdir='LR', rank='same', size=size, dpi=dpi) # Add nodes and edges for i,word_info in enumerate(output): word = word_info['word'] # Prepare word for RTL display head_idx = word_info['dep_head_idx'] dep_func = word_info['dep_func'] dot.node(str(i), word) # Create an invisible edge from the previous word to this one to enforce order if i > 0: dot.edge(str(i), str(i - 1), style='invis') if head_idx != -1: dot.edge(str(head_idx), str(i), label=dep_func, constraint='False') # Render the Digraph object dot.render('syntax_tree', format=format, cleanup=True) # Display the image in a scrollable container st.markdown( f"""
""", unsafe_allow_html=True) #st.image('syntax_tree.' + format, use_column_width=True) def display_download(disp_string): to_download = BytesIO(disp_string.encode()) st.download_button(label="⬇️ Download text file", data=to_download, file_name="parsed_output.txt", mime="text/plain") # Streamlit app title st.title('DictaBERT-Joint Visualizer') # Load Hugging Face token hf_token = st.secrets["HF_TOKEN"] # Assuming you've set up the token in Streamlit secrets # Authenticate and load model tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-joint', use_auth_token=hf_token) model = AutoModel.from_pretrained('dicta-il/dictabert-joint', use_auth_token=hf_token, trust_remote_code=True) model.eval() # Checkbox for the compute_mst parameter compute_mst = st.checkbox('Compute Maximum Spanning Tree', value=True) output_style = st.selectbox( 'Output Style: ', ('JSON', 'UD', 'IAHLT_UD'), index=1).lower() # User input sentence = st.text_input('Enter a sentence to analyze:') if sentence: # Display the input sentence st.text(sentence) # Model prediction output = model.predict([sentence], tokenizer, compute_syntax_mst=compute_mst, output_style=output_style)[0] if output_style == 'ud' or output_style == 'iahlt_ud': ud_output = output # convert to tree format of [dict(word, dep_head_idx, dep_func)] tree = [] for l in ud_output[2:]: parts = l.split('\t') if '-' in parts[0]: continue tree.append(dict(word=parts[1], dep_head_idx=int(parts[6]) - 1, dep_func=parts[7])) display_tree(tree) display_download('\n'.join(ud_output)) # Construct the table as a Markdown string table_md = "
\n\n" # Start with RTL div # Add the UD header lines table_md += "##" + ud_output[0] + "\n" table_md += "##" + ud_output[1] + "\n" # Table header table_md += "| " + " | ".join(["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]) + " |\n" # Table alignment table_md += "| " + " | ".join(["---"]*10) + " |\n" for line in ud_output[2:]: # Each UD line as a table row cells = line.replace('_', '\\_').replace('|', '|').replace(':', ':').split('\t') table_md += "| " + " | ".join(cells) + " |\n" table_md += "
" # Close the RTL div print(table_md) # Display the table using a single markdown call st.markdown(table_md, unsafe_allow_html=True) else: # display the tree tree = [w['syntax'] for w in output['tokens']] display_tree(tree) json_output = json.dumps(output, ensure_ascii=False, indent=2) display_download(json_output) # and the full json st.markdown("```json\n" + json_output + "\n```")