File size: 7,374 Bytes
a9ae4d6
73a72e4
a9ae4d6
2d9a7bb
8ae2f60
c2a1077
48a8ce5
d6bb012
0a71207
 
d6bb012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826b6f7
 
 
 
 
 
79341c6
 
 
 
 
 
 
 
3ee5fd7
 
ff11284
3ee5fd7
b27a82c
 
d6f3792
73a72e4
 
 
d6f3792
73a72e4
 
 
 
 
 
 
 
 
 
 
 
 
b27a82c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826b6f7
d6bb012
 
 
79341c6
3ee5fd7
d6bb012
 
 
08426ab
d6bb012
826b6f7
d0ce3e0
9278116
a0e2673
634dfe9
826b6f7
d6bb012
634dfe9
 
826b6f7
5039e23
826b6f7
73a72e4
d0ce3e0
73a72e4
 
 
 
d0ce3e0
e45e7fb
5039e23
73a72e4
5039e23
73a72e4
5039e23
dbe00f2
5039e23
73a72e4
 
 
 
e45e7fb
ff11284
e45e7fb
 
d95772b
5039e23
3ee5fd7
d0ce3e0
e45e7fb
 
 
 
 
 
 
d95772b
699adea
 
134ae6e
699adea
efb6635
699adea
 
 
b712baa
5039e23
134ae6e
 
 
 
 
 
 
21879cd
 
d8b3a4e
 
 
ea2d98b
d0ce3e0
3ee5fd7
 
 
134ae6e
3ee5fd7
134ae6e
 
 
0a71207
e5268d8
0a71207
e5268d8
 
 
0a71207
 
 
 
 
e5268d8
0a71207
 
f22548e
e5268d8
 
 
0a71207
e5268d8
0a71207
9278116
a0e2673
9278116
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import streamlit as st
import time
from annotated_text import annotated_text
from io import StringIO
from transformers import AutoTokenizer, AutoModelForTokenClassification
from text_extractor import *
from text_annotatator import *
import os
from streamlit_text_annotation import text_annotation

os.environ['KMP_DUPLICATE_LIB_OK']='True'

import plotly.express as px
from streamlit_option_menu import option_menu

st. set_page_config(layout="wide")

from transformers import pipeline
import pandas as pd

@st.cache(allow_output_mutation = True)
def init_text_summarization_model():
    MODEL = 'facebook/bart-large-cnn'
    pipe = pipeline("summarization", model=MODEL)
    return pipe

@st.cache(allow_output_mutation = True)
def init_zsl_topic_classification():
    MODEL = 'facebook/bart-large-mnli'
    pipe = pipeline("zero-shot-classification", model=MODEL)
    template = "This text is about {}."
    return pipe, template

@st.cache(allow_output_mutation = True)
def init_zsl_topic_classification():
    MODEL = 'facebook/bart-large-mnli'
    pipe = pipeline("zero-shot-classification", model=MODEL)
    template = "This text is about {}."
    return pipe, template

@st.cache(allow_output_mutation = True)
def init_ner_pipeline():
    tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
    model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
    return pipe

@st.cache(allow_output_mutation = True)
def init_qa_pipeline():
    question_answerer_pipe = pipeline("question-answering", model='deepset/roberta-base-squad2')
    return question_answerer_pipe

def get_formatted_text_for_annotation(output):
    colour_map = {'Coreference': '#29D93B',
    'Severity':'#FCF3CF',
 'Sex': '#E9F7EF',
 'Sign_symptom': '#EAF2F8',
 'Detailed_description': '#078E8B',
 'Date': '#F5EEF8',
 'History': '#FDEDEC',
 'Medication': '#F4F6F6',
 'Therapeutic_procedure': '#A3E4D7',
 'Age': '#85C1E9',
 'Subject': '#D7BDE2',
 'Biological_structure': '#AF7AC5',
 'Activity': '#B2BABB',
 'Lab_value': '#E6B0AA',
 'Family_history': '#2471A3',
 'Diagnostic_procedure': '#CCD1D1',
 'Other_event': '#239B56',
 'Occupation': '#B3B6B7'}
    
    annotated_texts = []
    next_index = 0
    for entity in output:
        if entity['start'] == next_index:
    #         print("found entity")
            extracted_text = text[entity['start']:entity['end']]
    #         print("annotated",annotated_text)
            annotated_texts.append((extracted_text ,entity['entity_group'],colour_map[entity['entity_group']]))
        else:
            unannotated_text = text[next_index:entity['start']-1]
            annotated_texts.append(unannotated_text)
            extracted_text = text[entity['start']:entity['end']]
            annotated_texts.append((extracted_text ,entity['entity_group'],colour_map[entity['entity_group']]))
            next_index =entity['end'] +1
    
    if next_index < len(text):
        annotated_texts.append(text[next_index-1:len(text)-1])
        
    return tuple(annotated_texts)
    
# Model initialization    
pipeline_summarization = init_text_summarization_model()
pipeline_zsl, template = init_zsl_topic_classification()
pipeline_ner =init_ner_pipeline()
pipeline_qa = init_qa_pipeline()

st.header("Intelligent Document Automation")



with st.sidebar:
    selected_menu = option_menu("Select Option", 
    ["Upload Document", "Extract Text", "Summarize Document", "Extract Entities","Detected Barriers","Get Answers","Annotation Tool",
    "Claim Status Report"], 
        menu_icon="cast", default_index=0)
    

if selected_menu == "Upload Document":
    uploaded_file = st.file_uploader("Choose a file")        
    if uploaded_file is not None:
        ocr_text  = get_text_from_ocr_engine()
        st.write("Upload Successful")
        
elif selected_menu == "Extract Text":
    with st.spinner("Extracting Text..."):
        time.sleep(6)
        st.write(get_text_from_ocr_engine())
        
elif selected_menu == "Summarize Document":
    paragraphs= get_paragraphs_for_summaries()
    
    with st.spinner("Finding Topics..."):
        tags_found = ["Injury Details", "Past Medical Conditions", "Injury Management Plan", "GP Correspondence"]
        time.sleep(5)
        st.write("This document is about:")
        st.markdown(";".join(["#" + tag + " "  for tag in tags_found]) + "**")
        st.markdown("""---""")
        
    with st.spinner("Summarizing Document..."):
        
        
        for text in paragraphs:
            summary_text = pipeline_summarization(text, max_length=130, min_length=30, do_sample=False)
            # Show output
            st.write(summary_text[0]['summary_text'])
            st.markdown("""---""")
     
        
elif selected_menu == "Extract Entities":
    paragraphs= get_paragraphs_for_entities()
    
    with st.spinner("Extracting Entities..."):
        for text in paragraphs:
            output = pipeline_ner (text)
            entities_text =get_formatted_text_for_annotation(output)
            annotated_text(*entities_text)
            st.markdown("""---""")
            
elif selected_menu == "Detected Barriers":
    #st.subheader('Barriers Detected')
    barriers_to_detect = {"Chronic Pain":"Is the patint experiencing chronic pain?",
                          "Mental Health Issues":"Does he have any mental issues?",
                          "Prior History":"What is prior medical history?",
                          "Smoking":"Does he smoke?",
                          "Drinking":"Does he drink?",
                          "Comorbidities":"Does he have any comorbidities?"}
    
    with st.spinner("Detecting Barriers..."):                                            
        for barrier,question_text in barriers_to_detect.items():
        
            context = get_text_from_ocr_engine()
            if question_text:
                result = pipeline_qa(question=question_text,  context=context)
                st.subheader(barrier)
                #st.text(result)
                if result['score'] < 0.3:
                    st.text("Not Found")
                else:
                    st.text(result['answer']) 

elif selected_menu == "Get Answers":
    st.subheader('Question')
    question_text = st.text_input("Type your question")
    context = get_text_from_ocr_engine()
  
    if question_text:
        with st.spinner("Finding Answer(s)..."):
            result = pipeline_qa(question=question_text,  context=context)
            st.subheader('Answer')
            st.text(result['answer'])
            
elif selected_menu == "Annotation Tool":
   
    display_only_data = get_display_only_data()
    editable_data = get_editable_data()
    
    st.subheader("Display Mode:")
    left, right = st.columns(2)
    with left:
        st.text("Vertical labels:")
        text_annotation(display_only_data )
    with right:
        st.text("Horizontal labels:")
        display_only_data["labelOrientation"] = "horizontal"
        text_annotation(display_only_data )


    st.subheader("Edit Mode:")
    data = text_annotation(editable_data)
    if data:
        "Returned data:", data
elif selected_menu == "Claim Status Report":
    claim_number = st.text_input("Enter the Claim Number")