Spaces:

Seetha
/

IMA-pipeline-streamlit

Sleeping

App Files Files Community

gseetha04 commited on Jun 1, 2023

Commit

5fcdc9c

•

1 Parent(s): 9bdb04e

scriptcomm

Browse files

Files changed (1) hide show

ST_BusT2KG_demo_final.py +544 -0

ST_BusT2KG_demo_final.py ADDED Viewed

	@@ -0,0 +1,544 @@

+# import all packages
+import requests
+import streamlit as st
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import KFold
+# tokenizer
+from transformers import AutoTokenizer, DistilBertTokenizerFast
+# sequence tagging model + training-related
+from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
+import numpy as np
+import pandas as pd
+import torch
+import json
+import sys
+import os
+#from datasets import load_metric
+from sklearn.metrics import classification_report
+from pandas import read_csv
+from sklearn.linear_model import LogisticRegression
+import sklearn.model_selection
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline, FeatureUnion
+import math
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.model_selection import train_test_split
+import json
+import re
+import numpy as np
+import pandas as pd
+import re
+import nltk
+#stemmer = nltk.SnowballStemmer("english")
+#from nltk.corpus import stopwords
+import string
+from sklearn.model_selection import train_test_split
+# import seaborn as sns
+# from sklearn.metrics import confusion_matrix
+# from sklearn.metrics import classification_report, ConfusionMatrixDisplay
+from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoConfig
+import torch
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+import itertools
+import json
+import glob
+from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer
+from transformers import pipeline
+import pickle
+import urllib.request
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+#from PyPDF2 import PdfReader
+#from urllib.request import urlopen
+#from tabulate import tabulate
+import csv
+import gdown
+import zipfile
+import wget
+import pdfplumber
+import pathlib
+import shutil
+import webbrowser
+from streamlit.components.v1 import html
+import streamlit.components.v1 as components
+from PyPDF2 import PdfReader
+#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# from git import Repo
+# Repo.clone_from('https://github.com/gseetha04/IMA-weights.git', branch='master')
+def main():
+  st.title("Text to Causal Knowledge Graph")
+  st.sidebar.title("Please upload your text documents in one file here:")
+  k=2
+  seed = 1
+  k1= 5
+  uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
+  text_list = []
+  causal_sents = []
+  reader = PdfReader(uploaded_file)
+  for page in reader.pages:
+    text = page.extract_text()
+    text_list.append(text)
+  text_list_final = [x.replace('\n', '') for x in text_list]
+  text_list_final = re.sub('"', '', str(text_list_final))
+  sentences = nltk.sent_tokenize(text_list_final)
+  result =[]
+  for i in sentences:
+    result1 = i.lower()
+    result2 = re.sub(r'[^\w\s]','',result1)
+    result.append(result2)
+  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+  model_path = "checkpoint-2850"
+  model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
+  pipe1 = pipeline("text-classification", model=model,tokenizer=tokenizer)
+  for sent in result:
+    pred = pipe1(sent)
+    for lab in pred:
+        if lab['label'] == 'causal': #causal
+            causal_sents.append(sent)
+  model_name = "distilbert-base-uncased"
+  tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
+  model_path1 = "DistilBertforTokenClassification"
+  model = DistilBertForTokenClassification.from_pretrained(model_path1, id2label={0:'CT',1:'E',2:'C',3:'O'}) #len(unique_tags),, num_labels= 7,
+  pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
+  sentence_pred = []
+  class_list = []
+  entity_list = []
+  for k in causal_sents:
+    pred= pipe(k)
+    #st.write(pred)
+    for i in pred:
+      sentence_pred.append(k)
+      class_list.append(i['word'])
+      entity_list.append(i['entity_group'])
+  filename = 'Checkpoint-classification.sav'
+  count_vect = CountVectorizer(ngram_range=[1,3])
+  tfidf_transformer=TfidfTransformer()
+  loaded_model = pickle.load(open(filename, 'rb'))
+  loaded_vectorizer = pickle.load(open('vectorizefile_classification.pickle', 'rb'))
+  pipeline_test_output = loaded_vectorizer.transform(class_list)
+  predicted = loaded_model.predict(pipeline_test_output)
+  pred1 = predicted
+  level0 = []
+  count =0
+  for i in predicted:
+    if i == 3:
+      level0.append('Non-Performance')
+      count +=1
+    else:
+      level0.append('Performance')
+      count +=1
+  list_pred = {0: 'Customers',1:'Employees',2:'Investors',3:'Non-performance',4:'Society',5:'Unclassified'}
+  pred_val = [list_pred[i] for i in pred1]
+  #print('count',count)
+  sent_id, unique = pd.factorize(sentence_pred)
+  final_list = pd.DataFrame(
+      {'Id': sent_id,
+       'Full sentence': sentence_pred,
+       'Component': class_list,
+       'cause/effect': entity_list,
+       'Label_level1': level0,
+       'Label_level2': pred_val
+      })
+  s = final_list['Component'].shift(-1)
+  m = s.str.startswith('##', na=False)
+  final_list.loc[m, 'Component'] += (' ' + s[m])
+  final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
+  li = []
+  uni = final_list1['Id'].unique()
+  for i in uni:
+    df_new = final_list1[final_list1['Id'] == i]
+    uni1 = df_new['Id'].unique()
+    if 'E' not in df_new.values:
+      li.append(uni1)
+  out = np.concatenate(li).ravel()
+  li_pan = pd.DataFrame(out,columns=['Id'])
+  df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
+              .query("_merge == 'left_only'") \
+              .drop('_merge',1)
+  df = df3.groupby(['Id','Full sentence','cause/effect', 'Label_level1', 'Label_level2'])['Component'].apply(', '.join).reset_index()
+  df["cause/effect"].replace({"C": "cause", "E": "effect"}, inplace=True)
+  df_final = df[df['cause/effect'] != 'CT']
+  df['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
+  df_final = df_final.drop('Component',1)
+  df_final.insert(2, "Component", df['New string'], True)
+  df_final.to_csv('predictions.csv')
+  count_NP_NP = 0
+  count_NP_investor = 0
+  count_NP_customer = 0
+  count_NP_employees = 0
+  count_NP_society = 0
+  count_inv_np = 0
+  count_inv_investor = 0
+  count_inv_customer = 0
+  count_inv_employee = 0
+  count_inv_society = 0
+  count_cus_np = 0
+  count_cus_investor = 0
+  count_cus_customer = 0
+  count_cus_employee = 0
+  count_cus_society = 0
+  count_emp_np = 0
+  count_emp_investor = 0
+  count_emp_customer = 0
+  count_emp_employee = 0
+  count_emp_society = 0
+  count_soc_np = 0
+  count_soc_investor = 0
+  count_soc_customer = 0
+  count_soc_employee = 0
+  count_soc_society = 0
+  for i in range(0,df_final['Id'].max()):
+    j = df_final.loc[df_final['Id'] == i]
+    cause_tab = j.loc[j['cause/effect'] == 'cause']
+    effect_tab = j.loc[j['cause/effect'] == 'effect']
+    cause_coun_NP = (cause_tab.Label_level2 == 'Non-performance').sum()
+    effect_coun_NP = (effect_tab.Label_level2 == 'Non-performance').sum()
+    if (cause_coun_NP > 0) and (effect_coun_NP > 0):
+        count_NP = cause_coun_NP if cause_coun_NP >= effect_coun_NP else effect_coun_NP
+    else:
+        count_NP = 0
+    effect_NP_inv = (effect_tab.Label_level2 == 'Investors').sum()
+    if (cause_coun_NP > 0) and (effect_NP_inv > 0):
+        count_NP_inv = cause_coun_NP if cause_coun_NP >= effect_NP_inv else effect_NP_inv
+    else:
+        count_NP_inv = 0
+    effect_NP_cus = (effect_tab.Label_level2 == 'Customers').sum()
+    if (cause_coun_NP > 0) and (effect_NP_cus > 0):
+        count_NP_cus = cause_coun_NP if cause_coun_NP >= effect_NP_cus else effect_NP_cus
+    else:
+        count_NP_cus = 0
+    effect_NP_emp = (effect_tab.Label_level2 == 'Employees').sum()
+    if (cause_coun_NP > 0) and (effect_NP_emp > 0):
+        count_NP_emp = cause_coun_NP if cause_coun_NP >= effect_NP_emp else effect_NP_emp
+    else:
+        count_NP_emp = 0
+    effect_NP_soc = (effect_tab.Label_level2 == 'Society').sum()
+    if (cause_coun_NP > 0) and (effect_NP_soc > 0):
+        count_NP_soc = cause_coun_NP if cause_coun_NP >= effect_NP_soc else effect_NP_soc
+    else:
+        count_NP_soc = 0
+    cause_coun_inv = (cause_tab.Label_level2 == 'Investors').sum()
+    effect_coun_inv = (effect_tab.Label_level2 == 'Non-performance').sum()
+    if (cause_coun_inv > 0) and (effect_coun_inv > 0):
+        count_NP_inv = cause_coun_inv if cause_coun_inv >= effect_coun_inv else effect_coun_inv
+    else:
+        count_NP_inv = 0
+    effect_inv_inv = (effect_tab.Label_level2 == 'Investors').sum()
+    if (cause_coun_inv > 0) and (effect_inv_inv > 0):
+        count_inv_inv = cause_coun_inv if cause_coun_inv >= effect_inv_inv else effect_inv_inv
+    else:
+        count_inv_inv = 0
+    effect_inv_cus = (effect_tab.Label_level2 == 'Customers').sum()
+    if (cause_coun_inv > 0) and (effect_inv_cus > 0):
+        count_inv_cus = cause_coun_inv if cause_coun_inv >= effect_inv_cus else effect_inv_cus
+    else:
+        count_inv_cus = 0
+    effect_inv_emp = (effect_tab.Label_level2 == 'Employees').sum()
+    if (cause_coun_inv > 0) and (effect_inv_emp > 0):
+        count_inv_emp = cause_coun_inv if cause_coun_inv >= effect_inv_emp else effect_inv_emp
+    else:
+        count_inv_emp = 0
+    effect_inv_soc = (effect_tab.Label_level2 == 'Society').sum()
+    if (cause_coun_inv > 0) and (effect_inv_soc > 0):
+        count_inv_soc = cause_coun_inv if cause_coun_inv >= effect_inv_soc else effect_inv_soc
+    else:
+        count_inv_soc = 0
+    cause_coun_cus = (cause_tab.Label_level2 == 'Customers').sum()
+    effect_coun_cus = (effect_tab.Label_level2 == 'Non-performance').sum()
+    if (cause_coun_cus > 0) and (effect_coun_cus > 0):
+        count_NP_cus = cause_coun_cus if cause_coun_cus >= effect_coun_cus else effect_coun_cus
+    else:
+        count_NP_cus = 0
+    effect_cus_inv = (effect_tab.Label_level2 == 'Investors').sum()
+    if (cause_coun_cus > 0) and (effect_cus_inv > 0):
+        count_cus_inv = cause_coun_cus if cause_coun_cus >= effect_cus_inv else effect_cus_inv
+    else:
+        count_cus_inv = 0
+    effect_cus_cus = (effect_tab.Label_level2 == 'Customers').sum()
+    if (cause_coun_cus > 0) and (effect_cus_cus > 0):
+        count_cus_cus = cause_coun_cus if cause_coun_cus >= effect_cus_cus else effect_cus_cus
+    else:
+        count_cus_cus = 0
+    effect_cus_emp = (effect_tab.Label_level2 == 'Employees').sum()
+    if (cause_coun_cus > 0) and (effect_cus_emp > 0):
+        count_cus_emp = cause_coun_cus if cause_coun_cus >= effect_cus_emp else effect_cus_emp
+    else:
+        count_cus_emp = 0
+    effect_cus_soc = (effect_tab.Label_level2 == 'Society').sum()
+    if (cause_coun_cus > 0) and (effect_cus_soc > 0):
+        count_cus_soc = cause_coun_cus if cause_coun_cus >= effect_cus_soc else effect_cus_soc
+    else:
+        count_cus_soc = 0
+    cause_coun_emp = (cause_tab.Label_level2 == 'Employees').sum()
+    effect_coun_emp = (effect_tab.Label_level2 == 'Non-performance').sum()
+    if (cause_coun_emp > 0) and (effect_coun_emp > 0):
+        count_NP_emp = cause_coun_emp if cause_coun_emp >= effect_coun_emp else effect_coun_emp
+    else:
+        count_NP_emp = 0
+    effect_emp_inv = (effect_tab.Label_level2 == 'Investors').sum()
+    if (cause_coun_emp > 0) and (effect_emp_inv > 0):
+        count_emp_inv = cause_coun_emp if cause_coun_emp >= effect_emp_inv else effect_emp_inv
+    else:
+        count_emp_inv = 0
+    effect_emp_cus = (effect_tab.Label_level2 == 'Customers').sum()
+    if (cause_coun_emp > 0) and (effect_emp_cus > 0):
+        count_emp_cus = cause_coun_emp if cause_coun_emp >= effect_emp_cus else effect_emp_cus
+    else:
+        count_emp_cus = 0
+    effect_emp_emp = (effect_tab.Label_level2 == 'Employees').sum()
+    if (cause_coun_emp > 0) and (effect_emp_emp > 0):
+        count_emp_emp = cause_coun_emp if cause_coun_emp >= effect_emp_emp else effect_emp_emp
+    else:
+        count_emp_emp = 0
+    effect_emp_soc = (effect_tab.Label_level2 == 'Society').sum()
+    if (cause_coun_emp > 0) and (effect_emp_soc > 0):
+        count_emp_soc = cause_coun_emp if cause_coun_emp >= effect_emp_soc else effect_emp_soc
+    else:
+        count_emp_soc = 0
+    cause_coun_soc = (cause_tab.Label_level2 == 'Society').sum()
+    effect_coun_soc = (effect_tab.Label_level2 == 'Non-performance').sum()
+    if (cause_coun_soc > 0) and (effect_coun_soc > 0):
+        count_NP_soc = cause_coun_soc if cause_coun_soc >= effect_coun_soc else effect_coun_soc
+    else:
+        count_NP_soc = 0
+    effect_soc_inv = (effect_tab.Label_level2 == 'Investors').sum()
+    if (cause_coun_soc > 0) and (effect_soc_inv > 0):
+        count_soc_inv = cause_coun_soc if cause_coun_soc >= effect_soc_inv else effect_soc_inv
+    else:
+        count_soc_inv = 0
+    effect_soc_cus = (effect_tab.Label_level2 == 'Customers').sum()
+    if (cause_coun_soc > 0) and (effect_soc_cus > 0):
+        count_soc_cus = cause_coun_soc if cause_coun_soc >= effect_soc_cus else effect_soc_cus
+    else:
+        count_soc_cus = 0
+    effect_soc_emp = (effect_tab.Label_level2 == 'Employees').sum()
+    if (cause_coun_soc > 0) and (effect_soc_emp > 0):
+        count_soc_emp = cause_coun_soc if cause_coun_soc >= effect_soc_emp else effect_soc_emp
+    else:
+        count_soc_emp = 0
+    effect_soc_soc = (effect_tab.Label_level2 == 'Society').sum()
+    if (cause_coun_soc > 0) and (effect_soc_soc > 0):
+        count_soc_soc = cause_coun_soc if cause_coun_soc >= effect_soc_soc else effect_soc_soc
+    else:
+        count_soc_soc = 0
+    count_NP_NP = count_NP_NP + count_NP
+    count_NP_investor = count_NP_investor + count_NP_inv
+    count_NP_customer = count_NP_customer + count_NP_cus
+    count_NP_employees = count_NP_employees + count_NP_emp
+    count_NP_society = count_NP_society + count_NP_soc
+    count_inv_np = count_inv_np + count_NP_inv
+    count_inv_investor = count_inv_investor + count_inv_inv
+    count_inv_customer = count_inv_customer + count_inv_cus
+    count_inv_employee = count_inv_employee + count_inv_emp
+    count_inv_society = count_inv_society + count_inv_soc
+    count_cus_np = count_cus_np + count_NP_cus
+    count_cus_investor = count_cus_investor + count_cus_inv
+    count_cus_customer = count_cus_customer + count_cus_cus
+    count_cus_employee = count_cus_employee + count_cus_emp
+    count_cus_society = count_cus_society + count_cus_soc
+    count_emp_np = count_emp_np + count_NP_emp
+    count_emp_investor = count_emp_investor + count_emp_inv
+    count_emp_customer = count_emp_customer + count_emp_cus
+    count_emp_employee = count_emp_employee + count_emp_emp
+    count_emp_society = count_emp_society + count_emp_soc
+    count_soc_np = count_soc_np + count_NP_soc
+    count_soc_investor = count_soc_investor + count_soc_inv
+    count_soc_customer = count_soc_customer + count_soc_cus
+    count_soc_employee = count_soc_employee + count_soc_emp
+    count_soc_society = count_soc_society + count_soc_soc
+    df_tab = pd.DataFrame(columns = ['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'],index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'], dtype=object)
+    df_tab.loc['Non-performance'] = [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society]
+    df_tab.loc['Investors'] = [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society]
+    df_tab.loc['Customers'] = [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society]
+    df_tab.loc['Employees'] = [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society]
+    df_tab.loc['Society'] = [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]
+#  df_tab = pd.DataFrame({
+#      'Non-performance': [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society],
+#      'Investors': [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society],
+#      'Customers': [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society],
+#      'Employees': [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society],
+#      'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
+#       index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
+  df_tab.to_csv('final_data.csv')
+  df = pd.read_csv('final_data.csv', index_col=0)
+  # Convert to JSON format
+  json_data = []
+  for row in df.index:
+    for col in df.columns:
+      json_data.append({
+            'source': row,
+            'target': col,
+            'value': int(df.loc[row, col])
+        })
+  # Write JSON to file
+  with open('smalljson.json', 'w') as f:
+    json.dump(json_data, f)
+  csv_file = "predictions.csv"
+  json_file = "ch.json"
+  # Open the CSV file and read the data
+  with open(csv_file, "r") as f:
+    csv_data = csv.DictReader(f)
+    # Convert the CSV data to a list of dictionaries
+    data_list = []
+    for row in csv_data:
+        data_list.append(dict(row))
+  # Convert the list of dictionaries to JSON
+  json_data = json.dumps(data_list)
+  # Write the JSON data to a file
+  with open(json_file, "w") as f:
+    f.write(json_data)
+  def convert_df(df):
+  #IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode('utf-8')
+  csv1 = convert_df(df_final.astype(str))
+  csv2 = convert_df(df_tab.astype(str))
+  with st.container():
+    st.download_button(label="Download the detailed result table",data=csv1,file_name='results.csv',mime='text/csv')
+    st.download_button(label="Download the result table",data=csv2,file_name='final_data.csv',mime='text/csv')
+#   # LINK TO THE CSS FILE
+#  def tree_css(file_name):
+#   with open('/Users/seetha/Downloads/tree.css')as f:
+#    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
+#
+#  def div_css(file_name):
+#   with open('/Users/seetha/Downloads/div.css')as f:
+#    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
+#
+#  def side_css(file_name):
+#   with open('/Users/seetha/Downloads/side.css')as f:
+#    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
+#
+#  tree_css('tree.css')
+#  div_css('div.css')
+#  side_css('side.css')
+    STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
+    CSS_PATH = (STREAMLIT_STATIC_PATH / "css1")
+    if not CSS_PATH.is_dir():
+      CSS_PATH.mkdir()
+    css_file = CSS_PATH / "tree.css"
+    css_file1 = CSS_PATH / "div.css"
+    css_file2 = CSS_PATH / "side.css"
+    jso_file = CSS_PATH / "smalljson.json"
+    if not css_file.exists():
+      shutil.copy("tree.css", css_file)
+      shutil.copy("div.css", css_file1)
+      shutil.copy("side.css", css_file2)
+      shutil.copy("smalljson.json", jso_file)
+  HtmlFile = open("index.html", 'r', encoding='utf-8')
+  source_code = HtmlFile.read()
+  #print(source_code)
+  components.html(source_code)
+#   # Define your javascript
+#   my_js = """
+#     alert("Hello World");
+#   """
+  # Wrapt the javascript as html code
+  #my_html = f"<script>{my_js}</script>"
+#   with st.container():
+#     # Execute your app
+#     st.title("Visualization example")
+# #     components.html(source_code)
+#     #html(my_html)
+#     #webbrowser.open('https://webpages.charlotte.edu/ltotapal/')
+#     # embed streamlit docs in a streamlit app
+#     #components.iframe("https://webpages.charlotte.edu/ltotapal/")
+#     st.markdown('<a href="https://webpages.charlotte.edu/ltotapal/" target="_self">Text to Knowledge graph link</a>', unsafe_allow_html=True)
+if __name__ == '__main__':
+    main()