Spaces:
GIZ
/
Running on CPU Upgrade

SDSN-demo / appStore /coherence.py
ppsingh's picture
new_version
22b8e0b
raw
history blame
No virus
10.4 kB
# set path
import glob, os, sys; sys.path.append('../udfPreprocess')
#import helper
import udfPreprocess.docPreprocessing as pre
import udfPreprocess.cleaning as clean
#import needed libraries
import seaborn as sns
from pandas import DataFrame
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sklearn.metrics.pairwise import cosine_similarity
# from keybert import KeyBERT
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import urllib.request
import ast
import tempfile
import sqlite3
import json
import urllib.request
import ast
def app():
# Sidebar
st.sidebar.title('Check Coherence')
st.sidebar.write(' ')
with open('ndcs/countryList.txt') as dfile:
countryList = dfile.read()
countryList = ast.literal_eval(countryList)
countrynames = list(countryList.keys())
option = st.sidebar.selectbox('Select Country', (countrynames))
countryCode = countryList[option]
with st.container():
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("ℹ️ - About this app", expanded=True):
st.write(
"""
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
"""
)
st.markdown("")
st.markdown("")
st.markdown("## 📌 Step One: Upload document of the country selected ")
with st.container():
docs = None
# asking user for either upload or select existing doc
choice = st.radio(label = 'Select the Document',
help = 'You can upload the document \
or else you can try a example document.',
options = ('Upload Document', 'Try Example'),
horizontal = True)
if choice == 'Upload Document':
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
if uploaded_file is not None:
with tempfile.NamedTemporaryFile(mode="wb") as temp:
bytes_data = uploaded_file.getvalue()
temp.write(bytes_data)
st.write("Uploaded Filename: ", uploaded_file.name)
file_name = uploaded_file.name
file_path = temp.name
docs = pre.load_document(file_path, file_name)
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
else:
# listing the options
option = st.selectbox('Select the example document',
('South Africa:Low Emission strategy',
'Ethiopia: 10 Year Development Plan'))
if option is 'South Africa:Low Emission strategy':
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
countryCode = countryList['South Africa']
st.write("Selected document:", file_name.split('/')[1])
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
else:
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
countryCode = countryList['Ethiopia']
st.write("Selected document:", file_name.split('/')[1])
if option is not None:
docs = pre.load_document(file_path,file_name)
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
cca_sent = dfile.read()
cca_sent = ast.literal_eval(cca_sent)
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
ccm_sent = dfile.read()
ccm_sent = ast.literal_eval(ccm_sent)
with open('ndcs/countryList.txt') as dfile:
countryList = dfile.read()
countryList = ast.literal_eval(countryList)
def get_document(countryCode: str):
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
with urllib.request.urlopen(link) as urlfile:
data = json.loads(urlfile.read())
categoriesData = {}
categoriesData['categories']= data['categories']
categoriesData['subcategories']= data['subcategories']
keys_sub = categoriesData['subcategories'].keys()
documentType= 'NDCs'
if documentType in data.keys():
if countryCode in data[documentType].keys():
get_dict = {}
for key, value in data[documentType][countryCode].items():
if key not in ['country_name','region_id', 'region_name']:
get_dict[key] = value['classification']
else:
get_dict[key] = value
else:
return None
else:
return None
country = {}
for key in categoriesData['categories']:
country[key]= {}
for key,value in categoriesData['subcategories'].items():
country[value['category']][key] = get_dict[key]
return country
# country_ndc = get_document('NDCs', countryList[option])
def countrySpecificCCA(cca_sent, threshold, countryCode):
temp = {}
doc = get_document(countryCode)
for key,value in cca_sent.items():
id_ = doc['climate change adaptation'][key]['id']
if id_ >threshold:
temp[key] = value['id'][id_]
return temp
def countrySpecificCCM(ccm_sent, threshold, countryCode):
temp = {}
doc = get_document(countryCode)
for key,value in ccm_sent.items():
id_ = doc['climate change mitigation'][key]['id']
if id_ >threshold:
temp[key] = value['id'][id_]
return temp
if docs is not None:
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
#st.write(sent_ccm)
@st.cache(allow_output_mutation=True)
def load_sentenceTransformer(name):
return SentenceTransformer(name)
model = load_sentenceTransformer('all-MiniLM-L6-v2')
document_embeddings = model.encode(paraList, show_progress_bar=True)
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
if genre == 'Climate Change Adaptation':
sent_dict = sent_cca
sent_labels = []
for key,sent in sent_dict.items():
sent_labels.append(sent)
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
similarity_high_threshold = 0.55
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
else:
sent_dict = sent_ccm
sent_labels = []
for key,sent in sent_dict.items():
sent_labels.append(sent)
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
similarity_high_threshold = 0.55
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
# sent_labels = []
# for key,sent in sent_dict.items():
# sent_labels.append(sent)
# label_embeddings = model.encode(sent_labels, show_progress_bar=True)
#similarity_high_threshold = 0.55
# similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
#label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
#positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
for _label_idx, _paragraph_idx in positive_indices:
st.write("This paragraph: \n")
st.write(paraList[_paragraph_idx])
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
st.write('-'*10)