|
|
|
import glob, os, sys; sys.path.append('../udfPreprocess') |
|
|
|
|
|
import udfPreprocess.docPreprocessing as pre |
|
import udfPreprocess.cleaning as clean |
|
|
|
|
|
import seaborn as sns |
|
from pandas import DataFrame |
|
from sentence_transformers import SentenceTransformer, CrossEncoder, util |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
from transformers import pipeline |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import streamlit as st |
|
import pandas as pd |
|
from rank_bm25 import BM25Okapi |
|
from sklearn.feature_extraction import _stop_words |
|
import string |
|
from tqdm.autonotebook import tqdm |
|
import numpy as np |
|
import urllib.request |
|
import ast |
|
import tempfile |
|
import sqlite3 |
|
import json |
|
import urllib.request |
|
import ast |
|
def app(): |
|
|
|
st.sidebar.title('Check Coherence') |
|
st.sidebar.write(' ') |
|
with open('ndcs/countryList.txt') as dfile: |
|
countryList = dfile.read() |
|
|
|
countryList = ast.literal_eval(countryList) |
|
countrynames = list(countryList.keys()) |
|
|
|
option = st.sidebar.selectbox('Select Country', (countrynames)) |
|
countryCode = countryList[option] |
|
|
|
|
|
with st.container(): |
|
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True) |
|
st.write(' ') |
|
st.write(' ') |
|
|
|
with st.expander("ℹ️ - About this app", expanded=True): |
|
|
|
st.write( |
|
""" |
|
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network. |
|
""" |
|
) |
|
|
|
st.markdown("") |
|
|
|
st.markdown("") |
|
st.markdown("## 📌 Step One: Upload document of the country selected ") |
|
|
|
with st.container(): |
|
docs = None |
|
|
|
choice = st.radio(label = 'Select the Document', |
|
help = 'You can upload the document \ |
|
or else you can try a example document.', |
|
options = ('Upload Document', 'Try Example'), |
|
horizontal = True) |
|
|
|
if choice == 'Upload Document': |
|
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt']) |
|
if uploaded_file is not None: |
|
with tempfile.NamedTemporaryFile(mode="wb") as temp: |
|
bytes_data = uploaded_file.getvalue() |
|
temp.write(bytes_data) |
|
|
|
st.write("Uploaded Filename: ", uploaded_file.name) |
|
file_name = uploaded_file.name |
|
file_path = temp.name |
|
docs = pre.load_document(file_path, file_name) |
|
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs) |
|
|
|
else: |
|
|
|
option = st.selectbox('Select the example document', |
|
('South Africa:Low Emission strategy', |
|
'Ethiopia: 10 Year Development Plan')) |
|
if option is 'South Africa:Low Emission strategy': |
|
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt' |
|
countryCode = countryList['South Africa'] |
|
st.write("Selected document:", file_name.split('/')[1]) |
|
|
|
|
|
else: |
|
|
|
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt' |
|
countryCode = countryList['Ethiopia'] |
|
st.write("Selected document:", file_name.split('/')[1]) |
|
|
|
if option is not None: |
|
docs = pre.load_document(file_path,file_name) |
|
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs) |
|
|
|
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile: |
|
cca_sent = dfile.read() |
|
|
|
cca_sent = ast.literal_eval(cca_sent) |
|
|
|
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile: |
|
ccm_sent = dfile.read() |
|
|
|
ccm_sent = ast.literal_eval(ccm_sent) |
|
|
|
with open('ndcs/countryList.txt') as dfile: |
|
countryList = dfile.read() |
|
|
|
countryList = ast.literal_eval(countryList) |
|
|
|
def get_document(countryCode: str): |
|
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json" |
|
with urllib.request.urlopen(link) as urlfile: |
|
data = json.loads(urlfile.read()) |
|
categoriesData = {} |
|
categoriesData['categories']= data['categories'] |
|
categoriesData['subcategories']= data['subcategories'] |
|
keys_sub = categoriesData['subcategories'].keys() |
|
documentType= 'NDCs' |
|
if documentType in data.keys(): |
|
if countryCode in data[documentType].keys(): |
|
get_dict = {} |
|
for key, value in data[documentType][countryCode].items(): |
|
if key not in ['country_name','region_id', 'region_name']: |
|
get_dict[key] = value['classification'] |
|
else: |
|
get_dict[key] = value |
|
else: |
|
return None |
|
else: |
|
return None |
|
|
|
country = {} |
|
for key in categoriesData['categories']: |
|
country[key]= {} |
|
for key,value in categoriesData['subcategories'].items(): |
|
country[value['category']][key] = get_dict[key] |
|
|
|
return country |
|
|
|
|
|
|
|
def countrySpecificCCA(cca_sent, threshold, countryCode): |
|
temp = {} |
|
doc = get_document(countryCode) |
|
for key,value in cca_sent.items(): |
|
id_ = doc['climate change adaptation'][key]['id'] |
|
if id_ >threshold: |
|
temp[key] = value['id'][id_] |
|
return temp |
|
|
|
|
|
def countrySpecificCCM(ccm_sent, threshold, countryCode): |
|
temp = {} |
|
doc = get_document(countryCode) |
|
for key,value in ccm_sent.items(): |
|
id_ = doc['climate change mitigation'][key]['id'] |
|
if id_ >threshold: |
|
temp[key] = value['id'][id_] |
|
|
|
return temp |
|
|
|
|
|
|
|
if docs is not None: |
|
sent_cca = countrySpecificCCA(cca_sent,1,countryCode) |
|
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode) |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_sentenceTransformer(name): |
|
return SentenceTransformer(name) |
|
model = load_sentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
document_embeddings = model.encode(paraList, show_progress_bar=True) |
|
|
|
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation')) |
|
if genre == 'Climate Change Adaptation': |
|
sent_dict = sent_cca |
|
sent_labels = [] |
|
for key,sent in sent_dict.items(): |
|
sent_labels.append(sent) |
|
label_embeddings = model.encode(sent_labels, show_progress_bar=True) |
|
similarity_high_threshold = 0.55 |
|
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings) |
|
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold) |
|
|
|
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist())) |
|
|
|
|
|
else: |
|
sent_dict = sent_ccm |
|
sent_labels = [] |
|
for key,sent in sent_dict.items(): |
|
sent_labels.append(sent) |
|
label_embeddings = model.encode(sent_labels, show_progress_bar=True) |
|
similarity_high_threshold = 0.55 |
|
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings) |
|
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold) |
|
|
|
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist())) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for _label_idx, _paragraph_idx in positive_indices: |
|
st.write("This paragraph: \n") |
|
st.write(paraList[_paragraph_idx]) |
|
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}") |
|
st.write('-'*10) |
|
|
|
|