Check Coherence of Policy Document with NDCs

# set path
import glob, os, sys; sys.path.append('../udfPreprocess')

#import helper
import udfPreprocess.docPreprocessing as pre
import udfPreprocess.cleaning as clean

#import needed libraries
import seaborn as sns
from pandas import DataFrame
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sklearn.metrics.pairwise import cosine_similarity
# from keybert import KeyBERT
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd 
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import urllib.request
import ast
import tempfile
import sqlite3
import json
import urllib.request
import ast
def app():
    # Sidebar
    st.sidebar.title('Check Coherence')
    st.sidebar.write(' ')
    with open('ndcs/countryList.txt') as dfile:
        countryList = dfile.read()

    countryList = ast.literal_eval(countryList)
    countrynames = list(countryList.keys())
    
    option = st.sidebar.selectbox('Select Country', (countrynames))
    countryCode = countryList[option]


    with st.container():
        st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
        st.write(' ')
        st.write(' ')

    with st.expander("ℹ️ - About this app", expanded=True):

        st.write(
            """     
            The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
            """
        )

        st.markdown("")

    st.markdown("")
    st.markdown("##  📌 Step One: Upload document of the country selected ")
    
    with st.container():
            docs = None
            # asking user for either upload or select existing doc
            choice = st.radio(label = 'Select the Document',
                              help = 'You can upload the document \
                              or else you can try a example document.', 
                              options = ('Upload Document', 'Try Example'), 
                              horizontal = True)

            if choice == 'Upload Document':
              uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
              if uploaded_file is not None:
                with tempfile.NamedTemporaryFile(mode="wb") as temp:
                    bytes_data = uploaded_file.getvalue()
                    temp.write(bytes_data)

                    st.write("Uploaded Filename: ", uploaded_file.name)
                    file_name =  uploaded_file.name
                    file_path = temp.name
                    docs = pre.load_document(file_path, file_name)
                    haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)

            else:
              # listing the options
              option = st.selectbox('Select the example document',
                                    ('South Africa:Low Emission strategy', 
                                    'Ethiopia: 10 Year Development Plan'))
              if option is 'South Africa:Low Emission strategy':
                file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
                countryCode = countryList['South Africa']
                st.write("Selected document:", file_name.split('/')[1])
                # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
                # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
              else:
                # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
                file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
                countryCode = countryList['Ethiopia']
                st.write("Selected document:", file_name.split('/')[1])
              
              if option is not None:
                docs = pre.load_document(file_path,file_name)
                haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)

            with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
                cca_sent = dfile.read()

            cca_sent = ast.literal_eval(cca_sent)
            
            with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
                ccm_sent = dfile.read()

            ccm_sent = ast.literal_eval(ccm_sent)
            
            with open('ndcs/countryList.txt') as dfile:
                countryList = dfile.read()

            countryList = ast.literal_eval(countryList)
            
            def get_document(countryCode: str):
                link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"  
                with urllib.request.urlopen(link) as urlfile:
                    data =  json.loads(urlfile.read())
                categoriesData = {}
                categoriesData['categories']= data['categories']
                categoriesData['subcategories']= data['subcategories']
                keys_sub = categoriesData['subcategories'].keys()
                documentType= 'NDCs'
                if documentType in data.keys():
                    if countryCode in data[documentType].keys():
                        get_dict = {}
                        for key, value in data[documentType][countryCode].items():
                            if key not in ['country_name','region_id', 'region_name']:
                                get_dict[key] = value['classification']
                            else:
                                get_dict[key] = value
                    else:
                        return None
                else:
                    return None

                country = {}
                for key in categoriesData['categories']:
                    country[key]= {}
                for key,value in categoriesData['subcategories'].items():
                    country[value['category']][key] = get_dict[key]
                
                return country
        
        #   country_ndc = get_document('NDCs', countryList[option])
            
            def countrySpecificCCA(cca_sent, threshold, countryCode):
                temp = {}
                doc = get_document(countryCode)
                for key,value in cca_sent.items():
                    id_ = doc['climate change adaptation'][key]['id']
                    if id_ >threshold:
                        temp[key] = value['id'][id_]
                return temp
            
                
            def countrySpecificCCM(ccm_sent, threshold, countryCode):
                temp = {}
                doc = get_document(countryCode)
                for key,value in ccm_sent.items():
                    id_ = doc['climate change mitigation'][key]['id']
                    if id_ >threshold:
                        temp[key] = value['id'][id_]
                
                return temp

        
            if docs is not None:
                    sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
                    sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
                    #st.write(sent_ccm)
                    @st.cache(allow_output_mutation=True)
                    def load_sentenceTransformer(name):
                        return SentenceTransformer(name)
                    model = load_sentenceTransformer('all-MiniLM-L6-v2')
          
                    document_embeddings = model.encode(paraList, show_progress_bar=True)
                
                    genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
                    if genre == 'Climate Change Adaptation':
                        sent_dict = sent_cca
                        sent_labels = []
                        for key,sent in sent_dict.items():
                            sent_labels.append(sent)
                        label_embeddings = model.encode(sent_labels, show_progress_bar=True)
                        similarity_high_threshold = 0.55
                        similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
                        label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)

                        positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
                        
                        
                    else:
                        sent_dict = sent_ccm
                        sent_labels = []
                        for key,sent in sent_dict.items():
                            sent_labels.append(sent)
                        label_embeddings = model.encode(sent_labels, show_progress_bar=True)
                        similarity_high_threshold = 0.55
                        similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
                        label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)

                        positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
                        
            
                #    sent_labels = []
                #   for key,sent in sent_dict.items():
                  #      sent_labels.append(sent)
                    
            
                  # label_embeddings = model.encode(sent_labels, show_progress_bar=True)
            
                    #similarity_high_threshold = 0.55
                  # similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
                    #label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)

                    #positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
            
                    for _label_idx, _paragraph_idx in positive_indices:
                        st.write("This paragraph: \n")
                        st.write(paraList[_paragraph_idx])
                        st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
                        st.write('-'*10)