ppsingh commited on
Commit
467673e
1 Parent(s): 2451417

Delete utils

Browse files
utils/__init__.py DELETED
@@ -1 +0,0 @@
1
- # adding for package implementation
 
 
utils/adapmit_classifier.py DELETED
@@ -1,99 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- @st.cache_resource
12
- def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
13
- """
14
- loads the document classifier using haystack, where the name/path of model
15
- in HF-hub as string is used to fetch the model object.Either configfile or
16
- model should be passed.
17
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
18
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
19
- Params
20
- --------
21
- config_file: config file path from which to read the model name
22
- classifier_name: if modelname is passed, it takes a priority if not \
23
- found then will look for configfile, else raise error.
24
- Return: document classifier model
25
- """
26
- if not classifier_name:
27
- if not config_file:
28
- logging.warning("Pass either model name or config file")
29
- return
30
- else:
31
- config = getconfig(config_file)
32
- classifier_name = config.get('adapmit','MODEL')
33
-
34
- logging.info("Loading Adaptation Mitigation classifier")
35
- doc_classifier = pipeline("text-classification",
36
- model=classifier_name,
37
- return_all_scores=True,
38
- function_to_apply= "sigmoid")
39
-
40
-
41
- return doc_classifier
42
-
43
-
44
- @st.cache_data
45
- def adapmit_classification(haystack_doc:pd.DataFrame,
46
- threshold:float = 0.5,
47
- classifier_model:pipeline= None
48
- )->Tuple[DataFrame,Series]:
49
- """
50
- Text-Classification on the list of texts provided. Classifier provides the
51
- most appropriate label for each text. these labels are in terms of if text
52
- belongs to which particular Sustainable Devleopment Goal (SDG).
53
- Params
54
- ---------
55
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
56
- contains the list of paragraphs in different format,here the list of
57
- Haystack Documents is used.
58
- threshold: threshold value for the model to keep the results from classifier
59
- classifiermodel: you can pass the classifier model directly,which takes priority
60
- however if not then looks for model in streamlit session.
61
- In case of streamlit avoid passing the model directly.
62
- Returns
63
- ----------
64
- df: Dataframe with two columns['SDG:int', 'text']
65
- x: Series object with the unique SDG covered in the document uploaded and
66
- the number of times it is covered/discussed/count_of_paragraphs.
67
- """
68
- logging.info("Working on Adaptation-Mitigation Identification")
69
- haystack_doc['Adapt-Mitig Label'] = 'NA'
70
- # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
71
- # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
72
-
73
- if not classifier_model:
74
- classifier_model = st.session_state['adapmit_classifier']
75
-
76
- predictions = classifier_model(list(haystack_doc.text))
77
- # converting the predictions to desired format
78
- list_ = []
79
- for i in range(len(predictions)):
80
-
81
- temp = predictions[i]
82
- placeholder = {}
83
- for j in range(len(temp)):
84
- placeholder[temp[j]['label']] = temp[j]['score']
85
- list_.append(placeholder)
86
- labels_ = [{**list_[l]} for l in range(len(predictions))]
87
- truth_df = DataFrame.from_dict(labels_)
88
- truth_df = truth_df.round(2)
89
- truth_df = truth_df.astype(float) >= threshold
90
- truth_df = truth_df.astype(str)
91
- categories = list(truth_df.columns)
92
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
93
- else None for i in categories}, axis=1)
94
- truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
95
- list(x['Adapt-Mitig Label'] -{None}),axis=1)
96
- haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
97
- #df = pd.concat([df,df1])
98
-
99
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/conditional_classifier.py DELETED
@@ -1,95 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_conditionalClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('conditional','MODEL')
34
-
35
- logging.info("Loading conditional classifier")
36
- doc_classifier = pipeline("text-classification",
37
- model=classifier_name,
38
- top_k =1)
39
-
40
- return doc_classifier
41
-
42
-
43
- @st.cache_data
44
- def conditional_classification(haystack_doc:pd.DataFrame,
45
- threshold:float = 0.8,
46
- classifier_model:pipeline= None
47
- )->Tuple[DataFrame,Series]:
48
- """
49
- Text-Classification on the list of texts provided. Classifier provides the
50
- most appropriate label for each text. It informs if paragraph contains any
51
- netzero information or not.
52
- Params
53
- ---------
54
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
55
- contains the list of paragraphs in different format,here the list of
56
- Haystack Documents is used.
57
- threshold: threshold value for the model to keep the results from classifier
58
- classifiermodel: you can pass the classifier model directly,which takes priority
59
- however if not then looks for model in streamlit session.
60
- In case of streamlit avoid passing the model directly.
61
- Returns
62
- ----------
63
- df: Dataframe
64
- """
65
- logging.info("Working on Conditionality Identification")
66
- haystack_doc['Conditional Label'] = 'NA'
67
- haystack_doc['Conditional Score'] = 0.0
68
- haystack_doc['cond_check'] = False
69
- haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
70
-
71
- #df1 = haystack_doc[haystack_doc['PA_check'] == True]
72
- #df = haystack_doc[haystack_doc['PA_check'] == False]
73
- haystack_doc['cond_check'] = haystack_doc.apply(lambda x: True if (
74
- (x['Target Label'] == 'TARGET') | (x['PA_check'] == True)) else
75
- False, axis=1)
76
- # we apply Netzero to only paragraphs which are classified as 'Target' related
77
- temp = haystack_doc[haystack_doc['cond_check'] == True]
78
- temp = temp.reset_index(drop=True)
79
- df = haystack_doc[haystack_doc['cond_check'] == False]
80
- df = df.reset_index(drop=True)
81
-
82
- if not classifier_model:
83
- classifier_model = st.session_state['conditional_classifier']
84
-
85
- results = classifier_model(list(temp.text))
86
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
87
- temp['Conditional Label'],temp['Conditional Score'] = zip(*labels_)
88
- # temp[' Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
89
- # merging Target with Non Target dataframe
90
- df = pd.concat([df,temp])
91
- df = df.drop(columns = ['cond_check','PA_check'])
92
- df = df.reset_index(drop =True)
93
- df.index += 1
94
-
95
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/config.py DELETED
@@ -1,31 +0,0 @@
1
- import configparser
2
- import logging
3
-
4
- def getconfig(configfile_path:str):
5
- """
6
- configfile_path: file path of .cfg file
7
- """
8
-
9
- config = configparser.ConfigParser()
10
-
11
- try:
12
- config.read_file(open(configfile_path))
13
- return config
14
- except:
15
- logging.warning("config file not found")
16
-
17
-
18
- # Declare all the necessary variables
19
- def get_classifier_params(model_name):
20
- config = getconfig('paramconfig.cfg')
21
- params = {}
22
- params['model_name'] = config.get(model_name,'MODEL')
23
- params['split_by'] = config.get(model_name,'SPLIT_BY')
24
- params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
25
- params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
26
- params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
27
- params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
28
- params['threshold'] = float(config.get(model_name,'THRESHOLD'))
29
- params['top_n'] = int(config.get(model_name,'TOP_KEY'))
30
-
31
- return params
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/ghg_classifier.py DELETED
@@ -1,96 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- # Labels dictionary ###
12
- _lab_dict = {
13
- 'GHG':'GHG',
14
- 'NOT_GHG':'NON GHG TRANSPORT TARGET',
15
- 'NEGATIVE':'OTHERS',
16
- }
17
-
18
-
19
- @st.cache_resource
20
- def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
21
- """
22
- loads the document classifier using haystack, where the name/path of model
23
- in HF-hub as string is used to fetch the model object.Either configfile or
24
- model should be passed.
25
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
26
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
27
- Params
28
- --------
29
- config_file: config file path from which to read the model name
30
- classifier_name: if modelname is passed, it takes a priority if not \
31
- found then will look for configfile, else raise error.
32
- Return: document classifier model
33
- """
34
- if not classifier_name:
35
- if not config_file:
36
- logging.warning("Pass either model name or config file")
37
- return
38
- else:
39
- config = getconfig(config_file)
40
- classifier_name = config.get('ghg','MODEL')
41
-
42
- logging.info("Loading ghg classifier")
43
- doc_classifier = pipeline("text-classification",
44
- model=classifier_name,
45
- top_k =1)
46
-
47
- return doc_classifier
48
-
49
-
50
- @st.cache_data
51
- def ghg_classification(haystack_doc:pd.DataFrame,
52
- threshold:float = 0.5,
53
- classifier_model:pipeline= None
54
- )->Tuple[DataFrame,Series]:
55
- """
56
- Text-Classification on the list of texts provided. Classifier provides the
57
- most appropriate label for each text. these labels are in terms of if text
58
- belongs to which particular Sustainable Devleopment Goal (SDG).
59
- Params
60
- ---------
61
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
- contains the list of paragraphs in different format,here the list of
63
- Haystack Documents is used.
64
- threshold: threshold value for the model to keep the results from classifier
65
- classifiermodel: you can pass the classifier model directly,which takes priority
66
- however if not then looks for model in streamlit session.
67
- In case of streamlit avoid passing the model directly.
68
- Returns
69
- ----------
70
- df: Dataframe with two columns['SDG:int', 'text']
71
- x: Series object with the unique SDG covered in the document uploaded and
72
- the number of times it is covered/discussed/count_of_paragraphs.
73
- """
74
- logging.info("Working on GHG Extraction")
75
- haystack_doc['GHG Label'] = 'NA'
76
- haystack_doc['GHG Score'] = 0.0
77
- # applying GHG Identifier to only 'Target' paragraphs.
78
- temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
79
- temp = temp.reset_index(drop=True)
80
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
- df = df.reset_index(drop=True)
82
-
83
- if not classifier_model:
84
- classifier_model = st.session_state['ghg_classifier']
85
-
86
- results = classifier_model(list(temp.text))
87
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
88
- temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
89
- temp['GHG Label'] = temp['GHG Label'].apply(lambda x: _lab_dict[x])
90
- # merge back Target and non-Target dataframe
91
- df = pd.concat([df,temp])
92
- df = df.reset_index(drop =True)
93
- df['GHG Score'] = df['GHG Score'].round(2)
94
- df.index += 1
95
-
96
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/indicator_classifier.py DELETED
@@ -1,109 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_indicatorClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('indicator','MODEL')
34
-
35
- logging.info("Loading indicator classifier")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- doc_classifier = pipeline("text-classification",
46
- model=classifier_name,
47
- return_all_scores=True,
48
- function_to_apply= "sigmoid")
49
-
50
- return doc_classifier
51
-
52
-
53
- @st.cache_data
54
- def indicator_classification(haystack_doc:pd.DataFrame,
55
- threshold:float = 0.5,
56
- classifier_model:pipeline= None
57
- )->Tuple[DataFrame,Series]:
58
- """
59
- Text-Classification on the list of texts provided. Classifier provides the
60
- most appropriate label for each text. these labels are in terms of if text
61
- belongs to which particular Sustainable Devleopment Goal (SDG).
62
- Params
63
- ---------
64
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
- contains the list of paragraphs in different format,here the list of
66
- Haystack Documents is used.
67
- threshold: threshold value for the model to keep the results from classifier
68
- classifiermodel: you can pass the classifier model directly,which takes priority
69
- however if not then looks for model in streamlit session.
70
- In case of streamlit avoid passing the model directly.
71
- Returns
72
- ----------
73
- df: Dataframe with two columns['SDG:int', 'text']
74
- x: Series object with the unique SDG covered in the document uploaded and
75
- the number of times it is covered/discussed/count_of_paragraphs.
76
- """
77
- logging.info("Working on Indicator Identification")
78
- haystack_doc['Indicator Label'] = 'NA'
79
- haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
80
-
81
- df1 = haystack_doc[haystack_doc['PA_check'] == True]
82
- df = haystack_doc[haystack_doc['PA_check'] == False]
83
- if not classifier_model:
84
- classifier_model = st.session_state['indicator_classifier']
85
-
86
- predictions = classifier_model(list(df1.text))
87
-
88
- list_ = []
89
- for i in range(len(predictions)):
90
-
91
- temp = predictions[i]
92
- placeholder = {}
93
- for j in range(len(temp)):
94
- placeholder[temp[j]['label']] = temp[j]['score']
95
- list_.append(placeholder)
96
- labels_ = [{**list_[l]} for l in range(len(predictions))]
97
- truth_df = DataFrame.from_dict(labels_)
98
- truth_df = truth_df.round(2)
99
- truth_df = truth_df.astype(float) >= threshold
100
- truth_df = truth_df.astype(str)
101
- categories = list(truth_df.columns)
102
- truth_df['Indicator Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
103
- None for i in categories}, axis=1)
104
- truth_df['Indicator Label'] = truth_df.apply(lambda x: list(x['Indicator Label']
105
- -{None}),axis=1)
106
- df1['Indicator Label'] = list(truth_df['Indicator Label'])
107
- df = pd.concat([df,df1])
108
- df = df.drop(columns = ['PA_check'])
109
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/netzero_classifier.py DELETED
@@ -1,88 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- # Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO NETZERO TARGET',
14
- 'NETZERO':'NETZERO TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('netzero','MODEL')
39
-
40
- logging.info("Loading netzero classifier")
41
- doc_classifier = pipeline("text-classification",
42
- model=classifier_name,
43
- top_k =1)
44
-
45
- return doc_classifier
46
-
47
-
48
- @st.cache_data
49
- def netzero_classification(haystack_doc:pd.DataFrame,
50
- threshold:float = 0.8,
51
- classifier_model:pipeline= None
52
- )->Tuple[DataFrame,Series]:
53
- """
54
- Text-Classification on the list of texts provided. Classifier provides the
55
- most appropriate label for each text. these labels are in terms of if text
56
- belongs to which particular Sustainable Devleopment Goal (SDG).
57
- Params
58
- ---------
59
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
60
- contains the list of paragraphs in different format,here the list of
61
- Haystack Documents is used.
62
- threshold: threshold value for the model to keep the results from classifier
63
- classifiermodel: you can pass the classifier model directly,which takes priority
64
- however if not then looks for model in streamlit session.
65
- In case of streamlit avoid passing the model directly.
66
- Returns
67
- ----------
68
- df: Dataframe with two columns['SDG:int', 'text']
69
- x: Series object with the unique SDG covered in the document uploaded and
70
- the number of times it is covered/discussed/count_of_paragraphs.
71
- """
72
- logging.info("Working on Netzero Extraction")
73
- haystack_doc['Netzero Label'] = 'NA'
74
- haystack_doc['Netzero Score'] = 'NA'
75
- temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
76
- df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
77
-
78
- if not classifier_model:
79
- classifier_model = st.session_state['netzero_classifier']
80
-
81
- results = classifier_model(list(temp.text))
82
- labels_= [(l[0]['label'],l[0]['score']) for l in results]
83
- temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
84
- df = pd.concat([df,temp])
85
- df = df.reset_index(drop =True)
86
- df.index += 1
87
-
88
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/policyaction_classifier.py DELETED
@@ -1,101 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- ## Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO TARGET INFO',
14
- 'TARGET':'TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_policyactionClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('policyaction','MODEL')
39
-
40
- logging.info("Loading classifier")
41
-
42
- doc_classifier = pipeline("text-classification",
43
- model=classifier_name,
44
- return_all_scores=True,
45
- function_to_apply= "sigmoid")
46
-
47
- return doc_classifier
48
-
49
-
50
- @st.cache_data
51
- def policyaction_classification(haystack_doc:pd.DataFrame,
52
- threshold:float = 0.5,
53
- classifier_model:pipeline= None
54
- )->Tuple[DataFrame,Series]:
55
- """
56
- Text-Classification on the list of texts provided. Classifier provides the
57
- most appropriate label for each text. these labels are in terms of if text
58
- belongs to which particular Sustainable Devleopment Goal (SDG).
59
- Params
60
- ---------
61
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
- contains the list of paragraphs in different format,here the list of
63
- Haystack Documents is used.
64
- threshold: threshold value for the model to keep the results from classifier
65
- classifiermodel: you can pass the classifier model directly,which takes priority
66
- however if not then looks for model in streamlit session.
67
- In case of streamlit avoid passing the model directly.
68
- Returns
69
- ----------
70
- df: Dataframe with two columns['SDG:int', 'text']
71
- x: Series object with the unique SDG covered in the document uploaded and
72
- the number of times it is covered/discussed/count_of_paragraphs.
73
- """
74
- logging.info("Working on Policy/Action. Extraction")
75
- haystack_doc['Policy-Action Label'] = 'NA'
76
- if not classifier_model:
77
- classifier_model = st.session_state['policyaction_classifier']
78
-
79
- predictions = classifier_model(list(haystack_doc.text))
80
- list_ = []
81
- for i in range(len(predictions)):
82
-
83
- temp = predictions[i]
84
- placeholder = {}
85
- for j in range(len(temp)):
86
- placeholder[temp[j]['label']] = temp[j]['score']
87
- list_.append(placeholder)
88
- labels_ = [{**list_[l]} for l in range(len(predictions))]
89
- truth_df = DataFrame.from_dict(labels_)
90
- truth_df = truth_df.round(2)
91
- truth_df = truth_df.astype(float) >= threshold
92
- truth_df = truth_df.astype(str)
93
- categories = list(truth_df.columns)
94
- truth_df['Policy-Action Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
95
- else None for i in categories}, axis=1)
96
- truth_df['Policy-Action Label'] = truth_df.apply(lambda x:
97
- list(x['Policy-Action Label'] -{None}),axis=1)
98
-
99
- haystack_doc['Policy-Action Label'] = list(truth_df['Policy-Action Label'])
100
-
101
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/preprocessing.py DELETED
@@ -1,307 +0,0 @@
1
- from haystack.nodes.base import BaseComponent
2
- from haystack.schema import Document
3
- from haystack.nodes import ImageToTextConverter, PDFToTextConverter
4
- from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
- from pdf2image import convert_from_path
6
- from typing import Callable, Dict, List, Optional, Text, Tuple, Union
7
- from typing_extensions import Literal
8
- import pandas as pd
9
- import logging
10
- import re
11
- import string
12
- from haystack.pipelines import Pipeline
13
- import streamlit as st
14
-
15
- @st.cache_data
16
- def useOCR(file_path: str)-> Text:
17
- """
18
- Converts image pdfs into text, Using the Farm-haystack[OCR]
19
-
20
- Params
21
- ----------
22
- file_path: file_path of uploade file, returned by add_upload function in
23
- uploadAndExample.py
24
-
25
- Returns the text file as string.
26
- """
27
- # we need pdf file to be first converted into image file
28
- # this will create each page as image file
29
- images = convert_from_path(pdf_path = file_path)
30
- list_ = []
31
- # save image file in cache and read them one by one to pass it to OCR
32
- for i, pdf in enumerate(images):
33
- # Save pages as images in the pdf
34
- pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
35
- list_.append(f'PDF\image_converted_{i+1}.png')
36
-
37
- converter = ImageToTextConverter(remove_numeric_tables=True,
38
- valid_languages=["eng"])
39
- # placeholder to collect the text from each page
40
- placeholder = []
41
- for file in list_:
42
- document = converter.convert(
43
- file_path=file, meta=None,
44
- )[0]
45
-
46
- text = document.content
47
- placeholder.append(text)
48
- # join the text from each page by page separator
49
- text = '\x0c'.join(placeholder)
50
- return text
51
-
52
-
53
-
54
- class FileConverter(BaseComponent):
55
- """
56
- Wrapper class to convert uploaded document into text by calling appropriate
57
- Converter class, will use internally haystack PDFToTextOCR in case of image
58
- pdf. Cannot use the FileClassifier from haystack as its doesnt has any
59
- label/output class for image.
60
- 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
61
- 2. https://docs.haystack.deepset.ai/docs/file_converters
62
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
63
- 4. https://docs.haystack.deepset.ai/reference/file-converters-api
64
- """
65
-
66
- outgoing_edges = 1
67
-
68
- def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
69
- id_hash_keys: Optional[List[str]] = None,
70
- ) -> Tuple[dict,str]:
71
- """ this is required method to invoke the component in
72
- the pipeline implementation.
73
-
74
- Params
75
- ----------
76
- file_name: name of file
77
- file_path: file_path of uploade file, returned by add_upload function in
78
- uploadAndExample.py
79
-
80
- See the links provided in Class docstring/description to see other params
81
-
82
- Return
83
- ---------
84
- output: dictionary, with key as identifier and value could be anything
85
- we need to return. In this case its the List of Hasyatck Document
86
-
87
- output_1: As there is only one outgoing edge, we pass 'output_1' string
88
- """
89
- try:
90
- if file_name.endswith('.pdf'):
91
- converter = PDFToTextConverter(remove_numeric_tables=True)
92
- if file_name.endswith('.txt'):
93
- converter = TextConverter(remove_numeric_tables=True)
94
- if file_name.endswith('.docx'):
95
- converter = DocxToTextConverter()
96
- except Exception as e:
97
- logging.error(e)
98
- return
99
-
100
-
101
-
102
- documents = []
103
-
104
- document = converter.convert(
105
- file_path=file_path, meta=None,
106
- encoding=encoding, id_hash_keys=id_hash_keys
107
- )[0]
108
-
109
- text = document.content
110
-
111
- # in case of scanned/images only PDF the content might contain only
112
- # the page separator (\f or \x0c). We check if is so and use
113
- # use the OCR to get the text.
114
- filtered = re.sub(r'\x0c', '', text)
115
-
116
- if filtered == "":
117
- logging.info("Using OCR")
118
- text = useOCR(file_path)
119
-
120
- documents.append(Document(content=text,
121
- meta={"name": file_name},
122
- id_hash_keys=id_hash_keys))
123
-
124
-
125
-
126
- logging.info('file conversion succesful')
127
- output = {'documents': documents}
128
- return output, 'output_1'
129
-
130
- def run_batch():
131
- """
132
- we dont have requirement to process the multiple files in one go
133
- therefore nothing here, however to use the custom node we need to have
134
- this method for the class.
135
- """
136
-
137
- return
138
-
139
-
140
- def basic(s:str, remove_punc:bool = False):
141
-
142
- """
143
- Performs basic cleaning of text.
144
- Params
145
- ----------
146
- s: string to be processed
147
- removePunc: to remove all Punctuation including ',' and '.' or not
148
-
149
- Returns: processed string: see comments in the source code for more info
150
- """
151
-
152
- # Remove URLs
153
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
154
- s = re.sub(r"http\S+", " ", s)
155
-
156
- # Remove new line characters
157
- s = re.sub('\n', ' ', s)
158
-
159
- # Remove punctuations
160
- if remove_punc == True:
161
- translator = str.maketrans(' ', ' ', string.punctuation)
162
- s = s.translate(translator)
163
- # Remove distracting single quotes and dotted pattern
164
- s = re.sub("\'", " ", s)
165
- s = s.replace("..","")
166
-
167
- return s.strip()
168
-
169
-
170
- def paraLengthCheck(paraList, max_len = 100):
171
- """
172
- There are cases where preprocessor cannot respect word limit, when using
173
- respect sentence boundary flag due to missing sentence boundaries.
174
- Therefore we run one more round of split here for those paragraphs
175
-
176
- Params
177
- ---------------
178
- paraList : list of paragraphs/text
179
- max_len : max length to be respected by sentences which bypassed
180
- preprocessor strategy
181
-
182
- """
183
- new_para_list = []
184
- for passage in paraList:
185
- # check if para exceeds words limit
186
- if len(passage.content.split()) > max_len:
187
- # we might need few iterations example if para = 512 tokens
188
- # we need to iterate 5 times to reduce para to size limit of '100'
189
- iterations = int(len(passage.content.split())/max_len)
190
- for i in range(iterations):
191
- temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
192
- new_para_list.append((temp,passage.meta['page']))
193
- temp = " ".join(passage.content.split()[max_len*(i+1):])
194
- new_para_list.append((temp,passage.meta['page']))
195
- else:
196
- # paragraphs which dont need any splitting
197
- new_para_list.append((passage.content, passage.meta['page']))
198
-
199
- logging.info("New paragraphs length {}".format(len(new_para_list)))
200
- return new_para_list
201
-
202
- class UdfPreProcessor(BaseComponent):
203
- """
204
- class to preprocess the document returned by FileConverter. It will check
205
- for splitting strategy and splits the document by word or sentences and then
206
- synthetically create the paragraphs.
207
- 1. https://docs.haystack.deepset.ai/docs/preprocessor
208
- 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
209
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
210
- """
211
- outgoing_edges = 1
212
-
213
- def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
214
- split_by: Literal["sentence", "word"] = 'sentence',
215
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
216
- split_overlap:int = 0):
217
-
218
- """ this is required method to invoke the component in
219
- the pipeline implementation.
220
-
221
- Params
222
- ----------
223
- documents: documents from the output dictionary returned by Fileconverter
224
- remove_punc: to remove all Punctuation including ',' and '.' or not
225
- split_by: document splitting strategy either as word or sentence
226
- split_length: when synthetically creating the paragrpahs from document,
227
- it defines the length of paragraph.
228
- split_respect_sentence_boundary: Used when using 'word' strategy for
229
- splititng of text.
230
- split_overlap: Number of words or sentences that overlap when creating
231
- the paragraphs. This is done as one sentence or 'some words' make sense
232
- when read in together with others. Therefore the overlap is used.
233
-
234
- Return
235
- ---------
236
- output: dictionary, with key as identifier and value could be anything
237
- we need to return. In this case the output will contain 4 objects
238
- the paragraphs text list as List, Haystack document, Dataframe and
239
- one raw text file.
240
-
241
- output_1: As there is only one outgoing edge, we pass 'output_1' string
242
-
243
- """
244
-
245
- if split_by == 'sentence':
246
- split_respect_sentence_boundary = False
247
-
248
- else:
249
- split_respect_sentence_boundary = split_respect_sentence_boundary
250
-
251
- preprocessor = PreProcessor(
252
- clean_empty_lines=True,
253
- clean_whitespace=True,
254
- clean_header_footer=True,
255
- split_by=split_by,
256
- split_length=split_length,
257
- split_respect_sentence_boundary= split_respect_sentence_boundary,
258
- split_overlap=split_overlap,
259
-
260
- # will add page number only in case of PDF not for text/docx file.
261
- add_page_number=True
262
- )
263
-
264
- for i in documents:
265
- # # basic cleaning before passing it to preprocessor.
266
- # i = basic(i)
267
- docs_processed = preprocessor.process([i])
268
- if apply_clean:
269
- for item in docs_processed:
270
- item.content = basic(item.content, remove_punc= remove_punc)
271
- else:
272
- pass
273
-
274
- df = pd.DataFrame(docs_processed)
275
- all_text = " ".join(df.content.to_list())
276
- para_list = df.content.to_list()
277
- logging.info('document split into {} paragraphs'.format(len(para_list)))
278
- output = {'documents': docs_processed,
279
- 'dataframe': df,
280
- 'text': all_text,
281
- 'paraList': para_list
282
- }
283
- return output, "output_1"
284
- def run_batch():
285
- """
286
- we dont have requirement to process the multiple files in one go
287
- therefore nothing here, however to use the custom node we need to have
288
- this method for the class.
289
- """
290
- return
291
-
292
- def processingpipeline():
293
- """
294
- Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
295
- from utils.preprocessing
296
- """
297
-
298
- preprocessing_pipeline = Pipeline()
299
- file_converter = FileConverter()
300
- custom_preprocessor = UdfPreProcessor()
301
-
302
- preprocessing_pipeline.add_node(component=file_converter,
303
- name="FileConverter", inputs=["File"])
304
- preprocessing_pipeline.add_node(component = custom_preprocessor,
305
- name ='UdfPreProcessor', inputs=["FileConverter"])
306
-
307
- return preprocessing_pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sector_classifier.py DELETED
@@ -1,106 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
-
12
- @st.cache_resource
13
- def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
14
- """
15
- loads the document classifier using haystack, where the name/path of model
16
- in HF-hub as string is used to fetch the model object.Either configfile or
17
- model should be passed.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
- Params
21
- --------
22
- config_file: config file path from which to read the model name
23
- classifier_name: if modelname is passed, it takes a priority if not \
24
- found then will look for configfile, else raise error.
25
- Return: document classifier model
26
- """
27
- if not classifier_name:
28
- if not config_file:
29
- logging.warning("Pass either model name or config file")
30
- return
31
- else:
32
- config = getconfig(config_file)
33
- classifier_name = config.get('sector','MODEL')
34
-
35
- logging.info("Loading sector classifier")
36
- # we are using the pipeline as the model is multilabel and DocumentClassifier
37
- # from Haystack doesnt support multilabel
38
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
- # if not then it will automatically use softmax, which is not a desired thing.
40
- # doc_classifier = TransformersDocumentClassifier(
41
- # model_name_or_path=classifier_name,
42
- # task="text-classification",
43
- # top_k = None)
44
-
45
- doc_classifier = pipeline("text-classification",
46
- model=classifier_name,
47
- return_all_scores=True,
48
- function_to_apply= "sigmoid")
49
-
50
- return doc_classifier
51
-
52
-
53
- @st.cache_data
54
- def sector_classification(haystack_doc:pd.DataFrame,
55
- threshold:float = 0.5,
56
- classifier_model:pipeline= None
57
- )->Tuple[DataFrame,Series]:
58
- """
59
- Text-Classification on the list of texts provided. Classifier provides the
60
- most appropriate label for each text. these labels are in terms of if text
61
- belongs to which particular Sustainable Devleopment Goal (SDG).
62
- Params
63
- ---------
64
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
- contains the list of paragraphs in different format,here the list of
66
- Haystack Documents is used.
67
- threshold: threshold value for the model to keep the results from classifier
68
- classifiermodel: you can pass the classifier model directly,which takes priority
69
- however if not then looks for model in streamlit session.
70
- In case of streamlit avoid passing the model directly.
71
- Returns
72
- ----------
73
- df: Dataframe with two columns['SDG:int', 'text']
74
- x: Series object with the unique SDG covered in the document uploaded and
75
- the number of times it is covered/discussed/count_of_paragraphs.
76
- """
77
- logging.info("Working on Sector Identification")
78
- haystack_doc['Sector Label'] = 'NA'
79
- # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
80
- # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
- if not classifier_model:
82
- classifier_model = st.session_state['sector_classifier']
83
-
84
- predictions = classifier_model(list(haystack_doc.text))
85
-
86
- list_ = []
87
- for i in range(len(predictions)):
88
-
89
- temp = predictions[i]
90
- placeholder = {}
91
- for j in range(len(temp)):
92
- placeholder[temp[j]['label']] = temp[j]['score']
93
- list_.append(placeholder)
94
- labels_ = [{**list_[l]} for l in range(len(predictions))]
95
- truth_df = DataFrame.from_dict(labels_)
96
- truth_df = truth_df.round(2)
97
- truth_df = truth_df.astype(float) >= threshold
98
- truth_df = truth_df.astype(str)
99
- categories = list(truth_df.columns)
100
- truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
101
- None for i in categories}, axis=1)
102
- truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
103
- -{None}),axis=1)
104
- haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
105
- # df = pd.concat([df,df1])
106
- return haystack_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/target_classifier.py DELETED
@@ -1,89 +0,0 @@
1
- from typing import List, Tuple
2
- from typing_extensions import Literal
3
- import logging
4
- import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
8
- import streamlit as st
9
- from transformers import pipeline
10
-
11
- ## Labels dictionary ###
12
- _lab_dict = {
13
- 'NEGATIVE':'NO TARGET INFO',
14
- 'TARGET':'TARGET',
15
- }
16
-
17
- @st.cache_resource
18
- def load_targetClassifier(config_file:str = None, classifier_name:str = None):
19
- """
20
- loads the document classifier using haystack, where the name/path of model
21
- in HF-hub as string is used to fetch the model object.Either configfile or
22
- model should be passed.
23
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
- Params
26
- --------
27
- config_file: config file path from which to read the model name
28
- classifier_name: if modelname is passed, it takes a priority if not \
29
- found then will look for configfile, else raise error.
30
- Return: document classifier model
31
- """
32
- if not classifier_name:
33
- if not config_file:
34
- logging.warning("Pass either model name or config file")
35
- return
36
- else:
37
- config = getconfig(config_file)
38
- classifier_name = config.get('target','MODEL')
39
-
40
- logging.info("Loading classifier")
41
-
42
- doc_classifier = pipeline("text-classification",
43
- model=classifier_name,
44
- top_k =1)
45
-
46
- return doc_classifier
47
-
48
-
49
- @st.cache_data
50
- def target_classification(haystack_doc:pd.DataFrame,
51
- threshold:float = 0.5,
52
- classifier_model:pipeline= None
53
- )->Tuple[DataFrame,Series]:
54
- """
55
- Text-Classification on the list of texts provided. Classifier provides the
56
- most appropriate label for each text. these labels are in terms of if text
57
- belongs to which particular Sustainable Devleopment Goal (SDG).
58
- Params
59
- ---------
60
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
61
- contains the list of paragraphs in different format,here the list of
62
- Haystack Documents is used.
63
- threshold: threshold value for the model to keep the results from classifier
64
- classifiermodel: you can pass the classifier model directly,which takes priority
65
- however if not then looks for model in streamlit session.
66
- In case of streamlit avoid passing the model directly.
67
- Returns
68
- ----------
69
- df: Dataframe with two columns['SDG:int', 'text']
70
- x: Series object with the unique SDG covered in the document uploaded and
71
- the number of times it is covered/discussed/count_of_paragraphs.
72
- """
73
- logging.info("Working on Target Extraction")
74
- if not classifier_model:
75
- classifier_model = st.session_state['target_classifier']
76
-
77
- results = classifier_model(list(haystack_doc.text))
78
- labels_= [(l[0]['label'],
79
- l[0]['score']) for l in results]
80
-
81
-
82
- df1 = DataFrame(labels_, columns=["Target Label","Relevancy"])
83
- df = pd.concat([haystack_doc,df1],axis=1)
84
-
85
- df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
86
- df.index += 1
87
- df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
88
-
89
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/uploadAndExample.py DELETED
@@ -1,39 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import json
4
-
5
- def add_upload(choice):
6
- """
7
- Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
- Based on user choice runs streamlit processes and save the path and name of
9
- the 'file' to streamlit session_state which then can be fetched later.
10
-
11
- """
12
-
13
- if choice == 'Upload Document':
14
-
15
- # if 'filename' in st.session_state:
16
- # Delete all the items in Session state
17
- # for key in st.session_state.keys():
18
- # del st.session_state[key]
19
-
20
- uploaded_file = st.sidebar.file_uploader('Upload the File',
21
- type=['pdf', 'docx', 'txt'])
22
- if uploaded_file is not None:
23
- with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
24
- bytes_data = uploaded_file.getvalue()
25
- temp.write(bytes_data)
26
- st.session_state['filename'] = uploaded_file.name
27
- st.session_state['filepath'] = temp.name
28
-
29
-
30
- else:
31
- # listing the options
32
- with open('docStore/sample/files.json','r') as json_file:
33
- files = json.load(json_file)
34
-
35
- option = st.sidebar.selectbox('Select the example document',
36
- list(files.keys()))
37
- file_name = file_path = files[option]
38
- st.session_state['filename'] = file_name
39
- st.session_state['filepath'] = file_path