File size: 5,670 Bytes
91ec262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdd41d2
 
 
 
 
91ec262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import sys
import subprocess
import streamlit as st
import numpy as np
from annotated_text import annotation
import collections
import ktrain
import pandas as pd
import os
import neattext.functions as nfx


label_path = ("./data/labels.txt")
cols = ['cat', 'code']
label_df = pd.read_csv(label_path, names=cols, header=0)


def default_text():
    with open("./data/sample.txt", 'r') as fs:
        text = fs.read()
    return text

@st.cache(allow_output_mutation=True,suppress_st_warning=True)
def load_model():
    model_path = "./models/distilbert/"
    model = ktrain.load_predictor(model_path)
    return model

@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_skill_extractor():
    # This function will only be run the first time it's called
    import spacy

    from skillNer.skill_extractor_class import SkillExtractor
    from skillNer.general_params import SKILL_DB

    from spacy.matcher import PhraseMatcher
    # init params of skill extractor
    print('load model')

    nlp = spacy.load('en_core_web_lg')

    print('load matcher')
    # init skill extractor
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
    return skill_extractor



def clean_text(text):
    try:
        docx = nfx.TextFrame(text)
        result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
        # doc = nlp(result.text)
        # empty_list = []
        # for token in doc:
        # empty_list.append(token.lemma_)
        # final_string = ' '.join(map(str,empty_list))
        return result.text
    except Exception as e:
        print(e)
        return None


def predict_cat(model, text):
    
    logits = model.predict(text,return_proba=True)
    prob = int(logits.max()*100)
    cat= label_df.iloc[logits.argmax()].values[0]
    return prob,cat


def grouper(iterable):
    prev = None
    group = []
    for item in iterable:
        if not prev or item - prev <= 1:
            group.append(item)
        else:
            yield group
            group = [item]
        prev = item
    if group:
        yield group


def get_skill(annotations):
    try:
        # annotations = skill_extractor.annotate(text,tresh=0.5)
        # skill_dict = {"Soft Skill": [], "Hard Skill": []}
        soft_skill = []
        hard_skill = []

        for item in annotations['results']['ngram_scored']:
            skill_id = item['skill_id']
            skill_type = skill_extractor.skills_db[skill_id]['skill_type']
            if skill_type == 'Soft Skill' and item['doc_node_value']:
                soft_skill.append(item['doc_node_value'])
            if skill_type == 'Hard Skill':
                hard_skill.append(item['doc_node_value'])  
            # skill_dict['Soft Skill'] =set(soft_skill)
        sk = " ".join(list(set(soft_skill)))
        hk = " ".join(list(set(hard_skill)))
        # st.write(skill_extractor.describe(annotations))
        return sk+hk
    except Exception as e:
        return None


def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])





def create_ann_list(text, results):
    try:
        from skillNer.general_params import SKILL_DB
    except:
        # install skillner if not done yet
        os.system('pip install skillner')
        from skillNer.general_params import SKILL_DB

    type_to_color = {'Hard Skill': "#faa",
                     'Soft Skill': '#afa', 'Certification': '#ff4'}
    text_tokens = text.split(' ')
    annots = {}
    all_res = results['ngram_scored']+results['full_matches']
    ids_done = []
    # create annotations from matches
    for match in all_res:
        id_ = match['skill_id']
        type_ = SKILL_DB[id_]['skill_type']
        span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
        annot = annotation(span_str, type_, background=type_to_color[type_],
                           color="#333", margin='2px')
        annots[match['doc_node_id'][0]] = annot
        for i in match['doc_node_id']:
            ids_done.append(i)
    # create strs for non annotated text
    non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
    dict_ = dict(enumerate(grouper(non_match_ids), 1))
    for v in dict_.values():
        span = ' '.join([text_tokens[i] for i in v])
        annots[v[0]] = span
        # annotation(token,color="#fff", background="transparent",)
    print(dict_)
    print('-----')
    # print(collections.OrderedDict(sorted(annots.items())))
    annots_ = collections.OrderedDict(sorted(annots.items())).values()
    return annots_


def create_dfs(results):
    try:
        from skillNer.general_params import SKILL_DB
    except:
        # install skillner if not done yet
        os.system('pip install skillner')
        from skillNer.general_params import SKILL_DB

    f_matches = results['full_matches']
    f_arr = []
    for match in f_matches:
        id_ = match['skill_id']
        full_name = SKILL_DB[id_]['skill_name']
        type_ = SKILL_DB[id_]['skill_type']
        f_arr.append([id_, full_name, type_])
    s_matches = results['ngram_scored']
    s_arr = []
    for match in s_matches:
        id_ = match['skill_id']
        full_name = SKILL_DB[id_]['skill_name']
        type_ = SKILL_DB[id_]['skill_type']
        score = match['score']
        s_arr.append([id_, full_name, type_, score])
    full_df = pd.DataFrame(
        f_arr, columns=['skill id', 'skill name', 'skill type'])
    sub_df = pd.DataFrame(
        s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
    return full_df, sub_df