File size: 3,947 Bytes
91ec262
 
 
 
aba59d0
 
91ec262
 
 
 
 
 
 
 
aba59d0
 
91ec262
 
aba59d0
 
91ec262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aba59d0
91ec262
 
 
aba59d0
91ec262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aba59d0
 
cdd41d2
 
 
 
aba59d0
 
cdd41d2
91ec262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aba59d0
 
 
 
 
 
 
 
 
91ec262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aba59d0
91ec262
 
 
 
aba59d0
 
91ec262
 
 
 
 
 
 
aba59d0
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import sys
import subprocess
import streamlit as st
import numpy as np
import ast
# from annotated_text import annotation
import collections
import ktrain
import pandas as pd
import os
import neattext.functions as nfx


label_path = ("./data/labels.txt")
top_skills= ("./data/top_50_hard_skills.csv")

cols = ['cat', 'code']
label_df = pd.read_csv(label_path, names=cols, header=0)
skcols = ['cat','skills']
top_skill_df = pd.read_csv(top_skills, names=skcols, header=0)


def default_text():
    with open("./data/sample.txt", 'r') as fs:
        text = fs.read()
    return text

@st.cache(allow_output_mutation=True,suppress_st_warning=True)
def load_model():
    model_path = "./models/distilbert/"
    model = ktrain.load_predictor(model_path)
    return model

@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_skill_extractor():
    # This function will only be run the first time it's called
    import spacy

    from skillNer.skill_extractor_class import SkillExtractor
    from skillNer.general_params import SKILL_DB

    from spacy.matcher import PhraseMatcher
    # init params of skill extractor
    # print('load model')

    nlp = spacy.load('en_core_web_lg')

    # print('load matcher')
    # init skill extractor
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
    return skill_extractor



def clean_text(text):
    try:
        docx = nfx.TextFrame(text)
        result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
        # doc = nlp(result.text)
        # empty_list = []
        # for token in doc:
        # empty_list.append(token.lemma_)
        # final_string = ' '.join(map(str,empty_list))
        return result.text
    except Exception as e:
        print(e)
        return None


def predict_cat(model, text):
    # p = int(model.predict(text,return_proba=True).max()*100)
    # cat =  model.predict(text)
    
    logits = model.predict(text,return_proba=True)
    prob = int(logits.max()*100)
    cat= label_df.iloc[logits.argmax()].values[0]
    
    
    return prob,cat


def grouper(iterable):
    prev = None
    group = []
    for item in iterable:
        if not prev or item - prev <= 1:
            group.append(item)
        else:
            yield group
            group = [item]
        prev = item
    if group:
        yield group


def get_match(job_cat,cv_skills):
    skills =  top_skill_df[top_skill_df['cat'] == job_cat]['skills']
    top_skills =  set(ast.literal_eval(",".join(skills)))
    cv_skills = set(cv_skills)
    matched_skills = top_skills.intersection(cv_skills)
    m = len(matched_skills)
    d = len(top_skills)
    match_p = round((m/10*100), 2)
    return match_p


def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])



def create_dfs(results):
    try:
        from skillNer.general_params import SKILL_DB
    except:
        # install skillner if not done yet
        os.system('pip install skillner')
        from skillNer.general_params import SKILL_DB

    f_matches = results['full_matches']
    hard_skills =[]
    for match in f_matches:
        id_ = match['skill_id']
        full_name = SKILL_DB[id_]['skill_name']
        type_ = SKILL_DB[id_]['skill_type']
        if type_ == 'Hard Skill':
            hard_skills.append(full_name)
    s_matches = results['ngram_scored']
    s_arr = []
    for match in s_matches:
        id_ = match['skill_id']
        full_name = SKILL_DB[id_]['skill_name']
        type_ = SKILL_DB[id_]['skill_type']
        score = match['score']
        if type_ == 'Hard Skill':
            hard_skills.append(full_name)
    hard_skills =list(set(hard_skills))        
    # df = pd.DataFrame(
    #     # f_arr, columns=['skill id', 'skill name', 'skill type'])
    #     hard_skills, columns=['skill name'])
    
    return hard_skills