job_skill_cat / utils.py
Basanth's picture
prediction modfied
cdd41d2
raw
history blame
5.67 kB
import sys
import subprocess
import streamlit as st
import numpy as np
from annotated_text import annotation
import collections
import ktrain
import pandas as pd
import os
import neattext.functions as nfx
label_path = ("./data/labels.txt")
cols = ['cat', 'code']
label_df = pd.read_csv(label_path, names=cols, header=0)
def default_text():
with open("./data/sample.txt", 'r') as fs:
text = fs.read()
return text
@st.cache(allow_output_mutation=True,suppress_st_warning=True)
def load_model():
model_path = "./models/distilbert/"
model = ktrain.load_predictor(model_path)
return model
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_skill_extractor():
# This function will only be run the first time it's called
import spacy
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher
# init params of skill extractor
print('load model')
nlp = spacy.load('en_core_web_lg')
print('load matcher')
# init skill extractor
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
return skill_extractor
def clean_text(text):
try:
docx = nfx.TextFrame(text)
result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
# doc = nlp(result.text)
# empty_list = []
# for token in doc:
# empty_list.append(token.lemma_)
# final_string = ' '.join(map(str,empty_list))
return result.text
except Exception as e:
print(e)
return None
def predict_cat(model, text):
logits = model.predict(text,return_proba=True)
prob = int(logits.max()*100)
cat= label_df.iloc[logits.argmax()].values[0]
return prob,cat
def grouper(iterable):
prev = None
group = []
for item in iterable:
if not prev or item - prev <= 1:
group.append(item)
else:
yield group
group = [item]
prev = item
if group:
yield group
def get_skill(annotations):
try:
# annotations = skill_extractor.annotate(text,tresh=0.5)
# skill_dict = {"Soft Skill": [], "Hard Skill": []}
soft_skill = []
hard_skill = []
for item in annotations['results']['ngram_scored']:
skill_id = item['skill_id']
skill_type = skill_extractor.skills_db[skill_id]['skill_type']
if skill_type == 'Soft Skill' and item['doc_node_value']:
soft_skill.append(item['doc_node_value'])
if skill_type == 'Hard Skill':
hard_skill.append(item['doc_node_value'])
# skill_dict['Soft Skill'] =set(soft_skill)
sk = " ".join(list(set(soft_skill)))
hk = " ".join(list(set(hard_skill)))
# st.write(skill_extractor.describe(annotations))
return sk+hk
except Exception as e:
return None
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
def create_ann_list(text, results):
try:
from skillNer.general_params import SKILL_DB
except:
# install skillner if not done yet
os.system('pip install skillner')
from skillNer.general_params import SKILL_DB
type_to_color = {'Hard Skill': "#faa",
'Soft Skill': '#afa', 'Certification': '#ff4'}
text_tokens = text.split(' ')
annots = {}
all_res = results['ngram_scored']+results['full_matches']
ids_done = []
# create annotations from matches
for match in all_res:
id_ = match['skill_id']
type_ = SKILL_DB[id_]['skill_type']
span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
annot = annotation(span_str, type_, background=type_to_color[type_],
color="#333", margin='2px')
annots[match['doc_node_id'][0]] = annot
for i in match['doc_node_id']:
ids_done.append(i)
# create strs for non annotated text
non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
dict_ = dict(enumerate(grouper(non_match_ids), 1))
for v in dict_.values():
span = ' '.join([text_tokens[i] for i in v])
annots[v[0]] = span
# annotation(token,color="#fff", background="transparent",)
print(dict_)
print('-----')
# print(collections.OrderedDict(sorted(annots.items())))
annots_ = collections.OrderedDict(sorted(annots.items())).values()
return annots_
def create_dfs(results):
try:
from skillNer.general_params import SKILL_DB
except:
# install skillner if not done yet
os.system('pip install skillner')
from skillNer.general_params import SKILL_DB
f_matches = results['full_matches']
f_arr = []
for match in f_matches:
id_ = match['skill_id']
full_name = SKILL_DB[id_]['skill_name']
type_ = SKILL_DB[id_]['skill_type']
f_arr.append([id_, full_name, type_])
s_matches = results['ngram_scored']
s_arr = []
for match in s_matches:
id_ = match['skill_id']
full_name = SKILL_DB[id_]['skill_name']
type_ = SKILL_DB[id_]['skill_type']
score = match['score']
s_arr.append([id_, full_name, type_, score])
full_df = pd.DataFrame(
f_arr, columns=['skill id', 'skill name', 'skill type'])
sub_df = pd.DataFrame(
s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
return full_df, sub_df