Spaces:
Runtime error
Runtime error
import sys | |
import subprocess | |
import streamlit as st | |
import numpy as np | |
from annotated_text import annotation | |
import collections | |
import ktrain | |
import pandas as pd | |
import os | |
import neattext.functions as nfx | |
label_path = ("./data/labels.txt") | |
cols = ['cat', 'code'] | |
label_df = pd.read_csv(label_path, names=cols, header=0) | |
def default_text(): | |
with open("./data/sample.txt", 'r') as fs: | |
text = fs.read() | |
return text | |
def load_model(): | |
model_path = "./models/distilbert/" | |
model = ktrain.load_predictor(model_path) | |
return model | |
def load_skill_extractor(): | |
# This function will only be run the first time it's called | |
import spacy | |
from skillNer.skill_extractor_class import SkillExtractor | |
from skillNer.general_params import SKILL_DB | |
from spacy.matcher import PhraseMatcher | |
# init params of skill extractor | |
print('load model') | |
nlp = spacy.load('en_core_web_lg') | |
print('load matcher') | |
# init skill extractor | |
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,) | |
return skill_extractor | |
def clean_text(text): | |
try: | |
docx = nfx.TextFrame(text) | |
result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters() | |
# doc = nlp(result.text) | |
# empty_list = [] | |
# for token in doc: | |
# empty_list.append(token.lemma_) | |
# final_string = ' '.join(map(str,empty_list)) | |
return result.text | |
except Exception as e: | |
print(e) | |
return None | |
def predict_cat(model, text): | |
logits = model.predict(text,return_proba=True) | |
prob = int(logits.max()*100) | |
cat= label_df.iloc[logits.argmax()].values[0] | |
return prob,cat | |
def grouper(iterable): | |
prev = None | |
group = [] | |
for item in iterable: | |
if not prev or item - prev <= 1: | |
group.append(item) | |
else: | |
yield group | |
group = [item] | |
prev = item | |
if group: | |
yield group | |
def get_skill(annotations): | |
try: | |
# annotations = skill_extractor.annotate(text,tresh=0.5) | |
# skill_dict = {"Soft Skill": [], "Hard Skill": []} | |
soft_skill = [] | |
hard_skill = [] | |
for item in annotations['results']['ngram_scored']: | |
skill_id = item['skill_id'] | |
skill_type = skill_extractor.skills_db[skill_id]['skill_type'] | |
if skill_type == 'Soft Skill' and item['doc_node_value']: | |
soft_skill.append(item['doc_node_value']) | |
if skill_type == 'Hard Skill': | |
hard_skill.append(item['doc_node_value']) | |
# skill_dict['Soft Skill'] =set(soft_skill) | |
sk = " ".join(list(set(soft_skill))) | |
hk = " ".join(list(set(hard_skill))) | |
# st.write(skill_extractor.describe(annotations)) | |
return sk+hk | |
except Exception as e: | |
return None | |
def install(package): | |
subprocess.check_call([sys.executable, "-m", "pip", "install", package]) | |
def create_ann_list(text, results): | |
try: | |
from skillNer.general_params import SKILL_DB | |
except: | |
# install skillner if not done yet | |
os.system('pip install skillner') | |
from skillNer.general_params import SKILL_DB | |
type_to_color = {'Hard Skill': "#faa", | |
'Soft Skill': '#afa', 'Certification': '#ff4'} | |
text_tokens = text.split(' ') | |
annots = {} | |
all_res = results['ngram_scored']+results['full_matches'] | |
ids_done = [] | |
# create annotations from matches | |
for match in all_res: | |
id_ = match['skill_id'] | |
type_ = SKILL_DB[id_]['skill_type'] | |
span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']]) | |
annot = annotation(span_str, type_, background=type_to_color[type_], | |
color="#333", margin='2px') | |
annots[match['doc_node_id'][0]] = annot | |
for i in match['doc_node_id']: | |
ids_done.append(i) | |
# create strs for non annotated text | |
non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done] | |
dict_ = dict(enumerate(grouper(non_match_ids), 1)) | |
for v in dict_.values(): | |
span = ' '.join([text_tokens[i] for i in v]) | |
annots[v[0]] = span | |
# annotation(token,color="#fff", background="transparent",) | |
print(dict_) | |
print('-----') | |
# print(collections.OrderedDict(sorted(annots.items()))) | |
annots_ = collections.OrderedDict(sorted(annots.items())).values() | |
return annots_ | |
def create_dfs(results): | |
try: | |
from skillNer.general_params import SKILL_DB | |
except: | |
# install skillner if not done yet | |
os.system('pip install skillner') | |
from skillNer.general_params import SKILL_DB | |
f_matches = results['full_matches'] | |
f_arr = [] | |
for match in f_matches: | |
id_ = match['skill_id'] | |
full_name = SKILL_DB[id_]['skill_name'] | |
type_ = SKILL_DB[id_]['skill_type'] | |
f_arr.append([id_, full_name, type_]) | |
s_matches = results['ngram_scored'] | |
s_arr = [] | |
for match in s_matches: | |
id_ = match['skill_id'] | |
full_name = SKILL_DB[id_]['skill_name'] | |
type_ = SKILL_DB[id_]['skill_type'] | |
score = match['score'] | |
s_arr.append([id_, full_name, type_, score]) | |
full_df = pd.DataFrame( | |
f_arr, columns=['skill id', 'skill name', 'skill type']) | |
sub_df = pd.DataFrame( | |
s_arr, columns=['skill id', 'skill name', 'skill type', 'score']) | |
return full_df, sub_df |