Spaces:
Runtime error
Runtime error
File size: 5,670 Bytes
91ec262 cdd41d2 91ec262 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import sys
import subprocess
import streamlit as st
import numpy as np
from annotated_text import annotation
import collections
import ktrain
import pandas as pd
import os
import neattext.functions as nfx
label_path = ("./data/labels.txt")
cols = ['cat', 'code']
label_df = pd.read_csv(label_path, names=cols, header=0)
def default_text():
with open("./data/sample.txt", 'r') as fs:
text = fs.read()
return text
@st.cache(allow_output_mutation=True,suppress_st_warning=True)
def load_model():
model_path = "./models/distilbert/"
model = ktrain.load_predictor(model_path)
return model
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_skill_extractor():
# This function will only be run the first time it's called
import spacy
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB
from spacy.matcher import PhraseMatcher
# init params of skill extractor
print('load model')
nlp = spacy.load('en_core_web_lg')
print('load matcher')
# init skill extractor
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
return skill_extractor
def clean_text(text):
try:
docx = nfx.TextFrame(text)
result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
# doc = nlp(result.text)
# empty_list = []
# for token in doc:
# empty_list.append(token.lemma_)
# final_string = ' '.join(map(str,empty_list))
return result.text
except Exception as e:
print(e)
return None
def predict_cat(model, text):
logits = model.predict(text,return_proba=True)
prob = int(logits.max()*100)
cat= label_df.iloc[logits.argmax()].values[0]
return prob,cat
def grouper(iterable):
prev = None
group = []
for item in iterable:
if not prev or item - prev <= 1:
group.append(item)
else:
yield group
group = [item]
prev = item
if group:
yield group
def get_skill(annotations):
try:
# annotations = skill_extractor.annotate(text,tresh=0.5)
# skill_dict = {"Soft Skill": [], "Hard Skill": []}
soft_skill = []
hard_skill = []
for item in annotations['results']['ngram_scored']:
skill_id = item['skill_id']
skill_type = skill_extractor.skills_db[skill_id]['skill_type']
if skill_type == 'Soft Skill' and item['doc_node_value']:
soft_skill.append(item['doc_node_value'])
if skill_type == 'Hard Skill':
hard_skill.append(item['doc_node_value'])
# skill_dict['Soft Skill'] =set(soft_skill)
sk = " ".join(list(set(soft_skill)))
hk = " ".join(list(set(hard_skill)))
# st.write(skill_extractor.describe(annotations))
return sk+hk
except Exception as e:
return None
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
def create_ann_list(text, results):
try:
from skillNer.general_params import SKILL_DB
except:
# install skillner if not done yet
os.system('pip install skillner')
from skillNer.general_params import SKILL_DB
type_to_color = {'Hard Skill': "#faa",
'Soft Skill': '#afa', 'Certification': '#ff4'}
text_tokens = text.split(' ')
annots = {}
all_res = results['ngram_scored']+results['full_matches']
ids_done = []
# create annotations from matches
for match in all_res:
id_ = match['skill_id']
type_ = SKILL_DB[id_]['skill_type']
span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
annot = annotation(span_str, type_, background=type_to_color[type_],
color="#333", margin='2px')
annots[match['doc_node_id'][0]] = annot
for i in match['doc_node_id']:
ids_done.append(i)
# create strs for non annotated text
non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
dict_ = dict(enumerate(grouper(non_match_ids), 1))
for v in dict_.values():
span = ' '.join([text_tokens[i] for i in v])
annots[v[0]] = span
# annotation(token,color="#fff", background="transparent",)
print(dict_)
print('-----')
# print(collections.OrderedDict(sorted(annots.items())))
annots_ = collections.OrderedDict(sorted(annots.items())).values()
return annots_
def create_dfs(results):
try:
from skillNer.general_params import SKILL_DB
except:
# install skillner if not done yet
os.system('pip install skillner')
from skillNer.general_params import SKILL_DB
f_matches = results['full_matches']
f_arr = []
for match in f_matches:
id_ = match['skill_id']
full_name = SKILL_DB[id_]['skill_name']
type_ = SKILL_DB[id_]['skill_type']
f_arr.append([id_, full_name, type_])
s_matches = results['ngram_scored']
s_arr = []
for match in s_matches:
id_ = match['skill_id']
full_name = SKILL_DB[id_]['skill_name']
type_ = SKILL_DB[id_]['skill_type']
score = match['score']
s_arr.append([id_, full_name, type_, score])
full_df = pd.DataFrame(
f_arr, columns=['skill id', 'skill name', 'skill type'])
sub_df = pd.DataFrame(
s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
return full_df, sub_df |