Spaces:

omdenalagos
/

job_skill_cat

Runtime error

App Files Files Community

job_skill_cat / utils.py

Basanth

prediction modfied

cdd41d2 almost 2 years ago

raw

history blame

5.67 kB

	import sys
	import subprocess
	import streamlit as st
	import numpy as np
	from annotated_text import annotation
	import collections
	import ktrain
	import pandas as pd
	import os
	import neattext.functions as nfx


	label_path = ("./data/labels.txt")
	cols = ['cat', 'code']
	label_df = pd.read_csv(label_path, names=cols, header=0)


	def default_text():
	with open("./data/sample.txt", 'r') as fs:
	text = fs.read()
	return text

	@st.cache(allow_output_mutation=True,suppress_st_warning=True)
	def load_model():
	model_path = "./models/distilbert/"
	model = ktrain.load_predictor(model_path)
	return model

	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	def load_skill_extractor():
	# This function will only be run the first time it's called
	import spacy

	from skillNer.skill_extractor_class import SkillExtractor
	from skillNer.general_params import SKILL_DB

	from spacy.matcher import PhraseMatcher
	# init params of skill extractor
	print('load model')

	nlp = spacy.load('en_core_web_lg')

	print('load matcher')
	# init skill extractor
	skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
	return skill_extractor



	def clean_text(text):
	try:
	docx = nfx.TextFrame(text)
	result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
	# doc = nlp(result.text)
	# empty_list = []
	# for token in doc:
	# empty_list.append(token.lemma_)
	# final_string = ' '.join(map(str,empty_list))
	return result.text
	except Exception as e:
	print(e)
	return None


	def predict_cat(model, text):

	logits = model.predict(text,return_proba=True)
	prob = int(logits.max()*100)
	cat= label_df.iloc[logits.argmax()].values[0]
	return prob,cat


	def grouper(iterable):
	prev = None
	group = []
	for item in iterable:
	if not prev or item - prev <= 1:
	group.append(item)
	else:
	yield group
	group = [item]
	prev = item
	if group:
	yield group


	def get_skill(annotations):
	try:
	# annotations = skill_extractor.annotate(text,tresh=0.5)
	# skill_dict = {"Soft Skill": [], "Hard Skill": []}
	soft_skill = []
	hard_skill = []

	for item in annotations['results']['ngram_scored']:
	skill_id = item['skill_id']
	skill_type = skill_extractor.skills_db[skill_id]['skill_type']
	if skill_type == 'Soft Skill' and item['doc_node_value']:
	soft_skill.append(item['doc_node_value'])
	if skill_type == 'Hard Skill':
	hard_skill.append(item['doc_node_value'])
	# skill_dict['Soft Skill'] =set(soft_skill)
	sk = " ".join(list(set(soft_skill)))
	hk = " ".join(list(set(hard_skill)))
	# st.write(skill_extractor.describe(annotations))
	return sk+hk
	except Exception as e:
	return None


	def install(package):
	subprocess.check_call([sys.executable, "-m", "pip", "install", package])





	def create_ann_list(text, results):
	try:
	from skillNer.general_params import SKILL_DB
	except:
	# install skillner if not done yet
	os.system('pip install skillner')
	from skillNer.general_params import SKILL_DB

	type_to_color = {'Hard Skill': "#faa",
	'Soft Skill': '#afa', 'Certification': '#ff4'}
	text_tokens = text.split(' ')
	annots = {}
	all_res = results['ngram_scored']+results['full_matches']
	ids_done = []
	# create annotations from matches
	for match in all_res:
	id_ = match['skill_id']
	type_ = SKILL_DB[id_]['skill_type']
	span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
	annot = annotation(span_str, type_, background=type_to_color[type_],
	color="#333", margin='2px')
	annots[match['doc_node_id'][0]] = annot
	for i in match['doc_node_id']:
	ids_done.append(i)
	# create strs for non annotated text
	non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
	dict_ = dict(enumerate(grouper(non_match_ids), 1))
	for v in dict_.values():
	span = ' '.join([text_tokens[i] for i in v])
	annots[v[0]] = span
	# annotation(token,color="#fff", background="transparent",)
	print(dict_)
	print('-----')
	# print(collections.OrderedDict(sorted(annots.items())))
	annots_ = collections.OrderedDict(sorted(annots.items())).values()
	return annots_


	def create_dfs(results):
	try:
	from skillNer.general_params import SKILL_DB
	except:
	# install skillner if not done yet
	os.system('pip install skillner')
	from skillNer.general_params import SKILL_DB

	f_matches = results['full_matches']
	f_arr = []
	for match in f_matches:
	id_ = match['skill_id']
	full_name = SKILL_DB[id_]['skill_name']
	type_ = SKILL_DB[id_]['skill_type']
	f_arr.append([id_, full_name, type_])
	s_matches = results['ngram_scored']
	s_arr = []
	for match in s_matches:
	id_ = match['skill_id']
	full_name = SKILL_DB[id_]['skill_name']
	type_ = SKILL_DB[id_]['skill_type']
	score = match['score']
	s_arr.append([id_, full_name, type_, score])
	full_df = pd.DataFrame(
	f_arr, columns=['skill id', 'skill name', 'skill type'])
	sub_df = pd.DataFrame(
	s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
	return full_df, sub_df