import sys import subprocess import streamlit as st import numpy as np from annotated_text import annotation import collections import ktrain import pandas as pd import os import neattext.functions as nfx label_path = ("./data/labels.txt") cols = ['cat', 'code'] label_df = pd.read_csv(label_path, names=cols, header=0) def default_text(): with open("./data/sample.txt", 'r') as fs: text = fs.read() return text @st.cache(allow_output_mutation=True,suppress_st_warning=True) def load_model(): model_path = "./models/distilbert/" model = ktrain.load_predictor(model_path) return model @st.cache(allow_output_mutation=True, suppress_st_warning=True) def load_skill_extractor(): # This function will only be run the first time it's called import spacy from skillNer.skill_extractor_class import SkillExtractor from skillNer.general_params import SKILL_DB from spacy.matcher import PhraseMatcher # init params of skill extractor print('load model') nlp = spacy.load('en_core_web_lg') print('load matcher') # init skill extractor skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,) return skill_extractor def clean_text(text): try: docx = nfx.TextFrame(text) result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters() # doc = nlp(result.text) # empty_list = [] # for token in doc: # empty_list.append(token.lemma_) # final_string = ' '.join(map(str,empty_list)) return result.text except Exception as e: print(e) return None def predict_cat(model, text): logits = model.predict(text,return_proba=True) prob = int(logits.max()*100) cat= label_df.iloc[logits.argmax()].values[0] return prob,cat def grouper(iterable): prev = None group = [] for item in iterable: if not prev or item - prev <= 1: group.append(item) else: yield group group = [item] prev = item if group: yield group def get_skill(annotations): try: # annotations = skill_extractor.annotate(text,tresh=0.5) # skill_dict = {"Soft Skill": [], "Hard Skill": []} soft_skill = [] hard_skill = [] for item in annotations['results']['ngram_scored']: skill_id = item['skill_id'] skill_type = skill_extractor.skills_db[skill_id]['skill_type'] if skill_type == 'Soft Skill' and item['doc_node_value']: soft_skill.append(item['doc_node_value']) if skill_type == 'Hard Skill': hard_skill.append(item['doc_node_value']) # skill_dict['Soft Skill'] =set(soft_skill) sk = " ".join(list(set(soft_skill))) hk = " ".join(list(set(hard_skill))) # st.write(skill_extractor.describe(annotations)) return sk+hk except Exception as e: return None def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) def create_ann_list(text, results): try: from skillNer.general_params import SKILL_DB except: # install skillner if not done yet os.system('pip install skillner') from skillNer.general_params import SKILL_DB type_to_color = {'Hard Skill': "#faa", 'Soft Skill': '#afa', 'Certification': '#ff4'} text_tokens = text.split(' ') annots = {} all_res = results['ngram_scored']+results['full_matches'] ids_done = [] # create annotations from matches for match in all_res: id_ = match['skill_id'] type_ = SKILL_DB[id_]['skill_type'] span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']]) annot = annotation(span_str, type_, background=type_to_color[type_], color="#333", margin='2px') annots[match['doc_node_id'][0]] = annot for i in match['doc_node_id']: ids_done.append(i) # create strs for non annotated text non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done] dict_ = dict(enumerate(grouper(non_match_ids), 1)) for v in dict_.values(): span = ' '.join([text_tokens[i] for i in v]) annots[v[0]] = span # annotation(token,color="#fff", background="transparent",) print(dict_) print('-----') # print(collections.OrderedDict(sorted(annots.items()))) annots_ = collections.OrderedDict(sorted(annots.items())).values() return annots_ def create_dfs(results): try: from skillNer.general_params import SKILL_DB except: # install skillner if not done yet os.system('pip install skillner') from skillNer.general_params import SKILL_DB f_matches = results['full_matches'] f_arr = [] for match in f_matches: id_ = match['skill_id'] full_name = SKILL_DB[id_]['skill_name'] type_ = SKILL_DB[id_]['skill_type'] f_arr.append([id_, full_name, type_]) s_matches = results['ngram_scored'] s_arr = [] for match in s_matches: id_ = match['skill_id'] full_name = SKILL_DB[id_]['skill_name'] type_ = SKILL_DB[id_]['skill_type'] score = match['score'] s_arr.append([id_, full_name, type_, score]) full_df = pd.DataFrame( f_arr, columns=['skill id', 'skill name', 'skill type']) sub_df = pd.DataFrame( s_arr, columns=['skill id', 'skill name', 'skill type', 'score']) return full_df, sub_df