Basanth commited on
Commit
aba59d0
1 Parent(s): 1702418

match percentage added

Browse files
Files changed (1) hide show
  1. utils.py +32 -72
utils.py CHANGED
@@ -2,7 +2,8 @@ import sys
2
  import subprocess
3
  import streamlit as st
4
  import numpy as np
5
- from annotated_text import annotation
 
6
  import collections
7
  import ktrain
8
  import pandas as pd
@@ -11,8 +12,12 @@ import neattext.functions as nfx
11
 
12
 
13
  label_path = ("./data/labels.txt")
 
 
14
  cols = ['cat', 'code']
15
  label_df = pd.read_csv(label_path, names=cols, header=0)
 
 
16
 
17
 
18
  def default_text():
@@ -36,11 +41,11 @@ def load_skill_extractor():
36
 
37
  from spacy.matcher import PhraseMatcher
38
  # init params of skill extractor
39
- print('load model')
40
 
41
  nlp = spacy.load('en_core_web_lg')
42
 
43
- print('load matcher')
44
  # init skill extractor
45
  skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
46
  return skill_extractor
@@ -63,10 +68,14 @@ def clean_text(text):
63
 
64
 
65
  def predict_cat(model, text):
 
 
66
 
67
  logits = model.predict(text,return_proba=True)
68
  prob = int(logits.max()*100)
69
  cat= label_df.iloc[logits.argmax()].values[0]
 
 
70
  return prob,cat
71
 
72
 
@@ -84,27 +93,15 @@ def grouper(iterable):
84
  yield group
85
 
86
 
87
- def get_skill(annotations):
88
- try:
89
- # annotations = skill_extractor.annotate(text,tresh=0.5)
90
- # skill_dict = {"Soft Skill": [], "Hard Skill": []}
91
- soft_skill = []
92
- hard_skill = []
93
-
94
- for item in annotations['results']['ngram_scored']:
95
- skill_id = item['skill_id']
96
- skill_type = skill_extractor.skills_db[skill_id]['skill_type']
97
- if skill_type == 'Soft Skill' and item['doc_node_value']:
98
- soft_skill.append(item['doc_node_value'])
99
- if skill_type == 'Hard Skill':
100
- hard_skill.append(item['doc_node_value'])
101
- # skill_dict['Soft Skill'] =set(soft_skill)
102
- sk = " ".join(list(set(soft_skill)))
103
- hk = " ".join(list(set(hard_skill)))
104
- # st.write(skill_extractor.describe(annotations))
105
- return sk+hk
106
- except Exception as e:
107
- return None
108
 
109
 
110
  def install(package):
@@ -112,46 +109,6 @@ def install(package):
112
 
113
 
114
 
115
-
116
-
117
- def create_ann_list(text, results):
118
- try:
119
- from skillNer.general_params import SKILL_DB
120
- except:
121
- # install skillner if not done yet
122
- os.system('pip install skillner')
123
- from skillNer.general_params import SKILL_DB
124
-
125
- type_to_color = {'Hard Skill': "#faa",
126
- 'Soft Skill': '#afa', 'Certification': '#ff4'}
127
- text_tokens = text.split(' ')
128
- annots = {}
129
- all_res = results['ngram_scored']+results['full_matches']
130
- ids_done = []
131
- # create annotations from matches
132
- for match in all_res:
133
- id_ = match['skill_id']
134
- type_ = SKILL_DB[id_]['skill_type']
135
- span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
136
- annot = annotation(span_str, type_, background=type_to_color[type_],
137
- color="#333", margin='2px')
138
- annots[match['doc_node_id'][0]] = annot
139
- for i in match['doc_node_id']:
140
- ids_done.append(i)
141
- # create strs for non annotated text
142
- non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
143
- dict_ = dict(enumerate(grouper(non_match_ids), 1))
144
- for v in dict_.values():
145
- span = ' '.join([text_tokens[i] for i in v])
146
- annots[v[0]] = span
147
- # annotation(token,color="#fff", background="transparent",)
148
- print(dict_)
149
- print('-----')
150
- # print(collections.OrderedDict(sorted(annots.items())))
151
- annots_ = collections.OrderedDict(sorted(annots.items())).values()
152
- return annots_
153
-
154
-
155
  def create_dfs(results):
156
  try:
157
  from skillNer.general_params import SKILL_DB
@@ -161,12 +118,13 @@ def create_dfs(results):
161
  from skillNer.general_params import SKILL_DB
162
 
163
  f_matches = results['full_matches']
164
- f_arr = []
165
  for match in f_matches:
166
  id_ = match['skill_id']
167
  full_name = SKILL_DB[id_]['skill_name']
168
  type_ = SKILL_DB[id_]['skill_type']
169
- f_arr.append([id_, full_name, type_])
 
170
  s_matches = results['ngram_scored']
171
  s_arr = []
172
  for match in s_matches:
@@ -174,9 +132,11 @@ def create_dfs(results):
174
  full_name = SKILL_DB[id_]['skill_name']
175
  type_ = SKILL_DB[id_]['skill_type']
176
  score = match['score']
177
- s_arr.append([id_, full_name, type_, score])
178
- full_df = pd.DataFrame(
179
- f_arr, columns=['skill id', 'skill name', 'skill type'])
180
- sub_df = pd.DataFrame(
181
- s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
182
- return full_df, sub_df
 
 
 
2
  import subprocess
3
  import streamlit as st
4
  import numpy as np
5
+ import ast
6
+ # from annotated_text import annotation
7
  import collections
8
  import ktrain
9
  import pandas as pd
 
12
 
13
 
14
  label_path = ("./data/labels.txt")
15
+ top_skills= ("./data/top_50_hard_skills.csv")
16
+
17
  cols = ['cat', 'code']
18
  label_df = pd.read_csv(label_path, names=cols, header=0)
19
+ skcols = ['cat','skills']
20
+ top_skill_df = pd.read_csv(top_skills, names=skcols, header=0)
21
 
22
 
23
  def default_text():
 
41
 
42
  from spacy.matcher import PhraseMatcher
43
  # init params of skill extractor
44
+ # print('load model')
45
 
46
  nlp = spacy.load('en_core_web_lg')
47
 
48
+ # print('load matcher')
49
  # init skill extractor
50
  skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
51
  return skill_extractor
 
68
 
69
 
70
  def predict_cat(model, text):
71
+ # p = int(model.predict(text,return_proba=True).max()*100)
72
+ # cat = model.predict(text)
73
 
74
  logits = model.predict(text,return_proba=True)
75
  prob = int(logits.max()*100)
76
  cat= label_df.iloc[logits.argmax()].values[0]
77
+
78
+
79
  return prob,cat
80
 
81
 
 
93
  yield group
94
 
95
 
96
+ def get_match(job_cat,cv_skills):
97
+ skills = top_skill_df[top_skill_df['cat'] == job_cat]['skills']
98
+ top_skills = set(ast.literal_eval(",".join(skills)))
99
+ cv_skills = set(cv_skills)
100
+ matched_skills = top_skills.intersection(cv_skills)
101
+ m = len(matched_skills)
102
+ d = len(top_skills)
103
+ match_p = round((m/10*100), 2)
104
+ return match_p
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  def install(package):
 
109
 
110
 
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def create_dfs(results):
113
  try:
114
  from skillNer.general_params import SKILL_DB
 
118
  from skillNer.general_params import SKILL_DB
119
 
120
  f_matches = results['full_matches']
121
+ hard_skills =[]
122
  for match in f_matches:
123
  id_ = match['skill_id']
124
  full_name = SKILL_DB[id_]['skill_name']
125
  type_ = SKILL_DB[id_]['skill_type']
126
+ if type_ == 'Hard Skill':
127
+ hard_skills.append(full_name)
128
  s_matches = results['ngram_scored']
129
  s_arr = []
130
  for match in s_matches:
 
132
  full_name = SKILL_DB[id_]['skill_name']
133
  type_ = SKILL_DB[id_]['skill_type']
134
  score = match['score']
135
+ if type_ == 'Hard Skill':
136
+ hard_skills.append(full_name)
137
+ hard_skills =list(set(hard_skills))
138
+ # df = pd.DataFrame(
139
+ # # f_arr, columns=['skill id', 'skill name', 'skill type'])
140
+ # hard_skills, columns=['skill name'])
141
+
142
+ return hard_skills