Spaces:
Running
Running
from flashtext import KeywordProcessor | |
import json | |
import nltk | |
from nltk.tokenize import word_tokenize,LineTokenizer | |
from utils import get_average_words_per_line, get_average_line_len | |
import wordninja | |
nltk.download('punkt') | |
class ResumeSegmenter(): | |
def __init__(self): | |
#has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections | |
self.resume_segments = { | |
'objective': [], | |
'work_and_employment': [], | |
'education_and_training': [], | |
'skills': [], | |
'accomplishments': [], | |
'misc': [] | |
} | |
self.resume_indices = [] | |
with open(r"models/prototype/sections.json") as f: | |
data = json.load(f) | |
self.section_headers = data["section_headers"] | |
f.close() | |
self.keyword_processor = KeywordProcessor() | |
self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers) | |
def find_segment_indices(self, text_list): | |
average_words_per_line = get_average_words_per_line(text_list) | |
average_sentence_length = get_average_line_len(text_list) | |
for i, line in enumerate(text_list): | |
line_tokenized = LineTokenizer(blanklines='discard').tokenize(line) | |
if line[0].islower() or line[-1] == '.': | |
continue | |
kys = self.keyword_processor.extract_keywords(line) | |
if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []: | |
text_list[i] = line = ' '.join(word_tokenize(line)) | |
kys = self.keyword_processor.extract_keywords(line) | |
if len(kys) > 0: | |
if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length: | |
continue | |
self.resume_indices.append(i) | |
self.resume_segments[kys[0]].append(i) | |
def slice_segments(self, lines): | |
sections = {} | |
if len(self.resume_indices) == 0: | |
return None | |
for section, points in self.resume_segments.items(): | |
if len(points) == 0: continue | |
start_point = points[0] | |
tmp_end_point = points[-1] | |
end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1, | |
len(self.resume_indices)-1)] | |
if start_point == self.resume_indices[-1]: | |
end_point = len(lines) | |
sections[section] = (start_point, end_point) | |
sections["basics_info"] = (0, self.resume_indices[0]) | |
return sections | |
def get_interval_intersection(self, sections, interval): | |
for section in sections: | |
s = section[1] | |
if s[0] >= interval[1] or interval[0] >= s[1]: | |
return None | |
else: | |
start = max(s[0], interval[0]) | |
end = min(s[1], interval[1]) | |
return [start, end], section | |
def segment(self, resume_lines): | |
self.find_segment_indices(resume_lines) | |
sections = self.slice_segments(resume_lines) | |
if sections is None: | |
return None | |
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ] | |
"""intersection_intervals = [] | |
for i, s in enumerate(sections_list[:-1]): | |
result = self.get_interval_intersection(sections_list[i+1:], s[1]) | |
if result is None: | |
continue | |
else: | |
a,b = result | |
print(a,b,s[0]) | |
intersection_intervals.append((a,b,s[0])) | |
if len(intersection_intervals) > 0: | |
print("there are intersections", intersection_intervals)""" | |
#needs last method of cleaning overlapping intervals with zero shot | |
#classifier + substract intervals | |
return sections | |
def get_parsed_sections(self, resume_lines): | |
text_segments = {} | |
sections = self.segment(resume_lines) | |
if sections is None: | |
return None, None | |
for header_title, section in sections.items(): | |
lines = resume_lines[section[0]:section[1]] | |
text_segments[header_title] = lines | |
return text_segments, sections |