Spaces:
Running
Running
Upload 8 files
Browse files- app.py +16 -0
- models.py +49 -0
- output_model.py +32 -0
- parser.py +125 -0
- reader.py +25 -0
- requirements.txt +483 -0
- sections.json +127 -0
- segmenter.py +105 -0
app.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydoc import describe
|
2 |
+
import gradio
|
3 |
+
from main import Main
|
4 |
+
|
5 |
+
main = Main()
|
6 |
+
|
7 |
+
def parse(cv):
|
8 |
+
return main.parse(cv.name)
|
9 |
+
|
10 |
+
description = "This is a demo of the resume parser. \
|
11 |
+
Upload a resume and it will return a JSON object with a detailed parsed resume data."
|
12 |
+
article = "Demo of detailed resume parser"
|
13 |
+
file_input = gradio.inputs.File(file_count="single", type="file", label="Upload your pdf resume (en)")
|
14 |
+
iface = gradio.Interface(fn=parse, inputs=file_input, outputs="json",
|
15 |
+
title="Detailed Resume Parser", description=description, article=article)
|
16 |
+
iface.launch()
|
models.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import sentencepiece
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
4 |
+
from langchain import PromptTemplate, LLMChain, HuggingFacePipeline
|
5 |
+
import ast
|
6 |
+
class Models():
|
7 |
+
def __init__(self) -> None:
|
8 |
+
self.template = """
|
9 |
+
A virtual assistant answers questions from a user based on the provided text.
|
10 |
+
USER: Text: {input_text}
|
11 |
+
ASSISTANT: I’ve read this text.
|
12 |
+
USER: What describes {entity_type} in the text?
|
13 |
+
ASSISTANT:
|
14 |
+
"""
|
15 |
+
self.load_trained_models()
|
16 |
+
|
17 |
+
def load_trained_models(self):
|
18 |
+
#is it best to keep in memory why not pickle?
|
19 |
+
checkpoint = "Universal-NER/UniNER-7B-all"
|
20 |
+
|
21 |
+
ner_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float16, offload_folder="offload", offload_state_dict = True)
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained("Universal-NER/UniNER-7B-all", use_fast=False, padding="max_length")
|
23 |
+
pipeline = pipeline(
|
24 |
+
"text-generation", #task
|
25 |
+
model=ner_model,
|
26 |
+
max_length=1000,
|
27 |
+
tokenizer=tokenizer,
|
28 |
+
trust_remote_code=True,
|
29 |
+
do_sample=True,
|
30 |
+
top_k=10,
|
31 |
+
num_return_sequences=1
|
32 |
+
)
|
33 |
+
|
34 |
+
self.llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})
|
35 |
+
self.prompt = PromptTemplate(template=self.template, input_variables=["input_text","entity_type"])
|
36 |
+
self.llm_chain = LLMChain(prompt=self.prompt, llm=self.llm)
|
37 |
+
|
38 |
+
def extract_ner(self, context, entity_type):
|
39 |
+
return ast.literal_eval(self.llm_chain.run({"input_text":context,"entity_type":entity_type}))
|
40 |
+
|
41 |
+
def get_ner(self, clean_lines, entity):
|
42 |
+
tokens = []
|
43 |
+
try_num = 0
|
44 |
+
while try_num < 5 and tokens == []:
|
45 |
+
tokens = self.extract_ner(' '.join(clean_lines), entity)
|
46 |
+
if len(tokens) == 0:
|
47 |
+
raise ValueError("Couldnt extract {entity}")
|
48 |
+
return tokens
|
49 |
+
|
output_model.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from typing import Any, Dict, List, Optional, Union
|
4 |
+
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
class Work_experience(BaseModel):
|
8 |
+
position:List[str]
|
9 |
+
company:List[str]
|
10 |
+
start_date:Optional[str] = ""
|
11 |
+
end_date:Optional[str] = ""
|
12 |
+
description:Optional[str] = ""
|
13 |
+
location:Optional[List[str]] = []
|
14 |
+
|
15 |
+
|
16 |
+
class Education(BaseModel):
|
17 |
+
degree:str = ""
|
18 |
+
major:List[str] = []
|
19 |
+
university:List[str] = []
|
20 |
+
start_date:Optional[str] = ""
|
21 |
+
end_date:Optional[str] = ""
|
22 |
+
location:Optional[List[str]] = []
|
23 |
+
|
24 |
+
class Basic_info(BaseModel):
|
25 |
+
name: str
|
26 |
+
email : Optional[str] = ""
|
27 |
+
phone: Optional[str] = ""
|
28 |
+
|
29 |
+
class ModelOutput(BaseModel):
|
30 |
+
basic_info: Basic_info
|
31 |
+
education: Optional[List[Education]] = None
|
32 |
+
work_experience: Optional[List[Work_experience]] = None
|
parser.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import chain
|
2 |
+
from models.prototype.models import Models
|
3 |
+
#from output_model import OutputModel, WorkExperience
|
4 |
+
from models.prototype.segmenter import ResumeSegmenter
|
5 |
+
from flashtext import KeywordProcessor
|
6 |
+
from collections import defaultdict
|
7 |
+
class ResumeParser():
|
8 |
+
def __init__(self) -> None:
|
9 |
+
self.resumeSegmenter = ResumeSegmenter()
|
10 |
+
self.models = Models()
|
11 |
+
|
12 |
+
|
13 |
+
def get_date_index(self, clean_resume_lines, date):
|
14 |
+
indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
|
15 |
+
return indexes
|
16 |
+
|
17 |
+
#better suited to a utils file
|
18 |
+
def sort_tokens_table(self, tokens_data):
|
19 |
+
table = {}
|
20 |
+
for key, tokens in tokens_data:
|
21 |
+
for token in tokens:
|
22 |
+
table[token] = key
|
23 |
+
return table
|
24 |
+
|
25 |
+
def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
|
26 |
+
|
27 |
+
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
|
28 |
+
dates_indexes = list(chain.from_iterable(dates_indexes))
|
29 |
+
dates_indexes = [i + start_index for i in dates_indexes]
|
30 |
+
#this list should be unique and ordered
|
31 |
+
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
|
32 |
+
dates_indexes = set(dates_indexes)
|
33 |
+
dates_indexes = list(dates_indexes)
|
34 |
+
|
35 |
+
list_single_work_exp = []
|
36 |
+
for i in range(len(dates_indexes)-1):
|
37 |
+
index = dates_indexes[i]
|
38 |
+
next_index = dates_indexes[i+1]
|
39 |
+
section = resume_lines[index:next_index]
|
40 |
+
if len(section) == 0:
|
41 |
+
continue
|
42 |
+
list_single_work_exp.append(section)
|
43 |
+
return list_single_work_exp
|
44 |
+
|
45 |
+
def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
|
46 |
+
text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
|
47 |
+
start_index = sections[section_header][0]
|
48 |
+
end_index = sections[section_header][1]
|
49 |
+
#on the bases dates would be unique
|
50 |
+
return start_index, end_index
|
51 |
+
|
52 |
+
#more of a utils function
|
53 |
+
def sort_tokens_table(tokens_data):
|
54 |
+
table = {}
|
55 |
+
for key, tokens in tokens_data:
|
56 |
+
for token in tokens:
|
57 |
+
table[token] = key
|
58 |
+
return table
|
59 |
+
|
60 |
+
def format_output(self, keywords, work_section_list, isWorkExp=True):
|
61 |
+
if isWorkExp:
|
62 |
+
headlines = [text[0] for text in work_section_list]
|
63 |
+
else:
|
64 |
+
headlines = work_section_list
|
65 |
+
table = self.sort_tokens_table(keywords)
|
66 |
+
tokens_processor = KeywordProcessor()
|
67 |
+
list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
|
68 |
+
tokens_processor.add_keywords_from_list(list_keywords)
|
69 |
+
data = []
|
70 |
+
for i, header in enumerate(headlines):
|
71 |
+
current_data = defaultdict(list)
|
72 |
+
tokens = tokens_processor.extract_keywords(header)
|
73 |
+
for token in tokens:
|
74 |
+
current_data[table[token]].append(token)
|
75 |
+
if isWorkExp:
|
76 |
+
current_data["description"] = work_section_list[i][1:]
|
77 |
+
data.append(dict(current_data))
|
78 |
+
return data
|
79 |
+
|
80 |
+
def parse_work_history(self, resume_lines):
|
81 |
+
start_index, end_index = self.extract_section_text(resume_lines)
|
82 |
+
work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date")
|
83 |
+
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
|
84 |
+
job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
|
85 |
+
companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
|
86 |
+
keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
|
87 |
+
return self.format_output(keywords, single_work_experiences)
|
88 |
+
|
89 |
+
def parse_education(self, resume_lines):
|
90 |
+
start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
|
91 |
+
tokens = ["degree", "university", "degree field", "date", "location"]
|
92 |
+
|
93 |
+
for token in tokens:
|
94 |
+
keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
|
95 |
+
output = self.format_output(keywords, resume_lines[start_index:end_index], False)
|
96 |
+
output = [res for res in output if res]
|
97 |
+
|
98 |
+
return output
|
99 |
+
|
100 |
+
def parse_basic_info(self,resume_lines):
|
101 |
+
start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
|
102 |
+
#tokens = ["person", "email", "phone"]
|
103 |
+
tokens = ["person"]
|
104 |
+
for token in tokens:
|
105 |
+
keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
|
106 |
+
|
107 |
+
output = {}
|
108 |
+
for token, result in keywords:
|
109 |
+
if len(result) > 0:
|
110 |
+
output[token] = result[0]
|
111 |
+
return output
|
112 |
+
|
113 |
+
def parse(self, resume_lines):
|
114 |
+
jobs = self.parse_work_history(resume_lines)
|
115 |
+
education = self.parse_education(resume_lines)
|
116 |
+
basic_info = self.parse_basic_info(resume_lines)
|
117 |
+
|
118 |
+
return {"basic_info":basic_info, "education":education, "work_experience":jobs}
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
|
reader.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pypdfium2 as pdfium
|
2 |
+
import re
|
3 |
+
class ResumeReader:
|
4 |
+
|
5 |
+
def clean_text(self, raw_text):
|
6 |
+
clean_text = re.sub(r'\n+', '\n', raw_text)
|
7 |
+
clean_text = clean_text.replace("\r", "\n")
|
8 |
+
clean_text = clean_text.replace("\t", " ")
|
9 |
+
clean_text = re.sub(r"\uf0b7", " ", clean_text)
|
10 |
+
clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii
|
11 |
+
clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text)
|
12 |
+
clean_text = re.sub(r'• ', " ", clean_text)
|
13 |
+
return clean_text
|
14 |
+
|
15 |
+
def read_pdf(self, path_file):
|
16 |
+
raw_text = ""
|
17 |
+
pdf = pdfium.PdfDocument(path_file)
|
18 |
+
for page in pdf:
|
19 |
+
raw_text += page.get_textpage().get_text_range()
|
20 |
+
clean_text = self.clean_text(raw_text)
|
21 |
+
resume_lines = clean_text.splitlines(True)
|
22 |
+
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
|
23 |
+
return resume_lines
|
24 |
+
|
25 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==1.4.0
|
2 |
+
accelerate==0.24.1
|
3 |
+
aiohttp==3.8.6
|
4 |
+
aiosignal==1.3.1
|
5 |
+
alabaster==0.7.13
|
6 |
+
albumentations==1.3.1
|
7 |
+
altair==4.2.2
|
8 |
+
anyio==3.7.1
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi==23.1.0
|
11 |
+
argon2-cffi-bindings==21.2.0
|
12 |
+
array-record==0.5.0
|
13 |
+
arviz==0.15.1
|
14 |
+
astropy==5.3.4
|
15 |
+
astunparse==1.6.3
|
16 |
+
async-timeout==4.0.3
|
17 |
+
atpublic==4.0
|
18 |
+
attrs==23.1.0
|
19 |
+
audioread==3.0.1
|
20 |
+
autograd==1.6.2
|
21 |
+
Babel==2.13.1
|
22 |
+
backcall==0.2.0
|
23 |
+
beautifulsoup4==4.11.2
|
24 |
+
bidict==0.22.1
|
25 |
+
bigframes==0.13.0
|
26 |
+
bleach==6.1.0
|
27 |
+
blinker==1.4
|
28 |
+
blis==0.7.11
|
29 |
+
blosc2==2.0.0
|
30 |
+
bokeh==3.3.0
|
31 |
+
bqplot==0.12.42
|
32 |
+
branca==0.7.0
|
33 |
+
build==1.0.3
|
34 |
+
CacheControl==0.13.1
|
35 |
+
cachetools==5.3.2
|
36 |
+
catalogue==2.0.10
|
37 |
+
certifi==2023.7.22
|
38 |
+
cffi==1.16.0
|
39 |
+
chardet==5.2.0
|
40 |
+
charset-normalizer==3.3.2
|
41 |
+
chex==0.1.7
|
42 |
+
click==8.1.7
|
43 |
+
click-plugins==1.1.1
|
44 |
+
cligj==0.7.2
|
45 |
+
cloudpickle==2.2.1
|
46 |
+
cmake==3.27.7
|
47 |
+
cmdstanpy==1.2.0
|
48 |
+
colorcet==3.0.1
|
49 |
+
colorlover==0.3.0
|
50 |
+
colour==0.1.5
|
51 |
+
community==1.0.0b1
|
52 |
+
confection==0.1.3
|
53 |
+
cons==0.4.6
|
54 |
+
contextlib2==21.6.0
|
55 |
+
contourpy==1.2.0
|
56 |
+
cryptography==41.0.5
|
57 |
+
cufflinks==0.17.3
|
58 |
+
cupy-cuda11x==11.0.0
|
59 |
+
cvxopt==1.3.2
|
60 |
+
cvxpy==1.3.2
|
61 |
+
cycler==0.12.1
|
62 |
+
cymem==2.0.8
|
63 |
+
Cython==3.0.5
|
64 |
+
dask==2023.8.1
|
65 |
+
dataclasses-json==0.6.2
|
66 |
+
datascience==0.17.6
|
67 |
+
db-dtypes==1.1.1
|
68 |
+
dbus-python==1.2.18
|
69 |
+
debugpy==1.6.6
|
70 |
+
decorator==4.4.2
|
71 |
+
defusedxml==0.7.1
|
72 |
+
diskcache==5.6.3
|
73 |
+
distributed==2023.8.1
|
74 |
+
distro==1.7.0
|
75 |
+
dlib==19.24.2
|
76 |
+
dm-tree==0.1.8
|
77 |
+
docutils==0.18.1
|
78 |
+
dopamine-rl==4.0.6
|
79 |
+
duckdb==0.9.1
|
80 |
+
earthengine-api==0.1.377
|
81 |
+
easydict==1.11
|
82 |
+
ecos==2.0.12
|
83 |
+
editdistance==0.6.2
|
84 |
+
eerepr==0.0.4
|
85 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
|
86 |
+
entrypoints==0.4
|
87 |
+
et-xmlfile==1.1.0
|
88 |
+
etils==1.5.2
|
89 |
+
etuples==0.3.9
|
90 |
+
exceptiongroup==1.1.3
|
91 |
+
fastai==2.7.13
|
92 |
+
fastcore==1.5.29
|
93 |
+
fastdownload==0.0.7
|
94 |
+
fastjsonschema==2.18.1
|
95 |
+
fastprogress==1.0.3
|
96 |
+
fastrlock==0.8.2
|
97 |
+
filelock==3.13.1
|
98 |
+
fiona==1.9.5
|
99 |
+
firebase-admin==5.3.0
|
100 |
+
flashtext==2.7
|
101 |
+
Flask==2.2.5
|
102 |
+
flatbuffers==23.5.26
|
103 |
+
flax==0.7.5
|
104 |
+
folium==0.14.0
|
105 |
+
fonttools==4.44.0
|
106 |
+
frozendict==2.3.8
|
107 |
+
frozenlist==1.4.0
|
108 |
+
fsspec==2023.6.0
|
109 |
+
future==0.18.3
|
110 |
+
gast==0.5.4
|
111 |
+
gcsfs==2023.6.0
|
112 |
+
GDAL==3.4.3
|
113 |
+
gdown==4.6.6
|
114 |
+
geemap==0.28.2
|
115 |
+
gensim==4.3.2
|
116 |
+
geocoder==1.38.1
|
117 |
+
geographiclib==2.0
|
118 |
+
geopandas==0.13.2
|
119 |
+
geopy==2.3.0
|
120 |
+
gin-config==0.5.0
|
121 |
+
glob2==0.7
|
122 |
+
google==2.0.3
|
123 |
+
google-api-core==2.11.1
|
124 |
+
google-api-python-client==2.84.0
|
125 |
+
google-auth==2.17.3
|
126 |
+
google-auth-httplib2==0.1.1
|
127 |
+
google-auth-oauthlib==1.0.0
|
128 |
+
google-cloud-bigquery==3.12.0
|
129 |
+
google-cloud-bigquery-connection==1.12.1
|
130 |
+
google-cloud-bigquery-storage==2.22.0
|
131 |
+
google-cloud-core==2.3.3
|
132 |
+
google-cloud-datastore==2.15.2
|
133 |
+
google-cloud-firestore==2.11.1
|
134 |
+
google-cloud-functions==1.13.3
|
135 |
+
google-cloud-iam==2.12.2
|
136 |
+
google-cloud-language==2.9.1
|
137 |
+
google-cloud-resource-manager==1.10.4
|
138 |
+
google-cloud-storage==2.8.0
|
139 |
+
google-cloud-translate==3.11.3
|
140 |
+
google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz#sha256=a7913e00463ccd8df75a61e36d8582af57905f6b05b88aa768c70a0d631990ef
|
141 |
+
google-crc32c==1.5.0
|
142 |
+
google-pasta==0.2.0
|
143 |
+
google-resumable-media==2.6.0
|
144 |
+
googleapis-common-protos==1.61.0
|
145 |
+
googledrivedownloader==0.4
|
146 |
+
graphviz==0.20.1
|
147 |
+
greenlet==3.0.1
|
148 |
+
grpc-google-iam-v1==0.12.6
|
149 |
+
grpcio==1.59.2
|
150 |
+
grpcio-status==1.48.2
|
151 |
+
gspread==3.4.2
|
152 |
+
gspread-dataframe==3.3.1
|
153 |
+
gym==0.25.2
|
154 |
+
gym-notices==0.0.8
|
155 |
+
h5netcdf==1.3.0
|
156 |
+
h5py==3.9.0
|
157 |
+
holidays==0.36
|
158 |
+
holoviews==1.17.1
|
159 |
+
html5lib==1.1
|
160 |
+
httpimport==1.3.1
|
161 |
+
httplib2==0.22.0
|
162 |
+
huggingface-hub==0.17.3
|
163 |
+
humanize==4.7.0
|
164 |
+
hyperopt==0.2.7
|
165 |
+
ibis-framework==6.2.0
|
166 |
+
idna==3.4
|
167 |
+
imageio==2.31.6
|
168 |
+
imageio-ffmpeg==0.4.9
|
169 |
+
imagesize==1.4.1
|
170 |
+
imbalanced-learn==0.10.1
|
171 |
+
imgaug==0.4.0
|
172 |
+
importlib-metadata==6.8.0
|
173 |
+
importlib-resources==6.1.1
|
174 |
+
imutils==0.5.4
|
175 |
+
inflect==7.0.0
|
176 |
+
iniconfig==2.0.0
|
177 |
+
install==1.3.5
|
178 |
+
intel-openmp==2023.2.0
|
179 |
+
ipyevents==2.0.2
|
180 |
+
ipyfilechooser==0.6.0
|
181 |
+
ipykernel==5.5.6
|
182 |
+
ipyleaflet==0.17.4
|
183 |
+
ipython==7.34.0
|
184 |
+
ipython-genutils==0.2.0
|
185 |
+
ipython-sql==0.5.0
|
186 |
+
ipytree==0.2.2
|
187 |
+
ipywidgets==7.7.1
|
188 |
+
itsdangerous==2.1.2
|
189 |
+
jax==0.4.20
|
190 |
+
jaxlib @ https://storage.googleapis.com/jax-releases/cuda11/jaxlib-0.4.20+cuda11.cudnn86-cp310-cp310-manylinux2014_x86_64.whl#sha256=01be66238133f884bf5adf15cd7eaaf8445f9d4b056c5c64df28a997a6aff2fe
|
191 |
+
jeepney==0.7.1
|
192 |
+
jieba==0.42.1
|
193 |
+
Jinja2==3.1.2
|
194 |
+
joblib==1.3.2
|
195 |
+
jsonpatch==1.33
|
196 |
+
jsonpickle==3.0.2
|
197 |
+
jsonpointer==2.4
|
198 |
+
jsonschema==4.19.2
|
199 |
+
jsonschema-specifications==2023.7.1
|
200 |
+
jupyter-client==6.1.12
|
201 |
+
jupyter-console==6.1.0
|
202 |
+
jupyter-server==1.24.0
|
203 |
+
jupyter_core==5.5.0
|
204 |
+
jupyterlab-pygments==0.2.2
|
205 |
+
jupyterlab-widgets==3.0.9
|
206 |
+
kaggle==1.5.16
|
207 |
+
keras==2.14.0
|
208 |
+
keyring==23.5.0
|
209 |
+
kiwisolver==1.4.5
|
210 |
+
langchain==0.0.334
|
211 |
+
langcodes==3.3.0
|
212 |
+
langsmith==0.0.63
|
213 |
+
launchpadlib==1.10.16
|
214 |
+
lazr.restfulclient==0.14.4
|
215 |
+
lazr.uri==1.0.6
|
216 |
+
lazy_loader==0.3
|
217 |
+
libclang==16.0.6
|
218 |
+
librosa==0.10.1
|
219 |
+
lida==0.0.10
|
220 |
+
lightgbm==4.1.0
|
221 |
+
linkify-it-py==2.0.2
|
222 |
+
llmx==0.0.15a0
|
223 |
+
llvmlite==0.41.1
|
224 |
+
locket==1.0.0
|
225 |
+
logical-unification==0.4.6
|
226 |
+
lxml==4.9.3
|
227 |
+
malloy==2023.1064
|
228 |
+
Markdown==3.5.1
|
229 |
+
markdown-it-py==3.0.0
|
230 |
+
MarkupSafe==2.1.3
|
231 |
+
marshmallow==3.20.1
|
232 |
+
matplotlib==3.7.1
|
233 |
+
matplotlib-inline==0.1.6
|
234 |
+
matplotlib-venn==0.11.9
|
235 |
+
mdit-py-plugins==0.4.0
|
236 |
+
mdurl==0.1.2
|
237 |
+
miniKanren==1.0.3
|
238 |
+
missingno==0.5.2
|
239 |
+
mistune==0.8.4
|
240 |
+
mizani==0.9.3
|
241 |
+
mkl==2023.2.0
|
242 |
+
ml-dtypes==0.2.0
|
243 |
+
mlxtend==0.22.0
|
244 |
+
more-itertools==10.1.0
|
245 |
+
moviepy==1.0.3
|
246 |
+
mpmath==1.3.0
|
247 |
+
msgpack==1.0.7
|
248 |
+
multidict==6.0.4
|
249 |
+
multipledispatch==1.0.0
|
250 |
+
multitasking==0.0.11
|
251 |
+
murmurhash==1.0.10
|
252 |
+
music21==9.1.0
|
253 |
+
mypy-extensions==1.0.0
|
254 |
+
natsort==8.4.0
|
255 |
+
nbclassic==1.0.0
|
256 |
+
nbclient==0.9.0
|
257 |
+
nbconvert==6.5.4
|
258 |
+
nbformat==5.9.2
|
259 |
+
nest-asyncio==1.5.8
|
260 |
+
networkx==3.2.1
|
261 |
+
nibabel==4.0.2
|
262 |
+
nltk==3.8.1
|
263 |
+
notebook==6.5.5
|
264 |
+
notebook_shim==0.2.3
|
265 |
+
numba==0.58.1
|
266 |
+
numexpr==2.8.7
|
267 |
+
numpy==1.23.5
|
268 |
+
oauth2client==4.1.3
|
269 |
+
oauthlib==3.2.2
|
270 |
+
opencv-contrib-python==4.8.0.76
|
271 |
+
opencv-python==4.8.0.76
|
272 |
+
opencv-python-headless==4.8.1.78
|
273 |
+
openpyxl==3.1.2
|
274 |
+
opt-einsum==3.3.0
|
275 |
+
optax==0.1.7
|
276 |
+
orbax-checkpoint==0.4.2
|
277 |
+
osqp==0.6.2.post8
|
278 |
+
packaging==23.2
|
279 |
+
pandas==1.5.3
|
280 |
+
pandas-datareader==0.10.0
|
281 |
+
pandas-gbq==0.17.9
|
282 |
+
pandas-stubs==1.5.3.230304
|
283 |
+
pandocfilters==1.5.0
|
284 |
+
panel==1.3.1
|
285 |
+
param==2.0.0
|
286 |
+
parso==0.8.3
|
287 |
+
parsy==2.1
|
288 |
+
partd==1.4.1
|
289 |
+
pathlib==1.0.1
|
290 |
+
pathy==0.10.3
|
291 |
+
patsy==0.5.3
|
292 |
+
peewee==3.17.0
|
293 |
+
pexpect==4.8.0
|
294 |
+
pickleshare==0.7.5
|
295 |
+
Pillow==9.4.0
|
296 |
+
pip-tools==6.13.0
|
297 |
+
platformdirs==3.11.0
|
298 |
+
plotly==5.15.0
|
299 |
+
plotnine==0.12.4
|
300 |
+
pluggy==1.3.0
|
301 |
+
polars==0.17.3
|
302 |
+
pooch==1.8.0
|
303 |
+
portpicker==1.5.2
|
304 |
+
prefetch-generator==1.0.3
|
305 |
+
preshed==3.0.9
|
306 |
+
prettytable==3.9.0
|
307 |
+
proglog==0.1.10
|
308 |
+
progressbar2==4.2.0
|
309 |
+
prometheus-client==0.18.0
|
310 |
+
promise==2.3
|
311 |
+
prompt-toolkit==3.0.39
|
312 |
+
prophet==1.1.5
|
313 |
+
proto-plus==1.22.3
|
314 |
+
protobuf==3.20.3
|
315 |
+
psutil==5.9.5
|
316 |
+
psycopg2==2.9.9
|
317 |
+
ptyprocess==0.7.0
|
318 |
+
py-cpuinfo==9.0.0
|
319 |
+
py4j==0.10.9.7
|
320 |
+
pyarrow==9.0.0
|
321 |
+
pyasn1==0.5.0
|
322 |
+
pyasn1-modules==0.3.0
|
323 |
+
pycocotools==2.0.7
|
324 |
+
pycparser==2.21
|
325 |
+
pyct==0.5.0
|
326 |
+
pydantic==1.10.13
|
327 |
+
pydata-google-auth==1.8.2
|
328 |
+
pydot==1.4.2
|
329 |
+
pydot-ng==2.0.0
|
330 |
+
pydotplus==2.0.2
|
331 |
+
PyDrive==1.3.1
|
332 |
+
PyDrive2==1.6.3
|
333 |
+
pyerfa==2.0.1.1
|
334 |
+
pygame==2.5.2
|
335 |
+
Pygments==2.16.1
|
336 |
+
PyGObject==3.42.1
|
337 |
+
PyJWT==2.3.0
|
338 |
+
pymc==5.7.2
|
339 |
+
pymystem3==0.2.0
|
340 |
+
PyOpenGL==3.1.7
|
341 |
+
pyOpenSSL==23.3.0
|
342 |
+
pyparsing==3.1.1
|
343 |
+
pypdfium2==4.24.0
|
344 |
+
pyperclip==1.8.2
|
345 |
+
pyproj==3.6.1
|
346 |
+
pyproject_hooks==1.0.0
|
347 |
+
pyshp==2.3.1
|
348 |
+
PySocks==1.7.1
|
349 |
+
pytensor==2.14.2
|
350 |
+
pytest==7.4.3
|
351 |
+
python-apt==0.0.0
|
352 |
+
python-box==7.1.1
|
353 |
+
python-dateutil==2.8.2
|
354 |
+
python-louvain==0.16
|
355 |
+
python-slugify==8.0.1
|
356 |
+
python-utils==3.8.1
|
357 |
+
pytz==2023.3.post1
|
358 |
+
pyviz_comms==3.0.0
|
359 |
+
PyWavelets==1.4.1
|
360 |
+
PyYAML==6.0.1
|
361 |
+
pyzmq==23.2.1
|
362 |
+
qdldl==0.1.7.post0
|
363 |
+
qudida==0.0.4
|
364 |
+
ratelim==0.1.6
|
365 |
+
referencing==0.30.2
|
366 |
+
regex==2023.6.3
|
367 |
+
requests==2.31.0
|
368 |
+
requests-oauthlib==1.3.1
|
369 |
+
requirements-parser==0.5.0
|
370 |
+
rich==13.6.0
|
371 |
+
rpds-py==0.12.0
|
372 |
+
rpy2==3.4.2
|
373 |
+
rsa==4.9
|
374 |
+
safetensors==0.4.0
|
375 |
+
scikit-image==0.19.3
|
376 |
+
scikit-learn==1.2.2
|
377 |
+
scipy==1.11.3
|
378 |
+
scooby==0.9.2
|
379 |
+
scs==3.2.4
|
380 |
+
seaborn==0.12.2
|
381 |
+
SecretStorage==3.3.1
|
382 |
+
Send2Trash==1.8.2
|
383 |
+
sentencepiece==0.1.99
|
384 |
+
shapely==2.0.2
|
385 |
+
six==1.16.0
|
386 |
+
sklearn-pandas==2.2.0
|
387 |
+
smart-open==6.4.0
|
388 |
+
sniffio==1.3.0
|
389 |
+
snowballstemmer==2.2.0
|
390 |
+
sortedcontainers==2.4.0
|
391 |
+
soundfile==0.12.1
|
392 |
+
soupsieve==2.5
|
393 |
+
soxr==0.3.7
|
394 |
+
spacy==3.6.1
|
395 |
+
spacy-legacy==3.0.12
|
396 |
+
spacy-loggers==1.0.5
|
397 |
+
Sphinx==5.0.2
|
398 |
+
sphinxcontrib-applehelp==1.0.7
|
399 |
+
sphinxcontrib-devhelp==1.0.5
|
400 |
+
sphinxcontrib-htmlhelp==2.0.4
|
401 |
+
sphinxcontrib-jsmath==1.0.1
|
402 |
+
sphinxcontrib-qthelp==1.0.6
|
403 |
+
sphinxcontrib-serializinghtml==1.1.9
|
404 |
+
SQLAlchemy==2.0.23
|
405 |
+
sqlglot==17.16.2
|
406 |
+
sqlparse==0.4.4
|
407 |
+
srsly==2.4.8
|
408 |
+
stanio==0.3.0
|
409 |
+
statsmodels==0.14.0
|
410 |
+
sympy==1.12
|
411 |
+
tables==3.8.0
|
412 |
+
tabulate==0.9.0
|
413 |
+
tbb==2021.10.0
|
414 |
+
tblib==3.0.0
|
415 |
+
tenacity==8.2.3
|
416 |
+
tensorboard==2.14.1
|
417 |
+
tensorboard-data-server==0.7.2
|
418 |
+
tensorflow==2.14.0
|
419 |
+
tensorflow-datasets==4.9.3
|
420 |
+
tensorflow-estimator==2.14.0
|
421 |
+
tensorflow-gcs-config==2.14.0
|
422 |
+
tensorflow-hub==0.15.0
|
423 |
+
tensorflow-io-gcs-filesystem==0.34.0
|
424 |
+
tensorflow-metadata==1.14.0
|
425 |
+
tensorflow-probability==0.22.0
|
426 |
+
tensorstore==0.1.45
|
427 |
+
termcolor==2.3.0
|
428 |
+
terminado==0.17.1
|
429 |
+
text-unidecode==1.3
|
430 |
+
textblob==0.17.1
|
431 |
+
tf-slim==1.1.0
|
432 |
+
thinc==8.1.12
|
433 |
+
threadpoolctl==3.2.0
|
434 |
+
tifffile==2023.9.26
|
435 |
+
tinycss2==1.2.1
|
436 |
+
tokenizers==0.14.1
|
437 |
+
toml==0.10.2
|
438 |
+
tomli==2.0.1
|
439 |
+
toolz==0.12.0
|
440 |
+
torch @ https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=a81b554184492005543ddc32e96469f9369d778dedd195d73bda9bed407d6589
|
441 |
+
torchaudio @ https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=cdfd0a129406155eee595f408cafbb92589652da4090d1d2040f5453d4cae71f
|
442 |
+
torchdata==0.7.0
|
443 |
+
torchsummary==1.5.1
|
444 |
+
torchtext==0.16.0
|
445 |
+
torchvision @ https://download.pytorch.org/whl/cu118/torchvision-0.16.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=033712f65d45afe806676c4129dfe601ad1321d9e092df62b15847c02d4061dc
|
446 |
+
tornado==6.3.2
|
447 |
+
tqdm==4.66.1
|
448 |
+
traitlets==5.7.1
|
449 |
+
traittypes==0.2.1
|
450 |
+
transformers==4.35.0
|
451 |
+
triton==2.1.0
|
452 |
+
tweepy==4.14.0
|
453 |
+
typer==0.9.0
|
454 |
+
types-pytz==2023.3.1.1
|
455 |
+
types-setuptools==68.2.0.1
|
456 |
+
typing-inspect==0.9.0
|
457 |
+
typing_extensions==4.5.0
|
458 |
+
tzlocal==5.2
|
459 |
+
uc-micro-py==1.0.2
|
460 |
+
uritemplate==4.1.1
|
461 |
+
urllib3==2.0.7
|
462 |
+
vega-datasets==0.9.0
|
463 |
+
wadllib==1.3.6
|
464 |
+
wasabi==1.1.2
|
465 |
+
wcwidth==0.2.9
|
466 |
+
webcolors==1.13
|
467 |
+
webencodings==0.5.1
|
468 |
+
websocket-client==1.6.4
|
469 |
+
Werkzeug==3.0.1
|
470 |
+
widgetsnbextension==3.6.6
|
471 |
+
wordcloud==1.9.2
|
472 |
+
wrapt==1.14.1
|
473 |
+
xarray==2023.7.0
|
474 |
+
xarray-einstats==0.6.0
|
475 |
+
xgboost==2.0.1
|
476 |
+
xlrd==2.0.1
|
477 |
+
xxhash==3.4.1
|
478 |
+
xyzservices==2023.10.1
|
479 |
+
yarl==1.9.2
|
480 |
+
yellowbrick==1.5
|
481 |
+
yfinance==0.2.31
|
482 |
+
zict==3.0.0
|
483 |
+
zipp==3.17.0
|
sections.json
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"section_headers": {
|
3 |
+
"objective": [
|
4 |
+
"career goal",
|
5 |
+
"objective",
|
6 |
+
"career objective",
|
7 |
+
"employment objective",
|
8 |
+
"professional objective",
|
9 |
+
"summary",
|
10 |
+
"summary of qualifications"
|
11 |
+
],
|
12 |
+
"work_and_employment": [
|
13 |
+
"employment history",
|
14 |
+
"employment data",
|
15 |
+
"career summary",
|
16 |
+
"work history",
|
17 |
+
"working history",
|
18 |
+
"work experience",
|
19 |
+
"experience",
|
20 |
+
"professional experience",
|
21 |
+
"professional background",
|
22 |
+
"professional employment",
|
23 |
+
"additional experience",
|
24 |
+
"career related experience",
|
25 |
+
"professional employment history",
|
26 |
+
"related experience",
|
27 |
+
"relevant experience",
|
28 |
+
"programming experience",
|
29 |
+
"freelance",
|
30 |
+
"freelance experience",
|
31 |
+
"army experience",
|
32 |
+
"military experience",
|
33 |
+
"military background"
|
34 |
+
],
|
35 |
+
"education_and_training": [
|
36 |
+
"academic background",
|
37 |
+
"academic experience",
|
38 |
+
"programs",
|
39 |
+
"courses",
|
40 |
+
"related courses",
|
41 |
+
"education",
|
42 |
+
"educational background",
|
43 |
+
"educational qualifications",
|
44 |
+
"educational training",
|
45 |
+
"education and training",
|
46 |
+
"training",
|
47 |
+
"academic training",
|
48 |
+
"Academic Qualification",
|
49 |
+
"professional training",
|
50 |
+
"course project experience",
|
51 |
+
"related course projects",
|
52 |
+
"internship experience",
|
53 |
+
"internships",
|
54 |
+
"apprenticeships",
|
55 |
+
"college activities",
|
56 |
+
"certifications",
|
57 |
+
"special training"
|
58 |
+
],
|
59 |
+
"skills": [
|
60 |
+
"credentials",
|
61 |
+
"qualifications",
|
62 |
+
"areas of experience",
|
63 |
+
"areas of expertise",
|
64 |
+
"areas of knowledge",
|
65 |
+
"skills",
|
66 |
+
"Skills",
|
67 |
+
"other skills",
|
68 |
+
"other abilities",
|
69 |
+
"career related skills",
|
70 |
+
"professional skills",
|
71 |
+
"specialized skills",
|
72 |
+
"technical skills",
|
73 |
+
"computer skills",
|
74 |
+
"personal skills",
|
75 |
+
"computer knowledge",
|
76 |
+
"technologies",
|
77 |
+
"technical experience",
|
78 |
+
"proficiencies",
|
79 |
+
"languages",
|
80 |
+
"language competencies and skills",
|
81 |
+
"programming languages",
|
82 |
+
"competencies"
|
83 |
+
],
|
84 |
+
"misc": [
|
85 |
+
"activities and honors",
|
86 |
+
"activities",
|
87 |
+
"affiliations",
|
88 |
+
"professional affiliations",
|
89 |
+
"associations",
|
90 |
+
"professional associations",
|
91 |
+
"memberships",
|
92 |
+
"professional memberships",
|
93 |
+
"athletic involvement",
|
94 |
+
"community involvement",
|
95 |
+
"refere",
|
96 |
+
"civic activities",
|
97 |
+
"extra-Curricular activities",
|
98 |
+
"professional activities",
|
99 |
+
"volunteer work",
|
100 |
+
"volunteer experience",
|
101 |
+
"additional information",
|
102 |
+
"interests"
|
103 |
+
],
|
104 |
+
"accomplishments": [
|
105 |
+
"awards",
|
106 |
+
"achievement",
|
107 |
+
"awards and achievements",
|
108 |
+
"licenses",
|
109 |
+
"presentations",
|
110 |
+
"conference presentations",
|
111 |
+
"conventions",
|
112 |
+
"dissertations",
|
113 |
+
"exhibits",
|
114 |
+
"papers",
|
115 |
+
"publications",
|
116 |
+
"professional publications",
|
117 |
+
"research experience",
|
118 |
+
"research grants",
|
119 |
+
"projects",
|
120 |
+
"research projects",
|
121 |
+
"personal projects",
|
122 |
+
"current research interests",
|
123 |
+
"thesis",
|
124 |
+
"theses"
|
125 |
+
]
|
126 |
+
}
|
127 |
+
}
|
segmenter.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flashtext import KeywordProcessor
|
2 |
+
import json
|
3 |
+
class ResumeSegmenter():
|
4 |
+
class ResumeSegmenter():
|
5 |
+
def __init__(self):
|
6 |
+
self.resume_segments = {
|
7 |
+
'objective': [],
|
8 |
+
'work_and_employment': [],
|
9 |
+
'education_and_training': [],
|
10 |
+
'skills': [],
|
11 |
+
'accomplishments': [],
|
12 |
+
'misc': []
|
13 |
+
}
|
14 |
+
self.resume_indices = []
|
15 |
+
|
16 |
+
def get_average_line_len(self, lines):
|
17 |
+
sum = 0
|
18 |
+
for line in lines:
|
19 |
+
sum+=len(line)
|
20 |
+
return sum / len(lines)
|
21 |
+
|
22 |
+
def get_average_words_per_line(self, lines):
|
23 |
+
sum = 0
|
24 |
+
for line in lines:
|
25 |
+
#other stopwords too?
|
26 |
+
sum+= len(line.split(' '))
|
27 |
+
return sum/ len(lines)
|
28 |
+
|
29 |
+
def find_segment_indices(self, text_list):
|
30 |
+
with open(r"./sections.json") as f:
|
31 |
+
data = json.load(f)
|
32 |
+
section_headers = data["section_headers"]
|
33 |
+
f.close()
|
34 |
+
keyword_processor = KeywordProcessor()
|
35 |
+
keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
|
36 |
+
average_words_per_line = self.get_average_words_per_line(text_list)
|
37 |
+
|
38 |
+
for i, line in enumerate(text_list):
|
39 |
+
if line[0].islower() or line[-1] == '.':
|
40 |
+
continue
|
41 |
+
kys = keyword_processor.extract_keywords(line)
|
42 |
+
if len(kys) > 0:
|
43 |
+
#other stopwords? from where? nltk lib ? pos tagger?
|
44 |
+
if len(line.split(" ")) > average_words_per_line * 0.75:
|
45 |
+
continue
|
46 |
+
#is it necessary to keep the actual raw keyword?
|
47 |
+
self.resume_indices.append(i)
|
48 |
+
self.resume_segments[kys[0]].append(i)
|
49 |
+
|
50 |
+
def slice_segments(self, lines):
|
51 |
+
sections = {}
|
52 |
+
if len(self.resume_indices) == 0:
|
53 |
+
return None
|
54 |
+
|
55 |
+
for section, points in self.resume_segments.items():
|
56 |
+
if len(points) == 0: continue
|
57 |
+
start_point = points[0]
|
58 |
+
tmp_end_point = points[-1]
|
59 |
+
end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
|
60 |
+
len(self.resume_indices)-1)]
|
61 |
+
if start_point == self.resume_indices[-1]:
|
62 |
+
end_point = len(lines)
|
63 |
+
sections[section] = (start_point, end_point)
|
64 |
+
sections["basics_info"] = (0, self.resume_indices[0])
|
65 |
+
return sections
|
66 |
+
|
67 |
+
def get_interval_intersection(self, sections, interval):
|
68 |
+
for section in sections:
|
69 |
+
s = section[1]
|
70 |
+
if s[0] >= interval[1] or interval[0] >= s[1]:
|
71 |
+
return None
|
72 |
+
else:
|
73 |
+
start = max(s[0], interval[0])
|
74 |
+
end = min(s[1], interval[1])
|
75 |
+
return [start, end], section
|
76 |
+
def segment(self, resume_lines):
|
77 |
+
self.find_segment_indices(resume_lines)
|
78 |
+
sections = self.slice_segments(resume_lines)
|
79 |
+
#whats the naming convention here sections_list or list_sections???
|
80 |
+
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
|
81 |
+
intersection_intervals = []
|
82 |
+
|
83 |
+
for i, s in enumerate(sections_list[:-1]):
|
84 |
+
result = self.get_interval_intersection(sections_list[i+1:], s[1])
|
85 |
+
if result is None:
|
86 |
+
continue
|
87 |
+
else:
|
88 |
+
a,b = result
|
89 |
+
print(a,b,s[0])
|
90 |
+
intersection_intervals.append((a,b,s[0]))
|
91 |
+
|
92 |
+
if len(intersection_intervals) > 0:
|
93 |
+
print("there are intersections", intersection_intervals)
|
94 |
+
#needs last method of cleaning overlapping intervals with zero shot
|
95 |
+
#classifier + substract intervals
|
96 |
+
return sections
|
97 |
+
|
98 |
+
def get_parsed_sections(self, resume_lines):
|
99 |
+
text_segments = {}
|
100 |
+
sections = self.segment(resume_lines)
|
101 |
+
for header_title, section in sections.items():
|
102 |
+
lines = resume_lines[section[0]:section[1]]
|
103 |
+
text_segments[header_title] = lines
|
104 |
+
|
105 |
+
return text_segments, sections
|