Spaces:
Sleeping
Sleeping
from PyPDF2 import PdfReader | |
import streamlit as st | |
from embeddings import text_embedding | |
import scipy | |
def extract_and_rank(resumes, job_description): | |
out_embed_dict = {} | |
out_text_dict = {} | |
for resume in resumes: | |
reader = PdfReader(resume) | |
raw_text = "".join(page.extract_text() for page in reader.pages) | |
embedding = text_embedding(raw_text) | |
d1 = {resume.name: (embedding)} | |
d2 = {resume.name: raw_text} | |
out_embed_dict.update(d1) | |
out_text_dict.update(d2) | |
ranked_output = rankings(out_dict=out_embed_dict, query=job_description) | |
return ranked_output, out_embed_dict, out_text_dict | |
def get_sim(query_embedding, average_vec): | |
try: | |
sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vec))] | |
return sim | |
except: | |
return [0] | |
def rankings(out_dict, query): | |
query_embedding = text_embedding(query) | |
rank = [] | |
for k, v in out_dict.items(): | |
rank.append((k, get_sim(query_embedding, v))) | |
rank = sorted(rank, key=lambda t: t[1], reverse=True) | |
return rank | |
# def data_clean(text): | |
# pattern = r'[^a-zA-Z0-9\s]' | |
# text = re.sub(pattern,'',' '.join(text)) | |
# tokens = [token.strip() for token in text.split()] | |
# filtered = [token for token in tokens if token.lower() not in stopword_list] | |
# filtered = ' '.join(filtered) | |
# return filtered | |
# def embeddings(word): | |
# # print(word) | |
# if word in wv.key_to_index: | |
# return wv.get_vector(word) | |
# else: | |
# return np.zeros(300) | |