Spaces:
No application file
No application file
import requests | |
import time | |
import json | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
HEADERS = { | |
'app_id': os.environ.get('MATHPIX_APP_ID', 'default_app_id'), | |
'app_key': os.environ.get('MATHPIX_APP_KEY', 'default_app_key') | |
} | |
def extract_text(file_path: str) -> str: | |
print("Parsing resume") | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"The file at {file_path} does not exist.") | |
file_name = os.path.basename(file_path) | |
url1 = 'https://api.mathpix.com/v3/pdf' | |
with open(file_path, 'rb') as file: | |
files = {'file': file} | |
data = {'options_json': json.dumps({ | |
"conversion_formats": {"md": True}, | |
"math_inline_delimiters": ["$", "$"], | |
"rm_spaces": True | |
})} | |
status_resp = requests.post(url1, headers=HEADERS, files=files, data=data) | |
if status_resp.status_code != 200: | |
raise Exception(f"Failed to upload PDF: {status_resp.text}") | |
status_resp_data = status_resp.json() | |
pdf_id = status_resp_data.get('pdf_id') | |
if not pdf_id: | |
raise Exception("Failed to retrieve PDF ID from response.") | |
time.sleep(1) | |
url2 = f'https://api.mathpix.com/v3/pdf/{pdf_id}' | |
while True: | |
challenge_resp = requests.get(url2, headers=HEADERS) | |
challenge_resp_data = challenge_resp.json() | |
if challenge_resp_data.get('status') == 'completed': | |
break | |
time.sleep(1) | |
url3 = f'https://api.mathpix.com/v3/pdf/{pdf_id}.mmd' | |
contents = requests.get(url3, headers=HEADERS) | |
if contents.status_code != 200: | |
raise Exception(f"Failed to download converted file: {contents.text}") | |
open(os.path.join(os.getcwd(),"resume_mmds", (str(file_name)+'.mmd')),"w").write(contents.text) | |
return contents.text | |