from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks from pydantic import BaseModel import fitz # PyMuPDF import tempfile, os, json, logging from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration # Initialize FastAPI app and logging app = FastAPI() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Load model and tokenizer model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", legacy=False) qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1) # CPU-based inference # Define the structured response model class Education(BaseModel): degree: str university: str graduation_year: str class ExtractedInfo(BaseModel): work_experience: str education: Education professional_course_detail: str software_usage: str safety_course_detail: str hse_description: str good_conduct_certificate: str def extract_text_from_pdf(pdf_path: str) -> str: """Extracts text from the uploaded PDF.""" with fitz.open(pdf_path) as doc: return "".join(page.get_text() for page in doc).strip() def chunk_text(text: str, max_tokens: int = 512) -> list: """Splits the text into manageable chunks that fit within the token limit.""" tokens = tokenizer.encode(text, add_special_tokens=False) return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True) for i in range(0, len(tokens), max_tokens)] def process_chunk(chunk: str) -> dict: """Uses the model to extract structured JSON data from a text chunk.""" prompt = f""" Extract the following information in JSON format: {{ "work_experience": "", "education": {{ "degree": "", "university": "", "graduation_year": "" }}, "professional_course_detail": "", "software_usage": "", "safety_course_detail": "", "hse_description": "", "good_conduct_certificate": "" }} Resume text: {chunk} """ try: response = qa_pipeline(prompt, max_new_tokens=150) generated_text = response[0]["generated_text"] # Extract JSON from the generated text json_start = generated_text.find("{") json_end = generated_text.rfind("}") + 1 return json.loads(generated_text[json_start:json_end]) except (json.JSONDecodeError, IndexError) as e: logger.warning(f"Failed to parse JSON: {e}") return {} def merge_outputs(chunks: list) -> dict: """Combines multiple chunk outputs into a single structured result.""" merged = { "work_experience": "", "education": {"degree": "", "university": "", "graduation_year": ""}, "professional_course_detail": "", "software_usage": "", "safety_course_detail": "", "hse_description": "", "good_conduct_certificate": "" } for chunk in chunks: chunk_output = process_chunk(chunk) for key, value in chunk_output.items(): if isinstance(value, dict): merged[key].update(value) elif not merged[key]: merged[key] = value return merged @app.post("/process_cv/", response_model=ExtractedInfo) async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()): """Processes a PDF resume and returns structured information in JSON format.""" if not file.filename.endswith(".pdf"): raise HTTPException(status_code=400, detail="Only PDF files are allowed.") temp_path = tempfile.mktemp(suffix=".pdf") with open(temp_path, "wb") as f: f.write(await file.read()) try: # Extract text from the uploaded PDF text = extract_text_from_pdf(temp_path) if not text: raise HTTPException(status_code=400, detail="No extractable text found in the PDF.") # Process the text in chunks and merge the output chunks = chunk_text(text) structured_data = merge_outputs(chunks) return ExtractedInfo(**structured_data) finally: os.remove(temp_path) background_tasks.add_task(clean_temp_files) def clean_temp_files(): """Cleans up temporary PDF files.""" for filename in os.listdir(tempfile.gettempdir()): if filename.endswith(".pdf"): try: os.remove(os.path.join(tempfile.gettempdir(), filename)) except Exception as e: logger.warning(f"Failed to delete {filename}: {e}")