Spaces:
Sleeping
Sleeping
import PyPDF2 | |
from docx import Document | |
import io | |
async def extract_text_from_pdf(pdf_data): | |
with io.BytesIO(pdf_data) as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
async def extract_text_from_docx(docx_data): | |
doc = Document(io.BytesIO(docx_data)) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
async def extract_text_from_attachment(filename, data): | |
if filename.endswith('.pdf'): | |
return await extract_text_from_pdf(data) | |
elif filename.endswith('.docx'): | |
return await extract_text_from_docx(data) | |
else: | |
# Add handling for other document types if needed | |
return "Unsupported document type" | |