Omkar008's picture
Upload 17 files
d1a66a2 verified
raw
history blame
No virus
889 Bytes
import PyPDF2
from docx import Document
import io
async def extract_text_from_pdf(pdf_data):
with io.BytesIO(pdf_data) as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
async def extract_text_from_docx(docx_data):
doc = Document(io.BytesIO(docx_data))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
async def extract_text_from_attachment(filename, data):
if filename.endswith('.pdf'):
return await extract_text_from_pdf(data)
elif filename.endswith('.docx'):
return await extract_text_from_docx(data)
else:
# Add handling for other document types if needed
return "Unsupported document type"