Spaces:
Sleeping
Sleeping
File size: 889 Bytes
d1a66a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import PyPDF2
from docx import Document
import io
async def extract_text_from_pdf(pdf_data):
with io.BytesIO(pdf_data) as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
async def extract_text_from_docx(docx_data):
doc = Document(io.BytesIO(docx_data))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
async def extract_text_from_attachment(filename, data):
if filename.endswith('.pdf'):
return await extract_text_from_pdf(data)
elif filename.endswith('.docx'):
return await extract_text_from_docx(data)
else:
# Add handling for other document types if needed
return "Unsupported document type"
|