File size: 889 Bytes
d1a66a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import PyPDF2
from docx import Document
import io

async def extract_text_from_pdf(pdf_data):
    with io.BytesIO(pdf_data) as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

async def extract_text_from_docx(docx_data):
    doc = Document(io.BytesIO(docx_data))
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

async def extract_text_from_attachment(filename, data):
    if filename.endswith('.pdf'):
        return await extract_text_from_pdf(data)
    elif filename.endswith('.docx'):
        return await extract_text_from_docx(data)
    else:
        # Add handling for other document types if needed
        return "Unsupported document type"