AiContract

Sleeping

karthikeyan-r commited on Sep 5

Commit

4834106

•

1 Parent(s): a48b255

Create pdfProcessor.py

Files changed (1) hide show

pdfProcessor.py ADDED Viewed

+from langchain_community.document_loaders import PyPDFLoader
+import os
+from typing import List
+class PDFProcessor:
+    """
+    Class for processing PDF files to extract text content.
+    """
+    def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
+        """
+        Extract text content from a list of PDF files.
+        Args:
+            file_paths (List[str]): A list of file paths to the PDF documents.
+        Returns:
+            List[str]: A list of text content extracted from the PDF documents.
+        """
+        texts = []
+        for file_path in file_paths:
+            try:
+                loader = PyPDFLoader(file_path)
+                pages = loader.load_and_split()
+                for page in pages:
+                    if isinstance(page.page_content, bytes):
+                        text = page.page_content.decode('utf-8', errors='ignore')
+                    elif isinstance(page.page_content, str):
+                        text = page.page_content
+                    else:
+                        print(f"Unexpected type: {type(page.page_content)}")
+                        continue
+                    texts.append(text)
+            except Exception as e:
+                print(f"Failed to process {file_path}: {e}")
+        return texts