import spaces import gradio as gr import json import os from pathlib import Path import logging from docling.document_converter import DocumentConverter from docling.datamodel.base_models import InputFormat, DocumentStream from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode from docling.document_converter import PdfFormatOption import requests from urllib.parse import urlparse from datetime import datetime import tempfile from docx import Document from docx.shared import Inches import markdown # Set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) def is_valid_url(url): try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def markdown_to_docx(markdown_content): """Convert markdown content to DOCX format""" doc = Document() # Split content into lines lines = markdown_content.split('\n') for line in lines: # Handle headers if line.startswith('# '): doc.add_heading(line[2:], level=1) elif line.startswith('## '): doc.add_heading(line[3:], level=2) elif line.startswith('### '): doc.add_heading(line[4:], level=3) # Handle lists elif line.startswith('* ') or line.startswith('- '): doc.add_paragraph(line[2:], style='List Bullet') elif line.startswith('1. '): doc.add_paragraph(line[3:], style='List Number') # Handle normal text elif line.strip(): doc.add_paragraph(line) # Handle empty lines else: doc.add_paragraph() return doc def create_output_files(content, original_name): """Create temporary files for different formats and return their paths""" files = {} # Generate base filename base_name = Path(original_name).stem timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Create markdown file md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name with open(md_path, "w", encoding="utf-8") as f: f.write(content) files['markdown'] = md_path # Create JSON file json_content = { "title": original_name, "content": content, "metadata": { "conversion_date": datetime.now().isoformat() } } json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name with open(json_path, "w", encoding="utf-8") as f: json.dump(json_content, f, ensure_ascii=False, indent=2) files['json'] = json_path # Create proper DOCX file docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name doc = markdown_to_docx(content) doc.save(docx_path) files['docx'] = docx_path return files @spaces.GPU() def process_document(input_type, file_input, url_input, use_gpu, table_mode): try: logger.debug(f"Processing with input type: {input_type}") logger.debug(f"File input: {file_input}") # Configure pipeline pipeline_options = PdfPipelineOptions(do_table_structure=True) if table_mode: pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE else: pipeline_options.table_structure_options.mode = TableFormerMode.FAST converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) # Handle different input types if input_type == "file": if file_input is None: return None, None, None, None, "Please upload a file" source = file_input original_name = Path(file_input).name elif input_type == "url": if not url_input or not is_valid_url(url_input): return None, None, None, None, "Please enter a valid URL" source = url_input original_name = Path(urlparse(url_input).path).name or "url_document" else: return None, None, None, None, "Invalid input type" # Convert document logger.debug(f"Converting document: {source}") result = converter.convert(source) # Get markdown content markdown_content = result.document.export_to_markdown() # Create output files output_files = create_output_files(markdown_content, original_name) return ( output_files['markdown'], output_files['json'], output_files['docx'], markdown_content, "Conversion completed successfully! Use the download buttons below to get your files." ) except Exception as e: logger.exception("Error occurred during conversion") return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs." # Create title HTML with custom style and duplicate button CSS title_html = """
Convert documents from files or URLs to various formats
Please like this Space if you find it useful! Your support is appreciated 🙏
Made with 💖 by Pejman Ebrahimi