Spaces:

oceansweep
/

tldw

Running

File size: 8,358 Bytes

32b7e17

# Media_Wiki.py
# Description: This file contains the functions to import MediaWiki dumps into the media_db and Chroma databases.
#######################################################################################################################
#
# Imports
import json
import logging
import os
import re
from typing import List, Dict, Any, Iterator, Optional
# 3rd-Party Imports
import mwparserfromhell
import mwxml
import yaml
#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, check_media_exists
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
#
#######################################################################################################################
#
# Functions:

def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
    """Set up and return a logger with the given name and level."""
    logger = logging.getLogger(name)
    logger.setLevel(level)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    return logger

# Usage
logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')

# Load configuration
def load_mediawiki_import_config():
    with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
        return yaml.safe_load(f)
config = load_mediawiki_import_config()

def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
    Dict[str, Any]]:
    dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
    for page in dump.pages:
        if skip_redirects and page.redirect:
            continue
        if namespaces and page.namespace not in namespaces:
            continue

        for revision in page:
            code = mwparserfromhell.parse(revision.text)
            text = code.strip_code(normalize=True, collapse=True, keep_template_params=False)
            yield {
                "title": page.title,
                "content": text,
                "namespace": page.namespace,
                "page_id": page.id,
                "revision_id": revision.id,
                "timestamp": revision.timestamp
            }
        logger.debug(f"Yielded page: {page.title}")


def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
    sections = re.split(r'\n==\s*(.*?)\s*==\n', text)
    chunks = []
    current_chunk = ""
    current_size = 0

    for i in range(0, len(sections), 2):
        section_title = sections[i] if i > 0 else "Introduction"
        section_content = sections[i + 1] if i + 1 < len(sections) else ""

        if current_size + len(section_content) > chunk_options['max_size']:
            if current_chunk:
                chunks.append({"text": current_chunk, "metadata": {"section": section_title}})
            current_chunk = section_content
            current_size = len(section_content)
        else:
            current_chunk += f"\n== {section_title} ==\n" + section_content
            current_size += len(section_content)

    if current_chunk:
        chunks.append({"text": current_chunk, "metadata": {"section": "End"}})

    return chunks


def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],

                        is_combined: bool = False, item: Dict[str, Any] = None):
    try:
        url = f"mediawiki:{wiki_name}" if is_combined else f"mediawiki:{wiki_name}:{title}"

        if not check_media_exists(title, url):
            media_id = add_media_with_keywords(
                url=url,
                title=title,
                media_type="mediawiki_dump" if is_combined else "mediawiki_article",
                content=content,
                keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
                prompt="",
                summary="",
                transcription_model="",
                author="MediaWiki",
                ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
            )

            chunks = optimized_chunking(content, chunk_options)
            for chunk in chunks:
                process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
            logger.info(f"Successfully processed item: {title}")
        else:
            logger.info(f"Skipping existing article: {title}")
    except Exception as e:
        logger.error(f"Error processing item {title}: {str(e)}")


def load_checkpoint(file_path: str) -> int:
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)['last_processed_id']
    return 0


def save_checkpoint(file_path: str, last_processed_id: int):
    with open(file_path, 'w') as f:
        json.dump({'last_processed_id': last_processed_id}, f)


def import_mediawiki_dump(

        file_path: str,

        wiki_name: str,

        namespaces: List[int] = None,

        skip_redirects: bool = False,

        chunk_options: Dict[str, Any] = None,

        single_item: bool = False,

        progress_callback: Any = None

) -> Iterator[str]:
    try:
        if chunk_options is None:
            chunk_options = config['chunking']

        checkpoint_file = f"{wiki_name}_import_checkpoint.json"
        last_processed_id = load_checkpoint(checkpoint_file)

        total_pages = count_pages(file_path, namespaces, skip_redirects)
        processed_pages = 0

        yield f"Found {total_pages} pages to process."

        for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
            if item['page_id'] <= last_processed_id:
                continue
            process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
            save_checkpoint(checkpoint_file, item['page_id'])
            processed_pages += 1
            if progress_callback is not None:
                progress_callback(processed_pages / total_pages, f"Processed page: {item['title']}")
            yield f"Processed page {processed_pages}/{total_pages}: {item['title']}"

        os.remove(checkpoint_file)  # Remove checkpoint file after successful import
        yield f"Successfully imported and indexed MediaWiki dump: {wiki_name}"
    except FileNotFoundError:
        logger.error(f"MediaWiki dump file not found: {file_path}")
        yield f"Error: File not found - {file_path}"
    except PermissionError:
        logger.error(f"Permission denied when trying to read: {file_path}")
        yield f"Error: Permission denied - {file_path}"
    except Exception as e:
        logger.exception(f"Error during MediaWiki import: {str(e)}")
        yield f"Error during import: {str(e)}"

def count_pages(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> int:
    """

    Count the number of pages in a MediaWiki XML dump file.



    Args:

    file_path (str): Path to the MediaWiki XML dump file.

    namespaces (List[int], optional): List of namespace IDs to include. If None, include all namespaces.

    skip_redirects (bool, optional): Whether to skip redirect pages.



    Returns:

    int: The number of pages in the dump file.

    """
    try:
        dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
        count = 0
        for page in dump.pages:
            if skip_redirects and page.redirect:
                continue
            if namespaces and page.namespace not in namespaces:
                continue
            count += 1
        return count
    except Exception as e:
        logger.error(f"Error counting pages in MediaWiki dump: {str(e)}")
        return 0

#
# End of Media_Wiki.py
#######################################################################################################################