Spaces:

vitorcalvi
/

PDFToAudioBookSummary

Runtime error

File size: 4,936 Bytes

d7529f8

import gradio as gr
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import os
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor
from TTS.api import TTS

# Download necessary NLTK data
nltk.download('punkt', quiet=True)

# Initialize TTS model using ONNX
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", use_onnx=True)

# Set default speaker and language manually based on valid IDs obtained
default_speaker = "en_speaker_1"  # Replace with a valid speaker ID from the printed list
default_language = "en"  # Replace with a valid language code from the printed list

def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None

def summarize_text(text, summary_length):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, summary_length)
    return ' '.join([str(sentence) for sentence in summary])

def split_into_chapters(text, num_chapters):
    sentences = sent_tokenize(text)
    if len(sentences) <= num_chapters:
        return sentences
    sentences_per_chapter = max(1, len(sentences) // num_chapters)
    chapters = []
    for i in range(0, len(sentences), sentences_per_chapter):
        chapter = ' '.join(sentences[i:i+sentences_per_chapter])
        chapters.append(chapter)
    while len(chapters) > num_chapters:
        chapters[-2] += ' ' + chapters[-1]
        chapters.pop()
    return chapters

def text_to_speech(text, output_path, speaker, language):
    tts.tts_to_file(text=text, file_path=output_path, speaker=speaker, language=language)
    return output_path

def adjust_audio_speed(input_path, output_path, target_duration):
    audio = AudioSegment.from_mp3(input_path)
    current_duration = len(audio)
    
    if current_duration == 0:
        print(f"Warning: Audio file {input_path} has zero duration. Skipping speed adjustment.")
        return input_path
    
    speed_factor = current_duration / target_duration
    if speed_factor < 0.1:
        speed_factor = 0.1
    
    try:
        adjusted_audio = audio.speedup(playback_speed=speed_factor)
        adjusted_audio.export(output_path, format="mp3")
        return output_path
    except Exception as e:
        print(f"Error adjusting audio speed: {e}")
        return input_path

def process_chapter(chapter, i, speaker, language):
    try:
        if len(chapter.strip()) == 0:
            print(f"Warning: Chapter {i+1} is empty. Skipping.")
            return None
        
        temp_path = f"temp_chapter_{i+1}.mp3"
        output_path = f"chapter_{i+1}.mp3"
        
        text_to_speech(chapter, temp_path, speaker, language)
        
        # Adjust speed to fit into 3 minutes
        adjust_audio_speed(temp_path, output_path, 3 * 60 * 1000)
        
        os.remove(temp_path)  # Clean up temporary file
        return output_path
    except Exception as e:
        print(f"Error processing chapter {i+1}: {e}")
        return None

def process_pdf(pdf_path, num_chapters, speaker, language):
    full_text = extract_text_from_pdf(pdf_path)
    if full_text is None or len(full_text.strip()) == 0:
        print("Error: Extracted text is empty or None")
        return []

    # Clean text to remove unwanted characters
    full_text = full_text.replace('\t', ' ')

    summary_length = max(1, 15 * 150 // len(full_text.split()))
    summary = summarize_text(full_text, summary_length)
    
    chapters = split_into_chapters(summary, num_chapters)
    
    with ThreadPoolExecutor() as executor:
        chapter_audios = list(executor.map(lambda i: process_chapter(chapters[i], i, speaker, language), range(len(chapters))))
    
    return [audio for audio in chapter_audios if audio is not None]

def gradio_interface(pdf_file, num_chapters):
    if pdf_file is None:
        return [None] * 10
    
    chapter_audios = process_pdf(pdf_file.name, num_chapters, default_speaker, default_language)
    return chapter_audios + [None] * (10 - len(chapter_audios))

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(label="Upload PDF Book"),
        gr.Slider(minimum=1, maximum=10, step=1, label="Number of Chapters", value=5)
    ],
    outputs=[gr.Audio(label=f"Chapter {i+1}") for i in range(10)],
    title="PDF Book to Audiobook Summary",
    description="Upload a PDF book to get a 15-minute audiobook summary split into chapters."
)

if __name__ == "__main__":
    iface.launch(share=True)