Spaces:

gpt-99
/

real-time-transcriber

Running

File size: 10,024 Bytes
import torch
import torchaudio
import gradio as gr
from transformers import AutoProcessor, AutoModel
import warnings
import traceback
import gc

warnings.filterwarnings("ignore")

class OptimizedContinuousTranslator:
    def __init__(self, target_language="spa", chunk_duration=3, sample_rate=16000):
        try:
            self.processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
            self.model = AutoModel.from_pretrained("facebook/seamless-m4t-v2-large")
            self.target_language = target_language
        except Exception as e:
            print(f"Error loading model: {e}")
            self.processor = None
            self.model = None
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def wav_to_tensor(self, file_path, sampling_rate):
        """
        Converts a WAV file into a PyTorch tensor.
        
        Args:
            file_path (str): Path to the WAV file.
        
        Returns:
            torch.Tensor: Audio tensor.
            int: Sampling rate of the audio.
        """
        # Load the WAV file
        waveform, sample_rate = torchaudio.load(file_path)
            # Resample if the original sampling rate is not 16000 Hz
        if sample_rate != sampling_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sampling_rate)
            waveform = resampler(waveform)
        
        return waveform, sampling_rate

    def translate_audio(self, audio_file_path):
        """
        Enhanced audio translation with improved error handling and memory management
        
        Args:
            audio (torch.Tensor): Audio chunk to translate
        
        Returns:
            str: Translated text or error message
        """
        print("REACHED")
        if audio_file_path is None or self.processor is None or self.model is None:
            print(f"{audio_file_path} {self.processor} {self.model}")
            return ""

        try:
            # Prepare audio inputs
            wavform, sample_rate = self.wav_to_tensor(audio_file_path, 16000)
            audio_inputs = self.processor(audios=wavform.unsqueeze(0), return_tensors="pt", sampling_rate=sample_rate)
            
            # Move inputs to the correct device
            audio_inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
                            for k, v in audio_inputs.items()}

            # Generate translation
            output_tokens = self.model.generate(
                **audio_inputs,
                tgt_lang=self.target_language,
                generate_speech=False
            )
            
            # Decode the translated text
            translated_text = self.processor.decode(
                output_tokens[0].tolist()[0],
                skip_special_tokens=True
            )
            print(translated_text)
            return translated_text
        except Exception as e:
            error_message = f"Translation error: {str(e)}"
            stack_trace = traceback.format_exc()
            print(f"{error_message}\n{stack_trace}")
            return ""
        finally:
            # Aggressive memory cleanup
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
    

# web app
# simple translator (no real time)
def create_translator_interface():
    """Create the optimized Gradio interface for the Continuous Translator"""
    # Initialize the translator
    translator = OptimizedContinuousTranslator()
    
    with gr.Blocks(title="Continuous Audio Translator") as demo:
        # Usage Instructions in a Markdown Dropdown
        gr.Markdown("""
## 🎙️ Audio Translator: How to Use

<details>
<summary>Click to view usage instructions</summary>

### 🌐 Translation Steps
1. **Select Target Language**: 
   - Choose the language you want to translate to from the dropdown menu

2. **Record Audio**:
   - Click on the microphone icon in the audio input area
   - Record your audio clearly and concisely
   - Ensure minimal background noise for best results

3. **Translate**:
   - After recording, click the "Translate" button
   - The translated text will appear in the transcript box below

### 💡 Tips
- Speak clearly and at a moderate pace
- Avoid complex or technical language for more accurate translations
- The translation works best with shorter, simpler sentences
- Maximum recommended recording time is around 30 seconds

### 🌍 Supported Languages
- Input: Currently supports clear spoken language
- Output: Any of the languages you choose from
</details>
""")
        
        languages = {
            "afr": "Afrikaans",
            "amh": "Amharic",
            "arb": "Modern Standard Arabic",
            "ary": "Moroccan Arabic",
            "arz": "Egyptian Arabic",
            "asm": "Assamese",
            "ast": "Asturian",
            "azj": "North Azerbaijani",
            "bel": "Belarusian",
            "ben": "Bengali",
            "bos": "Bosnian",
            "bul": "Bulgarian",
            "cat": "Catalan",
            "ceb": "Cebuano",
            "ces": "Czech",
            "ckb": "Central Kurdish",
            "cmn": "Mandarin Chinese",
            "cmn_Hant": "Mandarin Chinese (Traditional)",
            "cym": "Welsh",
            "dan": "Danish",
            "deu": "German",
            "ell": "Greek",
            "eng": "English",
            "est": "Estonian",
            "eus": "Basque",
            "fin": "Finnish",
            "fra": "French",
            "fuv": "Nigerian Fulfulde",
            "gaz": "West Central Oromo",
            "gle": "Irish",
            "glg": "Galician",
            "guj": "Gujarati",
            "heb": "Hebrew",
            "hin": "Hindi",
            "hrv": "Croatian",
            "hun": "Hungarian",
            "hye": "Armenian",
            "ibo": "Igbo",
            "ind": "Indonesian",
            "isl": "Icelandic",
            "ita": "Italian",
            "jav": "Javanese",
            "jpn": "Japanese",
            "kam": "Kamba",
            "kan": "Kannada",
            "kat": "Georgian",
            "kaz": "Kazakh",
            "kea": "Kabuverdianu",
            "khk": "Halh Mongolian",
            "khm": "Khmer",
            "kir": "Kyrgyz",
            "kor": "Korean",
            "lao": "Lao",
            "lit": "Lithuanian",
            "ltz": "Luxembourgish",
            "lug": "Ganda",
            "luo": "Luo",
            "lvs": "Standard Latvian",
            "mai": "Maithili",
            "mal": "Malayalam",
            "mar": "Marathi",
            "mkd": "Macedonian",
            "mlt": "Maltese",
            "mni": "Meitei",
            "mya": "Burmese",
            "nld": "Dutch",
            "nno": "Norwegian Nynorsk",
            "nob": "Norwegian Bokmål",
            "npi": "Nepali",
            "nya": "Nyanja",
            "oci": "Occitan",
            "ory": "Odia",
            "pan": "Punjabi",
            "pbt": "Southern Pashto",
            "pes": "Western Persian",
            "pol": "Polish",
            "por": "Portuguese",
            "ron": "Romanian",
            "rus": "Russian",
            "slk": "Slovak",
            "slv": "Slovenian",
            "sna": "Shona",
            "snd": "Sindhi",
            "som": "Somali",
            "spa": "Spanish",
            "srp": "Serbian",
            "swe": "Swedish",
            "swh": "Swahili",
            "tam": "Tamil",
            "tel": "Telugu",
            "tgk": "Tajik",
            "tgl": "Tagalog",
            "tha": "Thai",
            "tur": "Turkish",
            "ukr": "Ukrainian",
            "urd": "Urdu",
            "uzn": "Northern Uzbek",
            "vie": "Vietnamese",
            "xho": "Xhosa",
            "yor": "Yoruba",
            "yue": "Cantonese",
            "zlm": "Colloquial Malay",
            "zsm": "Standard Malay",
            "zul": "Zulu",
        }

        # Language Dropdown
        with gr.Row():
            # Generate the choices for the dropdown: display names mapped to their keys
            language_choices = [(name, code) for code, name in languages.items()]
            
            language_dropdown = gr.Dropdown(
                choices=language_choices,  # Each choice is a (display, value) tuple
                value="spa",  # Default value corresponds to the key
                label="Target Language",
                scale=2
            )

        # Audio Input
        audio_input = gr.Audio(label="Record Audio", sources="microphone", type="filepath")
        
        # Display Components
        transcript_box = gr.Textbox(label="Full Transcript", lines=10, interactive=False)
        
        # Control Buttons
        with gr.Row():
            start_btn = gr.Button("Translate")
        
        # Define the translation action
        def handle_translation(audio_file, target_language):
            """Handle the audio file and pass it to the translator for processing."""
            if not audio_file:
                return "No audio file provided. Please record and try again."
            
            translator.target_language = target_language  # Set the target language in the translator
            try:
                translated_text = translator.translate_audio(audio_file)
                return translated_text if translated_text else "Translation failed."
            except Exception as e:
                return f"Error: {str(e)}"
        
        # Set the Gradio action
        start_btn.click(
            fn=handle_translation,
            inputs=[audio_input, language_dropdown],
            outputs=transcript_box
        )
    
    return demo


def main():
    """Launch the Gradio app with optimized settings"""
    interface = create_translator_interface()
    interface.launch(
        share=False, 
        show_error=True, 
        debug=True  # Helpful for development
    )

if __name__ == "__main__":
    main()