File size: 3,094 Bytes
fd6c122
2109067
0d05344
 
 
 
 
2109067
 
555a546
2109067
 
0d05344
 
2109067
 
0d05344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd6c122
 
 
 
 
 
 
 
 
 
2109067
fd6c122
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import base64
import os
from flask import Flask, request, jsonify
from pydub import AudioSegment
import whisper
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Define cache directory
os.environ['HF_HOME'] = '/app/cache'

# Load the Whisper model with a specified cache directory
whisper_model = whisper.load_model("base", download_root="/app/cache")

# Load the translation model and tokenizer
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", cache_dir="/app/cache")
translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M", cache_dir="/app/cache")

def preprocess_audio(audio_path):
    """Convert audio to 16kHz mono WAV format."""
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_frame_rate(16000).set_channels(1)  # Set to 16kHz and mono
    processed_path = f"{audio_path}_processed.wav"
    audio.export(processed_path, format="wav")
    return processed_path

def transcribe_audio(audio_path, source_language=None):
    """Transcribe audio using Whisper with an optional source language."""
    options = {"language": source_language} if source_language else {}
    result = whisper_model.transcribe(audio_path, **options)
    return result['text']

def translate_text(text, source_lang="en", target_lang="hi"):
    """Translate text using Facebook's M2M100 model."""
    tokenizer.src_lang = source_lang
    inputs = tokenizer(text, return_tensors="pt")
    translated_tokens = translation_model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.get_lang_id(target_lang)
    )
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

def handle_request(audio_base64, source_lang, target_lang):
    """Handle audio translation request."""
    audio_file_path = "temp_audio.wav"
    # Decode the base64 audio
    with open(audio_file_path, "wb") as audio_file:
        audio_file.write(base64.b64decode(audio_base64))

    # Process the audio file
    processed_audio_file_name = preprocess_audio(audio_file_path)
    spoken_text = transcribe_audio(processed_audio_file_name, source_lang)
    translated_text = translate_text(spoken_text, source_lang, target_lang)

    # Clean up temporary files
    os.remove(processed_audio_file_name)
    os.remove(audio_file_path)

    return {"transcribed_text": spoken_text, "translated_text": translated_text}

# Flask for handling external POST requests
app = Flask(__name__)

@app.route('/translate', methods=['POST'])
def translate():
    """API endpoint for handling audio translation."""
    data = request.json
    if 'audio' not in data or 'source_lang' not in data or 'target_lang' not in data:
        return jsonify({"error": "Invalid request format"}), 400
    
    audio_base64 = data['audio']
    source_lang = data['source_lang']
    target_lang = data['target_lang']

    # Call the handle_request function to process the request
    response = handle_request(audio_base64, source_lang, target_lang)
    return jsonify(response)

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=7860)