File size: 2,080 Bytes
2dcbe47
d347764
 
 
 
bcb5aac
ecfd58a
d347764
 
 
bcb5aac
8ab88f8
2dcbe47
bcb5aac
d771f86
d347764
bcb5aac
faf5fc4
 
d771f86
c5ba654
 
2dcbe47
d347764
bcc1813
62a7e0b
 
1f173de
d347764
76364f3
822dbb3
866bb2a
 
 
d347764
de68bdb
76364f3
 
d347764
7aec40a
d347764
c737803
 
 
d347764
226ec3a
b2a79a9
d347764
c737803
 
 
 
 
fbba0c1
c737803
 
1d61d5d
c737803
3946ba6
c737803
7aec40a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

import gradio as gr
import numpy as np
import torch

from transformers import pipeline
from transformers import VitsModel, VitsTokenizer, FSMTForConditionalGeneration, FSMTTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Transform audio to en text
asr_pipe = pipeline("automatic-speech-recognition", model="asapp/sew-d-tiny-100k-ft-ls100h", device=device)

# Translate en to rus text
translation_en_to_rus = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

# Create speech from rus text
model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")

#model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
#tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")


def translate(audio):
  en_text = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
  translated_text = translation_en_to_rus(en_text["text"])
  return translated_text[0]['translation_text']

def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        speech = model(**inputs).waveform
    return speech.cpu()

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy")
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]]
)


with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()