Update app.py
Browse files
app.py
CHANGED
@@ -1,38 +1,39 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from transformers import AutoProcessor, AutoModelForCTC
|
4 |
-
import soundfile as sf # For handling audio input
|
5 |
-
|
6 |
-
# Load model directly
|
7 |
from transformers import AutoTokenizer, AutoModelForPreTraining
|
|
|
|
|
|
|
8 |
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
|
9 |
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
|
10 |
|
11 |
-
#
|
12 |
-
def
|
13 |
-
|
14 |
-
speech, _ = sf.read(audio)
|
15 |
-
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
|
16 |
|
17 |
with torch.no_grad():
|
18 |
-
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
#
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
return transcription
|
25 |
|
|
|
|
|
26 |
|
27 |
-
# Gradio
|
28 |
iface = gr.Interface(
|
29 |
-
fn=
|
30 |
-
inputs=
|
31 |
-
outputs="
|
32 |
-
title="Bulgarian Speech
|
33 |
-
description="
|
34 |
)
|
35 |
|
36 |
-
#
|
37 |
if __name__ == "__main__":
|
38 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
|
|
|
|
|
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForPreTraining
|
4 |
+
import soundfile as sf
|
5 |
+
|
6 |
+
# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech)
|
7 |
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
|
8 |
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
|
9 |
|
10 |
+
# TTS λ³ν ν¨μ (text-to-speech conversion)
|
11 |
+
def tts_generate(text):
|
12 |
+
inputs = tokenizer(text, return_tensors="pt")
|
|
|
|
|
13 |
|
14 |
with torch.no_grad():
|
15 |
+
outputs = model(**inputs)
|
16 |
+
|
17 |
+
# Convert the model outputs to audio format (you need to implement this depending on model specifics)
|
18 |
+
# This will depend on how the model's outputs are structured
|
19 |
+
# For now, let's assume you need a simple conversion to waveform/audio
|
20 |
|
21 |
+
# Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format
|
22 |
+
# You might need to adjust this based on how the TTS model is structured and how it outputs speech
|
23 |
+
audio = outputs['logits'] # Adjust according to your model's output structure
|
|
|
|
|
24 |
|
25 |
+
# Return audio output (in numpy format) and the sample rate (this might be specific to your model)
|
26 |
+
return audio.numpy(), 22050 # Assuming the output is sampled at 22050 Hz
|
27 |
|
28 |
+
# Create Gradio interface
|
29 |
iface = gr.Interface(
|
30 |
+
fn=tts_generate,
|
31 |
+
inputs="text",
|
32 |
+
outputs="audio",
|
33 |
+
title="Bulgarian TTS (Text-to-Speech)",
|
34 |
+
description="Enter text to generate speech in Bulgarian."
|
35 |
)
|
36 |
|
37 |
+
# Run the interface
|
38 |
if __name__ == "__main__":
|
39 |
iface.launch()
|