ruslanmv commited on
Commit
2dac140
1 Parent(s): a479ddf
Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -2,8 +2,8 @@ import streamlit as st
2
  import numpy as np
3
  import torch
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
- import soundfile as sf
6
  from io import StringIO
 
7
 
8
  # Load models outside of function calls for efficiency
9
  @st.cache_data
@@ -23,30 +23,31 @@ def get_speaker_embeddings():
23
 
24
  speaker_embeddings = get_speaker_embeddings()
25
 
26
- # Improved Styling (assuming style.css is present)
27
  def local_css(file_name):
28
  with open(file_name) as f:
29
  st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
30
 
31
- local_css("style.css") # Apply custom CSS styles
32
 
33
- # Streamlit Layout
34
  st.title("Text-to-Voice Conversion")
35
  st.markdown("Convert your text to speech using advanced AI models.")
36
 
37
  # Function to convert text to speech
38
  def text_to_speech(text):
39
  try:
 
40
  max_length = 100 # Set a max length as per model's capability
41
  segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
42
  audio_paths = []
43
 
44
- for i, segment in enumerate(segments):
45
  inputs = processor(text=segment, return_tensors="pt")
46
  spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
47
  with torch.no_grad():
48
  speech = vocoder(spectrogram)
49
- audio_path = f"speech_segment_{i}.wav"
50
  sf.write(audio_path, speech.numpy(), samplerate=16000)
51
  audio_paths.append(audio_path)
52
 
@@ -64,28 +65,30 @@ def combine_audio_segments(paths):
64
  sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
65
  return "combined_speech.wav"
66
 
67
- # Text Input and Conversion Button
68
- text = st.text_area("Type your text here.")
69
 
 
70
  if st.button("Convert"):
71
  if text:
72
  audio_paths = text_to_speech(text)
73
  combined_audio_path = combine_audio_segments(audio_paths)
74
- audio_bytes = open(combined_audio_path, 'rb').read()
 
75
  st.audio(audio_bytes, format='audio/wav')
76
  else:
77
  st.error("Please enter some text to convert.")
78
 
79
- # File Uploader and Conversion Button
80
- uploaded_file = st.file_uploader("Upload a text file here", type=['txt'])
81
-
82
  if uploaded_file is not None:
83
  stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
84
  text = stringio.read()
85
  st.write(text)
86
 
87
- if st.button("Convert Uploaded File", key="upload"):
88
  audio_paths = text_to_speech(text)
89
  combined_audio_path = combine_audio_segments(audio_paths)
90
- audio_bytes = open(combined_audio_path, 'rb').read()
91
- st.audio(audio_bytes, format='audio/wav')
 
 
2
  import numpy as np
3
  import torch
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
5
  from io import StringIO
6
+ import soundfile as sf
7
 
8
  # Load models outside of function calls for efficiency
9
  @st.cache_data
 
23
 
24
  speaker_embeddings = get_speaker_embeddings()
25
 
26
+ # Improved Styling
27
  def local_css(file_name):
28
  with open(file_name) as f:
29
  st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
30
 
31
+ local_css("style.css")
32
 
33
+ # Streamlined Layout
34
  st.title("Text-to-Voice Conversion")
35
  st.markdown("Convert your text to speech using advanced AI models.")
36
 
37
  # Function to convert text to speech
38
  def text_to_speech(text):
39
  try:
40
+ # Segment the text if it's too long
41
  max_length = 100 # Set a max length as per model's capability
42
  segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
43
  audio_paths = []
44
 
45
+ for segment in segments:
46
  inputs = processor(text=segment, return_tensors="pt")
47
  spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
48
  with torch.no_grad():
49
  speech = vocoder(spectrogram)
50
+ audio_path = f"speech_segment_{len(audio_paths)}.wav"
51
  sf.write(audio_path, speech.numpy(), samplerate=16000)
52
  audio_paths.append(audio_path)
53
 
 
65
  sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
66
  return "combined_speech.wav"
67
 
68
+ # Text Input
69
+ text = st.text_area("Type your text or upload a text file below.")
70
 
71
+ # Convert Button
72
  if st.button("Convert"):
73
  if text:
74
  audio_paths = text_to_speech(text)
75
  combined_audio_path = combine_audio_segments(audio_paths)
76
+ audio_file = open(combined_audio_path, 'rb')
77
+ audio_bytes = audio_file.read()
78
  st.audio(audio_bytes, format='audio/wav')
79
  else:
80
  st.error("Please enter some text to convert.")
81
 
82
+ # File Uploader
83
+ uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
 
84
  if uploaded_file is not None:
85
  stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
86
  text = stringio.read()
87
  st.write(text)
88
 
89
+ if st.button("Convert Uploaded File", key=1):
90
  audio_paths = text_to_speech(text)
91
  combined_audio_path = combine_audio_segments(audio_paths)
92
+ audio_file = open(combined_audio_path, 'rb')
93
+ audio_bytes = audio_file.read()
94
+ st.audio(audio_bytes, format='audio/wav')