nikajoon commited on
Commit
1b8f0eb
1 Parent(s): 34e4a1a

Upload 12 files

Browse files
Hazm_correction.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hazm
2
+ import typing
3
+
4
+ normalizer = hazm.Normalizer()
5
+ sent_tokenizer = hazm.SentenceTokenizer()
6
+ word_tokenizer = hazm.WordTokenizer()
7
+
8
+ tagger = hazm.POSTagger(
9
+ model=str("gyroing/PersianTextCorrection_Hazm/pos_tagger.model")
10
+ )
11
+
12
+ def preprocess_text(text: str) -> typing.List[typing.List[str]]:
13
+ """Split/normalize text into sentences/words with hazm"""
14
+ text = normalizer.normalize(text)
15
+ processed_sentences = []
16
+
17
+ for sentence in sent_tokenizer.tokenize(text):
18
+ words = word_tokenizer.tokenize(sentence)
19
+ processed_words = fix_words(words)
20
+ processed_sentences.append(" ".join(processed_words))
21
+
22
+ return " ".join(processed_sentences)
23
+
24
+ def fix_words(words: typing.List[str]) -> typing.List[str]:
25
+ fixed_words = []
26
+
27
+ for word, pos in tagger.tag(words):
28
+ if pos[-1] == "Z":
29
+ if word[-1] != "ِ":
30
+ if (word[-1] == "ه") and (word[-2] != "ا"):
31
+ word += "‌ی"
32
+ word += "ِ"
33
+
34
+
35
+ fixed_words.append(word)
36
+
37
+ return fixed_words
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import wave
3
+ import numpy as np
4
+ from io import BytesIO
5
+ from huggingface_hub import hf_hub_download
6
+ from piper import PiperVoice
7
+ from transformers import pipeline
8
+ import hazm
9
+ import typing
10
+
11
+ normalizer = hazm.Normalizer()
12
+ sent_tokenizer = hazm.SentenceTokenizer()
13
+ word_tokenizer = hazm.WordTokenizer()
14
+
15
+ tagger_path = hf_hub_download(repo_id="gyroing/HAZM_POS_TAGGER", filename="pos_tagger.model")
16
+ tagger = hazm.POSTagger(model=tagger_path)
17
+ model_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-medium.onnx")
18
+ config_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-medium.onnx.json")
19
+ voice = PiperVoice.load(model_path, config_path)
20
+
21
+ def preprocess_text(text: str) -> typing.List[typing.List[str]]:
22
+ """Split/normalize text into sentences/words with hazm"""
23
+ text = normalizer.normalize(text)
24
+ processed_sentences = []
25
+
26
+ for sentence in sent_tokenizer.tokenize(text):
27
+ words = word_tokenizer.tokenize(sentence)
28
+ processed_words = fix_words(words)
29
+ processed_sentences.append(" ".join(processed_words))
30
+ return " ".join(processed_sentences)
31
+ def fix_words(words: typing.List[str]) -> typing.List[str]:
32
+ fixed_words = []
33
+
34
+ for word, pos in tagger.tag(words):
35
+ if pos[-1] == "Z":
36
+ if word[-1] != "ِ":
37
+ if (word[-1] == "ه") and (word[-2] != "ا"):
38
+ word += "‌ی"
39
+ word += "ِ"
40
+
41
+
42
+ fixed_words.append(word)
43
+
44
+ return fixed_words
45
+
46
+ def synthesize_speech(text):
47
+
48
+
49
+ # Create an in-memory buffer for the WAV file
50
+ buffer = BytesIO()
51
+ with wave.open(buffer, 'wb') as wav_file:
52
+ wav_file.setframerate(voice.config.sample_rate)
53
+ wav_file.setsampwidth(2) # 16-bit
54
+ wav_file.setnchannels(1) # mono
55
+
56
+ # Synthesize speech
57
+ eztext = preprocess_text(text)
58
+ voice.synthesize(eztext, wav_file)
59
+
60
+ # Convert buffer to NumPy array for Gradio output
61
+ buffer.seek(0)
62
+ audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
63
+
64
+ return audio_data.tobytes()
65
+
66
+ # Using Gradio Blocks
67
+ with gr.Blocks(theme=gr.themes.Base()) as blocks:
68
+ input_text = gr.Textbox(label="Input")
69
+ output_audio = gr.Audio(label="Output", type="numpy")
70
+ submit_button = gr.Button("Synthesize")
71
+
72
+ submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio])
73
+ # Run the app
74
+ blocks.launch()
piper/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .voice import PiperVoice
2
+
3
+ __all__ = [
4
+ "PiperVoice",
5
+ ]
piper/__main__.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import sys
4
+ import time
5
+ import wave
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ from . import PiperVoice
10
+ from .download import ensure_voice_exists, find_voice, get_voices
11
+
12
+ _FILE = Path(__file__)
13
+ _DIR = _FILE.parent
14
+ _LOGGER = logging.getLogger(_FILE.stem)
15
+
16
+
17
+ def main() -> None:
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
20
+ parser.add_argument("-c", "--config", help="Path to model config file")
21
+ parser.add_argument(
22
+ "-f",
23
+ "--output-file",
24
+ "--output_file",
25
+ help="Path to output WAV file (default: stdout)",
26
+ )
27
+ parser.add_argument(
28
+ "-d",
29
+ "--output-dir",
30
+ "--output_dir",
31
+ help="Path to output directory (default: cwd)",
32
+ )
33
+ parser.add_argument(
34
+ "--output-raw",
35
+ "--output_raw",
36
+ action="store_true",
37
+ help="Stream raw audio to stdout",
38
+ )
39
+ #
40
+ parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
41
+ parser.add_argument(
42
+ "--length-scale", "--length_scale", type=float, help="Phoneme length"
43
+ )
44
+ parser.add_argument(
45
+ "--noise-scale", "--noise_scale", type=float, help="Generator noise"
46
+ )
47
+ parser.add_argument(
48
+ "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
49
+ )
50
+ #
51
+ parser.add_argument("--cuda", action="store_true", help="Use GPU")
52
+ #
53
+ parser.add_argument(
54
+ "--sentence-silence",
55
+ "--sentence_silence",
56
+ type=float,
57
+ default=0.0,
58
+ help="Seconds of silence after each sentence",
59
+ )
60
+ #
61
+ parser.add_argument(
62
+ "--data-dir",
63
+ "--data_dir",
64
+ action="append",
65
+ default=[str(Path.cwd())],
66
+ help="Data directory to check for downloaded models (default: current directory)",
67
+ )
68
+ parser.add_argument(
69
+ "--download-dir",
70
+ "--download_dir",
71
+ help="Directory to download voices into (default: first data dir)",
72
+ )
73
+ #
74
+ parser.add_argument(
75
+ "--update-voices",
76
+ action="store_true",
77
+ help="Download latest voices.json during startup",
78
+ )
79
+ #
80
+ parser.add_argument(
81
+ "--debug", action="store_true", help="Print DEBUG messages to console"
82
+ )
83
+ args = parser.parse_args()
84
+ logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
85
+ _LOGGER.debug(args)
86
+
87
+ if not args.download_dir:
88
+ # Download to first data directory by default
89
+ args.download_dir = args.data_dir[0]
90
+
91
+ # Download voice if file doesn't exist
92
+ model_path = Path(args.model)
93
+ if not model_path.exists():
94
+ # Load voice info
95
+ voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
96
+
97
+ # Resolve aliases for backwards compatibility with old voice names
98
+ aliases_info: Dict[str, Any] = {}
99
+ for voice_info in voices_info.values():
100
+ for voice_alias in voice_info.get("aliases", []):
101
+ aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
102
+
103
+ voices_info.update(aliases_info)
104
+ ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
105
+ args.model, args.config = find_voice(args.model, args.data_dir)
106
+
107
+ # Load voice
108
+ voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
109
+ synthesize_args = {
110
+ "speaker_id": args.speaker,
111
+ "length_scale": args.length_scale,
112
+ "noise_scale": args.noise_scale,
113
+ "noise_w": args.noise_w,
114
+ "sentence_silence": args.sentence_silence,
115
+ }
116
+
117
+ if args.output_raw:
118
+ # Read line-by-line
119
+ for line in sys.stdin:
120
+ line = line.strip()
121
+ if not line:
122
+ continue
123
+
124
+ # Write raw audio to stdout as its produced
125
+ audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
126
+ for audio_bytes in audio_stream:
127
+ sys.stdout.buffer.write(audio_bytes)
128
+ sys.stdout.buffer.flush()
129
+ elif args.output_dir:
130
+ output_dir = Path(args.output_dir)
131
+ output_dir.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Read line-by-line
134
+ for line in sys.stdin:
135
+ line = line.strip()
136
+ if not line:
137
+ continue
138
+
139
+ wav_path = output_dir / f"{time.monotonic_ns()}.wav"
140
+ with wave.open(str(wav_path), "wb") as wav_file:
141
+ voice.synthesize(line, wav_file, **synthesize_args)
142
+
143
+ _LOGGER.info("Wrote %s", wav_path)
144
+ else:
145
+ # Read entire input
146
+ text = sys.stdin.read()
147
+
148
+ if (not args.output_file) or (args.output_file == "-"):
149
+ # Write to stdout
150
+ with wave.open(sys.stdout.buffer, "wb") as wav_file:
151
+ voice.synthesize(text, wav_file, **synthesize_args)
152
+ else:
153
+ # Write to file
154
+ with wave.open(args.output_file, "wb") as wav_file:
155
+ voice.synthesize(text, wav_file, **synthesize_args)
156
+
157
+
158
+ if __name__ == "__main__":
159
+ main()
piper/config.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Piper configuration"""
2
+ from dataclasses import dataclass
3
+ from enum import Enum
4
+ from typing import Any, Dict, Mapping, Sequence
5
+
6
+
7
+ class PhonemeType(str, Enum):
8
+ ESPEAK = "espeak"
9
+ TEXT = "text"
10
+
11
+
12
+ @dataclass
13
+ class PiperConfig:
14
+ """Piper configuration"""
15
+
16
+ num_symbols: int
17
+ """Number of phonemes"""
18
+
19
+ num_speakers: int
20
+ """Number of speakers"""
21
+
22
+ sample_rate: int
23
+ """Sample rate of output audio"""
24
+
25
+ espeak_voice: str
26
+ """Name of espeak-ng voice or alphabet"""
27
+
28
+ length_scale: float
29
+ noise_scale: float
30
+ noise_w: float
31
+
32
+ phoneme_id_map: Mapping[str, Sequence[int]]
33
+ """Phoneme -> [id,]"""
34
+
35
+ phoneme_type: PhonemeType
36
+ """espeak or text"""
37
+
38
+ @staticmethod
39
+ def from_dict(config: Dict[str, Any]) -> "PiperConfig":
40
+ inference = config.get("inference", {})
41
+
42
+ return PiperConfig(
43
+ num_symbols=config["num_symbols"],
44
+ num_speakers=config["num_speakers"],
45
+ sample_rate=config["audio"]["sample_rate"],
46
+ noise_scale=inference.get("noise_scale", 0.667),
47
+ length_scale=inference.get("length_scale", 1.0),
48
+ noise_w=inference.get("noise_w", 0.8),
49
+ #
50
+ espeak_voice=config["espeak"]["voice"],
51
+ phoneme_id_map=config["phoneme_id_map"],
52
+ phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
53
+ )
piper/const.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Constants"""
2
+
3
+ PAD = "_" # padding (0)
4
+ BOS = "^" # beginning of sentence
5
+ EOS = "$" # end of sentence
piper/download.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility for downloading Piper voices."""
2
+ import json
3
+ import logging
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Any, Dict, Iterable, Set, Tuple, Union
7
+ from urllib.request import urlopen
8
+
9
+ from .file_hash import get_file_hash
10
+
11
+ URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
12
+
13
+ _DIR = Path(__file__).parent
14
+ _LOGGER = logging.getLogger(__name__)
15
+
16
+ _SKIP_FILES = {"MODEL_CARD"}
17
+
18
+
19
+ class VoiceNotFoundError(Exception):
20
+ pass
21
+
22
+
23
+ def get_voices(
24
+ download_dir: Union[str, Path], update_voices: bool = False
25
+ ) -> Dict[str, Any]:
26
+ """Loads available voices from downloaded or embedded JSON file."""
27
+ download_dir = Path(download_dir)
28
+ voices_download = download_dir / "voices.json"
29
+
30
+ if update_voices:
31
+ # Download latest voices.json
32
+ voices_url = URL_FORMAT.format(file="voices.json")
33
+ _LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
34
+ with urlopen(voices_url) as response, open(
35
+ voices_download, "wb"
36
+ ) as download_file:
37
+ shutil.copyfileobj(response, download_file)
38
+
39
+ # Prefer downloaded file to embedded
40
+ voices_embedded = _DIR / "voices.json"
41
+ voices_path = voices_download if voices_download.exists() else voices_embedded
42
+
43
+ _LOGGER.debug("Loading %s", voices_path)
44
+ with open(voices_path, "r", encoding="utf-8") as voices_file:
45
+ return json.load(voices_file)
46
+
47
+
48
+ def ensure_voice_exists(
49
+ name: str,
50
+ data_dirs: Iterable[Union[str, Path]],
51
+ download_dir: Union[str, Path],
52
+ voices_info: Dict[str, Any],
53
+ ):
54
+ assert data_dirs, "No data dirs"
55
+ if name not in voices_info:
56
+ raise VoiceNotFoundError(name)
57
+
58
+ voice_info = voices_info[name]
59
+ voice_files = voice_info["files"]
60
+ files_to_download: Set[str] = set()
61
+
62
+ for data_dir in data_dirs:
63
+ data_dir = Path(data_dir)
64
+
65
+ # Check sizes/hashes
66
+ for file_path, file_info in voice_files.items():
67
+ if file_path in files_to_download:
68
+ # Already planning to download
69
+ continue
70
+
71
+ file_name = Path(file_path).name
72
+ if file_name in _SKIP_FILES:
73
+ continue
74
+
75
+ data_file_path = data_dir / file_name
76
+ _LOGGER.debug("Checking %s", data_file_path)
77
+ if not data_file_path.exists():
78
+ _LOGGER.debug("Missing %s", data_file_path)
79
+ files_to_download.add(file_path)
80
+ continue
81
+
82
+ expected_size = file_info["size_bytes"]
83
+ actual_size = data_file_path.stat().st_size
84
+ if expected_size != actual_size:
85
+ _LOGGER.warning(
86
+ "Wrong size (expected=%s, actual=%s) for %s",
87
+ expected_size,
88
+ actual_size,
89
+ data_file_path,
90
+ )
91
+ files_to_download.add(file_path)
92
+ continue
93
+
94
+ expected_hash = file_info["md5_digest"]
95
+ actual_hash = get_file_hash(data_file_path)
96
+ if expected_hash != actual_hash:
97
+ _LOGGER.warning(
98
+ "Wrong hash (expected=%s, actual=%s) for %s",
99
+ expected_hash,
100
+ actual_hash,
101
+ data_file_path,
102
+ )
103
+ files_to_download.add(file_path)
104
+ continue
105
+
106
+ if (not voice_files) and (not files_to_download):
107
+ raise ValueError(f"Unable to find or download voice: {name}")
108
+
109
+ # Download missing files
110
+ download_dir = Path(download_dir)
111
+
112
+ for file_path in files_to_download:
113
+ file_name = Path(file_path).name
114
+ if file_name in _SKIP_FILES:
115
+ continue
116
+
117
+ file_url = URL_FORMAT.format(file=file_path)
118
+ download_file_path = download_dir / file_name
119
+ download_file_path.parent.mkdir(parents=True, exist_ok=True)
120
+
121
+ _LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
122
+ with urlopen(file_url) as response, open(
123
+ download_file_path, "wb"
124
+ ) as download_file:
125
+ shutil.copyfileobj(response, download_file)
126
+
127
+ _LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
128
+
129
+
130
+ def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
131
+ for data_dir in data_dirs:
132
+ data_dir = Path(data_dir)
133
+ onnx_path = data_dir / f"{name}.onnx"
134
+ config_path = data_dir / f"{name}.onnx.json"
135
+
136
+ if onnx_path.exists() and config_path.exists():
137
+ return onnx_path, config_path
138
+
139
+ raise ValueError(f"Missing files for voice {name}")
piper/file_hash.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import hashlib
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Union
7
+
8
+
9
+ def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
10
+ """Hash a file in chunks using md5."""
11
+ path_hash = hashlib.md5()
12
+ with open(path, "rb") as path_file:
13
+ chunk = path_file.read(bytes_per_chunk)
14
+ while chunk:
15
+ path_hash.update(chunk)
16
+ chunk = path_file.read(bytes_per_chunk)
17
+
18
+ return path_hash.hexdigest()
19
+
20
+
21
+ # -----------------------------------------------------------------------------
22
+
23
+
24
+ def main():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument("file", nargs="+")
27
+ parser.add_argument("--dir", help="Parent directory")
28
+ args = parser.parse_args()
29
+
30
+ if args.dir:
31
+ args.dir = Path(args.dir)
32
+
33
+ hashes = {}
34
+ for path_str in args.file:
35
+ path = Path(path_str)
36
+ path_hash = get_file_hash(path)
37
+ if args.dir:
38
+ path = path.relative_to(args.dir)
39
+
40
+ hashes[str(path)] = path_hash
41
+
42
+ json.dump(hashes, sys.stdout)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
piper/util.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilities"""
2
+ import numpy as np
3
+
4
+
5
+ def audio_float_to_int16(
6
+ audio: np.ndarray, max_wav_value: float = 32767.0
7
+ ) -> np.ndarray:
8
+ """Normalize audio and convert to int16 range"""
9
+ audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
10
+ audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
11
+ audio_norm = audio_norm.astype("int16")
12
+ return audio_norm
piper/voice.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import wave
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Iterable, List, Optional, Union
7
+
8
+ import numpy as np
9
+ import onnxruntime
10
+ from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
11
+
12
+ from .config import PhonemeType, PiperConfig
13
+ from .const import BOS, EOS, PAD
14
+ from .util import audio_float_to_int16
15
+
16
+ _LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class PiperVoice:
21
+ session: onnxruntime.InferenceSession
22
+ config: PiperConfig
23
+
24
+ @staticmethod
25
+ def load(
26
+ model_path: Union[str, Path],
27
+ config_path: Optional[Union[str, Path]] = None,
28
+ use_cuda: bool = False,
29
+ ) -> "PiperVoice":
30
+ """Load an ONNX model and config."""
31
+ if config_path is None:
32
+ config_path = f"{model_path}.json"
33
+
34
+ with open(config_path, "r", encoding="utf-8") as config_file:
35
+ config_dict = json.load(config_file)
36
+
37
+ return PiperVoice(
38
+ config=PiperConfig.from_dict(config_dict),
39
+ session=onnxruntime.InferenceSession(
40
+ str(model_path),
41
+ sess_options=onnxruntime.SessionOptions(),
42
+ providers=["CPUExecutionProvider"]
43
+ if not use_cuda
44
+ else ["CUDAExecutionProvider"],
45
+ ),
46
+ )
47
+
48
+ def phonemize(self, text: str) -> List[List[str]]:
49
+ """Text to phonemes grouped by sentence."""
50
+ if self.config.phoneme_type == PhonemeType.ESPEAK:
51
+ if self.config.espeak_voice == "ar":
52
+ # Arabic diacritization
53
+ # https://github.com/mush42/libtashkeel/
54
+ text = tashkeel_run(text)
55
+
56
+ return phonemize_espeak(text, self.config.espeak_voice)
57
+
58
+ if self.config.phoneme_type == PhonemeType.TEXT:
59
+ return phonemize_codepoints(text)
60
+
61
+ raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
62
+
63
+ def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
64
+ """Phonemes to ids."""
65
+ id_map = self.config.phoneme_id_map
66
+ ids: List[int] = list(id_map[BOS])
67
+
68
+ for phoneme in phonemes:
69
+ if phoneme not in id_map:
70
+ _LOGGER.warning("Missing phoneme from id map: %s", phoneme)
71
+ continue
72
+
73
+ ids.extend(id_map[phoneme])
74
+ ids.extend(id_map[PAD])
75
+
76
+ ids.extend(id_map[EOS])
77
+
78
+ return ids
79
+
80
+ def synthesize(
81
+ self,
82
+ text: str,
83
+ wav_file: wave.Wave_write,
84
+ speaker_id: Optional[int] = None,
85
+ length_scale: Optional[float] = None,
86
+ noise_scale: Optional[float] = None,
87
+ noise_w: Optional[float] = None,
88
+ sentence_silence: float = 0.0,
89
+ ):
90
+ """Synthesize WAV audio from text."""
91
+ wav_file.setframerate(self.config.sample_rate)
92
+ wav_file.setsampwidth(2) # 16-bit
93
+ wav_file.setnchannels(1) # mono
94
+
95
+ for audio_bytes in self.synthesize_stream_raw(
96
+ text,
97
+ speaker_id=speaker_id,
98
+ length_scale=length_scale,
99
+ noise_scale=noise_scale,
100
+ noise_w=noise_w,
101
+ sentence_silence=sentence_silence,
102
+ ):
103
+ wav_file.writeframes(audio_bytes)
104
+
105
+ def synthesize_stream_raw(
106
+ self,
107
+ text: str,
108
+ speaker_id: Optional[int] = None,
109
+ length_scale: Optional[float] = None,
110
+ noise_scale: Optional[float] = None,
111
+ noise_w: Optional[float] = None,
112
+ sentence_silence: float = 0.0,
113
+ ) -> Iterable[bytes]:
114
+ """Synthesize raw audio per sentence from text."""
115
+ sentence_phonemes = self.phonemize(text)
116
+
117
+ # 16-bit mono
118
+ num_silence_samples = int(sentence_silence * self.config.sample_rate)
119
+ silence_bytes = bytes(num_silence_samples * 2)
120
+
121
+ for phonemes in sentence_phonemes:
122
+ phoneme_ids = self.phonemes_to_ids(phonemes)
123
+ yield self.synthesize_ids_to_raw(
124
+ phoneme_ids,
125
+ speaker_id=speaker_id,
126
+ length_scale=length_scale,
127
+ noise_scale=noise_scale,
128
+ noise_w=noise_w,
129
+ ) + silence_bytes
130
+
131
+ def synthesize_ids_to_raw(
132
+ self,
133
+ phoneme_ids: List[int],
134
+ speaker_id: Optional[int] = None,
135
+ length_scale: Optional[float] = None,
136
+ noise_scale: Optional[float] = None,
137
+ noise_w: Optional[float] = None,
138
+ ) -> bytes:
139
+ """Synthesize raw audio from phoneme ids."""
140
+ if length_scale is None:
141
+ length_scale = self.config.length_scale
142
+
143
+ if noise_scale is None:
144
+ noise_scale = self.config.noise_scale
145
+
146
+ if noise_w is None:
147
+ noise_w = self.config.noise_w
148
+
149
+ phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
150
+ phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
151
+ scales = np.array(
152
+ [noise_scale, length_scale, noise_w],
153
+ dtype=np.float32,
154
+ )
155
+
156
+ if (self.config.num_speakers > 1) and (speaker_id is None):
157
+ # Default speaker
158
+ speaker_id = 0
159
+
160
+ sid = None
161
+
162
+ if speaker_id is not None:
163
+ sid = np.array([speaker_id], dtype=np.int64)
164
+
165
+ # Synthesize through Onnx
166
+ audio = self.session.run(
167
+ None,
168
+ {
169
+ "input": phoneme_ids_array,
170
+ "input_lengths": phoneme_ids_lengths,
171
+ "scales": scales,
172
+ "sid": sid,
173
+ },
174
+ )[0].squeeze((0, 1))
175
+ audio = audio_float_to_int16(audio.squeeze())
176
+
177
+ return audio.tobytes()
piper/voices.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ hazm
2
+ torch
3
+ transformers
4
+ piper-tts
5
+ piper-phonemize~=1.1.0
6
+ onnxruntime>=1.11.0,<2