Mahiruoshi
commited on
Commit
•
6c035be
1
Parent(s):
f0ca36a
Upload 25 files
Browse files- text/__pycache__/__init__.cpython-311.pyc +0 -0
- text/__pycache__/__init__.cpython-38.pyc +0 -0
- text/__pycache__/cleaners.cpython-311.pyc +0 -0
- text/__pycache__/cleaners.cpython-38.pyc +0 -0
- text/__pycache__/japanese.cpython-311.pyc +0 -0
- text/__pycache__/japanese.cpython-38.pyc +0 -0
- text/__pycache__/mandarin.cpython-38.pyc +0 -0
- text/__pycache__/symbols.cpython-38.pyc +0 -0
- text/cantonese.py +59 -0
- text/cleaners.py +51 -99
- text/korean.py +210 -0
- text/mandarin.py +5 -7
- text/ngu_dialect.py +30 -0
- text/shanghainese.py +64 -0
- text/symbols.py +12 -4
- text/thai.py +44 -0
text/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (2.89 kB). View file
|
|
text/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/text/__pycache__/__init__.cpython-38.pyc and b/text/__pycache__/__init__.cpython-38.pyc differ
|
|
text/__pycache__/cleaners.cpython-311.pyc
ADDED
Binary file (13.1 kB). View file
|
|
text/__pycache__/cleaners.cpython-38.pyc
CHANGED
Binary files a/text/__pycache__/cleaners.cpython-38.pyc and b/text/__pycache__/cleaners.cpython-38.pyc differ
|
|
text/__pycache__/japanese.cpython-311.pyc
ADDED
Binary file (8.36 kB). View file
|
|
text/__pycache__/japanese.cpython-38.pyc
CHANGED
Binary files a/text/__pycache__/japanese.cpython-38.pyc and b/text/__pycache__/japanese.cpython-38.pyc differ
|
|
text/__pycache__/mandarin.cpython-38.pyc
CHANGED
Binary files a/text/__pycache__/mandarin.cpython-38.pyc and b/text/__pycache__/mandarin.cpython-38.pyc differ
|
|
text/__pycache__/symbols.cpython-38.pyc
CHANGED
Binary files a/text/__pycache__/symbols.cpython-38.pyc and b/text/__pycache__/symbols.cpython-38.pyc differ
|
|
text/cantonese.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import cn2an
|
3 |
+
import opencc
|
4 |
+
|
5 |
+
|
6 |
+
converter = opencc.OpenCC('jyutjyu')
|
7 |
+
|
8 |
+
# List of (Latin alphabet, ipa) pairs:
|
9 |
+
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
10 |
+
('A', 'ei˥'),
|
11 |
+
('B', 'biː˥'),
|
12 |
+
('C', 'siː˥'),
|
13 |
+
('D', 'tiː˥'),
|
14 |
+
('E', 'iː˥'),
|
15 |
+
('F', 'e˥fuː˨˩'),
|
16 |
+
('G', 'tsiː˥'),
|
17 |
+
('H', 'ɪk̚˥tsʰyː˨˩'),
|
18 |
+
('I', 'ɐi˥'),
|
19 |
+
('J', 'tsei˥'),
|
20 |
+
('K', 'kʰei˥'),
|
21 |
+
('L', 'e˥llou˨˩'),
|
22 |
+
('M', 'ɛːm˥'),
|
23 |
+
('N', 'ɛːn˥'),
|
24 |
+
('O', 'ou˥'),
|
25 |
+
('P', 'pʰiː˥'),
|
26 |
+
('Q', 'kʰiːu˥'),
|
27 |
+
('R', 'aː˥lou˨˩'),
|
28 |
+
('S', 'ɛː˥siː˨˩'),
|
29 |
+
('T', 'tʰiː˥'),
|
30 |
+
('U', 'juː˥'),
|
31 |
+
('V', 'wiː˥'),
|
32 |
+
('W', 'tʊk̚˥piː˥juː˥'),
|
33 |
+
('X', 'ɪk̚˥siː˨˩'),
|
34 |
+
('Y', 'waːi˥'),
|
35 |
+
('Z', 'iː˨sɛːt̚˥')
|
36 |
+
]]
|
37 |
+
|
38 |
+
|
39 |
+
def number_to_cantonese(text):
|
40 |
+
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
41 |
+
|
42 |
+
|
43 |
+
def latin_to_ipa(text):
|
44 |
+
for regex, replacement in _latin_to_ipa:
|
45 |
+
text = re.sub(regex, replacement, text)
|
46 |
+
return text
|
47 |
+
|
48 |
+
|
49 |
+
def cantonese_to_ipa(text):
|
50 |
+
text = number_to_cantonese(text.upper())
|
51 |
+
text = converter.convert(text).replace('-','').replace('$',' ')
|
52 |
+
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
53 |
+
text = re.sub(r'[、;:]', ',', text)
|
54 |
+
text = re.sub(r'\s*,\s*', ', ', text)
|
55 |
+
text = re.sub(r'\s*。\s*', '. ', text)
|
56 |
+
text = re.sub(r'\s*?\s*', '? ', text)
|
57 |
+
text = re.sub(r'\s*!\s*', '! ', text)
|
58 |
+
text = re.sub(r'\s*$', '', text)
|
59 |
+
return text
|
text/cleaners.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1 |
import re
|
2 |
-
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
3 |
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
|
|
|
4 |
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
|
12 |
|
13 |
def japanese_cleaners(text):
|
14 |
text = japanese_to_romaji_with_accent(text)
|
15 |
-
|
16 |
-
text += '.'
|
17 |
return text
|
18 |
|
19 |
|
@@ -26,8 +25,7 @@ def korean_cleaners(text):
|
|
26 |
text = latin_to_hangul(text)
|
27 |
text = number_to_hangul(text)
|
28 |
text = divide_hangul(text)
|
29 |
-
|
30 |
-
text += '.'
|
31 |
return text
|
32 |
|
33 |
|
@@ -36,110 +34,67 @@ def chinese_cleaners(text):
|
|
36 |
text = number_to_chinese(text)
|
37 |
text = chinese_to_bopomofo(text)
|
38 |
text = latin_to_bopomofo(text)
|
39 |
-
|
40 |
-
text += '。'
|
41 |
return text
|
42 |
|
43 |
|
44 |
def zh_ja_mixture_cleaners(text):
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
cleaned_text = japanese_to_romaji_with_accent(
|
52 |
-
japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
|
53 |
-
text = text.replace(japanese_text, cleaned_text+' ', 1)
|
54 |
-
text = text[:-1]
|
55 |
-
if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
|
56 |
-
text += '.'
|
57 |
return text
|
58 |
|
59 |
|
60 |
def sanskrit_cleaners(text):
|
61 |
text = text.replace('॥', '।').replace('ॐ', 'ओम्')
|
62 |
-
|
63 |
-
text += ' ।'
|
64 |
return text
|
65 |
|
66 |
|
67 |
def cjks_cleaners(text):
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
|
81 |
-
text = text.replace(korean_text, cleaned_text+' ', 1)
|
82 |
-
for sanskrit_text in sanskrit_texts:
|
83 |
-
cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
|
84 |
-
text = text.replace(sanskrit_text, cleaned_text+' ', 1)
|
85 |
-
for english_text in english_texts:
|
86 |
-
cleaned_text = english_to_lazy_ipa(english_text[4:-4])
|
87 |
-
text = text.replace(english_text, cleaned_text+' ', 1)
|
88 |
-
text = text[:-1]
|
89 |
-
if re.match(r'[^\.,!\?\-…~]', text[-1]):
|
90 |
-
text += '.'
|
91 |
return text
|
92 |
|
93 |
|
94 |
def cjke_cleaners(text):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
|
106 |
-
cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
|
107 |
-
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
|
108 |
-
text = text.replace(japanese_text, cleaned_text+' ', 1)
|
109 |
-
for korean_text in korean_texts:
|
110 |
-
cleaned_text = korean_to_ipa(korean_text[4:-4])
|
111 |
-
text = text.replace(korean_text, cleaned_text+' ', 1)
|
112 |
-
for english_text in english_texts:
|
113 |
-
cleaned_text = english_to_ipa2(english_text[4:-4])
|
114 |
-
cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
|
115 |
-
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
|
116 |
-
text = text.replace(english_text, cleaned_text+' ', 1)
|
117 |
-
text = text[:-1]
|
118 |
-
if re.match(r'[^\.,!\?\-…~]', text[-1]):
|
119 |
-
text += '.'
|
120 |
return text
|
121 |
|
122 |
|
123 |
def cjke_cleaners2(text):
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
for korean_text in korean_texts:
|
135 |
-
cleaned_text = korean_to_ipa(korean_text[4:-4])
|
136 |
-
text = text.replace(korean_text, cleaned_text+' ', 1)
|
137 |
-
for english_text in english_texts:
|
138 |
-
cleaned_text = english_to_ipa2(english_text[4:-4])
|
139 |
-
text = text.replace(english_text, cleaned_text+' ', 1)
|
140 |
-
text = text[:-1]
|
141 |
-
if re.match(r'[^\.,!\?\-…~]', text[-1]):
|
142 |
-
text += '.'
|
143 |
return text
|
144 |
|
145 |
|
@@ -151,16 +106,13 @@ def thai_cleaners(text):
|
|
151 |
|
152 |
def shanghainese_cleaners(text):
|
153 |
text = shanghainese_to_ipa(text)
|
154 |
-
|
155 |
-
text += '.'
|
156 |
return text
|
157 |
|
158 |
|
159 |
def chinese_dialect_cleaners(text):
|
160 |
-
text = re.sub(r'\[
|
161 |
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
162 |
-
text = re.sub(r'\[TW\](.*?)\[TW\]',
|
163 |
-
lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
|
164 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
165 |
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
166 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
|
|
1 |
import re
|
|
|
2 |
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
|
3 |
+
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
|
4 |
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
5 |
+
from text.sanskrit import devanagari_to_ipa
|
6 |
+
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
7 |
+
from text.thai import num_to_thai, latin_to_thai
|
8 |
+
from text.shanghainese import shanghainese_to_ipa
|
9 |
+
from text.cantonese import cantonese_to_ipa
|
10 |
+
from text.ngu_dialect import ngu_dialect_to_ipa
|
11 |
|
12 |
|
13 |
def japanese_cleaners(text):
|
14 |
text = japanese_to_romaji_with_accent(text)
|
15 |
+
text = re.sub(r'([A-Za-z])$', r'\1.', text)
|
|
|
16 |
return text
|
17 |
|
18 |
|
|
|
25 |
text = latin_to_hangul(text)
|
26 |
text = number_to_hangul(text)
|
27 |
text = divide_hangul(text)
|
28 |
+
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
|
|
29 |
return text
|
30 |
|
31 |
|
|
|
34 |
text = number_to_chinese(text)
|
35 |
text = chinese_to_bopomofo(text)
|
36 |
text = latin_to_bopomofo(text)
|
37 |
+
text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
|
|
|
38 |
return text
|
39 |
|
40 |
|
41 |
def zh_ja_mixture_cleaners(text):
|
42 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
43 |
+
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
44 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
45 |
+
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
|
46 |
+
text = re.sub(r'\s+$', '', text)
|
47 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
return text
|
49 |
|
50 |
|
51 |
def sanskrit_cleaners(text):
|
52 |
text = text.replace('॥', '।').replace('ॐ', 'ओम्')
|
53 |
+
text = re.sub(r'([^।])$', r'\1।', text)
|
|
|
54 |
return text
|
55 |
|
56 |
|
57 |
def cjks_cleaners(text):
|
58 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
59 |
+
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
60 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
61 |
+
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
62 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
63 |
+
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
64 |
+
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
65 |
+
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
66 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
67 |
+
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
68 |
+
text = re.sub(r'\s+$', '', text)
|
69 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
return text
|
71 |
|
72 |
|
73 |
def cjke_cleaners(text):
|
74 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
75 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
|
76 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
77 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
|
78 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
79 |
+
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
80 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
81 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
|
82 |
+
text = re.sub(r'\s+$', '', text)
|
83 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return text
|
85 |
|
86 |
|
87 |
def cjke_cleaners2(text):
|
88 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
89 |
+
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
90 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
91 |
+
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
92 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
93 |
+
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
94 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
95 |
+
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
96 |
+
text = re.sub(r'\s+$', '', text)
|
97 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
return text
|
99 |
|
100 |
|
|
|
106 |
|
107 |
def shanghainese_cleaners(text):
|
108 |
text = shanghainese_to_ipa(text)
|
109 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
|
|
110 |
return text
|
111 |
|
112 |
|
113 |
def chinese_dialect_cleaners(text):
|
114 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
115 |
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
|
|
|
|
116 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
117 |
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
118 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
text/korean.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from jamo import h2j, j2hcj
|
3 |
+
import ko_pron
|
4 |
+
|
5 |
+
|
6 |
+
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
7 |
+
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
|
8 |
+
|
9 |
+
# List of (hangul, hangul divided) pairs:
|
10 |
+
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
11 |
+
('ㄳ', 'ㄱㅅ'),
|
12 |
+
('ㄵ', 'ㄴㅈ'),
|
13 |
+
('ㄶ', 'ㄴㅎ'),
|
14 |
+
('ㄺ', 'ㄹㄱ'),
|
15 |
+
('ㄻ', 'ㄹㅁ'),
|
16 |
+
('ㄼ', 'ㄹㅂ'),
|
17 |
+
('ㄽ', 'ㄹㅅ'),
|
18 |
+
('ㄾ', 'ㄹㅌ'),
|
19 |
+
('ㄿ', 'ㄹㅍ'),
|
20 |
+
('ㅀ', 'ㄹㅎ'),
|
21 |
+
('ㅄ', 'ㅂㅅ'),
|
22 |
+
('ㅘ', 'ㅗㅏ'),
|
23 |
+
('ㅙ', 'ㅗㅐ'),
|
24 |
+
('ㅚ', 'ㅗㅣ'),
|
25 |
+
('ㅝ', 'ㅜㅓ'),
|
26 |
+
('ㅞ', 'ㅜㅔ'),
|
27 |
+
('ㅟ', 'ㅜㅣ'),
|
28 |
+
('ㅢ', 'ㅡㅣ'),
|
29 |
+
('ㅑ', 'ㅣㅏ'),
|
30 |
+
('ㅒ', 'ㅣㅐ'),
|
31 |
+
('ㅕ', 'ㅣㅓ'),
|
32 |
+
('ㅖ', 'ㅣㅔ'),
|
33 |
+
('ㅛ', 'ㅣㅗ'),
|
34 |
+
('ㅠ', 'ㅣㅜ')
|
35 |
+
]]
|
36 |
+
|
37 |
+
# List of (Latin alphabet, hangul) pairs:
|
38 |
+
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
39 |
+
('a', '에이'),
|
40 |
+
('b', '비'),
|
41 |
+
('c', '시'),
|
42 |
+
('d', '디'),
|
43 |
+
('e', '이'),
|
44 |
+
('f', '에프'),
|
45 |
+
('g', '지'),
|
46 |
+
('h', '에이치'),
|
47 |
+
('i', '아이'),
|
48 |
+
('j', '제이'),
|
49 |
+
('k', '케이'),
|
50 |
+
('l', '엘'),
|
51 |
+
('m', '엠'),
|
52 |
+
('n', '엔'),
|
53 |
+
('o', '오'),
|
54 |
+
('p', '피'),
|
55 |
+
('q', '큐'),
|
56 |
+
('r', '아르'),
|
57 |
+
('s', '에스'),
|
58 |
+
('t', '티'),
|
59 |
+
('u', '유'),
|
60 |
+
('v', '브이'),
|
61 |
+
('w', '더블유'),
|
62 |
+
('x', '엑스'),
|
63 |
+
('y', '와이'),
|
64 |
+
('z', '제트')
|
65 |
+
]]
|
66 |
+
|
67 |
+
# List of (ipa, lazy ipa) pairs:
|
68 |
+
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
69 |
+
('t͡ɕ','ʧ'),
|
70 |
+
('d͡ʑ','ʥ'),
|
71 |
+
('ɲ','n^'),
|
72 |
+
('ɕ','ʃ'),
|
73 |
+
('ʷ','w'),
|
74 |
+
('ɭ','l`'),
|
75 |
+
('ʎ','ɾ'),
|
76 |
+
('ɣ','ŋ'),
|
77 |
+
('ɰ','ɯ'),
|
78 |
+
('ʝ','j'),
|
79 |
+
('ʌ','ə'),
|
80 |
+
('ɡ','g'),
|
81 |
+
('\u031a','#'),
|
82 |
+
('\u0348','='),
|
83 |
+
('\u031e',''),
|
84 |
+
('\u0320',''),
|
85 |
+
('\u0339','')
|
86 |
+
]]
|
87 |
+
|
88 |
+
|
89 |
+
def latin_to_hangul(text):
|
90 |
+
for regex, replacement in _latin_to_hangul:
|
91 |
+
text = re.sub(regex, replacement, text)
|
92 |
+
return text
|
93 |
+
|
94 |
+
|
95 |
+
def divide_hangul(text):
|
96 |
+
text = j2hcj(h2j(text))
|
97 |
+
for regex, replacement in _hangul_divided:
|
98 |
+
text = re.sub(regex, replacement, text)
|
99 |
+
return text
|
100 |
+
|
101 |
+
|
102 |
+
def hangul_number(num, sino=True):
|
103 |
+
'''Reference https://github.com/Kyubyong/g2pK'''
|
104 |
+
num = re.sub(',', '', num)
|
105 |
+
|
106 |
+
if num == '0':
|
107 |
+
return '영'
|
108 |
+
if not sino and num == '20':
|
109 |
+
return '스무'
|
110 |
+
|
111 |
+
digits = '123456789'
|
112 |
+
names = '일이삼사오육칠팔구'
|
113 |
+
digit2name = {d: n for d, n in zip(digits, names)}
|
114 |
+
|
115 |
+
modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
|
116 |
+
decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
|
117 |
+
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
118 |
+
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
119 |
+
|
120 |
+
spelledout = []
|
121 |
+
for i, digit in enumerate(num):
|
122 |
+
i = len(num) - i - 1
|
123 |
+
if sino:
|
124 |
+
if i == 0:
|
125 |
+
name = digit2name.get(digit, '')
|
126 |
+
elif i == 1:
|
127 |
+
name = digit2name.get(digit, '') + '십'
|
128 |
+
name = name.replace('일십', '십')
|
129 |
+
else:
|
130 |
+
if i == 0:
|
131 |
+
name = digit2mod.get(digit, '')
|
132 |
+
elif i == 1:
|
133 |
+
name = digit2dec.get(digit, '')
|
134 |
+
if digit == '0':
|
135 |
+
if i % 4 == 0:
|
136 |
+
last_three = spelledout[-min(3, len(spelledout)):]
|
137 |
+
if ''.join(last_three) == '':
|
138 |
+
spelledout.append('')
|
139 |
+
continue
|
140 |
+
else:
|
141 |
+
spelledout.append('')
|
142 |
+
continue
|
143 |
+
if i == 2:
|
144 |
+
name = digit2name.get(digit, '') + '백'
|
145 |
+
name = name.replace('일백', '백')
|
146 |
+
elif i == 3:
|
147 |
+
name = digit2name.get(digit, '') + '천'
|
148 |
+
name = name.replace('일천', '천')
|
149 |
+
elif i == 4:
|
150 |
+
name = digit2name.get(digit, '') + '만'
|
151 |
+
name = name.replace('일만', '만')
|
152 |
+
elif i == 5:
|
153 |
+
name = digit2name.get(digit, '') + '십'
|
154 |
+
name = name.replace('일십', '십')
|
155 |
+
elif i == 6:
|
156 |
+
name = digit2name.get(digit, '') + '백'
|
157 |
+
name = name.replace('일백', '백')
|
158 |
+
elif i == 7:
|
159 |
+
name = digit2name.get(digit, '') + '천'
|
160 |
+
name = name.replace('일천', '천')
|
161 |
+
elif i == 8:
|
162 |
+
name = digit2name.get(digit, '') + '억'
|
163 |
+
elif i == 9:
|
164 |
+
name = digit2name.get(digit, '') + '십'
|
165 |
+
elif i == 10:
|
166 |
+
name = digit2name.get(digit, '') + '백'
|
167 |
+
elif i == 11:
|
168 |
+
name = digit2name.get(digit, '') + '천'
|
169 |
+
elif i == 12:
|
170 |
+
name = digit2name.get(digit, '') + '조'
|
171 |
+
elif i == 13:
|
172 |
+
name = digit2name.get(digit, '') + '십'
|
173 |
+
elif i == 14:
|
174 |
+
name = digit2name.get(digit, '') + '백'
|
175 |
+
elif i == 15:
|
176 |
+
name = digit2name.get(digit, '') + '천'
|
177 |
+
spelledout.append(name)
|
178 |
+
return ''.join(elem for elem in spelledout)
|
179 |
+
|
180 |
+
|
181 |
+
def number_to_hangul(text):
|
182 |
+
'''Reference https://github.com/Kyubyong/g2pK'''
|
183 |
+
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
184 |
+
for token in tokens:
|
185 |
+
num, classifier = token
|
186 |
+
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
187 |
+
spelledout = hangul_number(num, sino=False)
|
188 |
+
else:
|
189 |
+
spelledout = hangul_number(num, sino=True)
|
190 |
+
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
191 |
+
# digit by digit for remaining digits
|
192 |
+
digits = '0123456789'
|
193 |
+
names = '영일이삼사오육칠팔구'
|
194 |
+
for d, n in zip(digits, names):
|
195 |
+
text = text.replace(d, n)
|
196 |
+
return text
|
197 |
+
|
198 |
+
|
199 |
+
def korean_to_lazy_ipa(text):
|
200 |
+
text = latin_to_hangul(text)
|
201 |
+
text = number_to_hangul(text)
|
202 |
+
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
203 |
+
for regex, replacement in _ipa_to_lazy_ipa:
|
204 |
+
text = re.sub(regex, replacement, text)
|
205 |
+
return text
|
206 |
+
|
207 |
+
|
208 |
+
def korean_to_ipa(text):
|
209 |
+
text = korean_to_lazy_ipa(text)
|
210 |
+
return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
|
text/mandarin.py
CHANGED
@@ -4,6 +4,7 @@ import re
|
|
4 |
from pypinyin import lazy_pinyin, BOPOMOFO
|
5 |
import jieba
|
6 |
import cn2an
|
|
|
7 |
|
8 |
|
9 |
# List of (Latin alphabet, bopomofo) pairs:
|
@@ -239,7 +240,7 @@ def number_to_chinese(text):
|
|
239 |
return text
|
240 |
|
241 |
|
242 |
-
def chinese_to_bopomofo(text
|
243 |
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
|
244 |
words = jieba.lcut(text, cut_all=False)
|
245 |
text = ''
|
@@ -252,10 +253,7 @@ def chinese_to_bopomofo(text, taiwanese=False):
|
|
252 |
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
|
253 |
if text != '':
|
254 |
text += ' '
|
255 |
-
|
256 |
-
text += '#'+'#'.join(bopomofos)
|
257 |
-
else:
|
258 |
-
text += ''.join(bopomofos)
|
259 |
return text
|
260 |
|
261 |
|
@@ -316,9 +314,9 @@ def chinese_to_ipa(text):
|
|
316 |
return text
|
317 |
|
318 |
|
319 |
-
def chinese_to_ipa2(text
|
320 |
text = number_to_chinese(text)
|
321 |
-
text = chinese_to_bopomofo(text
|
322 |
text = latin_to_bopomofo(text)
|
323 |
text = bopomofo_to_ipa2(text)
|
324 |
text = re.sub(r'i([aoe])', r'j\1', text)
|
|
|
4 |
from pypinyin import lazy_pinyin, BOPOMOFO
|
5 |
import jieba
|
6 |
import cn2an
|
7 |
+
import logging
|
8 |
|
9 |
|
10 |
# List of (Latin alphabet, bopomofo) pairs:
|
|
|
240 |
return text
|
241 |
|
242 |
|
243 |
+
def chinese_to_bopomofo(text):
|
244 |
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
|
245 |
words = jieba.lcut(text, cut_all=False)
|
246 |
text = ''
|
|
|
253 |
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
|
254 |
if text != '':
|
255 |
text += ' '
|
256 |
+
text += ''.join(bopomofos)
|
|
|
|
|
|
|
257 |
return text
|
258 |
|
259 |
|
|
|
314 |
return text
|
315 |
|
316 |
|
317 |
+
def chinese_to_ipa2(text):
|
318 |
text = number_to_chinese(text)
|
319 |
+
text = chinese_to_bopomofo(text)
|
320 |
text = latin_to_bopomofo(text)
|
321 |
text = bopomofo_to_ipa2(text)
|
322 |
text = re.sub(r'i([aoe])', r'j\1', text)
|
text/ngu_dialect.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import opencc
|
3 |
+
|
4 |
+
|
5 |
+
dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
|
6 |
+
'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
|
7 |
+
'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
|
8 |
+
'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
|
9 |
+
'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
|
10 |
+
'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
|
11 |
+
|
12 |
+
converters = {}
|
13 |
+
|
14 |
+
for dialect in dialects.values():
|
15 |
+
try:
|
16 |
+
converters[dialect] = opencc.OpenCC(dialect)
|
17 |
+
except:
|
18 |
+
pass
|
19 |
+
|
20 |
+
|
21 |
+
def ngu_dialect_to_ipa(text, dialect):
|
22 |
+
dialect = dialects[dialect]
|
23 |
+
text = converters[dialect].convert(text).replace('-','').replace('$',' ')
|
24 |
+
text = re.sub(r'[、;:]', ',', text)
|
25 |
+
text = re.sub(r'\s*,\s*', ', ', text)
|
26 |
+
text = re.sub(r'\s*。\s*', '. ', text)
|
27 |
+
text = re.sub(r'\s*?\s*', '? ', text)
|
28 |
+
text = re.sub(r'\s*!\s*', '! ', text)
|
29 |
+
text = re.sub(r'\s*$', '', text)
|
30 |
+
return text
|
text/shanghainese.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import cn2an
|
3 |
+
import opencc
|
4 |
+
|
5 |
+
|
6 |
+
converter = opencc.OpenCC('zaonhe')
|
7 |
+
|
8 |
+
# List of (Latin alphabet, ipa) pairs:
|
9 |
+
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
10 |
+
('A', 'ᴇ'),
|
11 |
+
('B', 'bi'),
|
12 |
+
('C', 'si'),
|
13 |
+
('D', 'di'),
|
14 |
+
('E', 'i'),
|
15 |
+
('F', 'ᴇf'),
|
16 |
+
('G', 'dʑi'),
|
17 |
+
('H', 'ᴇtɕʰ'),
|
18 |
+
('I', 'ᴀi'),
|
19 |
+
('J', 'dʑᴇ'),
|
20 |
+
('K', 'kʰᴇ'),
|
21 |
+
('L', 'ᴇl'),
|
22 |
+
('M', 'ᴇm'),
|
23 |
+
('N', 'ᴇn'),
|
24 |
+
('O', 'o'),
|
25 |
+
('P', 'pʰi'),
|
26 |
+
('Q', 'kʰiu'),
|
27 |
+
('R', 'ᴀl'),
|
28 |
+
('S', 'ᴇs'),
|
29 |
+
('T', 'tʰi'),
|
30 |
+
('U', 'ɦiu'),
|
31 |
+
('V', 'vi'),
|
32 |
+
('W', 'dᴀbɤliu'),
|
33 |
+
('X', 'ᴇks'),
|
34 |
+
('Y', 'uᴀi'),
|
35 |
+
('Z', 'zᴇ')
|
36 |
+
]]
|
37 |
+
|
38 |
+
|
39 |
+
def _number_to_shanghainese(num):
|
40 |
+
num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
|
41 |
+
return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
|
42 |
+
|
43 |
+
|
44 |
+
def number_to_shanghainese(text):
|
45 |
+
return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
|
46 |
+
|
47 |
+
|
48 |
+
def latin_to_ipa(text):
|
49 |
+
for regex, replacement in _latin_to_ipa:
|
50 |
+
text = re.sub(regex, replacement, text)
|
51 |
+
return text
|
52 |
+
|
53 |
+
|
54 |
+
def shanghainese_to_ipa(text):
|
55 |
+
text = number_to_shanghainese(text.upper())
|
56 |
+
text = converter.convert(text).replace('-','').replace('$',' ')
|
57 |
+
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
58 |
+
text = re.sub(r'[、;:]', ',', text)
|
59 |
+
text = re.sub(r'\s*,\s*', ', ', text)
|
60 |
+
text = re.sub(r'\s*。\s*', '. ', text)
|
61 |
+
text = re.sub(r'\s*?\s*', '? ', text)
|
62 |
+
text = re.sub(r'\s*!\s*', '! ', text)
|
63 |
+
text = re.sub(r'\s*$', '', text)
|
64 |
+
return text
|
text/symbols.py
CHANGED
@@ -1,15 +1,18 @@
|
|
1 |
'''
|
2 |
Defines the set of symbols used in text input to the model.
|
3 |
'''
|
|
|
|
|
4 |
_pad = '_'
|
5 |
-
_punctuation = '
|
6 |
-
_letters = '
|
7 |
'''
|
|
|
8 |
# japanese_cleaners2
|
9 |
_pad = '_'
|
10 |
_punctuation = ',.!?-~…'
|
11 |
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
|
12 |
-
|
13 |
|
14 |
'''# korean_cleaners
|
15 |
_pad = '_'
|
@@ -23,6 +26,11 @@ _punctuation = ',。!?—…'
|
|
23 |
_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
|
24 |
'''
|
25 |
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
'''# sanskrit_cleaners
|
28 |
_pad = '_'
|
@@ -57,7 +65,7 @@ _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
|
|
57 |
'''# chinese_dialect_cleaners
|
58 |
_pad = '_'
|
59 |
_punctuation = ',.!?~…─'
|
60 |
-
_letters = '#Nabdefghijklmnoprstuvwxyz
|
61 |
'''
|
62 |
|
63 |
# Export all symbols:
|
|
|
1 |
'''
|
2 |
Defines the set of symbols used in text input to the model.
|
3 |
'''
|
4 |
+
|
5 |
+
'''# japanese_cleaners
|
6 |
_pad = '_'
|
7 |
+
_punctuation = ',.!?-'
|
8 |
+
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
|
9 |
'''
|
10 |
+
|
11 |
# japanese_cleaners2
|
12 |
_pad = '_'
|
13 |
_punctuation = ',.!?-~…'
|
14 |
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
|
15 |
+
|
16 |
|
17 |
'''# korean_cleaners
|
18 |
_pad = '_'
|
|
|
26 |
_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
|
27 |
'''
|
28 |
|
29 |
+
'''# zh_ja_mixture_cleaners
|
30 |
+
_pad = '_'
|
31 |
+
_punctuation = ',.!?-~…'
|
32 |
+
_letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
|
33 |
+
'''
|
34 |
|
35 |
'''# sanskrit_cleaners
|
36 |
_pad = '_'
|
|
|
65 |
'''# chinese_dialect_cleaners
|
66 |
_pad = '_'
|
67 |
_punctuation = ',.!?~…─'
|
68 |
+
_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
|
69 |
'''
|
70 |
|
71 |
# Export all symbols:
|
text/thai.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from num_thai.thainumbers import NumThai
|
3 |
+
|
4 |
+
|
5 |
+
num = NumThai()
|
6 |
+
|
7 |
+
# List of (Latin alphabet, Thai) pairs:
|
8 |
+
_latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
9 |
+
('a', 'เอ'),
|
10 |
+
('b','บี'),
|
11 |
+
('c','ซี'),
|
12 |
+
('d','ดี'),
|
13 |
+
('e','อี'),
|
14 |
+
('f','เอฟ'),
|
15 |
+
('g','จี'),
|
16 |
+
('h','เอช'),
|
17 |
+
('i','ไอ'),
|
18 |
+
('j','เจ'),
|
19 |
+
('k','เค'),
|
20 |
+
('l','แอล'),
|
21 |
+
('m','เอ็ม'),
|
22 |
+
('n','เอ็น'),
|
23 |
+
('o','โอ'),
|
24 |
+
('p','พี'),
|
25 |
+
('q','คิว'),
|
26 |
+
('r','แอร์'),
|
27 |
+
('s','เอส'),
|
28 |
+
('t','ที'),
|
29 |
+
('u','ยู'),
|
30 |
+
('v','วี'),
|
31 |
+
('w','ดับเบิลยู'),
|
32 |
+
('x','เอ็กซ์'),
|
33 |
+
('y','วาย'),
|
34 |
+
('z','ซี')
|
35 |
+
]]
|
36 |
+
|
37 |
+
|
38 |
+
def num_to_thai(text):
|
39 |
+
return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
|
40 |
+
|
41 |
+
def latin_to_thai(text):
|
42 |
+
for regex, replacement in _latin_to_thai:
|
43 |
+
text = re.sub(regex, replacement, text)
|
44 |
+
return text
|