Yurii Paniv commited on
Commit
a2689f4
1 Parent(s): 2bd2514

Initial numbers

Browse files
crh_preprocessor/preprocessor.py CHANGED
@@ -13,7 +13,72 @@ mapping = {
13
  "w": "v",
14
  "x": "ks"
15
  }
16
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def preprocess(text):
19
  text = text.lower() # always treat lowercase
@@ -26,21 +91,42 @@ def preprocess(text):
26
  for symbol in separators:
27
  text = text.replace(symbol, ".")
28
 
29
- numbers = {
30
- "0": "sıfır",
31
- "1": "bir",
32
- "2": "eki",
33
- "3": "üç",
34
- "4": "dört",
35
- "5": "beş",
36
- "6": "altı",
37
- "7": "yedi",
38
- "8": "sekiz",
39
- "9": "doquz",
40
- }
41
-
42
- for number in numbers.keys():
43
- text = text.replace(number, numbers[number] + " ")
44
-
45
- return text[1:-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
13
  "w": "v",
14
  "x": "ks"
15
  }
16
+
17
+ zero = {
18
+ 0: 'sıfır',
19
+ }
20
+
21
+ numbers_map = {
22
+ 1: 'bir',
23
+ 2: 'eki',
24
+ 3: 'üç',
25
+ 4: 'dört',
26
+ 5: 'beş',
27
+ 6: 'altı',
28
+ 7: 'yedi',
29
+ 8: 'sekiz',
30
+ 9: 'doquz',
31
+ 10: 'on',
32
+ 20: 'yigirmi',
33
+ 30: 'otuz',
34
+ 40: 'qırq',
35
+ 50: 'elli',
36
+ 60: 'altmış',
37
+ 70: 'yetmiş',
38
+ 80: 'seksen',
39
+ 90: 'doqsan',
40
+ 100: 'yüz',
41
+ 1000: 'biñ',
42
+ 1_000_000: 'million',
43
+ 1_000_000_000: 'milliard'
44
+ }
45
+
46
+
47
+ def spell_numbers(numbers: str) -> str:
48
+ numbers_map_with_zero = {**numbers_map,**zero}
49
+ for i in range(0, 10):
50
+ numbers = numbers.replace(str(i), numbers_map_with_zero[i] + ' ')
51
+ return numbers.strip()
52
+
53
+
54
+ def num2word(n):
55
+ if n in numbers_map:
56
+ return numbers_map[n]
57
+ elif n < 100:
58
+ tens = (n // 10) * 10
59
+ units = n % 10
60
+ if units == 0:
61
+ return ''
62
+ return (numbers_map[tens] + ' ' + numbers_map[units]).strip()
63
+ elif n < 1000:
64
+ hundreds = n // 100
65
+ rest = n % 100
66
+ return (num2word(hundreds) + ' ' + numbers_map[100] + ' ' + num2word(rest)).strip()
67
+ elif n < 1_000_000:
68
+ thousands = n // 1_000
69
+ rest = n % 1_000
70
+ return (num2word(thousands) + ' ' + numbers_map[1_000] + ' ' + num2word(rest)).strip()
71
+ elif n < 1_000_000_000:
72
+ millions = n // 1_000_000
73
+ rest = n % 1_000_000
74
+ return (num2word(millions) + ' ' + numbers_map[1_000_000] + ' ' + num2word(rest)).strip()
75
+ elif n < 1_000_000_000_000:
76
+ billions = n // 1_000_000_000
77
+ rest = n % 1_000_000_000
78
+ return (num2word(billions) + ' ' + numbers_map[1_000_000_000] + ' ' + num2word(rest)).strip()
79
+ else:
80
+ return spell_numbers(str(n))
81
+
82
 
83
  def preprocess(text):
84
  text = text.lower() # always treat lowercase
 
91
  for symbol in separators:
92
  text = text.replace(symbol, ".")
93
 
94
+ while True:
95
+ number_match = re.search("-?\d+(\.|,)?(\d+)?", text)
96
+
97
+ if number_match is None:
98
+ break
99
+
100
+ print(number_match.string, number_match.start(), number_match.end())
101
+
102
+ number = number_match.string.strip()
103
+
104
+ prefix = ""
105
+
106
+ if number.startswith("-"):
107
+ prefix = "minus "
108
+ number = number.replace("-", "", 1)
109
+ elif number.startswith("+"):
110
+ prefix = "plüs "
111
+ number = number.replace("+", "", 1)
112
+
113
+ if "." in number:
114
+ number = number.split(".")
115
+ number = prefix + " noqta ".join((num2word(int(number[0])) if int(number[0]) != 0 else spell_numbers(number[0]), spell_numbers(number[1])))
116
+ text = text.replace(number_match.string.strip(), number, 1)
117
+ continue
118
+ elif "," in number:
119
+ number = number.split(",")
120
+ number = prefix + " virgül ".join((num2word(int(number[0])) if int(number[0]) != 0 else spell_numbers(number[0]), spell_numbers(number[1])))
121
+ text = text.replace(number_match.string.strip(), number, 1)
122
+ continue
123
+
124
+ if number.startswith("0"):
125
+ text = text.replace(number_match.string.strip(), prefix + spell_numbers(number), 1)
126
+ continue
127
+
128
+ text = text.replace(number_match.string.strip(), prefix + num2word(int(number)), 1)
129
+
130
+
131
+ return text.strip()
132
 
requirements-dev.txt DELETED
@@ -1,4 +0,0 @@
1
- -r requirements.txt
2
- -r requirements-test.txt
3
- huggingface_hub
4
- black
 
 
 
 
 
requirements-test.txt CHANGED
@@ -1,3 +1,5 @@
 
1
  pytest==7.1.3
2
  pytest-cov==4.0.0
3
- tabulate==0.8.10
 
 
1
+ -r requirements.txt
2
  pytest==7.1.3
3
  pytest-cov==4.0.0
4
+ tabulate==0.8.10
5
+ black
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  gradio==3.6
2
  torch>=1.13
3
- TTS==0.9.0
 
 
1
  gradio==3.6
2
  torch>=1.13
3
+ TTS==0.9.0
4
+ huggingface_hub
tests/test_preprocessor.py CHANGED
@@ -1,7 +1,70 @@
1
- from crh_preprocessor.preprocessor import preprocess
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def test_preprocessor():
5
  assert (
6
  preprocess("İşanç Alla-Taalâğa.") == "işan\u04ab alla-taalâğa."
7
  ) # first i is two symbols (i without dot and dot)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crh_preprocessor.preprocessor import preprocess, num2word
2
+
3
+
4
+ def test_num2word():
5
+ assert (
6
+ num2word(16) == "on altı"
7
+ )
8
+ assert (
9
+ num2word(1324759813) == "bir milliard üç yüz yigirmi dört million yedi yüz elli doquz biñ sekiz yüz on üç"
10
+ )
11
+ assert (
12
+ num2word(1_000_000) == "million"
13
+ )
14
 
15
 
16
  def test_preprocessor():
17
  assert (
18
  preprocess("İşanç Alla-Taalâğa.") == "işan\u04ab alla-taalâğa."
19
  ) # first i is two symbols (i without dot and dot)
20
+ assert (
21
+ preprocess("1000000") == "million"
22
+ )
23
+ assert (
24
+ preprocess("1324700000") == "bir milliard üç yüz yigirmi dört million yedi yüz biñ"
25
+ )
26
+ assert (
27
+ preprocess("1000002") == "bir million eki"
28
+ )
29
+ assert (
30
+ preprocess("16") == "on altı"
31
+ )
32
+ assert (
33
+ preprocess("001") == "sıfır sıfır bir"
34
+ )
35
+ assert (
36
+ preprocess("00") == "sıfır sıfır"
37
+ )
38
+ assert (
39
+ preprocess("10.02") == "on noqta sıfır eki"
40
+ )
41
+ assert (
42
+ preprocess("0.01") == "sıfır noqta sıfır bir"
43
+ )
44
+ assert (
45
+ preprocess("0,01") == "sıfır virgül sıfır bir"
46
+ )
47
+ assert (
48
+ preprocess("00,01") == "sıfır sıfır virgül sıfır bir"
49
+ )
50
+ assert (
51
+ preprocess("-10") == "minus on"
52
+ )
53
+ assert (
54
+ preprocess("+10") == "plüs on"
55
+ )
56
+ assert (
57
+ preprocess("+10.1400") == "plüs on noqta bir dört sıfır sıfır"
58
+ )
59
+ assert (
60
+ preprocess("-10.14156") == "minus on noqta bir dört bir beş altı"
61
+ )
62
+ assert (
63
+ preprocess("10,14156") == "on virgül bir dört bir beş altı"
64
+ )
65
+ assert (
66
+ preprocess("1, 2, 3, 4, 5, 6,7") == "bir virgül eki virgül"
67
+ )
68
+ assert (
69
+ preprocess("1,2,3,4,5,6,7") == "on altı"
70
+ )