lengyue233 commited on
Commit
8dfc341
1 Parent(s): c2603d3

Update fish_speech/text/clean.py

Browse files
Files changed (1) hide show
  1. fish_speech/text/clean.py +32 -1
fish_speech/text/clean.py CHANGED
@@ -1,6 +1,8 @@
1
  import re
2
 
3
  SYMBOLS_MAPPING = {
 
 
4
  "“": "'",
5
  "”": "'",
6
  "‘": "'",
@@ -13,7 +15,19 @@ SYMBOLS_MAPPING = {
13
  ")": "",
14
  "(": "",
15
  ")": "",
16
- "・": "·",
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
 
19
  REPLACE_SYMBOL_REGEX = re.compile(
@@ -21,6 +35,17 @@ REPLACE_SYMBOL_REGEX = re.compile(
21
  )
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  def clean_text(text):
25
  # Clean the text
26
  text = text.strip()
@@ -28,4 +53,10 @@ def clean_text(text):
28
  # Replace all chinese symbols with their english counterparts
29
  text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
30
 
 
 
 
 
 
 
31
  return text
 
1
  import re
2
 
3
  SYMBOLS_MAPPING = {
4
+ "\n": "",
5
+ "…": ".",
6
  "“": "'",
7
  "”": "'",
8
  "‘": "'",
 
15
  ")": "",
16
  "(": "",
17
  ")": "",
18
+ "・": "",
19
+ "·": "",
20
+ "「": "'",
21
+ "」": "'",
22
+ "《": "'",
23
+ "》": "'",
24
+ "—": "",
25
+ "~": "",
26
+ "~": "",
27
+ ":": ",",
28
+ ";": ",",
29
+ ";": ",",
30
+ ":": ",",
31
  }
32
 
33
  REPLACE_SYMBOL_REGEX = re.compile(
 
35
  )
36
 
37
 
38
+ EMOJI_REGEX = re.compile(
39
+ "["
40
+ "\U0001F600-\U0001F64F" # emoticons
41
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
42
+ "\U0001F680-\U0001F6FF" # transport & map symbols
43
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
44
+ "]+",
45
+ flags=re.UNICODE,
46
+ )
47
+
48
+
49
  def clean_text(text):
50
  # Clean the text
51
  text = text.strip()
 
53
  # Replace all chinese symbols with their english counterparts
54
  text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
55
 
56
+ # Remove emojis
57
+ text = EMOJI_REGEX.sub(r"", text)
58
+
59
+ # Remove continuous periods (...) and commas (,,,)
60
+ text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
61
+
62
  return text