Yurii Paniv commited on
Commit
efc4c8b
1 Parent(s): f5aefe9

Add data logging

Browse files
Files changed (4) hide show
  1. README.md +1 -0
  2. app.py +28 -0
  3. data_logger.py +41 -0
  4. requirements-dev.txt +1 -0
README.md CHANGED
@@ -18,6 +18,7 @@ Text-to-Speech for Crimean Tatar language
18
  Source code: https://github.com/robinhad/qirimtatar-tts
19
  Online demo: https://huggingface.co/spaces/robinhad/qirimtatar-tts
20
  You're welcome to join UA Speech Recognition and Synthesis community: Telegram https://t.me/speech_recognition_uk
 
21
 
22
  ## Examples
23
  Test sentence:
 
18
  Source code: https://github.com/robinhad/qirimtatar-tts
19
  Online demo: https://huggingface.co/spaces/robinhad/qirimtatar-tts
20
  You're welcome to join UA Speech Recognition and Synthesis community: Telegram https://t.me/speech_recognition_uk
21
+ Note: demo input is saved to improve Text-to-Speech engine and demo experience. By using this demo you give your consent to this.
22
 
23
  ## Examples
24
  Test sentence:
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import gradio as gr
2
  from crh_transliterator.transliterator import transliterate
3
  from crh_preprocessor.preprocessor import preprocess
@@ -18,6 +23,27 @@ class VoiceOption(Enum):
18
  # Abibulla = "Абібулла (чоловічий) 👨"
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  print(f"CUDA available? {is_available()}")
22
 
23
 
@@ -42,6 +68,8 @@ def tts(text: str, voice: str):
42
  }
43
 
44
  speaker_name = voice_mapping[voice]
 
 
45
  text_limit = 7200
46
  text = (
47
  text if len(text) < text_limit else text[0:text_limit]
 
1
+ from os import getenv
2
+ from queue import Queue
3
+ from threading import Thread
4
+ from time import sleep
5
+ from data_logger import log_data
6
  import gradio as gr
7
  from crh_transliterator.transliterator import transliterate
8
  from crh_preprocessor.preprocessor import preprocess
 
23
  # Abibulla = "Абібулла (чоловічий) 👨"
24
 
25
 
26
+ def check_thread(logging_queue: Queue):
27
+ logging_callback = log_data(hf_token=getenv("HF_API_TOKEN"), dataset_name="crh-tts-output", private=False)
28
+ while True:
29
+ sleep(60)
30
+ batch = []
31
+ while not logging_queue.empty():
32
+ batch.append(logging_queue.get())
33
+
34
+ if len(batch) > 0:
35
+ try:
36
+ logging_callback(batch)
37
+ except:
38
+ print("Error happened while pushing data to HF. Puttting items back in queue...")
39
+ for item in batch:
40
+ logging_queue.put(item)
41
+
42
+ if getenv("HF_API_TOKEN") is not None:
43
+ log_queue = Queue()
44
+ t = Thread(target=check_thread, args=(log_queue,))
45
+ t.start()
46
+
47
  print(f"CUDA available? {is_available()}")
48
 
49
 
 
68
  }
69
 
70
  speaker_name = voice_mapping[voice]
71
+ if getenv("HF_API_TOKEN") is not None:
72
+ log_queue.put([text, speaker_name, str(datetime.utcnow())])
73
  text_limit = 7200
74
  text = (
75
  text if len(text) < text_limit else text[0:text_limit]
data_logger.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio import utils
2
+ import os
3
+ import csv
4
+ import huggingface_hub
5
+
6
+ def log_data(hf_token: str, dataset_name: str, private=True):
7
+ path_to_dataset_repo = huggingface_hub.create_repo(
8
+ name=dataset_name,
9
+ token=hf_token,
10
+ private=private,
11
+ repo_type="dataset",
12
+ exist_ok=True,
13
+ )
14
+ flagging_dir = "flagged"
15
+ dataset_dir = os.path.join(flagging_dir, dataset_name)
16
+ repo = huggingface_hub.Repository(
17
+ local_dir=dataset_dir,
18
+ clone_from=path_to_dataset_repo,
19
+ use_auth_token=hf_token,
20
+ )
21
+ repo.git_pull(lfs=True)
22
+ log_file = os.path.join(dataset_dir, "data.csv")
23
+
24
+ def log_function(data):
25
+ repo.git_pull(lfs=True)
26
+
27
+ with open(log_file, "a", newline="", encoding="utf-8") as csvfile:
28
+ writer = csv.writer(csvfile)
29
+
30
+ for row in data:
31
+ writer.writerow(utils.sanitize_list_for_csv(row))
32
+
33
+ with open(log_file, "r", encoding="utf-8") as csvfile:
34
+ line_count = len([None for row in csv.reader(csvfile)]) - 1
35
+
36
+ repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
37
+
38
+ return line_count
39
+
40
+ return log_function
41
+
requirements-dev.txt CHANGED
@@ -1,3 +1,4 @@
1
  -r requirements.txt
2
  -r requirements-test.txt
 
3
  black
 
1
  -r requirements.txt
2
  -r requirements-test.txt
3
+ huggingface_hub
4
  black