balaramas commited on
Commit
8e74bf4
0 Parent(s):

Duplicate from balaramas/indic_s2t

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Indic S2t
3
+ emoji: 🌖
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.36.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ duplicated_from: balaramas/indic_s2t
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to translate given single english audio file to corresponding hindi text
3
+ Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
4
+ """
5
+
6
+
7
+
8
+ import gradio as gr
9
+ import sys
10
+ import os
11
+ import subprocess
12
+ from pydub import AudioSegment
13
+ from huggingface_hub import snapshot_download
14
+
15
+ def install_fairseq():
16
+ try:
17
+ # Run pip install command to install fairseq
18
+ subprocess.check_call(["pip", "install", "fairseq"])
19
+ subprocess.check_call(["pip", "install", "sentencepiece"])
20
+ subprocess.check_call(["pip", "install", "soundfile"])
21
+ return "fairseq successfully installed!"
22
+ except subprocess.CalledProcessError as e:
23
+ return f"An error occurred while installing fairseq: {str(e)}"
24
+
25
+ def convert_audio_to_16k_wav(audio_input):
26
+ sound = AudioSegment.from_file(audio_input)
27
+ sample_rate = sound.frame_rate
28
+ num_channels = sound.channels
29
+ num_frames = int(sound.frame_count())
30
+ filename = audio_input.split("/")[-1]
31
+ print("original file is at:", audio_input)
32
+ if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
33
+ if num_channels > 1:
34
+ sound = sound.set_channels(1)
35
+ if sample_rate != 16000:
36
+ sound = sound.set_frame_rate(16000)
37
+ num_frames = int(sound.frame_count())
38
+ filename = filename.replace(".wav", "") + "_16k.wav"
39
+ sound.export(f"{filename}", format="wav")
40
+ return filename
41
+
42
+
43
+ def run_my_code(input_text, language):
44
+ # TODO better argument handling
45
+ audio=convert_audio_to_16k_wav(input_text)
46
+ hi_wav = audio
47
+
48
+ data_root=""
49
+ model_checkpoint=""
50
+ d_r=""
51
+
52
+ if(language=="Hindi"):
53
+ model_checkpoint = "./models/hi_m.pt"
54
+ data_root="./lang/hi/"
55
+
56
+ if(language=="Gujrati"):
57
+ model_checkpoint = "./models/gj_m.pt"
58
+ data_root="./lang/gj/"
59
+
60
+ if(language=="Bengali"):
61
+ model_checkpoint = "./models/bn_m.pt"
62
+ data_root="./lang/bn/"
63
+
64
+ if(language=="Nepali"):
65
+ model_checkpoint = "./models/ne_m.pt"
66
+ data_root="./lang/ne/"
67
+
68
+ if(language=="Tamil"):
69
+ model_checkpoint = "./models/tm_m.pt"
70
+ data_root="./lang/tm/"
71
+
72
+ if(language=="Marathi"):
73
+ model_checkpoint = "./models/mt_m.pt"
74
+ data_root="./lang/mt/"
75
+
76
+ #os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")
77
+ f = open('input.txt', 'w')
78
+ f.write(hi_wav)
79
+
80
+ f = open('input.txt', 'r')
81
+ content = f. read()
82
+ print(content)
83
+ print(hi_wav)
84
+
85
+ print("------Performing translation...")
86
+ #subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"])
87
+ translation_result = subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"], capture_output=True, text=True)
88
+ translation_result_text = translation_result.stdout
89
+
90
+ lines = translation_result_text.split("\n")
91
+
92
+ output_text=""
93
+ print("\n\n------Translation results are:")
94
+ for i in lines:
95
+ if (i.startswith("D-0")):
96
+ print(i.split("\t")[2])
97
+ output_text=i.split("\t")[2]
98
+ break
99
+
100
+ #os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
101
+ f = open('input.txt', 'w')
102
+ f.write("")
103
+
104
+ f = open('input.txt', 'r')
105
+ content = f. read()
106
+ print(content)
107
+ return output_text
108
+
109
+ install_fairseq()
110
+
111
+ # Define the input and output interfaces for Gradio
112
+ #inputs = [
113
+ # gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
114
+ # gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
115
+ # ]
116
+
117
+ #input_textbox = gr.inputs.Textbox(label="test2.wav")
118
+ #input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
119
+ #audio=convert_audio_to_16k_wav(input)
120
+ output_textbox = gr.outputs.Textbox(label="Translated Text")
121
+
122
+ # Create a Gradio interface
123
+ iface = gr.Interface(
124
+ fn=run_my_code,
125
+ inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American English accent)"), gr.inputs.Radio(["Hindi", "Gujrati", "Bengali", "Tamil", "Nepali", "Marathi"], label="Language")],
126
+ outputs=output_textbox,
127
+ title="English to Indic Language Translator")
128
+
129
+ # Launch the interface
130
+ iface.launch()
input.txt ADDED
File without changes
lang/bn/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/bn/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:892dd0398e561af3bd035798ff1682f9a35c2736378e041922a46e111c3d7a72
3
+ size 467219
lang/bn/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/gj/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/gj/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af23c356de872a60a32cfd6eacd7d5313934d7252b1b4ccc011bfc6992c2e904
3
+ size 454913
lang/gj/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/hi/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/hi/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf7b26c17db61dcd76400fbb74c5395d5f13837ed0fd5fa1098930de4f2a8202
3
+ size 449800
lang/hi/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/mt/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/mt/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba6aa66df08e7b5614deadfe3fc08d5473dc7dcf672d15134ce0e4db6dd99e1
3
+ size 458987
lang/mt/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/ne/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: /home/deepakprasad/nlp_code/Nepali_MUSTC/en-ne/spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/ne/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9b431e41320a5738c0af5368d23c5071a71899c897887f06a22f2efc087dd80
3
+ size 459775
lang/ne/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/tm/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/tm/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8111dca119a0b896f1a2f371fbe60682b804cf1e0f99281dd4cf410ea9e8bd29
3
+ size 500276
lang/tm/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/bn_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97a2b6d13570a7296bb8530ff4a97306c643dddfa8abff9197df53d20cd8b735
3
+ size 373237256
models/de_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbd8f22a2b8d90dc24ba9d4fc84df3c3b0bcf711366ac93bef27e0fe2deaa6cd
3
+ size 373237192
models/fr_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34199f96a7194ed36bdde18cf9137df39fff82f725e57923627909c369d75433
3
+ size 373237448
models/gj_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a7d7a21002847ec8e16dd1737b35495574e266fdf39aecfa9bb9126d8444a62
3
+ size 373237448
models/hi_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47e8bfef22034ac859da3a2726b142876793113cf18ac18bb6f6eb85415a7893
3
+ size 373227272
models/mt_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b300d014121494e5583ac83df275038b7a5728e25c25caf2d0a566f482f33a6
3
+ size 373237192
models/ne_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ac42610702980cf090b41356e4b525ac1999b147c0564d8a45605b571b3018
3
+ size 373237192
models/tm_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487b409230de732f76fad7bb40581490ff207054b13dcabf8cd52d6ed1334668
3
+ size 373237448
test.wav ADDED
Binary file (141 kB). View file
 
test2.wav ADDED
Binary file (126 kB). View file