Spaces:
Sleeping
Sleeping
Yurii Paniv
commited on
Commit
•
e0a3506
1
Parent(s):
13aac28
Implement recording functionality
Browse files- .gitignore +2 -0
- client.py +119 -0
- main.py +11 -3
- requirements.txt +2 -1
- templates/hello.html +65 -31
.gitignore
CHANGED
@@ -127,3 +127,5 @@ dmypy.json
|
|
127 |
|
128 |
# Pyre type checker
|
129 |
.pyre/
|
|
|
|
|
|
127 |
|
128 |
# Pyre type checker
|
129 |
.pyre/
|
130 |
+
|
131 |
+
*.tflite
|
client.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from __future__ import absolute_import, division, print_function
|
4 |
+
|
5 |
+
import argparse
|
6 |
+
import numpy as np
|
7 |
+
import shlex
|
8 |
+
import subprocess
|
9 |
+
import sys
|
10 |
+
import wave
|
11 |
+
import json
|
12 |
+
|
13 |
+
from deepspeech import Model, version
|
14 |
+
from timeit import default_timer as timer
|
15 |
+
|
16 |
+
try:
|
17 |
+
from shhlex import quote
|
18 |
+
except ImportError:
|
19 |
+
from pipes import quote
|
20 |
+
|
21 |
+
|
22 |
+
def convert_samplerate(audio_path, desired_sample_rate):
|
23 |
+
sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(
|
24 |
+
quote(audio_path), desired_sample_rate)
|
25 |
+
try:
|
26 |
+
output = subprocess.check_output(
|
27 |
+
shlex.split(sox_cmd), stderr=subprocess.PIPE)
|
28 |
+
except subprocess.CalledProcessError as e:
|
29 |
+
raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
|
30 |
+
except OSError as e:
|
31 |
+
raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(
|
32 |
+
desired_sample_rate, e.strerror))
|
33 |
+
|
34 |
+
return desired_sample_rate, np.frombuffer(output, np.int16)
|
35 |
+
|
36 |
+
|
37 |
+
def metadata_to_string(metadata):
|
38 |
+
return ''.join(token.text for token in metadata.tokens)
|
39 |
+
|
40 |
+
|
41 |
+
def words_from_candidate_transcript(metadata):
|
42 |
+
word = ""
|
43 |
+
word_list = []
|
44 |
+
word_start_time = 0
|
45 |
+
# Loop through each character
|
46 |
+
for i, token in enumerate(metadata.tokens):
|
47 |
+
# Append character to word if it's not a space
|
48 |
+
if token.text != " ":
|
49 |
+
if len(word) == 0:
|
50 |
+
# Log the start time of the new word
|
51 |
+
word_start_time = token.start_time
|
52 |
+
|
53 |
+
word = word + token.text
|
54 |
+
# Word boundary is either a space or the last character in the array
|
55 |
+
if token.text == " " or i == len(metadata.tokens) - 1:
|
56 |
+
word_duration = token.start_time - word_start_time
|
57 |
+
|
58 |
+
if word_duration < 0:
|
59 |
+
word_duration = 0
|
60 |
+
|
61 |
+
each_word = dict()
|
62 |
+
each_word["word"] = word
|
63 |
+
each_word["start_time "] = round(word_start_time, 4)
|
64 |
+
each_word["duration"] = round(word_duration, 4)
|
65 |
+
|
66 |
+
word_list.append(each_word)
|
67 |
+
# Reset
|
68 |
+
word = ""
|
69 |
+
word_start_time = 0
|
70 |
+
|
71 |
+
return word_list
|
72 |
+
|
73 |
+
|
74 |
+
def metadata_json_output(metadata):
|
75 |
+
json_result = dict()
|
76 |
+
json_result["transcripts"] = [{
|
77 |
+
"confidence": transcript.confidence,
|
78 |
+
"words": words_from_candidate_transcript(transcript),
|
79 |
+
} for transcript in metadata.transcripts]
|
80 |
+
return json.dumps(json_result, indent=2)
|
81 |
+
|
82 |
+
|
83 |
+
class VersionAction(argparse.Action):
|
84 |
+
def __init__(self, *args, **kwargs):
|
85 |
+
super(VersionAction, self).__init__(nargs=0, *args, **kwargs)
|
86 |
+
|
87 |
+
def __call__(self, *args, **kwargs):
|
88 |
+
print('DeepSpeech ', version())
|
89 |
+
exit(0)
|
90 |
+
|
91 |
+
|
92 |
+
def client(audio_file):
|
93 |
+
model_load_start = timer()
|
94 |
+
# sphinx-doc: python_ref_model_start
|
95 |
+
ds = Model("./uk.tflite")
|
96 |
+
# sphinx-doc: python_ref_model_stop
|
97 |
+
model_load_end = timer() - model_load_start
|
98 |
+
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
|
99 |
+
|
100 |
+
desired_sample_rate = ds.sampleRate()
|
101 |
+
|
102 |
+
fin = wave.open(audio_file, 'rb')
|
103 |
+
fs_orig = fin.getframerate()
|
104 |
+
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
105 |
+
|
106 |
+
audio_length = fin.getnframes() * (1/fs_orig)
|
107 |
+
fin.close()
|
108 |
+
|
109 |
+
print('Running inference.', file=sys.stderr)
|
110 |
+
inference_start = timer()
|
111 |
+
# sphinx-doc: python_ref_inference_start
|
112 |
+
|
113 |
+
result = ds.stt(audio)
|
114 |
+
print(result)
|
115 |
+
# sphinx-doc: python_ref_inference_stop
|
116 |
+
inference_end = timer() - inference_start
|
117 |
+
print('Inference took %0.3fs for %0.3fs audio file.' %
|
118 |
+
(inference_end, audio_length), file=sys.stderr)
|
119 |
+
return result
|
main.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
-
from flask import Flask, render_template
|
2 |
-
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
@app.route('/')
|
@@ -9,4 +12,9 @@ def index():
|
|
9 |
|
10 |
@app.route('/recognize', methods=["POST"])
|
11 |
def recognize():
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request
|
2 |
+
from io import BytesIO
|
3 |
+
from client import client
|
4 |
+
|
5 |
+
app = Flask(__name__,)
|
6 |
|
7 |
|
8 |
@app.route('/')
|
|
|
12 |
|
13 |
@app.route('/recognize', methods=["POST"])
|
14 |
def recognize():
|
15 |
+
file = request.files['file']
|
16 |
+
audio = BytesIO()
|
17 |
+
file.save(audio)
|
18 |
+
audio.seek(0)
|
19 |
+
result = client(audio)
|
20 |
+
return result
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
Flask==1.1.2
|
2 |
-
deepspeech-tflite==0.7.3
|
|
|
|
1 |
Flask==1.1.2
|
2 |
+
deepspeech-tflite==0.7.3
|
3 |
+
numpy==1.17.0
|
templates/hello.html
CHANGED
@@ -13,49 +13,83 @@
|
|
13 |
<h1>Audio Recording Test</h1>
|
14 |
<p>Talk for 3 seconds, then you will hear your recording played back</p>
|
15 |
<button class="btn btn-primary" id="action" onclick="handleAction()">Start recording...</button>
|
|
|
|
|
16 |
<script>
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
const mediaRecorder = new MediaRecorder(stream, { audioBitsPerSecond: 16000 });
|
21 |
-
const audioChunks = [];
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
mediaRecorder.addEventListener("stop", () => {
|
32 |
-
const audioBlob = new Blob(audioChunks);
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
.then(response => console.log(response.text()))
|
37 |
-
const audio = new Audio(audioUrl);
|
38 |
-
const play = () => audio.play();
|
39 |
-
resolve({ audioBlob, audioUrl, play });
|
40 |
-
});
|
41 |
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
resolve({ start, stop });
|
46 |
-
});
|
47 |
|
48 |
const sleep = time => new Promise(resolve => setTimeout(resolve, time));
|
49 |
|
50 |
-
|
51 |
-
const recorder = await recordAudio();
|
52 |
const actionButton = document.getElementById('action');
|
53 |
actionButton.disabled = true;
|
54 |
-
|
55 |
-
await sleep(3000);
|
56 |
-
const audio = await recorder.stop();
|
57 |
-
audio.play();
|
58 |
-
await sleep(3000);
|
59 |
actionButton.disabled = false;
|
60 |
}
|
61 |
</script>
|
|
|
13 |
<h1>Audio Recording Test</h1>
|
14 |
<p>Talk for 3 seconds, then you will hear your recording played back</p>
|
15 |
<button class="btn btn-primary" id="action" onclick="handleAction()">Start recording...</button>
|
16 |
+
<div id="result"></div>
|
17 |
+
<script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script>
|
18 |
<script>
|
19 |
+
var gumStream; //stream from getUserMedia()
|
20 |
+
var rec; //Recorder.js object
|
21 |
+
var input; //MediaStreamAudioSourceNode we'll be recording
|
|
|
|
|
22 |
|
23 |
+
// shim for AudioContext when it's not avb.
|
24 |
+
var AudioContext = window.AudioContext || window.webkitAudioContext;
|
25 |
+
var audioContext; //audio context to help us record
|
26 |
+
var resultNode = document.getElementById('result');
|
27 |
|
28 |
+
function resultProcess(data) {
|
29 |
+
resultNode.textContent = `Довжина тексту: ${data.length} \n
|
30 |
+
Текст: ${data}
|
31 |
+
`
|
32 |
+
}
|
33 |
+
|
34 |
+
function exportWAV(blob) {
|
35 |
+
var data = new FormData()
|
36 |
+
data.append('file', blob);
|
37 |
+
fetch(`./recognize`, { method: "POST", body: data })
|
38 |
+
.then(response => response.text())
|
39 |
+
.then(resultProcess);
|
40 |
+
}
|
41 |
+
function record() {
|
42 |
+
var constraints = { audio: true, video: false }
|
43 |
+
navigator.mediaDevices.getUserMedia(constraints).then(function (stream) {
|
44 |
+
console.log("getUserMedia() success, stream created, initializing Recorder.js ...");
|
45 |
+
|
46 |
+
/*
|
47 |
+
create an audio context after getUserMedia is called
|
48 |
+
sampleRate might change after getUserMedia is called, like it does on macOS when recording through AirPods
|
49 |
+
the sampleRate defaults to the one set in your OS for your playback device
|
50 |
+
*/
|
51 |
+
audioContext = new AudioContext();
|
52 |
|
53 |
+
/* assign to gumStream for later use */
|
54 |
+
gumStream = stream;
|
|
|
|
|
55 |
|
56 |
+
/* use the stream */
|
57 |
+
input = audioContext.createMediaStreamSource(stream);
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
/*
|
60 |
+
Create the Recorder object and configure to record mono sound (1 channel)
|
61 |
+
Recording 2 channels will double the file size
|
62 |
+
*/
|
63 |
+
rec = new Recorder(input, { numChannels: 1 })
|
64 |
+
|
65 |
+
//start the recording process
|
66 |
+
rec.record()
|
67 |
+
|
68 |
+
console.log("Recording started");
|
69 |
+
sleep(3000).then(stop);
|
70 |
+
})
|
71 |
+
}
|
72 |
+
|
73 |
+
|
74 |
+
function stop() {
|
75 |
+
rec.stop();
|
76 |
+
|
77 |
+
//stop microphone access
|
78 |
+
gumStream.getAudioTracks()[0].stop();
|
79 |
+
|
80 |
+
//create the wav blob and pass it on to createDownloadLink
|
81 |
+
rec.exportWAV(exportWAV);
|
82 |
+
console.log("Recording stopped")
|
83 |
+
|
84 |
+
}
|
85 |
|
|
|
|
|
86 |
|
87 |
const sleep = time => new Promise(resolve => setTimeout(resolve, time));
|
88 |
|
89 |
+
async function handleAction() {
|
|
|
90 |
const actionButton = document.getElementById('action');
|
91 |
actionButton.disabled = true;
|
92 |
+
record();
|
|
|
|
|
|
|
|
|
93 |
actionButton.disabled = false;
|
94 |
}
|
95 |
</script>
|