Upload source code
#1
by
Xenova
HF staff
- opened
- whisper-speaker-diarization/.eslintrc.cjs +21 -0
- whisper-speaker-diarization/.gitignore +24 -0
- whisper-speaker-diarization/README.md +8 -0
- whisper-speaker-diarization/index.html +12 -0
- whisper-speaker-diarization/package.json +30 -0
- whisper-speaker-diarization/postcss.config.js +6 -0
- whisper-speaker-diarization/src/App.jsx +218 -0
- whisper-speaker-diarization/src/components/LanguageSelector.jsx +134 -0
- whisper-speaker-diarization/src/components/MediaInput.jsx +194 -0
- whisper-speaker-diarization/src/components/Progress.jsx +15 -0
- whisper-speaker-diarization/src/components/Transcript.jsx +125 -0
- whisper-speaker-diarization/src/index.css +25 -0
- whisper-speaker-diarization/src/main.jsx +10 -0
- whisper-speaker-diarization/src/worker.js +124 -0
- whisper-speaker-diarization/tailwind.config.js +12 -0
- whisper-speaker-diarization/vite.config.js +7 -0
whisper-speaker-diarization/.eslintrc.cjs
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module.exports = {
|
2 |
+
root: true,
|
3 |
+
env: { browser: true, es2020: true },
|
4 |
+
extends: [
|
5 |
+
'eslint:recommended',
|
6 |
+
'plugin:react/recommended',
|
7 |
+
'plugin:react/jsx-runtime',
|
8 |
+
'plugin:react-hooks/recommended',
|
9 |
+
],
|
10 |
+
ignorePatterns: ['dist', '.eslintrc.cjs'],
|
11 |
+
parserOptions: { ecmaVersion: 'latest', sourceType: 'module' },
|
12 |
+
settings: { react: { version: '18.2' } },
|
13 |
+
plugins: ['react-refresh'],
|
14 |
+
rules: {
|
15 |
+
'react/jsx-no-target-blank': 'off',
|
16 |
+
'react-refresh/only-export-components': [
|
17 |
+
'warn',
|
18 |
+
{ allowConstantExport: true },
|
19 |
+
],
|
20 |
+
},
|
21 |
+
}
|
whisper-speaker-diarization/.gitignore
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Logs
|
2 |
+
logs
|
3 |
+
*.log
|
4 |
+
npm-debug.log*
|
5 |
+
yarn-debug.log*
|
6 |
+
yarn-error.log*
|
7 |
+
pnpm-debug.log*
|
8 |
+
lerna-debug.log*
|
9 |
+
|
10 |
+
node_modules
|
11 |
+
dist
|
12 |
+
dist-ssr
|
13 |
+
*.local
|
14 |
+
|
15 |
+
# Editor directories and files
|
16 |
+
.vscode/*
|
17 |
+
!.vscode/extensions.json
|
18 |
+
.idea
|
19 |
+
.DS_Store
|
20 |
+
*.suo
|
21 |
+
*.ntvs*
|
22 |
+
*.njsproj
|
23 |
+
*.sln
|
24 |
+
*.sw?
|
whisper-speaker-diarization/README.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# React + Vite
|
2 |
+
|
3 |
+
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
4 |
+
|
5 |
+
Currently, two official plugins are available:
|
6 |
+
|
7 |
+
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh
|
8 |
+
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
|
whisper-speaker-diarization/index.html
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!doctype html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
6 |
+
<title>Whisper Diarization</title>
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<div id="root"></div>
|
10 |
+
<script type="module" src="/src/main.jsx"></script>
|
11 |
+
</body>
|
12 |
+
</html>
|
whisper-speaker-diarization/package.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "whisper-speaker-diarization",
|
3 |
+
"private": true,
|
4 |
+
"version": "0.0.0",
|
5 |
+
"type": "module",
|
6 |
+
"scripts": {
|
7 |
+
"dev": "vite",
|
8 |
+
"build": "vite build",
|
9 |
+
"lint": "eslint . --ext js,jsx --report-unused-disable-directives --max-warnings 0",
|
10 |
+
"preview": "vite preview"
|
11 |
+
},
|
12 |
+
"dependencies": {
|
13 |
+
"@xenova/transformers": "github:xenova/transformers.js#v3",
|
14 |
+
"react": "^18.3.1",
|
15 |
+
"react-dom": "^18.3.1"
|
16 |
+
},
|
17 |
+
"devDependencies": {
|
18 |
+
"@types/react": "^18.3.3",
|
19 |
+
"@types/react-dom": "^18.3.0",
|
20 |
+
"@vitejs/plugin-react": "^4.3.1",
|
21 |
+
"autoprefixer": "^10.4.19",
|
22 |
+
"eslint": "^8.57.0",
|
23 |
+
"eslint-plugin-react": "^7.34.2",
|
24 |
+
"eslint-plugin-react-hooks": "^4.6.2",
|
25 |
+
"eslint-plugin-react-refresh": "^0.4.7",
|
26 |
+
"postcss": "^8.4.38",
|
27 |
+
"tailwindcss": "^3.4.4",
|
28 |
+
"vite": "^5.3.1"
|
29 |
+
}
|
30 |
+
}
|
whisper-speaker-diarization/postcss.config.js
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export default {
|
2 |
+
plugins: {
|
3 |
+
tailwindcss: {},
|
4 |
+
autoprefixer: {},
|
5 |
+
},
|
6 |
+
}
|
whisper-speaker-diarization/src/App.jsx
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useEffect, useState, useRef, useCallback } from 'react';
|
2 |
+
|
3 |
+
import Progress from './components/Progress';
|
4 |
+
import MediaInput from './components/MediaInput';
|
5 |
+
import Transcript from './components/Transcript';
|
6 |
+
import LanguageSelector from './components/LanguageSelector';
|
7 |
+
|
8 |
+
|
9 |
+
async function hasWebGPU() {
|
10 |
+
if (!navigator.gpu) {
|
11 |
+
return false;
|
12 |
+
}
|
13 |
+
try {
|
14 |
+
const adapter = await navigator.gpu.requestAdapter();
|
15 |
+
return !!adapter;
|
16 |
+
} catch (e) {
|
17 |
+
return false;
|
18 |
+
}
|
19 |
+
}
|
20 |
+
|
21 |
+
function App() {
|
22 |
+
|
23 |
+
// Create a reference to the worker object.
|
24 |
+
const worker = useRef(null);
|
25 |
+
|
26 |
+
// Model loading and progress
|
27 |
+
const [status, setStatus] = useState(null);
|
28 |
+
const [loadingMessage, setLoadingMessage] = useState('');
|
29 |
+
const [progressItems, setProgressItems] = useState([]);
|
30 |
+
|
31 |
+
const mediaInputRef = useRef(null);
|
32 |
+
const [audio, setAudio] = useState(null);
|
33 |
+
const [language, setLanguage] = useState('en');
|
34 |
+
|
35 |
+
const [result, setResult] = useState(null);
|
36 |
+
const [time, setTime] = useState(null);
|
37 |
+
const [currentTime, setCurrentTime] = useState(0);
|
38 |
+
|
39 |
+
const [device, setDevice] = useState('webgpu'); // Try use WebGPU first
|
40 |
+
const [modelSize, setModelSize] = useState('gpu' in navigator ? 196 : 77); // WebGPU=196MB, WebAssembly=77MB
|
41 |
+
useEffect(() => {
|
42 |
+
hasWebGPU().then((b) => {
|
43 |
+
setModelSize(b ? 196 : 77);
|
44 |
+
setDevice(b ? 'webgpu' : 'wasm');
|
45 |
+
});
|
46 |
+
}, []);
|
47 |
+
|
48 |
+
// We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
|
49 |
+
useEffect(() => {
|
50 |
+
if (!worker.current) {
|
51 |
+
// Create the worker if it does not yet exist.
|
52 |
+
worker.current = new Worker(new URL('./worker.js', import.meta.url), {
|
53 |
+
type: 'module'
|
54 |
+
});
|
55 |
+
}
|
56 |
+
|
57 |
+
// Create a callback function for messages from the worker thread.
|
58 |
+
const onMessageReceived = (e) => {
|
59 |
+
switch (e.data.status) {
|
60 |
+
case 'loading':
|
61 |
+
// Model file start load: add a new progress item to the list.
|
62 |
+
setStatus('loading');
|
63 |
+
setLoadingMessage(e.data.data);
|
64 |
+
break;
|
65 |
+
|
66 |
+
case 'initiate':
|
67 |
+
setProgressItems(prev => [...prev, e.data]);
|
68 |
+
break;
|
69 |
+
|
70 |
+
case 'progress':
|
71 |
+
// Model file progress: update one of the progress items.
|
72 |
+
setProgressItems(
|
73 |
+
prev => prev.map(item => {
|
74 |
+
if (item.file === e.data.file) {
|
75 |
+
return { ...item, ...e.data }
|
76 |
+
}
|
77 |
+
return item;
|
78 |
+
})
|
79 |
+
);
|
80 |
+
break;
|
81 |
+
|
82 |
+
case 'done':
|
83 |
+
// Model file loaded: remove the progress item from the list.
|
84 |
+
setProgressItems(
|
85 |
+
prev => prev.filter(item => item.file !== e.data.file)
|
86 |
+
);
|
87 |
+
break;
|
88 |
+
|
89 |
+
case 'loaded':
|
90 |
+
// Pipeline ready: the worker is ready to accept messages.
|
91 |
+
setStatus('ready');
|
92 |
+
break;
|
93 |
+
|
94 |
+
case 'complete':
|
95 |
+
setResult(e.data.result);
|
96 |
+
setTime(e.data.time);
|
97 |
+
setStatus('ready');
|
98 |
+
break;
|
99 |
+
}
|
100 |
+
};
|
101 |
+
|
102 |
+
// Attach the callback function as an event listener.
|
103 |
+
worker.current.addEventListener('message', onMessageReceived);
|
104 |
+
|
105 |
+
// Define a cleanup function for when the component is unmounted.
|
106 |
+
return () => {
|
107 |
+
worker.current.removeEventListener('message', onMessageReceived);
|
108 |
+
};
|
109 |
+
}, []);
|
110 |
+
|
111 |
+
const handleClick = useCallback(() => {
|
112 |
+
setResult(null);
|
113 |
+
setTime(null);
|
114 |
+
if (status === null) {
|
115 |
+
setStatus('loading');
|
116 |
+
worker.current.postMessage({ type: 'load', data: { device } });
|
117 |
+
} else {
|
118 |
+
setStatus('running');
|
119 |
+
worker.current.postMessage({
|
120 |
+
type: 'run', data: { audio, language }
|
121 |
+
});
|
122 |
+
}
|
123 |
+
}, [status, audio, language, device]);
|
124 |
+
|
125 |
+
return (
|
126 |
+
<div className="flex flex-col h-screen mx-auto text-gray-800 dark:text-gray-200 bg-white dark:bg-gray-900 max-w-[600px]">
|
127 |
+
|
128 |
+
{status === 'loading' && (
|
129 |
+
<div className="flex justify-center items-center fixed w-screen h-screen bg-black z-10 bg-opacity-[92%] top-0 left-0">
|
130 |
+
<div className="w-[500px]">
|
131 |
+
<p className="text-center mb-1 text-white text-md">{loadingMessage}</p>
|
132 |
+
{progressItems.map(({ file, progress, total }, i) => (
|
133 |
+
<Progress key={i} text={file} percentage={progress} total={total} />
|
134 |
+
))}
|
135 |
+
</div>
|
136 |
+
</div>
|
137 |
+
)}
|
138 |
+
<div className="my-auto">
|
139 |
+
<div className="flex flex-col items-center mb-2 text-center">
|
140 |
+
<h1 className="text-5xl font-bold mb-2">Whisper Diarization</h1>
|
141 |
+
<h2 className="text-xl font-semibold">In-browser automatic speech recognition w/ <br />word-level timestamps and speaker segmentation</h2>
|
142 |
+
</div>
|
143 |
+
|
144 |
+
<div className="w-full min-h-[220px] flex flex-col justify-center items-center">
|
145 |
+
{
|
146 |
+
!audio && (
|
147 |
+
<p className="mb-2">
|
148 |
+
You are about to download <a href="https://huggingface.co/onnx-community/whisper-base_timestamped" target="_blank" rel="noreferrer" className="font-medium underline">whisper-base</a> and <a href="https://huggingface.co/onnx-community/pyannote-segmentation-3.0" target="_blank" rel="noreferrer" className="font-medium underline">pyannote-segmentation-3.0</a>,
|
149 |
+
two powerful speech recognition models for generating word-level timestamps across 100 different languages and speaker segmentation, respectively.
|
150 |
+
Once loaded, the models ({modelSize}MB + 6MB) will be cached and reused when you revisit the page.<br />
|
151 |
+
<br />
|
152 |
+
Everything runs locally in your browser using <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline">🤗 Transformers.js</a> and ONNX Runtime Web,
|
153 |
+
meaning no API calls are made to a server for inference. You can even disconnect from the internet after the model has loaded!
|
154 |
+
</p>
|
155 |
+
)
|
156 |
+
}
|
157 |
+
|
158 |
+
<div className="flex flex-col w-full m-3 max-w-[520px]">
|
159 |
+
<span className="text-sm mb-0.5">Input audio/video</span>
|
160 |
+
<MediaInput
|
161 |
+
ref={mediaInputRef}
|
162 |
+
className="flex items-center border rounded-md cursor-pointer min-h-[100px] max-h-[500px] overflow-hidden"
|
163 |
+
onInputChange={(audio) => {
|
164 |
+
setResult(null);
|
165 |
+
setAudio(audio);
|
166 |
+
}}
|
167 |
+
onTimeUpdate={(time) => setCurrentTime(time)}
|
168 |
+
/>
|
169 |
+
</div>
|
170 |
+
|
171 |
+
<div className="relative w-full flex justify-center items-center">
|
172 |
+
<button
|
173 |
+
className="border px-4 py-2 rounded-lg bg-blue-400 text-white hover:bg-blue-500 disabled:bg-blue-100 disabled:cursor-not-allowed select-none"
|
174 |
+
onClick={handleClick}
|
175 |
+
disabled={status === 'running' || (status !== null && audio === null)}
|
176 |
+
>
|
177 |
+
{status === null ? 'Load model' :
|
178 |
+
status === 'running'
|
179 |
+
? 'Running...'
|
180 |
+
: 'Run model'
|
181 |
+
}
|
182 |
+
</button>
|
183 |
+
|
184 |
+
{status !== null &&
|
185 |
+
<div className='absolute right-0 bottom-0'>
|
186 |
+
<span className="text-xs">Language:</span>
|
187 |
+
<br />
|
188 |
+
<LanguageSelector className="border rounded-lg p-1 max-w-[100px]" language={language} setLanguage={setLanguage} />
|
189 |
+
</div>
|
190 |
+
}
|
191 |
+
</div>
|
192 |
+
|
193 |
+
{
|
194 |
+
result && time && (
|
195 |
+
<>
|
196 |
+
<div className="w-full mt-4 border rounded-md">
|
197 |
+
<Transcript
|
198 |
+
className="p-2 max-h-[200px] overflow-y-auto scrollbar-thin select-none"
|
199 |
+
transcript={result.transcript}
|
200 |
+
segments={result.segments}
|
201 |
+
currentTime={currentTime}
|
202 |
+
setCurrentTime={(time) => {
|
203 |
+
setCurrentTime(time);
|
204 |
+
mediaInputRef.current.setMediaTime(time);
|
205 |
+
}}
|
206 |
+
/>
|
207 |
+
</div>
|
208 |
+
<p className="text-sm text-gray-600 text-end p-1">Generation time: <span className="text-gray-800 font-semibold">{time.toFixed(2)}ms</span></p>
|
209 |
+
</>
|
210 |
+
)
|
211 |
+
}
|
212 |
+
</div>
|
213 |
+
</div>
|
214 |
+
</div >
|
215 |
+
)
|
216 |
+
}
|
217 |
+
|
218 |
+
export default App
|
whisper-speaker-diarization/src/components/LanguageSelector.jsx
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
function titleCase(str) {
|
3 |
+
str = str.toLowerCase();
|
4 |
+
return (str.match(/\w+.?/g) || [])
|
5 |
+
.map((word) => {
|
6 |
+
return word.charAt(0).toUpperCase() + word.slice(1);
|
7 |
+
})
|
8 |
+
.join("");
|
9 |
+
}
|
10 |
+
|
11 |
+
// List of supported languages:
|
12 |
+
// https://help.openai.com/en/articles/7031512-whisper-api-faq
|
13 |
+
// https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L79
|
14 |
+
const LANGUAGES = {
|
15 |
+
en: "english",
|
16 |
+
zh: "chinese",
|
17 |
+
de: "german",
|
18 |
+
es: "spanish/castilian",
|
19 |
+
ru: "russian",
|
20 |
+
ko: "korean",
|
21 |
+
fr: "french",
|
22 |
+
ja: "japanese",
|
23 |
+
pt: "portuguese",
|
24 |
+
tr: "turkish",
|
25 |
+
pl: "polish",
|
26 |
+
ca: "catalan/valencian",
|
27 |
+
nl: "dutch/flemish",
|
28 |
+
ar: "arabic",
|
29 |
+
sv: "swedish",
|
30 |
+
it: "italian",
|
31 |
+
id: "indonesian",
|
32 |
+
hi: "hindi",
|
33 |
+
fi: "finnish",
|
34 |
+
vi: "vietnamese",
|
35 |
+
he: "hebrew",
|
36 |
+
uk: "ukrainian",
|
37 |
+
el: "greek",
|
38 |
+
ms: "malay",
|
39 |
+
cs: "czech",
|
40 |
+
ro: "romanian/moldavian/moldovan",
|
41 |
+
da: "danish",
|
42 |
+
hu: "hungarian",
|
43 |
+
ta: "tamil",
|
44 |
+
no: "norwegian",
|
45 |
+
th: "thai",
|
46 |
+
ur: "urdu",
|
47 |
+
hr: "croatian",
|
48 |
+
bg: "bulgarian",
|
49 |
+
lt: "lithuanian",
|
50 |
+
la: "latin",
|
51 |
+
mi: "maori",
|
52 |
+
ml: "malayalam",
|
53 |
+
cy: "welsh",
|
54 |
+
sk: "slovak",
|
55 |
+
te: "telugu",
|
56 |
+
fa: "persian",
|
57 |
+
lv: "latvian",
|
58 |
+
bn: "bengali",
|
59 |
+
sr: "serbian",
|
60 |
+
az: "azerbaijani",
|
61 |
+
sl: "slovenian",
|
62 |
+
kn: "kannada",
|
63 |
+
et: "estonian",
|
64 |
+
mk: "macedonian",
|
65 |
+
br: "breton",
|
66 |
+
eu: "basque",
|
67 |
+
is: "icelandic",
|
68 |
+
hy: "armenian",
|
69 |
+
ne: "nepali",
|
70 |
+
mn: "mongolian",
|
71 |
+
bs: "bosnian",
|
72 |
+
kk: "kazakh",
|
73 |
+
sq: "albanian",
|
74 |
+
sw: "swahili",
|
75 |
+
gl: "galician",
|
76 |
+
mr: "marathi",
|
77 |
+
pa: "punjabi/panjabi",
|
78 |
+
si: "sinhala/sinhalese",
|
79 |
+
km: "khmer",
|
80 |
+
sn: "shona",
|
81 |
+
yo: "yoruba",
|
82 |
+
so: "somali",
|
83 |
+
af: "afrikaans",
|
84 |
+
oc: "occitan",
|
85 |
+
ka: "georgian",
|
86 |
+
be: "belarusian",
|
87 |
+
tg: "tajik",
|
88 |
+
sd: "sindhi",
|
89 |
+
gu: "gujarati",
|
90 |
+
am: "amharic",
|
91 |
+
yi: "yiddish",
|
92 |
+
lo: "lao",
|
93 |
+
uz: "uzbek",
|
94 |
+
fo: "faroese",
|
95 |
+
ht: "haitian creole/haitian",
|
96 |
+
ps: "pashto/pushto",
|
97 |
+
tk: "turkmen",
|
98 |
+
nn: "nynorsk",
|
99 |
+
mt: "maltese",
|
100 |
+
sa: "sanskrit",
|
101 |
+
lb: "luxembourgish/letzeburgesch",
|
102 |
+
my: "myanmar/burmese",
|
103 |
+
bo: "tibetan",
|
104 |
+
tl: "tagalog",
|
105 |
+
mg: "malagasy",
|
106 |
+
as: "assamese",
|
107 |
+
tt: "tatar",
|
108 |
+
haw: "hawaiian",
|
109 |
+
ln: "lingala",
|
110 |
+
ha: "hausa",
|
111 |
+
ba: "bashkir",
|
112 |
+
jw: "javanese",
|
113 |
+
su: "sundanese",
|
114 |
+
};
|
115 |
+
function LanguageSelector({ language, setLanguage, ...props }) {
|
116 |
+
const handleLanguageChange = (event) => {
|
117 |
+
setLanguage(event.target.value);
|
118 |
+
};
|
119 |
+
|
120 |
+
const names = Object.values(LANGUAGES).map(titleCase);
|
121 |
+
|
122 |
+
return (
|
123 |
+
<select
|
124 |
+
{...props}
|
125 |
+
value={language} onChange={handleLanguageChange}>
|
126 |
+
{Object.keys(LANGUAGES).map((key, i) => (
|
127 |
+
<option key={key} value={key}>
|
128 |
+
{names[i]}
|
129 |
+
</option>
|
130 |
+
))}
|
131 |
+
</select>
|
132 |
+
);
|
133 |
+
}
|
134 |
+
export default LanguageSelector
|
whisper-speaker-diarization/src/components/MediaInput.jsx
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useState, forwardRef, useRef, useImperativeHandle, useEffect, useCallback } from 'react';
|
2 |
+
|
3 |
+
const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/hopper.webm';
|
4 |
+
|
5 |
+
const MediaInput = forwardRef(({ onInputChange, onTimeUpdate, ...props }, ref) => {
|
6 |
+
// UI states
|
7 |
+
const [dragging, setDragging] = useState(false);
|
8 |
+
const fileInputRef = useRef(null);
|
9 |
+
|
10 |
+
// Create a reference to the audio and video elements
|
11 |
+
const audioElement = useRef(null);
|
12 |
+
const videoElement = useRef(null);
|
13 |
+
|
14 |
+
const currentTimeRef = useRef(0);
|
15 |
+
useImperativeHandle(ref, () => ({
|
16 |
+
setMediaTime(time) {
|
17 |
+
if (audioElement.current?.src) {
|
18 |
+
audioElement.current.currentTime = time;
|
19 |
+
} else if (videoElement.current?.src) {
|
20 |
+
videoElement.current.currentTime = time;
|
21 |
+
}
|
22 |
+
currentTimeRef.current = time;
|
23 |
+
}
|
24 |
+
}));
|
25 |
+
|
26 |
+
const onBufferLoad = (arrayBuffer, type) => {
|
27 |
+
const blob = new Blob([arrayBuffer.slice(0)], { type: type });
|
28 |
+
const url = URL.createObjectURL(blob);
|
29 |
+
processFile(arrayBuffer);
|
30 |
+
|
31 |
+
// Create a URL for the Blob
|
32 |
+
if (type.startsWith('audio/')) {
|
33 |
+
// Dispose the previous source
|
34 |
+
videoElement.current.pause();
|
35 |
+
videoElement.current.removeAttribute('src');
|
36 |
+
videoElement.current.load();
|
37 |
+
|
38 |
+
audioElement.current.src = url;
|
39 |
+
} else if (type.startsWith('video/')) {
|
40 |
+
// Dispose the previous source
|
41 |
+
audioElement.current.pause();
|
42 |
+
audioElement.current.removeAttribute('src');
|
43 |
+
audioElement.current.load();
|
44 |
+
|
45 |
+
videoElement.current.src = url;
|
46 |
+
} else {
|
47 |
+
alert(`Unsupported file type: ${type}`);
|
48 |
+
}
|
49 |
+
}
|
50 |
+
|
51 |
+
const readFile = (file) => {
|
52 |
+
if (!file) return;
|
53 |
+
|
54 |
+
// file.type
|
55 |
+
const reader = new FileReader();
|
56 |
+
reader.onload = (e) => {
|
57 |
+
onBufferLoad(e.target.result, file.type);
|
58 |
+
}
|
59 |
+
reader.readAsArrayBuffer(file);
|
60 |
+
}
|
61 |
+
|
62 |
+
const handleInputChange = (event) => {
|
63 |
+
readFile(event.target.files[0]);
|
64 |
+
};
|
65 |
+
|
66 |
+
const handleDragOver = (event) => {
|
67 |
+
event.preventDefault();
|
68 |
+
};
|
69 |
+
|
70 |
+
const handleDrop = (event) => {
|
71 |
+
event.preventDefault();
|
72 |
+
setDragging(false);
|
73 |
+
readFile(event.dataTransfer.files[0]);
|
74 |
+
};
|
75 |
+
|
76 |
+
const handleClick = (e) => {
|
77 |
+
if (e.target.tagName === 'VIDEO' || e.target.tagName === 'AUDIO') {
|
78 |
+
e.preventDefault();
|
79 |
+
fileInputRef.current.click();
|
80 |
+
} else if (e.target.tagName === 'INPUT') {
|
81 |
+
e.stopPropagation();
|
82 |
+
} else {
|
83 |
+
fileInputRef.current.click();
|
84 |
+
e.stopPropagation();
|
85 |
+
}
|
86 |
+
};
|
87 |
+
|
88 |
+
const processFile = async (buffer) => {
|
89 |
+
const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16_000 });
|
90 |
+
|
91 |
+
try {
|
92 |
+
const audioBuffer = await audioContext.decodeAudioData(buffer);
|
93 |
+
let audio;
|
94 |
+
if (audioBuffer.numberOfChannels === 2) {
|
95 |
+
// Merge channels
|
96 |
+
const SCALING_FACTOR = Math.sqrt(2);
|
97 |
+
const left = audioBuffer.getChannelData(0);
|
98 |
+
const right = audioBuffer.getChannelData(1);
|
99 |
+
audio = new Float32Array(left.length);
|
100 |
+
for (let i = 0; i < audioBuffer.length; ++i) {
|
101 |
+
audio[i] = SCALING_FACTOR * (left[i] + right[i]) / 2;
|
102 |
+
}
|
103 |
+
} else {
|
104 |
+
audio = audioBuffer.getChannelData(0);
|
105 |
+
}
|
106 |
+
onInputChange(audio);
|
107 |
+
|
108 |
+
} catch (e) {
|
109 |
+
alert(e);
|
110 |
+
}
|
111 |
+
};
|
112 |
+
|
113 |
+
const requestRef = useRef();
|
114 |
+
|
115 |
+
const updateTime = useCallback(() => {
|
116 |
+
let elem;
|
117 |
+
if (audioElement.current?.src) {
|
118 |
+
elem = audioElement.current;
|
119 |
+
|
120 |
+
} else if (videoElement.current?.src) {
|
121 |
+
elem = videoElement.current;
|
122 |
+
}
|
123 |
+
|
124 |
+
if (elem && currentTimeRef.current !== elem.currentTime) {
|
125 |
+
currentTimeRef.current = elem.currentTime;
|
126 |
+
onTimeUpdate(elem.currentTime);
|
127 |
+
}
|
128 |
+
|
129 |
+
// Request the next frame
|
130 |
+
requestRef.current = requestAnimationFrame(updateTime);
|
131 |
+
}, [onTimeUpdate]);
|
132 |
+
|
133 |
+
useEffect(() => {
|
134 |
+
// Start the animation
|
135 |
+
requestRef.current = requestAnimationFrame(updateTime);
|
136 |
+
|
137 |
+
return () => {
|
138 |
+
// Cleanup on component unmount
|
139 |
+
cancelAnimationFrame(requestRef.current);
|
140 |
+
};
|
141 |
+
}, [updateTime]);
|
142 |
+
return (
|
143 |
+
<div
|
144 |
+
{...props}
|
145 |
+
onClick={handleClick}
|
146 |
+
onDragOver={handleDragOver}
|
147 |
+
onDrop={handleDrop}
|
148 |
+
onDragEnter={(e) => setDragging(true)}
|
149 |
+
onDragLeave={(e) => setDragging(false)}
|
150 |
+
>
|
151 |
+
<input
|
152 |
+
type="file"
|
153 |
+
accept="audio/*,video/*"
|
154 |
+
onChange={handleInputChange}
|
155 |
+
ref={fileInputRef}
|
156 |
+
className="hidden"
|
157 |
+
/>
|
158 |
+
{
|
159 |
+
<audio
|
160 |
+
ref={audioElement}
|
161 |
+
controls
|
162 |
+
style={{ display: audioElement.current?.src ? 'block' : 'none' }}
|
163 |
+
className='w-full max-h-full'
|
164 |
+
/>
|
165 |
+
}
|
166 |
+
{
|
167 |
+
<video
|
168 |
+
ref={videoElement}
|
169 |
+
controls
|
170 |
+
style={{ display: videoElement.current?.src ? 'block' : 'none' }}
|
171 |
+
className='w-full max-h-full'
|
172 |
+
/>
|
173 |
+
}
|
174 |
+
{
|
175 |
+
!audioElement.current?.src && !videoElement.current?.src && (
|
176 |
+
<div className="w-full flex flex-col items-center justify-center border-2 border-dashed border-gray-300 rounded-md h-[250px]"
|
177 |
+
style={{ borderColor: dragging ? 'blue' : 'lightgray' }}
|
178 |
+
>
|
179 |
+
<span className="text-gray-600 text-center"><u>Drag & drop</u> or <u>click</u><br />to select media</span>
|
180 |
+
<span className="text-gray-500 text-sm hover:text-gray-800 mt-2" onClick={async (e) => {
|
181 |
+
e.stopPropagation();
|
182 |
+
const buffer = await fetch(EXAMPLE_URL).then((r) => r.arrayBuffer());
|
183 |
+
videoElement.current.src = URL.createObjectURL(new Blob([buffer], { type: 'video/mp4' }));
|
184 |
+
onBufferLoad(buffer, 'video/mp4');
|
185 |
+
}}>(or <u>try an example</u>)</span>
|
186 |
+
</div>
|
187 |
+
)
|
188 |
+
}
|
189 |
+
</div>
|
190 |
+
);
|
191 |
+
});
|
192 |
+
MediaInput.displayName = 'MediaInput';
|
193 |
+
|
194 |
+
export default MediaInput;
|
whisper-speaker-diarization/src/components/Progress.jsx
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
function formatBytes(size) {
|
2 |
+
const i = size == 0 ? 0 : Math.floor(Math.log(size) / Math.log(1024));
|
3 |
+
return +((size / Math.pow(1024, i)).toFixed(2)) * 1 + ['B', 'kB', 'MB', 'GB', 'TB'][i];
|
4 |
+
}
|
5 |
+
|
6 |
+
export default function Progress({ text, percentage, total }) {
|
7 |
+
percentage ??= 0;
|
8 |
+
return (
|
9 |
+
<div className="w-full bg-gray-100 dark:bg-gray-700 text-left rounded-lg overflow-hidden mb-0.5">
|
10 |
+
<div className="bg-blue-400 whitespace-nowrap px-1 text-sm" style={{ width: `${percentage}%` }}>
|
11 |
+
{text} ({percentage.toFixed(2)}%{isNaN(total) ? '' : ` of ${formatBytes(total)}`})
|
12 |
+
</div>
|
13 |
+
</div>
|
14 |
+
);
|
15 |
+
}
|
whisper-speaker-diarization/src/components/Transcript.jsx
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useEffect, useMemo, useRef } from "react";
|
2 |
+
|
3 |
+
const Chunk = ({ chunk, currentTime, onClick, ...props }) => {
|
4 |
+
const spanRef = useRef(null);
|
5 |
+
const { text, timestamp } = chunk;
|
6 |
+
const [start, end] = timestamp;
|
7 |
+
|
8 |
+
const bolded = start <= currentTime && currentTime < end;
|
9 |
+
|
10 |
+
useEffect(() => {
|
11 |
+
if (spanRef.current && bolded) { // scroll into view
|
12 |
+
spanRef.current.scrollIntoView({
|
13 |
+
behavior: 'smooth',
|
14 |
+
block: 'center',
|
15 |
+
inline: 'center',
|
16 |
+
});
|
17 |
+
}
|
18 |
+
}, [bolded]);
|
19 |
+
|
20 |
+
return (
|
21 |
+
<span {...props}>
|
22 |
+
{text.startsWith(' ') ? " " : ""}
|
23 |
+
<span
|
24 |
+
ref={spanRef}
|
25 |
+
onClick={onClick}
|
26 |
+
className="text-md text-gray-600 cursor-pointer hover:text-red-600"
|
27 |
+
title={timestamp.map(x => x.toFixed(2)).join(' → ')}
|
28 |
+
style={{
|
29 |
+
textDecoration: bolded ? 'underline' : 'none',
|
30 |
+
textShadow: bolded ? '0 0 1px #000' : 'none',
|
31 |
+
}}
|
32 |
+
>{text.trim()}</span>
|
33 |
+
</span>
|
34 |
+
)
|
35 |
+
}
|
36 |
+
|
37 |
+
const Transcript = ({ transcript, segments, currentTime, setCurrentTime, ...props }) => {
|
38 |
+
const jsonTranscript = useMemo(() => {
|
39 |
+
return JSON.stringify({
|
40 |
+
...transcript,
|
41 |
+
segments,
|
42 |
+
}, null, 2)
|
43 |
+
// post-process the JSON to make it more readable
|
44 |
+
.replace(/( {4}"timestamp": )\[\s+(\S+)\s+(\S+)\s+\]/gm, "$1[$2 $3]");
|
45 |
+
}, [transcript, segments]);
|
46 |
+
|
47 |
+
// Post-process the transcript to highlight speaker changes
|
48 |
+
const postProcessedTranscript = useMemo(() => {
|
49 |
+
let prev = 0;
|
50 |
+
const words = transcript.chunks;
|
51 |
+
|
52 |
+
const result = [];
|
53 |
+
for (const segment of segments) {
|
54 |
+
const { label, end } = segment;
|
55 |
+
if (label === 'NO_SPEAKER') continue;
|
56 |
+
|
57 |
+
// Collect all words within this segment
|
58 |
+
const segmentWords = [];
|
59 |
+
for (let i = prev; i < words.length; ++i) {
|
60 |
+
const word = words[i];
|
61 |
+
if (word.timestamp[1] <= end) {
|
62 |
+
segmentWords.push(word);
|
63 |
+
} else {
|
64 |
+
prev = i;
|
65 |
+
break;
|
66 |
+
}
|
67 |
+
}
|
68 |
+
if (segmentWords.length > 0) {
|
69 |
+
result.push({
|
70 |
+
...segment,
|
71 |
+
chunks: segmentWords,
|
72 |
+
})
|
73 |
+
}
|
74 |
+
}
|
75 |
+
return result;
|
76 |
+
}, [transcript, segments]);
|
77 |
+
|
78 |
+
const downloadTranscript = () => {
|
79 |
+
const blob = new Blob([jsonTranscript], { type: 'application/json' });
|
80 |
+
const url = URL.createObjectURL(blob);
|
81 |
+
const a = document.createElement('a');
|
82 |
+
a.href = url;
|
83 |
+
a.download = 'transcript.json';
|
84 |
+
a.click();
|
85 |
+
URL.revokeObjectURL(url);
|
86 |
+
}
|
87 |
+
|
88 |
+
return (<>
|
89 |
+
<div {...props}>
|
90 |
+
{
|
91 |
+
postProcessedTranscript.map(({ label, start, end, chunks }, i) => (
|
92 |
+
<div className="border-t py-2" key={i}>
|
93 |
+
<div className="flex justify-between">
|
94 |
+
<label className="text-xs font-medium">{label}</label>
|
95 |
+
<label className="text-xs">{start.toFixed(2)} → {end.toFixed(2)}</label>
|
96 |
+
</div>
|
97 |
+
<div>
|
98 |
+
{chunks.map((chunk, j) =>
|
99 |
+
<Chunk
|
100 |
+
key={j}
|
101 |
+
chunk={chunk}
|
102 |
+
currentTime={currentTime}
|
103 |
+
onClick={() => setCurrentTime(chunk.timestamp[0])} // Set to start of chunk
|
104 |
+
/>
|
105 |
+
)}
|
106 |
+
</div>
|
107 |
+
</div>
|
108 |
+
))
|
109 |
+
}
|
110 |
+
</div>
|
111 |
+
|
112 |
+
<div className="flex justify-center border-t text-sm text-gray-600 max-h-[150px] overflow-y-auto p-2 scrollbar-thin">
|
113 |
+
<button
|
114 |
+
className="flex items-center border px-2 py-1 rounded-lg bg-green-400 text-white hover:bg-green-500"
|
115 |
+
onClick={downloadTranscript}
|
116 |
+
>
|
117 |
+
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" strokeWidth={1.5} stroke="currentColor" className="size-6 mr-1">
|
118 |
+
<path strokeLinecap="round" strokeLinejoin="round" d="M3 16.5v2.25A2.25 2.25 0 0 0 5.25 21h13.5A2.25 2.25 0 0 0 21 18.75V16.5M16.5 12 12 16.5m0 0L7.5 12m4.5 4.5V3" />
|
119 |
+
</svg>
|
120 |
+
Download transcript
|
121 |
+
</button>
|
122 |
+
</div>
|
123 |
+
</>)
|
124 |
+
};
|
125 |
+
export default Transcript;
|
whisper-speaker-diarization/src/index.css
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@tailwind base;
|
2 |
+
@tailwind components;
|
3 |
+
@tailwind utilities;
|
4 |
+
|
5 |
+
@layer utilities {
|
6 |
+
.scrollbar-thin::-webkit-scrollbar {
|
7 |
+
@apply w-2;
|
8 |
+
}
|
9 |
+
|
10 |
+
.scrollbar-thin::-webkit-scrollbar-track {
|
11 |
+
@apply rounded-full bg-gray-100 dark:bg-gray-700;
|
12 |
+
}
|
13 |
+
|
14 |
+
.scrollbar-thin::-webkit-scrollbar-thumb {
|
15 |
+
@apply rounded-full bg-gray-300 dark:bg-gray-600;
|
16 |
+
}
|
17 |
+
|
18 |
+
.scrollbar-thin::-webkit-scrollbar-thumb:hover {
|
19 |
+
@apply bg-gray-500;
|
20 |
+
}
|
21 |
+
}
|
22 |
+
|
23 |
+
html {
|
24 |
+
@apply scrollbar-thin;
|
25 |
+
}
|
whisper-speaker-diarization/src/main.jsx
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React from 'react'
|
2 |
+
import ReactDOM from 'react-dom/client'
|
3 |
+
import App from './App.jsx'
|
4 |
+
import './index.css'
|
5 |
+
|
6 |
+
ReactDOM.createRoot(document.getElementById('root')).render(
|
7 |
+
<React.StrictMode>
|
8 |
+
<App />
|
9 |
+
</React.StrictMode>,
|
10 |
+
)
|
whisper-speaker-diarization/src/worker.js
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import { pipeline, AutoProcessor, AutoModelForAudioFrameClassification } from '@xenova/transformers';
|
3 |
+
|
4 |
+
const PER_DEVICE_CONFIG = {
|
5 |
+
webgpu: {
|
6 |
+
dtype: {
|
7 |
+
encoder_model: 'fp32',
|
8 |
+
decoder_model_merged: 'q4',
|
9 |
+
},
|
10 |
+
device: 'webgpu',
|
11 |
+
},
|
12 |
+
wasm: {
|
13 |
+
dtype: 'q8',
|
14 |
+
device: 'wasm',
|
15 |
+
},
|
16 |
+
};
|
17 |
+
|
18 |
+
/**
|
19 |
+
* This class uses the Singleton pattern to ensure that only one instance of the model is loaded.
|
20 |
+
*/
|
21 |
+
class PipelineSingeton {
|
22 |
+
static asr_model_id = 'onnx-community/whisper-base_timestamped';
|
23 |
+
static asr_instance = null;
|
24 |
+
|
25 |
+
static segmentation_model_id = 'onnx-community/pyannote-segmentation-3.0';
|
26 |
+
static segmentation_instance = null;
|
27 |
+
static segmentation_processor = null;
|
28 |
+
|
29 |
+
static async getInstance(progress_callback = null, device = 'webgpu') {
|
30 |
+
this.asr_instance ??= pipeline('automatic-speech-recognition', this.asr_model_id, {
|
31 |
+
...PER_DEVICE_CONFIG[device],
|
32 |
+
progress_callback,
|
33 |
+
});
|
34 |
+
|
35 |
+
this.segmentation_processor ??= AutoProcessor.from_pretrained(this.segmentation_model_id, {
|
36 |
+
progress_callback,
|
37 |
+
});
|
38 |
+
this.segmentation_instance ??= AutoModelForAudioFrameClassification.from_pretrained(this.segmentation_model_id, {
|
39 |
+
// NOTE: WebGPU is not currently supported for this model
|
40 |
+
// See https://github.com/microsoft/onnxruntime/issues/21386
|
41 |
+
device: 'wasm',
|
42 |
+
dtype: 'fp32',
|
43 |
+
progress_callback,
|
44 |
+
});
|
45 |
+
|
46 |
+
return Promise.all([this.asr_instance, this.segmentation_processor, this.segmentation_instance]);
|
47 |
+
}
|
48 |
+
}
|
49 |
+
|
50 |
+
async function load({ device }) {
|
51 |
+
self.postMessage({
|
52 |
+
status: 'loading',
|
53 |
+
data: `Loading models (${device})...`
|
54 |
+
});
|
55 |
+
|
56 |
+
// Load the pipeline and save it for future use.
|
57 |
+
const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance(x => {
|
58 |
+
// We also add a progress callback to the pipeline so that we can
|
59 |
+
// track model loading.
|
60 |
+
self.postMessage(x);
|
61 |
+
}, device);
|
62 |
+
|
63 |
+
if (device === 'webgpu') {
|
64 |
+
self.postMessage({
|
65 |
+
status: 'loading',
|
66 |
+
data: 'Compiling shaders and warming up model...'
|
67 |
+
});
|
68 |
+
|
69 |
+
await transcriber(new Float32Array(16_000), {
|
70 |
+
language: 'en',
|
71 |
+
});
|
72 |
+
}
|
73 |
+
|
74 |
+
self.postMessage({ status: 'loaded' });
|
75 |
+
}
|
76 |
+
|
77 |
+
async function segment(processor, model, audio) {
|
78 |
+
const inputs = await processor(audio);
|
79 |
+
const { logits } = await model(inputs);
|
80 |
+
const segments = processor.post_process_speaker_diarization(logits, audio.length)[0];
|
81 |
+
|
82 |
+
// Attach labels
|
83 |
+
for (const segment of segments) {
|
84 |
+
segment.label = model.config.id2label[segment.id];
|
85 |
+
}
|
86 |
+
|
87 |
+
return segments;
|
88 |
+
}
|
89 |
+
|
90 |
+
async function run({ audio, language }) {
|
91 |
+
const [transcriber, segmentation_processor, segmentation_model] = await PipelineSingeton.getInstance();
|
92 |
+
|
93 |
+
const start = performance.now();
|
94 |
+
|
95 |
+
// Run transcription and segmentation in parallel
|
96 |
+
const [transcript, segments] = await Promise.all([
|
97 |
+
transcriber(audio, {
|
98 |
+
language,
|
99 |
+
return_timestamps: 'word',
|
100 |
+
chunk_length_s: 30,
|
101 |
+
}),
|
102 |
+
segment(segmentation_processor, segmentation_model, audio)
|
103 |
+
]);
|
104 |
+
console.table(segments, ['start', 'end', 'id', 'label', 'confidence']);
|
105 |
+
|
106 |
+
const end = performance.now();
|
107 |
+
|
108 |
+
self.postMessage({ status: 'complete', result: { transcript, segments }, time: end - start });
|
109 |
+
}
|
110 |
+
|
111 |
+
// Listen for messages from the main thread
|
112 |
+
self.addEventListener('message', async (e) => {
|
113 |
+
const { type, data } = e.data;
|
114 |
+
|
115 |
+
switch (type) {
|
116 |
+
case 'load':
|
117 |
+
load(data);
|
118 |
+
break;
|
119 |
+
|
120 |
+
case 'run':
|
121 |
+
run(data);
|
122 |
+
break;
|
123 |
+
}
|
124 |
+
});
|
whisper-speaker-diarization/tailwind.config.js
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/** @type {import('tailwindcss').Config} */
|
2 |
+
export default {
|
3 |
+
content: [
|
4 |
+
"./index.html",
|
5 |
+
"./src/**/*.{js,ts,jsx,tsx}",
|
6 |
+
],
|
7 |
+
theme: {
|
8 |
+
extend: {},
|
9 |
+
},
|
10 |
+
plugins: [],
|
11 |
+
}
|
12 |
+
|
whisper-speaker-diarization/vite.config.js
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { defineConfig } from 'vite'
|
2 |
+
import react from '@vitejs/plugin-react'
|
3 |
+
|
4 |
+
// https://vitejs.dev/config/
|
5 |
+
export default defineConfig({
|
6 |
+
plugins: [react()],
|
7 |
+
})
|