(py310) chin@192 driverless_car % python tests/test_whisper4.py
/Users/chin/anaconda3/envs/py310/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: clean_up_tokenization_spaces was not set. It will be set to True by default. This behavior will be depracted in transformers v4.45, and will be then set to False by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
warnings.warn(
Using device: cpu
开始录音...
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
识别的中文文本: 该家屋是一个屋下的一个屋下屋。
识别的中文文本: 该建筑是一种一种的一个种。
识别的中文文本: 该建筑是一个建筑的一个建筑。
识别的中文文本: 该建筑是一种一种的变化。
识别的中文文本: 该家屋是一个位于美国的一个市镇。
识别的中文文本: 该建筑是一种一种的草为草属的草属。
识别的中文文本: 该建筑是一种一种的一个种。
识别的中文文本: 该建筑是一种一种的一个种。

代码

import sounddevice as sd
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import queue
import threading
import time

录音参数

sample_rate = 16000
duration = 2 # 录音持续时间（秒）
block_size = 1024 * 2 # 每次处理 5 秒的音频数据

初始化音频队列

q = queue.Queue(maxsize=int(sample_rate / block_size * (9 + duration)))

定义回调函数来处理实时音频输入

def callback(indata, frames, time, status):
if status:
print(status)
try:
q.put(indata.copy(), block=False)
except queue.Full:
print("Queue is full, dropping frame.")

model_name = 'xmzhu/whisper-tiny-zh'

初始化 Whisper 模型和处理器

processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

检查是否有可用的 GPU

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

将模型移到适当的设备上

model.to(device)

用于处理音频的线程函数

def process_audio(stop_event):
while not stop_event.is_set():
# 收集足够长的音频数据
collected_data = []
try:
while len(collected_data) < int(sample_rate / block_size * duration):
collected_data.append(q.get(block=True, timeout=1))
except queue.Empty:
continue

    # 将收集的音频数据拼接成一个完整的音频片段
    full_audio_data = np.concatenate(collected_data, axis=0)
    
    # 将音频数据转换为 PyTorch 张量
    audio_input = torch.from_numpy(full_audio_data).squeeze().float()

    # 如果音频数据是在 GPU 上处理的，确保在传给处理器前将其移到 CPU
    if device.type == 'cuda':
        audio_input = audio_input.cpu()

    # 使用处理器处理音频数据
    input_features = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_features

    # 如果模型在 GPU 上，则将输入特征也移到 GPU
    if device.type == 'cuda':
        input_features = input_features.to(device)

    # 使用模型进行语音识别
    predicted_ids = model.generate(input_features)

    # 输出识别结果
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    print(f"识别的中文文本: {transcription}")

创建停止事件用于终止线程

stop_event = threading.Event()

创建并启动音频处理线程

audio_thread = threading.Thread(target=process_audio, args=(stop_event,), daemon=True)
audio_thread.start()

启动实时录音

try:
with sd.InputStream(
samplerate=sample_rate,
channels=1,
dtype=np.float32,
blocksize=block_size, # 每次读取的数据块大小
callback=callback
):
print("开始录音...")
# 让主线程保持活跃状态
while True:
time.sleep(1)
except KeyboardInterrupt:
print("停止录音...")
finally:
# 设置停止事件以终止处理线程
stop_event.set()
audio_thread.join()
print("线程已停止.")

xmzhu
/

whisper-tiny-zh

在没有任何语音输入的情况它不断识别出中文

录音参数

初始化音频队列

定义回调函数来处理实时音频输入

初始化 Whisper 模型和处理器

检查是否有可用的 GPU

将模型移到适当的设备上

用于处理音频的线程函数

创建停止事件用于终止线程

创建并启动音频处理线程

启动实时录音

在没有任何语音输入的情况 它不断识别出中文

录音参数

初始化音频队列

定义回调函数来处理实时音频输入

初始化 Whisper 模型和处理器

检查是否有可用的 GPU

将模型移到适当的设备上

用于处理音频的线程函数

创建停止事件用于终止线程

创建并启动音频处理线程

启动实时录音

在没有任何语音输入的情况它不断识别出中文