Spaces:

vitorcalvi
/

MMESA-ZeroGPU

Sleeping

App Files Files Community

MMESA-ZeroGPU / tabs /whisperVoiceMetrics_OK.py

vitorcalvi

pre-launch

efabbbd 2 months ago

raw

history blame contribute delete

1.95 kB

	import gradio as gr
	from transformers import pipeline
	import librosa
	import numpy as np
	import matplotlib.pyplot as plt

	# Load Whisper model using transformers pipeline
	transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

	def analyze_audio(audio):
	# Convert audio to text using Whisper
	transcription_result = transcriber(audio)
	transcription = transcription_result["text"]

	# Load audio file
	y, sr = librosa.load(audio, sr=None)

	# Extract prosodic features
	pitch = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
	tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

	# Calculate pitch variance
	pitch_variance = np.var(pitch)

	# Estimate speaking pace (syllables per second)
	# This is a simplified estimation based on the number of words
	num_syllables = len(transcription.split())
	duration = librosa.get_duration(y=y, sr=sr)
	pace = num_syllables / duration

	# Plot pitch
	plt.figure(figsize=(10, 4))
	plt.plot(pitch, label='Pitch')
	plt.xlabel('Time')
	plt.ylabel('Frequency (Hz)')
	plt.title('Pitch Over Time')
	plt.legend()
	pitch_plot_path = '/tmp/pitch_contour.png'
	plt.savefig(pitch_plot_path)
	plt.close()

	# Voice Stress Analysis (simplified example)
	stress_level = np.std(pitch) # Standard deviation as a simple stress indicator

	return transcription, tempo, pace, pitch_variance, pitch_plot_path

	# Create Gradio interface
	input_audio = gr.Audio(label="Input Audio", type="filepath")

	iface = gr.Interface(
	fn=analyze_audio,
	inputs=input_audio,
	outputs=[
	gr.Textbox(label="Transcription"),
	gr.Number(label="Tempo (BPM)"),
	gr.Number(label="Speaking Pace (syllables/sec)"),
	gr.Number(label="Pitch Variance"),
	gr.Image(label="Pitch Contour Plot")
	],
	live=True
	)

	iface.launch(share=False)