sidiali_project / app.py
Muhammed_Kotb1
add audio files in chunks to avoid OOMs
233fed6
raw
history blame
9.96 kB
import torch
import torchaudio
import jiwer
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
# Load the Arabic-specific processor and model
model_name = "omarxadel/wav2vec2-large-xlsr-53-arabic-egyptian"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
# Set the model to evaluation mode
model.eval()
# Define the chunk length in seconds
CHUNK_LENGTH = 30 # 30 seconds
SAMPLE_RATE = 16000 # 16 kHz
def transcribe_chunk(chunk):
"""
Transcribes a single audio chunk.
"""
# Process the audio chunk with the processor
inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
# Get logits from the model
with torch.no_grad():
logits = model(inputs.input_values).logits
# Decode the predicted ids to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription.strip()
def transcribe_audio_in_chunks(audio_file):
"""
Loads an audio file, splits it into chunks, transcribes each chunk, and combines the transcriptions.
"""
# Load the audio file
print("Loading audio file...")
waveform, sr = torchaudio.load(audio_file)
print(f"Original Sample Rate: {sr}, Waveform shape: {waveform.shape}")
# Convert to mono if it's stereo
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
print(f"Converted to mono. New shape: {waveform.shape}")
# Resample if needed
if sr != SAMPLE_RATE:
print(f"Resampling from {sr} to {SAMPLE_RATE}...")
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
waveform = resampler(waveform)
print(f"Resampled waveform shape: {waveform.shape}")
# Calculate the number of samples per chunk
chunk_size = CHUNK_LENGTH * SAMPLE_RATE
total_samples = waveform.shape[1]
num_chunks = (total_samples + chunk_size - 1) // chunk_size # Ceiling division
print(f"Total samples: {total_samples}, Chunk size (in samples): {chunk_size}, Number of chunks: {num_chunks}")
transcriptions = []
for i in range(num_chunks):
start_sample = i * chunk_size
end_sample = min(start_sample + chunk_size, total_samples)
chunk = waveform[:, start_sample:end_sample].squeeze(0)
print(f"Processing chunk {i+1}/{num_chunks}, Samples {start_sample}:{end_sample}")
# Normalize the audio
chunk = chunk / torch.abs(chunk).max()
transcription = transcribe_chunk(chunk.numpy())
transcriptions.append(transcription)
print(f"Transcription for chunk {i+1}: {transcription}\n")
# Combine all transcriptions
full_transcription = ' '.join(transcriptions)
return full_transcription.strip()
# Define reference transcription for WER calculation
reference_transcription = "ู‚ุงู„ ุงู„ู…ุตู†ู ุฑุญู…ู‡ ุงู„ู„ู‡ ุชุนุงู„ู‰ ูˆู†ูุนู†ุง ุงู„ู„ู‡ ุจุนู„ูˆู…ู‡ ููŠ ุงู„ุฏุงุฑูŠู† ุงู…ูŠู† ูุตู„ ูˆุงู„ุฑุจุง ููŠ ุงู„ุฐู‡ุจ ูˆุงู„ูุถู‡ ูˆุงู„ู…ุทุนูˆู…ุงุช ุงู„ุฑุจุง ุดูƒู„ ู…ู† ุงุดูƒุงู„ ุงู„ุจูŠูˆุน ููŠู‡ ุชุจุงุฏู„ ู…ุซู„ ุงู„ุซู…ู† ูˆุงู„ู…ุซู…ู† ูู‡ู†ุง ููŠู‡ ู…ุงู„ ุจุงุฒุงุก ู…ุงู„ ูˆู„ุฐู„ูƒ ุงุฎุชู„ุท ุงู„ุงู…ุฑ ุนู„ู‰ ุงู„ู…ุดุฑูƒูŠู† ููŠ ู…ูƒู‡ ูู‚ุงู„ูˆุง ุงู†ู…ุง ุงู„ุจูŠุน ู…ุซู„ ุงู„ุฑุจุง ูŠุนู†ูŠ ู‡ุฐุง ููŠู‡ ู…ุจุงุฏู„ู‡ ูˆู‡ุฐุง ููŠู‡ ู…ุจุงุฏู„ู‡ ูˆููŠ ุงู„ู…ุจุงุฏู„ู‡ ุฒูŠุงุฏู‡ ุจูŠู† ุงู„ุซู…ู† ูˆ ุงู„ู…ุซู…ู† ูˆู‡ู†ุง ุฒูŠุงุฏู‡ ุจูŠู† ุงู„ุซู…ู† ูˆุงู„ู…ุซู…ู† ูู…ุง ุงู„ูุฑู‚ ู†ุญู† ู†ุจูŠุน ูˆู†ุดุชุฑูŠ ูˆุงู„ูุฑู‚ ู‡ูˆ ุงู† ุงู„ู„ู‡ ุณุจุญุงู†ู‡ ูˆุชุนุงู„ู‰ ุงู…ุฑ ุจุญุฑู…ุฉ ุงู„ุฑุจุง ู…ู† ุงุฌู„ ุงู„ุญูุงุธ ุนู„ู‰ ู†ุธุงู… ุงู„ุงู‚ุชุตุงุฏ ุงู„ู†ู‚ุฏูŠ ู…ู† ุฏุงุฎู„ู‡ ูู„ูˆ ุงุจุงุญ ุงู„ู„ู‡ ุงู„ุฑุจุง ู„ูƒุงู† ู‡ู†ุงูƒ ุฒูŠุงุฏู‡ ููŠ ูˆุณูŠุท ุงู„ุชุจุงุฏู„ ุฏูˆู† ุฒูŠุงุฏุฉ ุงู„ุงู†ุชุงุฌ ูˆูŠุญุฏุซ ุจุฐู„ูƒ ุงู„ุชุถุฎู… ู…ู† ุฏุงุฎู„ ุงู„ู†ุธุงู… ูุงุฐุง ุญุฏุซ ุงู„ุชุถุฎู… ู…ู† ุฐุงุช ุงู„ู†ุธุงู… ู ุงู„ุฐูŠ ูŠุถุงุฑ ุจุฐู„ูƒ ู…ุญุฏูˆุฏ ุงู„ุฏุฎู„ ุงู„ุฐูŠ ูŠุถุงุฑ ุจุฐู„ูƒ ุงู„ูู‚ูŠุฑ ูุญุฑู… ุงู„ู„ู‡ ุงู„ุฑุจุง ู…ู† ุงุฌู„ ุงู„ุง ูŠู‚ุน ุธู„ู… ุนู„ู‰ ุงู„ูู‚ูŠุฑ ูˆุงูˆุฌุจ ุงู„ุฒูƒุงู‡ ุญุชู‰ ูŠุชุนุงุฏู„ ุฏุฎู„ ุงู„ูู‚ูŠุฑ ู„ุงุฏุงุก ู…ู‚ุชุถูŠุงุช ุงู„ุญูŠุงู‡ ูุชุญุฑูŠู… ุงู„ุฑุจุง ู…ุฑุชุจุท ููŠ ุงู„ู†ุธุฑ ุงู„ุงู„ู‡ูŠ ู…ุน ุงูŠุฌุงุจ ุงู„ุฒูƒุงู‡ ุญุฑู… ุงู„ู„ู‡ ุงู„ุฑุจุง ูˆุงูˆุฌุจ ุญุฑู… ูˆุงูˆุฌุจ ุงู„ุฒูƒุงู‡ ู…ู† ุงุฌู„ ุญู…ุงูŠุฉ ุงู†ุณุงู†ูŠุฉ ุงู„ูู‚ูŠุฑ ู…ุญุฏูˆุฏ ุงู„ุฏุฎู„ ูู„ุง ูŠุฌุฏ ุงู„ุงุณุนุงุฑ ููˆู‚ ุงู„ุทุงู‚ู‡ ูˆูŠุณุชุทูŠุน ุงู† ูŠุญุตู„ ุจุงุณุชู‡ู„ุงูƒู‡ ู„ู…ุง ุงุฎุฐู‡ ู…ู† ุงู„ุฒูƒุงู‡ ู…ุง ูŠุฑูŠุฏ ู…ู† ู…ู‚ุชุถูŠุงุช ุงู„ุญูŠุงู‡ ูุงุฐุง ูƒุงู† ุงู„ุงู…ุฑ ูƒุฐู„ูƒ ูˆู‡ุฐุง ู†ุธุงู… ุฑุจุงู†ูŠ ุงู„ู‡ูŠ ูƒุงู† ุญุฑุง ู„ุง ูŠุณุชุทูŠุน ุงุญุฏ ุงู† ูŠุชุญูƒู… ููŠู‡ ู„ุง ููŠ ุงุฌุชู…ุงุน ูˆู„ุง ููŠ ุณูŠุงุณู‡ ูˆู„ุง ููŠ ุงู‚ุชุตุงุฏ ูŠูƒู…ู„ ุงู„ู…ู†ุธูˆู…ู‡ ุงู„ุตู„ุงู‡ ูุงุฐุง ุฐู‡ุจู‡ุง ุงู„ู†ุงุณ ูˆู‚ููˆุง ู…ุชุฌุงูˆุฑูŠู† ุงู„ุบู†ูŠ ุจุฌูˆุงุฑู‡ ุงู„ูู‚ูŠุฑ ูˆุงู„ูู‚ูŠุฑ ุจุฌูˆุงุฑู‡ ุงู„ู…ุณูƒูŠู† ูˆุงู„ู…ุณูƒูŠู† ุจุฌูˆุงุฑู‡ ุงู„ุบู†ูŠ ู„ุง ูุฑู‚ ุจูŠู†ู‡ู…ุง ูˆุงู„ุตู ุงู„ุงูˆู„ ู…ู†ุงุฎ ู…ู† ุณุจู‚ ุณุจู‚ ุงู„ู‰ ุงู„ุตู ุงู„ุงูˆู„ ุงู„ุบู†ูŠ ุงูˆ ุณุจู‚ ุงู„ู‰ ุงู„ุตู ุงู„ุซุงู†ูŠ ุงูˆ ุณุจู‚ ุงู„ู‰ ุงู„ุตู ุงู„ุงูˆู„ ุงู„ูู‚ูŠุฑ ู„ุง ูŠุณุชุทูŠุน ุงุญุฏ ุงู† ูŠุฌุฐุจู‡ ู…ู† ุงูˆู„ูŠุชู‡ ุญุชู‰ ูŠุชุงุฎุฑ ุจุณุจุจ ุบู†ุงู‡ ุงูˆ ุฌุงู‡ู‡ ุงูˆ ุณู„ุทุงู†ู‡ ู„ุงู† ู…ู† ุฌุงุก ููˆุฌุฏ ุงู„ุตู„ุงู‡ ูู„ูŠู„ุญู‚ ุจู‡ุง ุญูŠุซ ู…ุง ุงู†ุชู‡ู‰ ุจู‡ ุงู„ุตู ูˆูƒุฐู„ูƒ ููŠ ุทู„ุจ ุงู„ุนู„ู… ุญูŠุซ ู…ุง ุงู†ุชู‡ู‰ ุจู‡ ุงู„ู…ุฌู„ุณ ู…ู† ุงูŠู† ุชุชุงุชู‰ ู‡ุฐู‡ ุงู„ู†ูุณูŠู‡ ุงู„ุงุจูŠู‡ ู…ู† ุฏูˆู† ูƒุจุฑ ุงู†ู…ุง ุจุนุฒู‡ ูˆุฑู‚ูŠ ูˆุงู†ุณุงู†ูŠู‡ ุจูˆุฌูˆุจ ุงู„ุฒูƒุงู‡ ูุงู„ุฒูƒุงุฉ ู„ูŠุณุช ู…ู†ู‡ ูˆู„ุง ุนุทูŠู‡ ูˆู„ุง ู‡ูŠ ูŠุฏ ุนู„ูŠุง ูˆ ุจุญุฑู…ุฉ ุงู„ุฑุจุง ุญุชูŠ ูŠุญุงูุธ ุนู„ู‰ ุงู„ู†ุธุงู… ุงู„ุงู‚ุชุตุงุฏูŠ ู…ู† ุฏุงุฎู„ู‡ ู…ู† ุงู„ุชุถุฎู… ุฐู‡ุจุช ุงู„ุงูŠุงู… ูˆุฌุงุกุช ุงู„ุงูŠุงู… ูˆุชุณู„ุท ุนู„ูŠู†ุง ุงุจุงู„ุณู‡ ุงู„ุฌู† ูˆุงู„ุงู†ุณ ูˆุงุฎุฑุฌูˆุง ุงู„ุฐู‡ุจ ูˆุงู„ูุถู‡ ู…ู† ุงู„ุชุนุงู…ู„ ูˆุงุณุชุจุฏู„ูˆุง ุจู‡ุง ุญูŠู„ู‡ ุณุฎูŠูู‡ ูˆู‡ูŠ ุญูŠู„ู‡ ุงู„ูˆุฑู‚ ุงู„ู†ู‚ุฏูŠ ุฌุนู„ูˆุง ู„ู‡ ุงูˆู„ุง ุบุทุงุก ุซู… ุจุนุฏ ุฐู„ูƒ ุชู†ุงุณูˆุง ุงู„ุบุทุงุก ูุงุตุจุญ ูˆุฑู‚ุง ู„ุง ู‚ูŠู…ู‡ ู„ู‡ ููŠ ุงู†ุชุงุฌู‡ ู…ุน ู‚ูŠู…ุชู‡ ุฏูŠู†ุงุฑ ุงู„ุฐู‡ุจ 4 ุฌุฑุงู… ูˆุฑุจุน ู„ูˆ ุณูŠุญู†ุงู‡ ูŠุทู„ุน 4 ุฌุฑุงู… ูˆุฑุจุน ุฐู‡ุจ ู„ูˆ ุตูƒูŠู†ุงู‡ ูŠุทู„ุน 4 ุฌุฑุงู… ูˆุฑุจุน ุฐู‡ุจ ู„ูˆ ุจุนู†ุงู‡ ูŠุทู„ุน 4 ุฌุฑุงู… ูˆุฑุจุน ุงู„ุฐู‡ุจ ู‚ูŠู…ุชู‡ ููŠู‡ ุงู„ูˆุฑู‚ู‡ ุงู„ 200 ุฌู†ูŠู‡ ุงู„ู„ูŠ ููŠ ุฌูŠุจูƒ ู…ุชูƒู„ูู‡ 16 ู‚ุฑุด ุญุชุฉ ุงู„ูˆุฑู‚ู‡ ุฏูŠ ูˆุดูˆูŠุฉ ุงู„ุงุญุจุงุฑ ุงู„ู„ูŠ ุนู„ูŠู‡ุง 16 ู‚ุฑุด ุทุจ ุงู„ูุฑู‚ ุจูŠู† 16 ู‚ุฑุด ูˆ200 ุฌู†ูŠู‡ ููŠู† ุณู„ุทุฉ ุงู„ุฏูˆู„ู‡ ุณู„ุทุฉ ุงู„ุงุตุฏุงุฑ ูˆุณู„ุทุฉ ุงู„ุฏูˆู„ู‡ ุฏูŠ ุจุชุชุญุณุจ ุงุฒุงูŠ ููŠ ู†ูุณู‡ุง ุจู‡ุง ูŠุนู†ูŠ ุฌูˆู‡ ู…ุตุฑ ูƒุฏู‡ ูŠุจู‚ู‰ ุจู‡ุง ูŠุนู†ูŠ ุงู„ 200 ุฌู†ูŠู‡ 200 ุฌู†ูŠู‡ ุทุจ ูˆููŠ ุฎุงุฑุฌู‡ุง ูŠุจู‚ู‰ ุจู‚ูˆุฉ ุงู†ุชุงุฌู‡ุง ูุงู„ุฏูˆู„ุงุฑ ูŠุจู‚ู‰ ุจ 50 ุฌู†ูŠู‡ ุงู„ู„ูŠ ู‡ูˆ ูƒุงู† ุจ 39 ู‚ุฑุด ูˆู†ุต ูŠุนู†ูŠ ูƒุงู† ุงู„ุฌู†ูŠู‡ ุจ 5 ุฏูˆู„ุงุฑ ูˆุจุนุฏูŠู† ู„ู…ุง ุงู†ู‡ุงุฑ ุจู‚ู‰ 2 ูˆู†ุต ููŠ ุงู„ุงูˆู„ ููŠ ุงู„ุงุฑุจุนูŠู†ุงุช ูƒุงู† ุงู„ุฌู†ูŠู‡ ุงู„ู…ุตุฑูŠ ุจ 5 ุฏูˆู„ุงุฑ 5 ุฏูˆู„ุงุฑ ููŠ 50 ุฌู†ูŠู‡ ูŠุจู‚ู‰ ุจูƒุงู… ูŠุจู‚ู‰ ุจ 250 ู…ุฑู‡ ูŠุจู‚ู‰ ุงู„ุฌู†ูŠู‡ ุงุจูˆ 40 ุณู†ุฉ 40 ุจูŠุณุงูˆูŠ 250 ุฌู†ูŠู‡ ู…ู† ุงู„ุฌู†ูŠู‡ุงุช ุงู„ู„ูŠ ู…ุนุงู†ุง ุฏูŠ ุชุฎูŠู„ ุงุถุฑุจ ุฏุฎู„ูƒ ููŠ 250 ูู„ูˆ ุณุงู„ู†ุง ุงู„ุดูŠุฎ ูู„ุงู† ูˆู„ุง ุนู„ุงู† ุจุชุงุฎุฏ ูƒุงู… ูŠู‚ูˆู„ู„ูƒ ุนุจุงุฑู‡ ุนู† 6000 ุฌู†ูŠู‡ ููŠ ุงู„ุดู‡ุฑ ุทุจ 6000 ุฌู†ูŠู‡ ููŠ ุงู„ุดู‡ุฑ ุงุถุฑุจู‡ุง ููŠ 250 ูŠุจู‚ู‰ 150 ู‚ุตุงุฏู‡ุง ุตูุฑ ูŠุจู‚ู‰ ู…ู„ูŠูˆู† ูˆู†ุต ู‡ุง ุชุฎูŠู„ ู†ูุณูƒ ุจู‚ู‰ ุงู† ุงู†ุช ู…ุนุงูƒ ู…ู„ูŠูˆู† ูˆู†ุต ูƒู„ ุดู‡ุฑ ู‡ุชุนุฑู ุชุณูƒู† ู‡ุชุนุฑู ุชุดุชุฑูŠ ู„ูƒ ุนุฑุจูŠู‡ ู…ู† ุงู… 2 ู…ู„ูŠูˆู† ุฏูŠ ุงู‚ู„ ุญุงุฌู‡ ุญุชู‰ ุงู„ุนุฑุจูŠู‡ ุงู… 14 ู…ู„ูŠูˆู† ู…ู…ูƒู† ุชุฌูŠุจู‡ุง ุจุณ ุจุงู„ุชู‚ุณูŠุท ุงู„ู„ู‡ ู‡ูˆ ุงูŠู‡ ุงู„ู„ูŠ ุญุตู„ ุชุถุฎู… ูˆุงู„ุชุถุฎู… ุฏู‡ ู…ุนู†ุงุชู‡ ุงูŠู‡ ู…ุนู†ุงุชู‡ ุงู† ู‡ุฐุง ุงู„ูˆุณูŠุท ู„ู„ุชุจุงุฏู„ ู„ู… ูŠุนุฏ ุณุจุจุง ู„ุงุบู†ุงุก ุงู„ูู‚ูŠุฑ ุงู„ูƒู„ุงู… ุฏู‡ ูƒู„ู‡ ูŠุง ุงุฎูŠ ู„ู‚ูŠู†ุง ุงู„ุดุงูุนูŠ ูู‡ู…ู‡ ุงู„ูƒู„ุงู… ุงู„ู„ูŠ ุงุญู†ุง ู‚ู„ู†ุงู‡ ุฏู‡ ูˆุงู„ุงู…ุงู… ุงู„ุบุฒุงู„ูŠ ุฑูƒู† ุงู„ุดุงูุนูŠู‡ ูุงู‡ู…ูŠู†ู‡ ุจูŠูƒุชุจูˆู‡ ุนุงุฏูŠ ูƒุฏู‡ ููŠู‚ูˆู„ ุฑุถูŠ ุงู„ู„ู‡ ุชุนุงู„ู‰ ุนู†ู‡ ููŠ ุงู„ุงู… ูˆุนู„ุฉ ุงู„ุฑุจุง ุชุนุจุฏูŠู‡ ูุงู„ุฐู‡ุจ ูˆุงู„ูุถู‡ ุฌุฒุก ุนู„ู‡ ูŠุนู†ูŠ ุฌุฒุก ุงู„ุนู„ู‡ ูŠุนู†ูŠ ุงุฐุง ุฎุฑุฌ ุนู† ุงู„ุฐู‡ุจูŠู‡ ูˆุงู„ูุถูŠู‡ ูู„ุง ุฑุจุง ูˆูŠู‚ูˆู„ ุงุจู† ุงู„ุญุงุฌุจ ูˆู‡ู„ ุญุฑู… ุงู„ุฑุจุง ู„ุฌูˆู‡ุฑู‡ู…ุง ุงูˆ ู„ุฌูˆู‡ุฑูŠุชู‡ู…ุง ุงู„ุฐู‡ุจ ูˆุงู„ูุถู‡ ู„ุฌูˆู‡ุฑู‡ู…ุง ุนุดุงู† ู‡ู… ุฐู‡ุจ ูˆูุถู‡ ูˆู„ุง ู„ุงู†ู‡ู… ูˆุณูŠุท ู„ู„ุชุจุงุฏู„ ู‚ุงู„ ุจู„ ู„ุฌูˆู‡ุฑู‡ู…ุง ู„ุฌูˆู‡ุฑู‡ู…ุง ูŠุนู†ูŠ ู„ุงู†ู‡ู… ุฐู‡ุจ ูˆูุถู‡ ู…ุด ู„ุฌูˆู‡ุฑูŠุชู‡ู…ุง ูŠุนู†ูŠ ูƒูˆู†ู‡ู… ูˆุณูŠุท ู„ู„ุชุจุงุฏู„ ุจูŠู† ุงู„ู†ุงุณ ู„ูˆ ูƒุงู† ูˆุณูŠุท "
# Transcribe the audio file in chunks
audio_file = "sidiali_reba_zahab_feda.wav"
transcription = transcribe_audio_in_chunks(audio_file)
if transcription:
print("Final Transcription:\n", transcription)
# Calculate Word Error Rate (WER)
wer = jiwer.wer(reference_transcription, transcription)
print(f"\nWord Error Rate (WER): {wer}")
# Save the transcription and WER to a file
with open("evaluation_results.txt", "w", encoding="utf-8") as f:
f.write(f"Transcription:\n{transcription}\n\n")
f.write(f"Reference:\n{reference_transcription}\n\n")
f.write(f"Word Error Rate (WER): {wer}\n")
print("\nEvaluation results saved to evaluation_results.txt")
else:
print("Transcription failed.")