sidiali_project / app.py
Muhammed_Kotb1
test ziad model
5dd8287
raw
history blame
2.11 kB
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer
# Load the Arabic-specific processor and model
model_name = "Zaid/wav2vec2-large-xlsr-53-arabic-egyptian"
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, tokenizer=tokenizer)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
def transcribe(audio_file):
try:
# Load the audio file
print("Loading audio file...")
audio_input, sr = torchaudio.load(audio_file)
print(f"Audio loaded: {audio_input.shape}, Sample rate: {sr}")
# Resample if needed
if sr != 16000:
print("Resampling audio...")
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
audio_input = resampler(audio_input)
sr = 16000
print(f"Audio shape after resampling: {audio_input.shape}, Sample rate: {sr}")
# Convert tensor to numpy array
audio_input = audio_input[0].numpy()
# Process audio input
print("Processing audio input...")
input_values = processor(audio_input, return_tensors="pt", sampling_rate=sr).input_values
# Run model inference
print("Running model inference...")
with torch.no_grad():
logits = model(input_values).logits
# Decode transcription
print("Decoding transcription...")
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return transcription[0]
except Exception as e:
print(f"An error occurred: {e}")
return None
# Transcribe the audio file
transcription = transcribe("sidiali.wav")
if transcription:
print(transcription.encode('utf-8').decode('utf-8'))
# Save the transcription to a file
with open("transcription.txt", "w", encoding="utf-8") as f:
f.write(transcription)
print("Transcription saved to transcription.txt")