Ahmedalla commited on
Commit
99b670d
1 Parent(s): f6f28e8

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +63 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import torchaudio
5
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperTokenizer
6
+
7
+ # Define paths to the model and processor
8
+ model_name = "userdata/whisper-largeV2-03-ms-v11-LORA-Merged"
9
+
10
+ # Load the processor and model
11
+ processor = AutoProcessor.from_pretrained(model_name)
12
+ tokenizer = WhisperTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
14
+
15
+ # Check and set the device
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+
18
+ # Function to chunk the audio
19
+ def chunk_audio(audio, chunk_length):
20
+ num_chunks = len(audio) // chunk_length + (1 if len(audio) % chunk_length > 0 else 0)
21
+ return [audio[i * chunk_length:(i + 1) * chunk_length] for i in range(num_chunks)]
22
+
23
+ # Function to transcribe an audio file
24
+ def transcribe(audio_path, chunk_length=16000 * 30): # 30 seconds chunks
25
+ # Load audio
26
+ speech_array, sampling_rate = torchaudio.load(audio_path)
27
+ # Resample to 16kHz
28
+ resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
29
+ speech = resampler(speech_array).squeeze().numpy()
30
+
31
+ # Chunk the audio if it's too long
32
+ chunks = chunk_audio(speech, chunk_length)
33
+
34
+ # Transcribe each chunk
35
+ transcriptions = []
36
+ for chunk in chunks:
37
+ # Process the audio
38
+ inputs = processor(chunk, sampling_rate=16000, return_tensors="pt")
39
+ inputs = {key: value.to(device).to(torch.float16) for key, value in inputs.items()} # Convert to float16
40
+
41
+ # Generate token IDs
42
+ with torch.no_grad():
43
+ generated_ids = model.generate(inputs["input_features"], max_length=448)
44
+
45
+ # Decode the token IDs to text
46
+ transcription = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
47
+ transcriptions.append(transcription)
48
+
49
+ # Combine transcriptions
50
+ full_transcription = ' '.join(transcriptions)
51
+ return full_transcription
52
+
53
+ # Create the Gradio interface
54
+ iface = gr.Interface(
55
+ fn=transcribe, # Update to match the function name
56
+ inputs=gr.Audio(type="filepath"),
57
+ outputs=gr.Textbox(),
58
+ title="Audio Transcription App",
59
+ description="Upload an audio file to get a transcription."
60
+ )
61
+
62
+ # Launch the Gradio interface
63
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers==4.41.2
2
+ gradio
3
+ torch
4
+ torchaudio