Spaces:
Sleeping
Sleeping
# imports for | |
from datasets import Audio, load_dataset, Dataset | |
import torch | |
from transformers import pipeline | |
from pathlib import Path | |
# for the UI by Gradio | |
import gradio as gr | |
import pandas as pd | |
# initializing the values for device | |
if torch.cuda.is_available(): | |
DEVICE = "cuda:0" | |
TORCH_DTYPE = torch.float16 | |
else: | |
DEVICE = "cpu" | |
TORCH_DTYPE = torch.float32 | |
# MODEL_NAME = 'openai/whisper-large-v3' | |
MODEL_NAME = 'openai/whisper-small' | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=MODEL_NAME, | |
torch_dtype=TORCH_DTYPE, | |
device=DEVICE, | |
) | |
def convert_audio_2_array(files): | |
file_paths = files | |
complaint_data = Dataset.from_dict({ | |
"audio" : [file_paths] # path to be changed based on the path | |
}).cast_column("audio",Audio(sampling_rate=16000)) | |
file_name = Path(file_paths).name | |
print(f"file_paths: \n {file_paths} and \n file_name: {file_name}and \n complaint_data : \n {complaint_data} ") | |
return file_name, complaint_data | |
def v2t_convertor(files): | |
file_name,inputs_dict = convert_audio_2_array(files) | |
input = inputs_dict[0] # selecting only one input | |
org_complain_dict = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "transcribe"}) | |
print('f{org_complain_dict}') | |
org_complain = org_complain_dict['text'] | |
# lang_token = pipe.model.generate(input, max_new_tokens=1)[0,1] | |
# language_code = pipe.tokenizer.decode(lang_token) | |
language_code = "hi" # hard coded for the time being | |
eng_complain = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "translate"})['text'] | |
print(f"org_complain: \n {org_complain} \nand\n eng_complain:\n {eng_complain} \n language_code: {language_code}") | |
return [[file_name, org_complain, eng_complain, language_code]] | |
def upload_file(files): # the actual translation should happen here | |
""" | |
takes the file that comes from the UI and converts it to the respective | |
format to be sent to the model for transcription | |
""" | |
# Define the column names | |
columns = ["audio_id", "transcribed_text( in org lang )", "transcribed_text( in eng )", "language"] | |
# Define the data as lists. Here the data would be sent in the form of single data fro mthe data | |
# data = [ ["ca_1.wav", "बिना किसी पूर्व सूचना के विलंबित या रद्द की गई ट्रिनिक", "without any prior information or any delay or delay in the train journey", "hindi"]] | |
data = v2t_convertor(files) | |
# Create the DataFrameoutputs | |
if data is not None: | |
df = pd.DataFrame(data, columns=columns) | |
else: | |
raise ValueError("Data is None. Cannot create DataFrame.") | |
return df | |
with gr.Blocks() as demo: | |
gr.Markdown( "# V2T Engine" ) | |
with gr.Accordion("Steps to run the App"): | |
gr.Markdown("1. Click \"Click to Upload a File\" to open a dialog box and browse your local files.") | |
gr.Markdown("2. The uploaded audio will be transcribed in the original language and translated into English.") | |
gr.Markdown("3. The transcriptions and translations will be displayed in a table, which is how they will be stored in the database.") | |
upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio"], file_count="single") | |
df_output = gr.Dataframe( | |
headers=["audio_id", "transcribed_text( in eng )", "transcribed_text( in org lang )", "language"], | |
datatype=["str", "str", "str"], | |
row_count=1, | |
col_count=(4, "fixed"), | |
wrap=True | |
) | |
upload_button.upload(upload_file, upload_button, df_output, show_progress = True) # upload the audio file and and sends to the upload function | |
if __name__ == "__main__": | |
demo.launch(debug=True) | |