# imports for from datasets import Audio, load_dataset, Dataset import torch from transformers import pipeline from pathlib import Path # for the UI by Gradio import gradio as gr import pandas as pd # initializing the values for device if torch.cuda.is_available(): DEVICE = "cuda:0" TORCH_DTYPE = torch.float16 else: DEVICE = "cpu" TORCH_DTYPE = torch.float32 # MODEL_NAME = 'openai/whisper-large-v3' MODEL_NAME = 'openai/whisper-small' pipe = pipeline( "automatic-speech-recognition", model=MODEL_NAME, torch_dtype=TORCH_DTYPE, device=DEVICE, ) def convert_audio_2_array(files): file_paths = files complaint_data = Dataset.from_dict({ "audio" : [file_paths] # path to be changed based on the path }).cast_column("audio",Audio(sampling_rate=16000)) file_name = Path(file_paths).name print(f"file_paths: \n {file_paths} and \n file_name: {file_name}and \n complaint_data : \n {complaint_data} ") return file_name, complaint_data def v2t_convertor(files): file_name,inputs_dict = convert_audio_2_array(files) input = inputs_dict[0] # selecting only one input org_complain_dict = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "transcribe"}) print('f{org_complain_dict}') org_complain = org_complain_dict['text'] # lang_token = pipe.model.generate(input, max_new_tokens=1)[0,1] # language_code = pipe.tokenizer.decode(lang_token) language_code = "hi" # hard coded for the time being eng_complain = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "translate"})['text'] print(f"org_complain: \n {org_complain} \nand\n eng_complain:\n {eng_complain} \n language_code: {language_code}") return [[file_name, org_complain, eng_complain, language_code]] def upload_file(files): # the actual translation should happen here """ takes the file that comes from the UI and converts it to the respective format to be sent to the model for transcription """ # Define the column names columns = ["audio_id", "transcribed_text( in org lang )", "transcribed_text( in eng )", "language"] # Define the data as lists. Here the data would be sent in the form of single data fro mthe data # data = [ ["ca_1.wav", "बिना किसी पूर्व सूचना के विलंबित या रद्द की गई ट्रिनिक", "without any prior information or any delay or delay in the train journey", "hindi"]] data = v2t_convertor(files) # Create the DataFrameoutputs if data is not None: df = pd.DataFrame(data, columns=columns) else: raise ValueError("Data is None. Cannot create DataFrame.") return df with gr.Blocks() as demo: gr.Markdown( "# V2T Engine" ) with gr.Accordion("Steps to run the App"): gr.Markdown("1. Click \"Click to Upload a File\" to open a dialog box and browse your local files.") gr.Markdown("2. The uploaded audio will be transcribed in the original language and translated into English.") gr.Markdown("3. The transcriptions and translations will be displayed in a table, which is how they will be stored in the database.") upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio"], file_count="single") df_output = gr.Dataframe( headers=["audio_id", "transcribed_text( in eng )", "transcribed_text( in org lang )", "language"], datatype=["str", "str", "str"], row_count=1, col_count=(4, "fixed"), wrap=True ) upload_button.upload(upload_file, upload_button, df_output, show_progress = True) # upload the audio file and and sends to the upload function if __name__ == "__main__": demo.launch(debug=True)