Spaces:

pknayak
/

bhashini_techathon

Sleeping

App Files Files Community

bhashini_techathon / app.py

pknayak

Update app.py

3295d39 verified 3 months ago

raw

history blame

3.6 kB


	# imports for
	from datasets import Audio, load_dataset, Dataset
	import torch
	from transformers import pipeline

	from pathlib import Path

	# for the UI by Gradio
	import gradio as gr
	import pandas as pd


	# initializing the values for device
	if torch.cuda.is_available():
	DEVICE = "cuda:0"
	TORCH_DTYPE = torch.float16
	else:
	DEVICE = "cpu"
	TORCH_DTYPE = torch.float32


	MODEL_NAME = 'openai/whisper-large-v3'

	pipe = pipeline(
	"automatic-speech-recognition",
	model=MODEL_NAME,
	torch_dtype=TORCH_DTYPE,
	device=DEVICE,
	)




	def convert_audio_2_array(files):
	file_paths = files
	complaint_data = Dataset.from_dict({
	"audio" : [file_paths] # path to be changed based on the path
	}).cast_column("audio",Audio(sampling_rate=16000))
	file_name = Path(file_paths).name
	print(f"file_paths: \n {file_paths} and \n file_name: {file_name}and \n complaint_data : \n {complaint_data} ")
	return file_name, complaint_data


	def v2t_convertor(files):
	file_name,inputs_dict = convert_audio_2_array(files)

	input = inputs_dict[0] # selecting only one input
	org_complain_dict = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "transcribe"})
	print('f{org_complain_dict}')
	org_complain = org_complain_dict['text']

	# lang_token = pipe.model.generate(input, max_new_tokens=1)[0,1]
	# language_code = pipe.tokenizer.decode(lang_token)
	language_code = "hi" # hard coded for the time being

	eng_complain = pipe(input["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "translate"})['text']

	print(f"org_complain: \n {org_complain} \nand\n eng_complain:\n {eng_complain} \n language_code: {language_code}")

	return [[file_name, org_complain, eng_complain, language_code]]



	def upload_file(files): # the actual translation should happen here
	"""
	takes the file that comes from the UI and converts it to the respective
	format to be sent to the model for transcription
	"""
	# Define the column names
	columns = ["audio_id", "transcribed_text( in org lang )", "transcribed_text( in eng )", "language"]

	# Define the data as lists. Here the data would be sent in the form of single data fro mthe data

	# data = [ ["ca_1.wav", "बिना किसी पूर्व सूचना के विलंबित या रद्द की गई ट्रिनिक", "without any prior information or any delay or delay in the train journey", "hindi"]]

	data = v2t_convertor(files)

	# Create the DataFrameoutputs
	if data is not None:
	df = pd.DataFrame(data, columns=columns)
	else:
	raise ValueError("Data is None. Cannot create DataFrame.")
	return df


	with gr.Blocks() as demo:
	gr.Markdown( "# V2T Engine" )
	with gr.Accordion("Steps to run the App"):
	gr.Markdown("1. Click the \"Click to Upload a File\" to get the dialog box to browse your local path." )
	gr.Markdown("2. The audio will get uploaded, transcribed into the ")
	gr.Markdown("3. ")

	upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio"], file_count="single")
	df_output = gr.Dataframe(
	headers=["audio_id", "transcribed_text( in eng )", "transcribed_text( in org lang )", "language"],
	datatype=["str", "str", "str"],
	row_count=1,
	col_count=(4, "fixed"),
	wrap=True
	)
	upload_button.upload(upload_file, upload_button, df_output, show_progress = True) # upload the audio file and and sends to the upload function



	if __name__ == "__main__":
	demo.launch(debug=True)