Spaces:

Ryan-Pham
/

Obj-to-voice

Runtime error

App Files Files Community

Obj-to-voice / app.py

Ryan-Pham

Update app.py

dd4a234 over 1 year ago

raw

history blame contribute delete

7.93 kB

	import gradio as gr
	import torch
	import os
	import requests
	import json
	import cv2
	from PIL import Image
	from timeit import default_timer as timer
	import numpy as np
	import aiofiles
	from transformers import AutoModel
	from gtts import gTTS
	import io
	import time
	from gtts.lang import _main_langs

	AUDIO_DIR = 'audio_files'
	MAX_FILE_AGE = 24 * 60 * 60
	model = torch.hub.load('ultralytics/yolov5','yolov5s', pretrained=True)
	#model1 = AutoModel.from_pretrained(model)
	cnt = 0
	def LCR(bbox,x_img, y_img):
	x1 = bbox[0]/x_img
	x2 = bbox[2]/x_img
	if x1 < 0.2 and x2 < 0.2 :
	location = "On the left"
	elif x1 > 0.8 and x2 > 0.8:
	location = "On the right"
	elif x1 < 0.2 and (x2 <= 0.8 and x2 >= 0.2):
	if (x1 + x2) < 0.4:
	location = "On the left"
	else:
	location = "At the center"
	elif x2 > 0.8 and (x1 <= 0.8 and x1 >= 0.2):
	if (x1 + x2) > 1.6:
	location = "On the right"
	else:
	location = "At the center"
	else:
	location = "At the center"
	print(f"x1 {x1} x2 {x2} bbox0 {bbox[0]} bbox2 {bbox[2]} x_img {x_img} LocationLCR {location}")
	return location

	def ACB(bbox, x_img, y_img, location):
	y1 = bbox[1]/y_img
	y2 = bbox[3]/y_img
	if location == "At the center":
	if y1 < 0.33333 and y2 < 0.33333 :
	location = "On the top"
	elif y1 > 0.66667 and y2 > 0.66667:
	location = "On the bottom"
	elif y1 < 0.33333 and (y2 <= 0.66667 and y2 >= 0.33333):
	if (y1 + y2) < 0.66667:
	location = "On the top"
	else:
	location = "At the center"
	elif y2 > 0.66667 and (y1 <= 0.66667 and y1 >= 0.33333):
	if (y1 + y2) > 1.33333:
	location = "On the bottom"
	else:
	location = "At the center"
	else:
	location = "At the center"
	else:
	pass
	print(f"y1 {y1} y2 {y2} bbox1 {bbox[1]} bbox3 {bbox[3]} y_img {y_img} Location{location}")

	return location
	#print(bbox[0])
	def imgae_to_text(data) :
	count = {}
	for index, infor in enumerate(data):
	key = infor['Location'] + ':' + infor['Class']
	if key in count:
	count[key] += 1
	else:
	count[key] = 1
	text = ""

	for index1, infor1 in enumerate(count):
	name_class =""
	value = count[infor1]
	parts = infor1.split(":")
	if value > 1 :
	vbare = "are"
	if parts[1] =='person':
	name_class = 'people'
	else:
	name_class = parts[1] + 's'
	else:
	name_class = parts[1]
	vbare = "is"
	text += parts[0] + ", there" + " " + vbare + " " + f"{value}" + " " + name_class +'.' +" "
	return text


	def delete_old_audio_files():
	# delete audio files older than MAX_FILE_AGE
	now = time.time()
	for file_name in os.listdir(AUDIO_DIR):
	file_path = os.path.join(AUDIO_DIR, file_name)
	if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
	os.remove(file_path)

	# list of supported TLDs
	tlds = [
	"com",
	"ad",
	"ae",
	"com.af",
	"com.ag",
	"com.ai",
	"com.ar",
	"as",
	"at",
	"com.au",
	"az",
	"ba",
	"com.bd",
	"be",
	"bf",
	"bg",
	"bj",
	"br",
	"bs",
	"bt",
	"co.bw",
	"by",
	"com.bz",
	"ca",
	"cd",
	"ch",
	"ci",
	"co.ck",
	"cl",
	"cm",
	"cn",
	"com.co",
	"co.cr",
	"cv",
	"dj",
	"dm",
	"com.do",
	"dz",
	"com.ec",
	"ee",
	"com.eg",
	"es",
	"et",
	"fi",
	"com.fj",
	"fm",
	"fr",
	"ga",
	"ge",
	"gg",
	"com.gh",
	"com.gi",
	"gl",
	"gm",
	"gr",
	"com.gt",
	"gy",
	"com.hk",
	"hn",
	"ht",
	"hr",
	"hu",
	"co.id",
	"ie",
	"co.il",
	"im",
	"co.in",
	"iq",
	"is",
	"it",
	"iw",
	"je",
	"com.je",
	"jo",
	"co.jp",
	"co.ke",
	"com.kh",
	"ki",
	"kg",
	"co.kr",
	"com.kw",
	"kz",
	"la",
	"com.lb",
	"li",
	"lk",
	"co.ls",
	"lt",
	"lu",
	"lv",
	"com.ly",
	"com.ma",
	"md",
	"me",
	"mg",
	"mk",
	"ml",
	"mm",
	"mn",
	"ms",
	"com.mt",
	"mu",
	"mv",
	"mw",
	"com.mx",
	"com.my",
	"co.mz",
	"na",
	"ng",
	"ni",
	"ne",
	"nl",
	"no",
	"com.np",
	"nr",
	"nu",
	"co.nz",
	"com.om",
	"pa",
	"pe",
	"pg",
	"ph",
	"pk",
	"pl",
	"pn",
	"com.pr",
	"ps",
	"pt",
	"com.py",
	"com.qa",
	"ro",
	"ru",
	"rw",
	"com.sa",
	"com.sb",
	"sc",
	"se",
	"com.sg",
	"sh",
	"si",
	"sk",
	"com.sl",
	"sn",
	"so",
	"sm",
	"sr",
	"st",
	"com.sv",
	"td",
	"tg",
	"co.th",
	"com.tj",
	"tl",
	"tm",
	"tn",
	"to",
	"com.tr",
	"tt",
	"com.tw",
	"co.tz",
	"com.ua",
	"co.ug",
	"co.uk",
	"com,uy",
	"co.uz",
	"com.vc",
	"co.ve",
	"vg",
	"co.vi",
	"com.vn",
	"vu",
	"ws",
	"rs",
	"co.za",
	"co.zm",
	"co.zw",
	"cat",
	]

	def text_to_speech(text, lang, tld):
	# map the language name to its corresponding code
	lang_codes = {lang_name: lang_code for lang_code, lang_name in _main_langs().items()}
	lang_code = lang_codes[lang]

	# create the text-to-speech audio
	tts = gTTS(text, lang=lang_code, tld=tld)
	fp = io.BytesIO()
	tts.write_to_fp(fp)
	fp.seek(0)

	# create the audio directory if it does not exist
	os.makedirs(AUDIO_DIR, exist_ok=True)

	# generate a unique file name for the audio file
	file_name = str(time.time()) + '.wav'
	file_path = os.path.join(AUDIO_DIR, file_name)

	# save the audio stream to a file
	with open(file_path, 'wb') as f:
	f.write(fp.read())

	# delete old audio files
	delete_old_audio_files()

	# return the file path
	return file_path, f.name


	def turn_img_into_voice(frame, lang, tld):
	start_time = timer()
	x_img, y_img = frame.size
	print(x_img,y_img)
	global cnt
	objects = []

	prediction = model(frame)
	for det in prediction.xyxy[0]:
	class_id = int(det[5])
	class_name = model.names[class_id]
	confidence = float(det[4])
	bbox = det[:4].tolist()
	if(confidence >= 0.5):
	location = LCR(bbox, x_img, y_img)
	location = ACB(bbox, x_img, y_img, location)
	# Save the results to the list
	objects.append({
	'Class': class_name,
	#'BoundingBox': bbox,
	'Location': location,
	'Confidence': confidence
	})

	with open('{:05d}.json'.format(cnt) , 'w') as f:
	json.dump(objects, f)
	text = imgae_to_text(objects)

	file_path, f_name = text_to_speech(text, lang, tld)
	pred_time = round(timer() - start_time, 5)
	return file_path, f_name, pred_time


	#path = [["D:/cuoc_thi/test_img/364452351_843427357389915_7340823319235373312_n.jpg"],["D:/cuoc_thi/test_img/download.jpg"],["D:/cuoc_thi/test_img/tong-hop-cac-mau-gia-ke-de-bat-dia-thong-minh-hot-nhat-2.jpg"]]


	iface = gr.Interface(fn=turn_img_into_voice,
	inputs=["pil",
	gr.inputs.Dropdown(choices=list(_main_langs().values()), label="Select language:", default='English'),
	gr.inputs.Dropdown(choices=[tld for tld in tlds], label="Select TLD:", default="com")],
	outputs=[gr.Audio(label="Audio", autoplay=True),
	gr.File(label="Audio File"),
	gr.Number(label="Prediction time (s)")],
	#examples=path,
	allow_flagging="never",
	live=True)

	iface.launch(enable_queue=True, inline=False)