Spaces:

Ryan-Pham
/

Obj-to-voice

Runtime error

App Files Files Community

Obj-to-voice / app.py

Ryan-Pham

Upload folder using huggingface_hub

c91b12b over 1 year ago

raw

history blame

7.93 kB

	import gradio as gr
	import torch
	import os
	import requests
	import json
	import cv2
	from PIL import Image
	from timeit import default_timer as timer
	import numpy as np
	import aiofiles
	from transformers import AutoModel
	from gtts import gTTS
	import io
	import time
	from gtts.lang import _main_langs

	AUDIO_DIR = 'audio_files'
	MAX_FILE_AGE = 24 * 60 * 60
	model = torch.hub.load('ultralytics/yolov5','yolov5s', pretrained=True)
	#model1 = AutoModel.from_pretrained(model)
	cnt = 0
	def LCR(bbox,x_img, y_img):
	x1 = bbox[0]/x_img
	x2 = bbox[2]/x_img
	if x1 < 0.2 and x2 < 0.2 :
	location = "On the left"
	elif x1 > 0.8 and x2 > 0.8:
	location = "On the right"
	elif x1 < 0.2 and (x2 <= 0.8 and x2 >= 0.2):
	if (x1 + x2) < 0.4:
	location = "On the left"
	else:
	location = "At the center"
	elif x2 > 0.8 and (x1 <= 0.8 and x1 >= 0.2):
	if (x1 + x2) > 1.6:
	location = "On the right"
	else:
	location = "At the center"
	else:
	location = "At the center"
	print(f"x1 {x1} x2 {x2} bbox0 {bbox[0]} bbox2 {bbox[2]} x_img {x_img} LocationLCR {location}")
	return location

	def ACB(bbox, x_img, y_img, location):
	y1 = bbox[1]/y_img
	y2 = bbox[3]/y_img
	if location == "At the center":
	if y1 < 0.33333 and y2 < 0.33333 :
	location = "On the top"
	elif y1 > 0.66667 and y2 > 0.66667:
	location = "On the bottom"
	elif y1 < 0.33333 and (y2 <= 0.66667 and y2 >= 0.33333):
	if (y1 + y2) < 0.66667:
	location = "On the top"
	else:
	location = "At the center"
	elif y2 > 0.66667 and (y1 <= 0.66667 and y1 >= 0.33333):
	if (y1 + y2) > 1.33333:
	location = "On the bottom"
	else:
	location = "At the center"
	else:
	location = "At the center"
	else:
	pass
	print(f"y1 {y1} y2 {y2} bbox1 {bbox[1]} bbox3 {bbox[3]} y_img {y_img} Location{location}")

	return location
	#print(bbox[0])
	def imgae_to_text(data) :
	count = {}
	for index, infor in enumerate(data):
	key = infor['Location'] + ':' + infor['Class']
	if key in count:
	count[key] += 1
	else:
	count[key] = 1
	text = ""

	for index1, infor1 in enumerate(count):
	name_class =""
	value = count[infor1]
	parts = infor1.split(":")
	if value > 1 :
	vbare = "are"
	if parts[1] =='person':
	name_class = 'people'
	else:
	name_class = parts[1] + 's'
	else:
	name_class = parts[1]
	vbare = "is"
	text += parts[0] + ", there" + " " + vbare + " " + f"{value}" + " " + name_class +'.' +" "
	return text


	def delete_old_audio_files():
	# delete audio files older than MAX_FILE_AGE
	now = time.time()
	for file_name in os.listdir(AUDIO_DIR):
	file_path = os.path.join(AUDIO_DIR, file_name)
	if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
	os.remove(file_path)

	# list of supported TLDs
	tlds = [
	"com",
	"ad",
	"ae",
	"com.af",
	"com.ag",
	"com.ai",
	"com.ar",
	"as",
	"at",
	"com.au",
	"az",
	"ba",
	"com.bd",
	"be",
	"bf",
	"bg",
	"bj",
	"br",
	"bs",
	"bt",
	"co.bw",
	"by",
	"com.bz",
	"ca",
	"cd",
	"ch",
	"ci",
	"co.ck",
	"cl",
	"cm",
	"cn",
	"com.co",
	"co.cr",
	"cv",
	"dj",
	"dm",
	"com.do",
	"dz",
	"com.ec",
	"ee",
	"com.eg",
	"es",
	"et",
	"fi",
	"com.fj",
	"fm",
	"fr",
	"ga",
	"ge",
	"gg",
	"com.gh",
	"com.gi",
	"gl",
	"gm",
	"gr",
	"com.gt",
	"gy",
	"com.hk",
	"hn",
	"ht",
	"hr",
	"hu",
	"co.id",
	"ie",
	"co.il",
	"im",
	"co.in",
	"iq",
	"is",
	"it",
	"iw",
	"je",
	"com.je",
	"jo",
	"co.jp",
	"co.ke",
	"com.kh",
	"ki",
	"kg",
	"co.kr",
	"com.kw",
	"kz",
	"la",
	"com.lb",
	"li",
	"lk",
	"co.ls",
	"lt",
	"lu",
	"lv",
	"com.ly",
	"com.ma",
	"md",
	"me",
	"mg",
	"mk",
	"ml",
	"mm",
	"mn",
	"ms",
	"com.mt",
	"mu",
	"mv",
	"mw",
	"com.mx",
	"com.my",
	"co.mz",
	"na",
	"ng",
	"ni",
	"ne",
	"nl",
	"no",
	"com.np",
	"nr",
	"nu",
	"co.nz",
	"com.om",
	"pa",
	"pe",
	"pg",
	"ph",
	"pk",
	"pl",
	"pn",
	"com.pr",
	"ps",
	"pt",
	"com.py",
	"com.qa",
	"ro",
	"ru",
	"rw",
	"com.sa",
	"com.sb",
	"sc",
	"se",
	"com.sg",
	"sh",
	"si",
	"sk",
	"com.sl",
	"sn",
	"so",
	"sm",
	"sr",
	"st",
	"com.sv",
	"td",
	"tg",
	"co.th",
	"com.tj",
	"tl",
	"tm",
	"tn",
	"to",
	"com.tr",
	"tt",
	"com.tw",
	"co.tz",
	"com.ua",
	"co.ug",
	"co.uk",
	"com,uy",
	"co.uz",
	"com.vc",
	"co.ve",
	"vg",
	"co.vi",
	"com.vn",
	"vu",
	"ws",
	"rs",
	"co.za",
	"co.zm",
	"co.zw",
	"cat",
	]

	def text_to_speech(text, lang, tld):
	# map the language name to its corresponding code
	lang_codes = {lang_name: lang_code for lang_code, lang_name in _main_langs().items()}
	lang_code = lang_codes[lang]

	# create the text-to-speech audio
	tts = gTTS(text, lang=lang_code, tld=tld)
	fp = io.BytesIO()
	tts.write_to_fp(fp)
	fp.seek(0)

	# create the audio directory if it does not exist
	os.makedirs(AUDIO_DIR, exist_ok=True)

	# generate a unique file name for the audio file
	file_name = str(time.time()) + '.wav'
	file_path = os.path.join(AUDIO_DIR, file_name)

	# save the audio stream to a file
	with open(file_path, 'wb') as f:
	f.write(fp.read())

	# delete old audio files
	delete_old_audio_files()

	# return the file path
	return file_path, f.name


	def turn_img_into_voice(frame, lang, tld):
	start_time = timer()
	x_img, y_img = frame.size
	print(x_img,y_img)
	global cnt
	objects = []

	prediction = model(frame)
	for det in prediction.xyxy[0]:
	class_id = int(det[5])
	class_name = model.names[class_id]
	confidence = float(det[4])
	bbox = det[:4].tolist()
	if(confidence >= 0.5):
	location = LCR(bbox, x_img, y_img)
	location = ACB(bbox, x_img, y_img, location)
	# Save the results to the list
	objects.append({
	'Class': class_name,
	#'BoundingBox': bbox,
	'Location': location,
	'Confidence': confidence
	})

	with open('{:05d}.json'.format(cnt) , 'w') as f:
	json.dump(objects, f)
	text = imgae_to_text(objects)

	file_path, f_name = text_to_speech(text, lang, tld)
	pred_time = round(timer() - start_time, 5)
	return file_path, f_name, pred_time


	path = [["D:/cuoc_thi/test_img/364452351_843427357389915_7340823319235373312_n.jpg"],["D:/cuoc_thi/test_img/download.jpg"],["D:/cuoc_thi/test_img/tong-hop-cac-mau-gia-ke-de-bat-dia-thong-minh-hot-nhat-2.jpg"]]


	iface = gr.Interface(fn=turn_img_into_voice,
	inputs=["pil",
	gr.inputs.Dropdown(choices=list(_main_langs().values()), label="Select language:", default='English'),
	gr.inputs.Dropdown(choices=[tld for tld in tlds], label="Select TLD:", default="com")],
	outputs=[gr.Audio(label="Audio", autoplay=True),
	gr.File(label="Audio File"),
	gr.Number(label="Prediction time (s)")],
	examples=path,
	allow_flagging="never",
	live=True)

	iface.launch(enable_queue=True, share=False)