Obj-to-voice / app.py
Ryan-Pham's picture
Update app.py
dd4a234
import gradio as gr
import torch
import os
import requests
import json
import cv2
from PIL import Image
from timeit import default_timer as timer
import numpy as np
import aiofiles
from transformers import AutoModel
from gtts import gTTS
import io
import time
from gtts.lang import _main_langs
AUDIO_DIR = 'audio_files'
MAX_FILE_AGE = 24 * 60 * 60
model = torch.hub.load('ultralytics/yolov5','yolov5s', pretrained=True)
#model1 = AutoModel.from_pretrained(model)
cnt = 0
def LCR(bbox,x_img, y_img):
x1 = bbox[0]/x_img
x2 = bbox[2]/x_img
if x1 < 0.2 and x2 < 0.2 :
location = "On the left"
elif x1 > 0.8 and x2 > 0.8:
location = "On the right"
elif x1 < 0.2 and (x2 <= 0.8 and x2 >= 0.2):
if (x1 + x2) < 0.4:
location = "On the left"
else:
location = "At the center"
elif x2 > 0.8 and (x1 <= 0.8 and x1 >= 0.2):
if (x1 + x2) > 1.6:
location = "On the right"
else:
location = "At the center"
else:
location = "At the center"
print(f"x1 {x1} x2 {x2} bbox0 {bbox[0]} bbox2 {bbox[2]} x_img {x_img} LocationLCR {location}")
return location
def ACB(bbox, x_img, y_img, location):
y1 = bbox[1]/y_img
y2 = bbox[3]/y_img
if location == "At the center":
if y1 < 0.33333 and y2 < 0.33333 :
location = "On the top"
elif y1 > 0.66667 and y2 > 0.66667:
location = "On the bottom"
elif y1 < 0.33333 and (y2 <= 0.66667 and y2 >= 0.33333):
if (y1 + y2) < 0.66667:
location = "On the top"
else:
location = "At the center"
elif y2 > 0.66667 and (y1 <= 0.66667 and y1 >= 0.33333):
if (y1 + y2) > 1.33333:
location = "On the bottom"
else:
location = "At the center"
else:
location = "At the center"
else:
pass
print(f"y1 {y1} y2 {y2} bbox1 {bbox[1]} bbox3 {bbox[3]} y_img {y_img} Location{location}")
return location
#print(bbox[0])
def imgae_to_text(data) :
count = {}
for index, infor in enumerate(data):
key = infor['Location'] + ':' + infor['Class']
if key in count:
count[key] += 1
else:
count[key] = 1
text = ""
for index1, infor1 in enumerate(count):
name_class =""
value = count[infor1]
parts = infor1.split(":")
if value > 1 :
vbare = "are"
if parts[1] =='person':
name_class = 'people'
else:
name_class = parts[1] + 's'
else:
name_class = parts[1]
vbare = "is"
text += parts[0] + ", there" + " " + vbare + " " + f"{value}" + " " + name_class +'.' +" "
return text
def delete_old_audio_files():
# delete audio files older than MAX_FILE_AGE
now = time.time()
for file_name in os.listdir(AUDIO_DIR):
file_path = os.path.join(AUDIO_DIR, file_name)
if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
os.remove(file_path)
# list of supported TLDs
tlds = [
"com",
"ad",
"ae",
"com.af",
"com.ag",
"com.ai",
"com.ar",
"as",
"at",
"com.au",
"az",
"ba",
"com.bd",
"be",
"bf",
"bg",
"bj",
"br",
"bs",
"bt",
"co.bw",
"by",
"com.bz",
"ca",
"cd",
"ch",
"ci",
"co.ck",
"cl",
"cm",
"cn",
"com.co",
"co.cr",
"cv",
"dj",
"dm",
"com.do",
"dz",
"com.ec",
"ee",
"com.eg",
"es",
"et",
"fi",
"com.fj",
"fm",
"fr",
"ga",
"ge",
"gg",
"com.gh",
"com.gi",
"gl",
"gm",
"gr",
"com.gt",
"gy",
"com.hk",
"hn",
"ht",
"hr",
"hu",
"co.id",
"ie",
"co.il",
"im",
"co.in",
"iq",
"is",
"it",
"iw",
"je",
"com.je",
"jo",
"co.jp",
"co.ke",
"com.kh",
"ki",
"kg",
"co.kr",
"com.kw",
"kz",
"la",
"com.lb",
"li",
"lk",
"co.ls",
"lt",
"lu",
"lv",
"com.ly",
"com.ma",
"md",
"me",
"mg",
"mk",
"ml",
"mm",
"mn",
"ms",
"com.mt",
"mu",
"mv",
"mw",
"com.mx",
"com.my",
"co.mz",
"na",
"ng",
"ni",
"ne",
"nl",
"no",
"com.np",
"nr",
"nu",
"co.nz",
"com.om",
"pa",
"pe",
"pg",
"ph",
"pk",
"pl",
"pn",
"com.pr",
"ps",
"pt",
"com.py",
"com.qa",
"ro",
"ru",
"rw",
"com.sa",
"com.sb",
"sc",
"se",
"com.sg",
"sh",
"si",
"sk",
"com.sl",
"sn",
"so",
"sm",
"sr",
"st",
"com.sv",
"td",
"tg",
"co.th",
"com.tj",
"tl",
"tm",
"tn",
"to",
"com.tr",
"tt",
"com.tw",
"co.tz",
"com.ua",
"co.ug",
"co.uk",
"com,uy",
"co.uz",
"com.vc",
"co.ve",
"vg",
"co.vi",
"com.vn",
"vu",
"ws",
"rs",
"co.za",
"co.zm",
"co.zw",
"cat",
]
def text_to_speech(text, lang, tld):
# map the language name to its corresponding code
lang_codes = {lang_name: lang_code for lang_code, lang_name in _main_langs().items()}
lang_code = lang_codes[lang]
# create the text-to-speech audio
tts = gTTS(text, lang=lang_code, tld=tld)
fp = io.BytesIO()
tts.write_to_fp(fp)
fp.seek(0)
# create the audio directory if it does not exist
os.makedirs(AUDIO_DIR, exist_ok=True)
# generate a unique file name for the audio file
file_name = str(time.time()) + '.wav'
file_path = os.path.join(AUDIO_DIR, file_name)
# save the audio stream to a file
with open(file_path, 'wb') as f:
f.write(fp.read())
# delete old audio files
delete_old_audio_files()
# return the file path
return file_path, f.name
def turn_img_into_voice(frame, lang, tld):
start_time = timer()
x_img, y_img = frame.size
print(x_img,y_img)
global cnt
objects = []
prediction = model(frame)
for det in prediction.xyxy[0]:
class_id = int(det[5])
class_name = model.names[class_id]
confidence = float(det[4])
bbox = det[:4].tolist()
if(confidence >= 0.5):
location = LCR(bbox, x_img, y_img)
location = ACB(bbox, x_img, y_img, location)
# Save the results to the list
objects.append({
'Class': class_name,
#'BoundingBox': bbox,
'Location': location,
'Confidence': confidence
})
with open('{:05d}.json'.format(cnt) , 'w') as f:
json.dump(objects, f)
text = imgae_to_text(objects)
file_path, f_name = text_to_speech(text, lang, tld)
pred_time = round(timer() - start_time, 5)
return file_path, f_name, pred_time
#path = [["D:/cuoc_thi/test_img/364452351_843427357389915_7340823319235373312_n.jpg"],["D:/cuoc_thi/test_img/download.jpg"],["D:/cuoc_thi/test_img/tong-hop-cac-mau-gia-ke-de-bat-dia-thong-minh-hot-nhat-2.jpg"]]
iface = gr.Interface(fn=turn_img_into_voice,
inputs=["pil",
gr.inputs.Dropdown(choices=list(_main_langs().values()), label="Select language:", default='English'),
gr.inputs.Dropdown(choices=[tld for tld in tlds], label="Select TLD:", default="com")],
outputs=[gr.Audio(label="Audio", autoplay=True),
gr.File(label="Audio File"),
gr.Number(label="Prediction time (s)")],
#examples=path,
allow_flagging="never",
live=True)
iface.launch(enable_queue=True, inline=False)