Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
import os | |
import requests | |
import json | |
import cv2 | |
from PIL import Image | |
from timeit import default_timer as timer | |
import numpy as np | |
import aiofiles | |
from transformers import AutoModel | |
from gtts import gTTS | |
import io | |
import time | |
from gtts.lang import _main_langs | |
AUDIO_DIR = 'audio_files' | |
MAX_FILE_AGE = 24 * 60 * 60 | |
model = torch.hub.load('ultralytics/yolov5','yolov5s', pretrained=True) | |
#model1 = AutoModel.from_pretrained(model) | |
cnt = 0 | |
def LCR(bbox,x_img, y_img): | |
x1 = bbox[0]/x_img | |
x2 = bbox[2]/x_img | |
if x1 < 0.2 and x2 < 0.2 : | |
location = "On the left" | |
elif x1 > 0.8 and x2 > 0.8: | |
location = "On the right" | |
elif x1 < 0.2 and (x2 <= 0.8 and x2 >= 0.2): | |
if (x1 + x2) < 0.4: | |
location = "On the left" | |
else: | |
location = "At the center" | |
elif x2 > 0.8 and (x1 <= 0.8 and x1 >= 0.2): | |
if (x1 + x2) > 1.6: | |
location = "On the right" | |
else: | |
location = "At the center" | |
else: | |
location = "At the center" | |
print(f"x1 {x1} x2 {x2} bbox0 {bbox[0]} bbox2 {bbox[2]} x_img {x_img} LocationLCR {location}") | |
return location | |
def ACB(bbox, x_img, y_img, location): | |
y1 = bbox[1]/y_img | |
y2 = bbox[3]/y_img | |
if location == "At the center": | |
if y1 < 0.33333 and y2 < 0.33333 : | |
location = "On the top" | |
elif y1 > 0.66667 and y2 > 0.66667: | |
location = "On the bottom" | |
elif y1 < 0.33333 and (y2 <= 0.66667 and y2 >= 0.33333): | |
if (y1 + y2) < 0.66667: | |
location = "On the top" | |
else: | |
location = "At the center" | |
elif y2 > 0.66667 and (y1 <= 0.66667 and y1 >= 0.33333): | |
if (y1 + y2) > 1.33333: | |
location = "On the bottom" | |
else: | |
location = "At the center" | |
else: | |
location = "At the center" | |
else: | |
pass | |
print(f"y1 {y1} y2 {y2} bbox1 {bbox[1]} bbox3 {bbox[3]} y_img {y_img} Location{location}") | |
return location | |
#print(bbox[0]) | |
def imgae_to_text(data) : | |
count = {} | |
for index, infor in enumerate(data): | |
key = infor['Location'] + ':' + infor['Class'] | |
if key in count: | |
count[key] += 1 | |
else: | |
count[key] = 1 | |
text = "" | |
for index1, infor1 in enumerate(count): | |
name_class ="" | |
value = count[infor1] | |
parts = infor1.split(":") | |
if value > 1 : | |
vbare = "are" | |
if parts[1] =='person': | |
name_class = 'people' | |
else: | |
name_class = parts[1] + 's' | |
else: | |
name_class = parts[1] | |
vbare = "is" | |
text += parts[0] + ", there" + " " + vbare + " " + f"{value}" + " " + name_class +'.' +" " | |
return text | |
def delete_old_audio_files(): | |
# delete audio files older than MAX_FILE_AGE | |
now = time.time() | |
for file_name in os.listdir(AUDIO_DIR): | |
file_path = os.path.join(AUDIO_DIR, file_name) | |
if now - os.path.getmtime(file_path) > MAX_FILE_AGE: | |
os.remove(file_path) | |
# list of supported TLDs | |
tlds = [ | |
"com", | |
"ad", | |
"ae", | |
"com.af", | |
"com.ag", | |
"com.ai", | |
"com.ar", | |
"as", | |
"at", | |
"com.au", | |
"az", | |
"ba", | |
"com.bd", | |
"be", | |
"bf", | |
"bg", | |
"bj", | |
"br", | |
"bs", | |
"bt", | |
"co.bw", | |
"by", | |
"com.bz", | |
"ca", | |
"cd", | |
"ch", | |
"ci", | |
"co.ck", | |
"cl", | |
"cm", | |
"cn", | |
"com.co", | |
"co.cr", | |
"cv", | |
"dj", | |
"dm", | |
"com.do", | |
"dz", | |
"com.ec", | |
"ee", | |
"com.eg", | |
"es", | |
"et", | |
"fi", | |
"com.fj", | |
"fm", | |
"fr", | |
"ga", | |
"ge", | |
"gg", | |
"com.gh", | |
"com.gi", | |
"gl", | |
"gm", | |
"gr", | |
"com.gt", | |
"gy", | |
"com.hk", | |
"hn", | |
"ht", | |
"hr", | |
"hu", | |
"co.id", | |
"ie", | |
"co.il", | |
"im", | |
"co.in", | |
"iq", | |
"is", | |
"it", | |
"iw", | |
"je", | |
"com.je", | |
"jo", | |
"co.jp", | |
"co.ke", | |
"com.kh", | |
"ki", | |
"kg", | |
"co.kr", | |
"com.kw", | |
"kz", | |
"la", | |
"com.lb", | |
"li", | |
"lk", | |
"co.ls", | |
"lt", | |
"lu", | |
"lv", | |
"com.ly", | |
"com.ma", | |
"md", | |
"me", | |
"mg", | |
"mk", | |
"ml", | |
"mm", | |
"mn", | |
"ms", | |
"com.mt", | |
"mu", | |
"mv", | |
"mw", | |
"com.mx", | |
"com.my", | |
"co.mz", | |
"na", | |
"ng", | |
"ni", | |
"ne", | |
"nl", | |
"no", | |
"com.np", | |
"nr", | |
"nu", | |
"co.nz", | |
"com.om", | |
"pa", | |
"pe", | |
"pg", | |
"ph", | |
"pk", | |
"pl", | |
"pn", | |
"com.pr", | |
"ps", | |
"pt", | |
"com.py", | |
"com.qa", | |
"ro", | |
"ru", | |
"rw", | |
"com.sa", | |
"com.sb", | |
"sc", | |
"se", | |
"com.sg", | |
"sh", | |
"si", | |
"sk", | |
"com.sl", | |
"sn", | |
"so", | |
"sm", | |
"sr", | |
"st", | |
"com.sv", | |
"td", | |
"tg", | |
"co.th", | |
"com.tj", | |
"tl", | |
"tm", | |
"tn", | |
"to", | |
"com.tr", | |
"tt", | |
"com.tw", | |
"co.tz", | |
"com.ua", | |
"co.ug", | |
"co.uk", | |
"com,uy", | |
"co.uz", | |
"com.vc", | |
"co.ve", | |
"vg", | |
"co.vi", | |
"com.vn", | |
"vu", | |
"ws", | |
"rs", | |
"co.za", | |
"co.zm", | |
"co.zw", | |
"cat", | |
] | |
def text_to_speech(text, lang, tld): | |
# map the language name to its corresponding code | |
lang_codes = {lang_name: lang_code for lang_code, lang_name in _main_langs().items()} | |
lang_code = lang_codes[lang] | |
# create the text-to-speech audio | |
tts = gTTS(text, lang=lang_code, tld=tld) | |
fp = io.BytesIO() | |
tts.write_to_fp(fp) | |
fp.seek(0) | |
# create the audio directory if it does not exist | |
os.makedirs(AUDIO_DIR, exist_ok=True) | |
# generate a unique file name for the audio file | |
file_name = str(time.time()) + '.wav' | |
file_path = os.path.join(AUDIO_DIR, file_name) | |
# save the audio stream to a file | |
with open(file_path, 'wb') as f: | |
f.write(fp.read()) | |
# delete old audio files | |
delete_old_audio_files() | |
# return the file path | |
return file_path, f.name | |
def turn_img_into_voice(frame, lang, tld): | |
start_time = timer() | |
x_img, y_img = frame.size | |
print(x_img,y_img) | |
global cnt | |
objects = [] | |
prediction = model(frame) | |
for det in prediction.xyxy[0]: | |
class_id = int(det[5]) | |
class_name = model.names[class_id] | |
confidence = float(det[4]) | |
bbox = det[:4].tolist() | |
if(confidence >= 0.5): | |
location = LCR(bbox, x_img, y_img) | |
location = ACB(bbox, x_img, y_img, location) | |
# Save the results to the list | |
objects.append({ | |
'Class': class_name, | |
#'BoundingBox': bbox, | |
'Location': location, | |
'Confidence': confidence | |
}) | |
with open('{:05d}.json'.format(cnt) , 'w') as f: | |
json.dump(objects, f) | |
text = imgae_to_text(objects) | |
file_path, f_name = text_to_speech(text, lang, tld) | |
pred_time = round(timer() - start_time, 5) | |
return file_path, f_name, pred_time | |
#path = [["D:/cuoc_thi/test_img/364452351_843427357389915_7340823319235373312_n.jpg"],["D:/cuoc_thi/test_img/download.jpg"],["D:/cuoc_thi/test_img/tong-hop-cac-mau-gia-ke-de-bat-dia-thong-minh-hot-nhat-2.jpg"]] | |
iface = gr.Interface(fn=turn_img_into_voice, | |
inputs=["pil", | |
gr.inputs.Dropdown(choices=list(_main_langs().values()), label="Select language:", default='English'), | |
gr.inputs.Dropdown(choices=[tld for tld in tlds], label="Select TLD:", default="com")], | |
outputs=[gr.Audio(label="Audio", autoplay=True), | |
gr.File(label="Audio File"), | |
gr.Number(label="Prediction time (s)")], | |
#examples=path, | |
allow_flagging="never", | |
live=True) | |
iface.launch(enable_queue=True, inline=False) |