import gradio as gr import torch import os import requests import json import cv2 from PIL import Image from timeit import default_timer as timer import numpy as np import aiofiles from transformers import AutoModel from gtts import gTTS import io import time from gtts.lang import _main_langs AUDIO_DIR = 'audio_files' MAX_FILE_AGE = 24 * 60 * 60 model = torch.hub.load('ultralytics/yolov5','yolov5s', pretrained=True) #model1 = AutoModel.from_pretrained(model) cnt = 0 def LCR(bbox,x_img, y_img): x1 = bbox[0]/x_img x2 = bbox[2]/x_img if x1 < 0.2 and x2 < 0.2 : location = "On the left" elif x1 > 0.8 and x2 > 0.8: location = "On the right" elif x1 < 0.2 and (x2 <= 0.8 and x2 >= 0.2): if (x1 + x2) < 0.4: location = "On the left" else: location = "At the center" elif x2 > 0.8 and (x1 <= 0.8 and x1 >= 0.2): if (x1 + x2) > 1.6: location = "On the right" else: location = "At the center" else: location = "At the center" print(f"x1 {x1} x2 {x2} bbox0 {bbox[0]} bbox2 {bbox[2]} x_img {x_img} LocationLCR {location}") return location def ACB(bbox, x_img, y_img, location): y1 = bbox[1]/y_img y2 = bbox[3]/y_img if location == "At the center": if y1 < 0.33333 and y2 < 0.33333 : location = "On the top" elif y1 > 0.66667 and y2 > 0.66667: location = "On the bottom" elif y1 < 0.33333 and (y2 <= 0.66667 and y2 >= 0.33333): if (y1 + y2) < 0.66667: location = "On the top" else: location = "At the center" elif y2 > 0.66667 and (y1 <= 0.66667 and y1 >= 0.33333): if (y1 + y2) > 1.33333: location = "On the bottom" else: location = "At the center" else: location = "At the center" else: pass print(f"y1 {y1} y2 {y2} bbox1 {bbox[1]} bbox3 {bbox[3]} y_img {y_img} Location{location}") return location #print(bbox[0]) def imgae_to_text(data) : count = {} for index, infor in enumerate(data): key = infor['Location'] + ':' + infor['Class'] if key in count: count[key] += 1 else: count[key] = 1 text = "" for index1, infor1 in enumerate(count): name_class ="" value = count[infor1] parts = infor1.split(":") if value > 1 : vbare = "are" if parts[1] =='person': name_class = 'people' else: name_class = parts[1] + 's' else: name_class = parts[1] vbare = "is" text += parts[0] + ", there" + " " + vbare + " " + f"{value}" + " " + name_class +'.' +" " return text def delete_old_audio_files(): # delete audio files older than MAX_FILE_AGE now = time.time() for file_name in os.listdir(AUDIO_DIR): file_path = os.path.join(AUDIO_DIR, file_name) if now - os.path.getmtime(file_path) > MAX_FILE_AGE: os.remove(file_path) # list of supported TLDs tlds = [ "com", "ad", "ae", "com.af", "com.ag", "com.ai", "com.ar", "as", "at", "com.au", "az", "ba", "com.bd", "be", "bf", "bg", "bj", "br", "bs", "bt", "co.bw", "by", "com.bz", "ca", "cd", "ch", "ci", "co.ck", "cl", "cm", "cn", "com.co", "co.cr", "cv", "dj", "dm", "com.do", "dz", "com.ec", "ee", "com.eg", "es", "et", "fi", "com.fj", "fm", "fr", "ga", "ge", "gg", "com.gh", "com.gi", "gl", "gm", "gr", "com.gt", "gy", "com.hk", "hn", "ht", "hr", "hu", "co.id", "ie", "co.il", "im", "co.in", "iq", "is", "it", "iw", "je", "com.je", "jo", "co.jp", "co.ke", "com.kh", "ki", "kg", "co.kr", "com.kw", "kz", "la", "com.lb", "li", "lk", "co.ls", "lt", "lu", "lv", "com.ly", "com.ma", "md", "me", "mg", "mk", "ml", "mm", "mn", "ms", "com.mt", "mu", "mv", "mw", "com.mx", "com.my", "co.mz", "na", "ng", "ni", "ne", "nl", "no", "com.np", "nr", "nu", "co.nz", "com.om", "pa", "pe", "pg", "ph", "pk", "pl", "pn", "com.pr", "ps", "pt", "com.py", "com.qa", "ro", "ru", "rw", "com.sa", "com.sb", "sc", "se", "com.sg", "sh", "si", "sk", "com.sl", "sn", "so", "sm", "sr", "st", "com.sv", "td", "tg", "co.th", "com.tj", "tl", "tm", "tn", "to", "com.tr", "tt", "com.tw", "co.tz", "com.ua", "co.ug", "co.uk", "com,uy", "co.uz", "com.vc", "co.ve", "vg", "co.vi", "com.vn", "vu", "ws", "rs", "co.za", "co.zm", "co.zw", "cat", ] def text_to_speech(text, lang, tld): # map the language name to its corresponding code lang_codes = {lang_name: lang_code for lang_code, lang_name in _main_langs().items()} lang_code = lang_codes[lang] # create the text-to-speech audio tts = gTTS(text, lang=lang_code, tld=tld) fp = io.BytesIO() tts.write_to_fp(fp) fp.seek(0) # create the audio directory if it does not exist os.makedirs(AUDIO_DIR, exist_ok=True) # generate a unique file name for the audio file file_name = str(time.time()) + '.wav' file_path = os.path.join(AUDIO_DIR, file_name) # save the audio stream to a file with open(file_path, 'wb') as f: f.write(fp.read()) # delete old audio files delete_old_audio_files() # return the file path return file_path, f.name def turn_img_into_voice(frame, lang, tld): start_time = timer() x_img, y_img = frame.size print(x_img,y_img) global cnt objects = [] prediction = model(frame) for det in prediction.xyxy[0]: class_id = int(det[5]) class_name = model.names[class_id] confidence = float(det[4]) bbox = det[:4].tolist() if(confidence >= 0.5): location = LCR(bbox, x_img, y_img) location = ACB(bbox, x_img, y_img, location) # Save the results to the list objects.append({ 'Class': class_name, #'BoundingBox': bbox, 'Location': location, 'Confidence': confidence }) with open('{:05d}.json'.format(cnt) , 'w') as f: json.dump(objects, f) text = imgae_to_text(objects) file_path, f_name = text_to_speech(text, lang, tld) pred_time = round(timer() - start_time, 5) return file_path, f_name, pred_time #path = [["D:/cuoc_thi/test_img/364452351_843427357389915_7340823319235373312_n.jpg"],["D:/cuoc_thi/test_img/download.jpg"],["D:/cuoc_thi/test_img/tong-hop-cac-mau-gia-ke-de-bat-dia-thong-minh-hot-nhat-2.jpg"]] iface = gr.Interface(fn=turn_img_into_voice, inputs=["pil", gr.inputs.Dropdown(choices=list(_main_langs().values()), label="Select language:", default='English'), gr.inputs.Dropdown(choices=[tld for tld in tlds], label="Select TLD:", default="com")], outputs=[gr.Audio(label="Audio", autoplay=True), gr.File(label="Audio File"), gr.Number(label="Prediction time (s)")], #examples=path, allow_flagging="never", live=True) iface.launch(enable_queue=True, inline=False)