Spaces:

Ryan-Pham
/

Obj-to-voice

Runtime error

File size: 7,930 Bytes

import gradio as gr
import torch
import os
import requests
import json
import cv2
from PIL import Image
from timeit import default_timer as timer
import numpy as np
import aiofiles
from transformers import AutoModel
from gtts import gTTS
import io
import time
from gtts.lang import _main_langs

AUDIO_DIR = 'audio_files'
MAX_FILE_AGE = 24 * 60 * 60
model = torch.hub.load('ultralytics/yolov5','yolov5s', pretrained=True)
#model1 = AutoModel.from_pretrained(model)
cnt = 0
def LCR(bbox,x_img, y_img):
    x1 = bbox[0]/x_img
    x2 = bbox[2]/x_img
    if x1 < 0.2 and x2 < 0.2 :
        location = "On the left"
    elif x1 > 0.8 and x2 > 0.8:
        location = "On the right"
    elif x1 < 0.2 and (x2 <= 0.8 and x2 >= 0.2):
        if (x1 + x2) < 0.4:
            location = "On the left"
        else:
            location = "At the center" 
    elif x2 > 0.8 and (x1 <= 0.8 and x1 >= 0.2):
        if (x1 + x2) > 1.6:
            location = "On the right"
        else:
            location = "At the center" 
    else:
        location = "At the center"
    print(f"x1 {x1} x2 {x2} bbox0 {bbox[0]} bbox2 {bbox[2]} x_img {x_img} LocationLCR {location}")
    return location

def ACB(bbox, x_img, y_img, location):
    y1 = bbox[1]/y_img
    y2 = bbox[3]/y_img
    if location == "At the center":
        if y1 < 0.33333 and y2 < 0.33333 :
            location = "On the top"
        elif y1 > 0.66667 and y2 > 0.66667:
            location = "On the bottom"
        elif y1 < 0.33333 and (y2 <= 0.66667 and y2 >= 0.33333):
            if (y1 + y2) < 0.66667:
                location = "On the top"
            else:
                location = "At the center" 
        elif y2 > 0.66667 and (y1 <= 0.66667 and y1 >= 0.33333):
            if (y1 + y2) > 1.33333:
                location = "On the bottom"
            else:
                location = "At the center" 
        else:
            location = "At the center"
    else:
        pass
    print(f"y1 {y1} y2 {y2} bbox1 {bbox[1]} bbox3 {bbox[3]} y_img {y_img} Location{location}")
    
    return location
    #print(bbox[0])
def imgae_to_text(data) : 
    count = {}
    for index, infor in enumerate(data):
        key = infor['Location']  + ':' + infor['Class']
        if key in count:
            count[key] += 1
        else:
            count[key] = 1
    text = ""

    for index1, infor1 in enumerate(count):
        name_class =""
        value = count[infor1]
        parts = infor1.split(":")
        if value > 1 :
            vbare = "are"
            if parts[1] =='person':
                name_class = 'people'
            else:
                name_class = parts[1] + 's'
        else:
            name_class = parts[1] 
            vbare = "is"
        text +=  parts[0] + ", there" + " " + vbare + " " + f"{value}" + " " + name_class +'.' +" "
    return text   


def delete_old_audio_files():
    # delete audio files older than MAX_FILE_AGE
    now = time.time()
    for file_name in os.listdir(AUDIO_DIR):
        file_path = os.path.join(AUDIO_DIR, file_name)
        if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
            os.remove(file_path)

# list of supported TLDs
tlds = [
    "com",
    "ad",
    "ae",
    "com.af",
    "com.ag",
    "com.ai",
    "com.ar",
    "as",
    "at",
    "com.au",
    "az",
    "ba",
    "com.bd",
    "be",
    "bf",
    "bg",
    "bj",
    "br",
    "bs",
    "bt",
    "co.bw",
    "by",
    "com.bz",
    "ca",
    "cd",
    "ch",
    "ci",
    "co.ck",
    "cl",
    "cm",
    "cn",
    "com.co",
    "co.cr",
    "cv",
    "dj",
    "dm",
    "com.do",
    "dz",
    "com.ec",
    "ee",
    "com.eg",
    "es",
    "et",
    "fi",
    "com.fj",
    "fm",
    "fr",
    "ga",
    "ge",
    "gg",
    "com.gh",
    "com.gi",
    "gl",
    "gm",
    "gr",
    "com.gt",
    "gy",
    "com.hk",
    "hn",
    "ht",
    "hr",
    "hu",
    "co.id",
    "ie",
    "co.il",
    "im",
    "co.in",
    "iq",
    "is",
    "it",
    "iw",
    "je",
    "com.je",
    "jo",
    "co.jp",
    "co.ke",
    "com.kh",
    "ki",
    "kg",
    "co.kr",
    "com.kw",
    "kz",
    "la",
    "com.lb",
    "li",
    "lk",
    "co.ls",
    "lt",
    "lu",
    "lv",
    "com.ly",
    "com.ma",
    "md",
    "me",
    "mg",
    "mk",
    "ml",
    "mm",
    "mn",
    "ms",
    "com.mt",
    "mu",
    "mv",
    "mw",
    "com.mx",
    "com.my",
    "co.mz",
    "na",
    "ng",
    "ni",
    "ne",
    "nl",
    "no",
    "com.np",
    "nr",
    "nu",
    "co.nz",
    "com.om",
    "pa",
    "pe",
    "pg",
    "ph",
    "pk",
    "pl",
    "pn",
    "com.pr",
    "ps",
    "pt",
    "com.py",
    "com.qa",
    "ro",
    "ru",
    "rw",
    "com.sa",
    "com.sb",
    "sc",
    "se",
    "com.sg",
    "sh",
    "si",
    "sk",
    "com.sl",
    "sn",
    "so",
    "sm",
    "sr",
    "st",
    "com.sv",
    "td",
    "tg",
    "co.th",
    "com.tj",
    "tl",
    "tm",
    "tn",
    "to",
    "com.tr",
    "tt",
    "com.tw",
    "co.tz",
    "com.ua",
    "co.ug",
    "co.uk",
    "com,uy",
    "co.uz",
    "com.vc",
    "co.ve",
    "vg",
    "co.vi",
    "com.vn",
    "vu",
    "ws",
    "rs",
    "co.za",
    "co.zm",
    "co.zw",
    "cat",
]

def text_to_speech(text, lang, tld):
    # map the language name to its corresponding code
    lang_codes = {lang_name: lang_code for lang_code, lang_name in _main_langs().items()}
    lang_code = lang_codes[lang]

    # create the text-to-speech audio
    tts = gTTS(text, lang=lang_code, tld=tld)
    fp = io.BytesIO()
    tts.write_to_fp(fp)
    fp.seek(0)

    # create the audio directory if it does not exist
    os.makedirs(AUDIO_DIR, exist_ok=True)

    # generate a unique file name for the audio file
    file_name = str(time.time()) + '.wav'
    file_path = os.path.join(AUDIO_DIR, file_name)

    # save the audio stream to a file
    with open(file_path, 'wb') as f:
        f.write(fp.read())

    # delete old audio files
    delete_old_audio_files()

    # return the file path
    return file_path, f.name


def turn_img_into_voice(frame, lang, tld):
    start_time = timer()
    x_img, y_img = frame.size
    print(x_img,y_img)
    global cnt
    objects = []

    prediction = model(frame)
    for det in prediction.xyxy[0]:
        class_id = int(det[5])
        class_name = model.names[class_id]
        confidence = float(det[4])
        bbox = det[:4].tolist()
        if(confidence >= 0.5):
            location = LCR(bbox, x_img, y_img)
            location = ACB(bbox, x_img, y_img, location)
            # Save the results to the list
            objects.append({
                'Class': class_name,
                #'BoundingBox': bbox,
                'Location': location,
                'Confidence': confidence
            })
        
        with open('{:05d}.json'.format(cnt)  , 'w') as f:
            json.dump(objects, f)
    text = imgae_to_text(objects)

    file_path, f_name = text_to_speech(text, lang, tld)
    pred_time = round(timer() - start_time, 5)
    return file_path, f_name, pred_time


#path = [["D:/cuoc_thi/test_img/364452351_843427357389915_7340823319235373312_n.jpg"],["D:/cuoc_thi/test_img/download.jpg"],["D:/cuoc_thi/test_img/tong-hop-cac-mau-gia-ke-de-bat-dia-thong-minh-hot-nhat-2.jpg"]]


iface = gr.Interface(fn=turn_img_into_voice, 
                     inputs=["pil",
                             gr.inputs.Dropdown(choices=list(_main_langs().values()), label="Select language:", default='English'),
                             gr.inputs.Dropdown(choices=[tld for tld in tlds], label="Select TLD:", default="com")], 
                     outputs=[gr.Audio(label="Audio", autoplay=True),
                              gr.File(label="Audio File"),
                              gr.Number(label="Prediction time (s)")],
                     #examples=path,
                     allow_flagging="never",
                     live=True)

iface.launch(enable_queue=True, inline=False)