File size: 2,044 Bytes
15f2286
 
c95d497
 
 
 
15f2286
 
c95d497
 
 
15f2286
c95d497
 
 
 
 
15f2286
 
c95d497
 
 
 
15f2286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c95d497
 
15f2286
 
 
 
 
 
 
 
 
 
c95d497
 
15f2286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import argparse
import io
import logging
import os

import gradio as gr
import requests
from PIL import Image
from pillow_heif import register_heif_opener
from transformers import pipeline

os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG")
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 200))
# https://huggingface.co/models?pipeline_tag=image-to-text&sort=likes
MODEL = os.getenv("MODEL", "Salesforce/blip-image-captioning-large")

register_heif_opener()

logging.basicConfig(level=LOG_LEVEL)
logger = logging.getLogger(__name__)


def setup_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--share", action="store_true", default=False)
    return parser.parse_args()


def load_image_from_url(url):
    try:
        response = requests.get(url)
        if not response.ok:
            raise Exception("Error downloading image")
        image = Image.open(io.BytesIO(response.content))
        return image
    except Exception as e:
        logger.error("Error loading image from URL: %s", e)
        raise


def graptioner(image, url):
    if url and url.strip():
        image = load_image_from_url(url)
    width, height = image.size
    if width < 1 or height < 1:
        raise Exception("Invalid image")
    logger.debug("Loaded image size: %sx%s", width, height)
    # generate caption
    result = captioner(image)
    return result[0]["generated_text"]


if __name__ == "__main__":
    args = setup_args()
    logger.info("Loading model...")
    # simpler model: "ydshieh/vit-gpt2-coco-en"
    captioner = pipeline(
        "image-to-text",
        model=MODEL,
        max_new_tokens=MAX_NEW_TOKENS,
    )
    logger.info("Done loading model.")
    iface = gr.Interface(
        fn=graptioner,
        inputs=[
            gr.Image(type="pil", label="Upload Image"),
            gr.Textbox(lines=1, placeholder="Image URL", label="Image URL"),
        ],
        outputs=["text"],
        allow_flagging="never",
    )
    iface.launch(share=args.share)