File size: 2,843 Bytes

d8f0dfd

import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from tqdm import tqdm
from pathlib import Path
from os.path import join as opj
from os import listdir

model_name_or_path="Minthy/ToriiGate-v0.2"
s_dir='./images_to_caption'
caption_suffix='_caption_tags.txt' #suffix for generated captions
tags_suffix='_tags.txt' #suggix for file with booru tags
use_tags=True #set to True for using with reference tags
image_extensions=['.jpg','.png','.webp','.jpeg']

DEVICE = "cuda:0" #change to your device
processor = AutoProcessor.from_pretrained(model_name_or_path) #or change to local path
model = AutoModelForVision2Seq.from_pretrained(
    model_name_or_path, 
    torch_dtype=torch.bfloat16,
#	_attn_implementation="flash_attention_2", #if installed
).to(DEVICE)


filelist=[fn for fn in listdir(s_dir) if any([fn.endswith(a) for a in image_extensions])]


for fn in tqdm(filelist,desc='Captioninig'):
    image = load_image(opj(s_dir,fn))

    ###Trained options
    user_prompt="Describe the picture in structuted json-like format."
    #user_prompt="Give a long and detailed description of the picture."
    #user_prompt="Describe the picture briefly."
    ###Any other questions or instructions
    #user_prompt="What color is the ribbon in the character's hair?"
    #...
    
    #Optional, add booru tags
    if use_tags:
        try:
            tags=open(opj(s_dir,Path(fn).stem+tags_suffix)).read().strip()
            user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.'
            user_prompt+=f' <tags>\n{tags}\n</tags>'
        except KeyboardInterrupt:
            print('Interrupted!')
            quit()
        except Exception as err:
            print(err)
            continue

    messages = [
        {
        #Important!
            "role": "system",
            "content": [
                {"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."} 
            ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": user_prompt}
            ]
        }
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    # Generate
    generated_ids = model.generate(**inputs, max_new_tokens=500)
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    caption=generated_texts[0].split('Assistant: ')[1]
    
    with open(opj(s_dir,Path(fn).stem+caption_suffix),'w',encoding='utf-8',errors='ignore') as outf:
        outf.write(caption)