import torch from transformers import AutoProcessor, AutoModelForVision2Seq from transformers.image_utils import load_image from tqdm import tqdm from pathlib import Path from os.path import join as opj from os import listdir model_name_or_path="Minthy/ToriiGate-v0.2" s_dir='./images_to_caption' caption_suffix='_caption_tags.txt' #suffix for generated captions tags_suffix='_tags.txt' #suggix for file with booru tags use_tags=True #set to True for using with reference tags image_extensions=['.jpg','.png','.webp','.jpeg'] DEVICE = "cuda:0" #change to your device processor = AutoProcessor.from_pretrained(model_name_or_path) #or change to local path model = AutoModelForVision2Seq.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16, # _attn_implementation="flash_attention_2", #if installed ).to(DEVICE) filelist=[fn for fn in listdir(s_dir) if any([fn.endswith(a) for a in image_extensions])] for fn in tqdm(filelist,desc='Captioninig'): image = load_image(opj(s_dir,fn)) ###Trained options user_prompt="Describe the picture in structuted json-like format." #user_prompt="Give a long and detailed description of the picture." #user_prompt="Describe the picture briefly." ###Any other questions or instructions #user_prompt="What color is the ribbon in the character's hair?" #... #Optional, add booru tags if use_tags: try: tags=open(opj(s_dir,Path(fn).stem+tags_suffix)).read().strip() user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.' user_prompt+=f' \n{tags}\n' except KeyboardInterrupt: print('Interrupted!') quit() except Exception as err: print(err) continue messages = [ { #Important! "role": "system", "content": [ {"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."} ] }, { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": user_prompt} ] } ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") inputs = {k: v.to(DEVICE) for k, v in inputs.items()} # Generate generated_ids = model.generate(**inputs, max_new_tokens=500) generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) caption=generated_texts[0].split('Assistant: ')[1] with open(opj(s_dir,Path(fn).stem+caption_suffix),'w',encoding='utf-8',errors='ignore') as outf: outf.write(caption)