Minthy
/

ToriiGate-v0.2

Image-Text-to-Text

Model card Files Files and versions Community

ToriiGate-v0.2 / batch_processing_example.py

Minthy's picture

Upload folder using huggingface_hub

d8f0dfd verified about 2 months ago

history blame contribute delete

2.84 kB

	import torch
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from transformers.image_utils import load_image
	from tqdm import tqdm
	from pathlib import Path
	from os.path import join as opj
	from os import listdir

	model_name_or_path="Minthy/ToriiGate-v0.2"
	s_dir='./images_to_caption'
	caption_suffix='_caption_tags.txt' #suffix for generated captions
	tags_suffix='_tags.txt' #suggix for file with booru tags
	use_tags=True #set to True for using with reference tags
	image_extensions=['.jpg','.png','.webp','.jpeg']

	DEVICE = "cuda:0" #change to your device
	processor = AutoProcessor.from_pretrained(model_name_or_path) #or change to local path
	model = AutoModelForVision2Seq.from_pretrained(
	model_name_or_path,
	torch_dtype=torch.bfloat16,
	# _attn_implementation="flash_attention_2", #if installed
	).to(DEVICE)


	filelist=[fn for fn in listdir(s_dir) if any([fn.endswith(a) for a in image_extensions])]


	for fn in tqdm(filelist,desc='Captioninig'):
	image = load_image(opj(s_dir,fn))

	###Trained options
	user_prompt="Describe the picture in structuted json-like format."
	#user_prompt="Give a long and detailed description of the picture."
	#user_prompt="Describe the picture briefly."
	###Any other questions or instructions
	#user_prompt="What color is the ribbon in the character's hair?"
	#...

	#Optional, add booru tags
	if use_tags:
	try:
	tags=open(opj(s_dir,Path(fn).stem+tags_suffix)).read().strip()
	user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.'
	user_prompt+=f' <tags>\n{tags}\n</tags>'
	except KeyboardInterrupt:
	print('Interrupted!')
	quit()
	except Exception as err:
	print(err)
	continue

	messages = [
	{
	#Important!
	"role": "system",
	"content": [
	{"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."}
	]
	},
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": user_prompt}
	]
	}
	]
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

	# Generate
	generated_ids = model.generate(**inputs, max_new_tokens=500)
	generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
	caption=generated_texts[0].split('Assistant: ')[1]

	with open(opj(s_dir,Path(fn).stem+caption_suffix),'w',encoding='utf-8',errors='ignore') as outf:
	outf.write(caption)