photo-merge / app.py
keplersj's picture
typos
fe3afc1
raw
history blame contribute delete
No virus
2.16 kB
import streamlit as st
from PIL import Image
from transformers import pipeline as transformer
from diffusers import StableDiffusionPipeline
captions = []
with st.sidebar:
files = st.file_uploader("Upload images to blend", accept_multiple_files=True)
st.divider()
caption_model = st.selectbox("Caption Model", [
"Salesforce/blip-image-captioning-large",
"nlpconnect/vit-gpt2-image-captioning",
"microsoft/git-base",
"ydshieh/vit-gpt2-coco-en"
])
caption_max_tokens = st.number_input("Image Caption: Max Tokens", value=20)
st.divider()
caption_concat_joiner = st.text_input("Caption Concatenation Joiner", value=" ")
st.divider()
diffusion_model = st.selectbox("Diffusion Model", [
"stabilityai/stable-diffusion-xl-base-1.0",
"runwayml/stable-diffusion-v1-5",
"stabilityai/stable-diffusion-2-1",
"CompVis/stable-diffusion-v1-4"
])
image_gen_height = st.number_input("Stable Diffusion: Height", value=512)
image_gen_width = st.number_input("Stable Diffusion: Width", value=512)
image_gen_steps = st.slider("Stable Diffusion: Inference Steps", value=50)
image_gen_guidance = st.slider("Stable Diffusion: Guidance Scale", value=7.5)
image_gen_number = st.number_input("Stable Diffusion: Images Generated", value=1)
for file_name in files:
image = Image.open(file_name)
with st.spinner('Captioning Provided Image'):
captioner = transformer(model=caption_model)
caption = captioner(image, max_new_tokens=caption_max_tokens)[0]['generated_text']
captions.append(caption)
st.image(image, caption=caption)
if len(captions) > 0:
st.divider()
description = caption_concat_joiner.join(captions)
pipe = StableDiffusionPipeline.from_pretrained(diffusion_model)
with st.spinner(f'Generating Photo for "{description}"'):
images = pipe(description, height=image_gen_height, width=image_gen_width, num_inference_steps=image_gen_steps, guidance_scale=image_gen_guidance, num_images_per_prompt=image_gen_number).images
for image in images:
st.image(image, caption=description)