video / app.py
aakashch0179's picture
Update app.py
eadb20d verified
raw
history blame
6.1 kB
# Text to Vedio
# import torch
# from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
# from diffusers.utils import export_to_video
# import streamlit as st
# import numpy as np
# # Title and User Input
# st.title("Text-to-Video with Streamlit")
# prompt = st.text_input("Enter your text prompt:", "Spiderman is surfing")
# # Button to trigger generation
# if st.button("Generate Video"):
# # Ensure you have 'accelerate' version 0.17.0 or higher
# import accelerate
# if accelerate.__version__ < "0.17.0":
# st.warning("Please upgrade 'accelerate' to version 0.17.0 or higher for CPU offloading.")
# else:
# with st.spinner("Generating video..."):
# # Define the pipeline for image generation
# pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b",
# torch_dtype=torch.float16, variant="fp16", device="cpu")
# pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
# # Generate video frames
# video_frames = pipe(prompt, num_inference_steps=25).frames
# # Create dummy frames for testing (replace with actual manipulation later)
# dummy_frames = [np.ones((256, 256, 3), dtype=np.uint8) for _ in range(20)]
# # Export to video
# video_path = export_to_video(dummy_frames)
# # Display the video in the Streamlit app
# st.video(video_path)
# Text to 3D
# import streamlit as st
# import torch
# from diffusers import ShapEPipeline
# from diffusers.utils import export_to_gif
# from PIL import Image
# import numpy as np
# # import PyTorch
# # Model loading (Ideally done once at the start for efficiency)
# ckpt_id = "openai/shap-e"
# @st.cache_resource # Caches the model for faster subsequent runs
# def process_image_for_pil(image):
# if isinstance(image, torch.Tensor):
# # Your PyTorch conversion logic here (with correct indentation)
# # elif isinstance(image, np.ndarray):
# # Your Numpy conversion logic here (with correct indentation)
# image_array = image.astype('uint8') # Assuming 8-bit conversion is needed
# return Image.fromarray(image_array)
# else:
# raise TypeError("Unsupported image format. Please provide conversion logic.")
# test_image = np.random.randint(0, 256, size=(256, 256, 3), dtype=np.uint8) # Placeholder image
# result = process_image_for_pil(test_image)
# def should_resize(image): # Add 'image' as an argument
# """Determines whether to resize images (replace with your own logic)"""
# if image.width > 512 or image.height > 512:
# return True
# else:
# return False
# def load_model():
# return ShapEPipeline.from_pretrained(ckpt_id).to("cuda")
# pipe = load_model()
# # App Title
# st.title("Shark 3D Image Generator")
# # User Inputs
# prompt = st.text_input("Enter your prompt:", "a shark")
# guidance_scale = st.slider("Guidance Scale", 0.0, 20.0, 15.0, step=0.5)
# # Generate and Display Images
# if st.button("Generate"):
# with st.spinner("Generating images..."):
# images = pipe(prompt, guidance_scale=guidance_scale, num_inference_steps=64).images
# # ... (Process images for PIL conversion)
# # Resize Images (Optional)
# pil_images = [] # Modified to store resized images if needed
# for image in images:
# processed_image = process_image_for_pil(image)
# if should_resize(processed_image): # Pass image to should_resize
# resized_image = processed_image.resize((256, 256))
# pil_images.append(resized_image)
# else:
# pil_images.append(processed_image) # Append without resizing
# gif_path = export_to_gif(pil_images, "shark_3d.gif")
# st.image(pil_images[0])
# st.success("GIF saved as shark_3d.gif")
# visual QA
import requests
from PIL import Image
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import streamlit as st
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-ai2d-base")
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-ai2d-base")
question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
inputs = processor(images=image, text=question, return_tensors="pt")
predictions = model.generate(**inputs,max_new_tokens= 1000)
# print(processor.decode(predictions[0], skip_special_tokens=True))
def load_image():
with st.sidebar:
if img := st.text_input("Enter Image URL") or st.selectbox("Select Image", ("https://images.unsplash.com/photo-1593466144596-8abd50ad2c52?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3434&q=80", "https://images.unsplash.com/photo-1566438480900-0609be27a4be?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3394&q=80")):
if st.button("Load Image"):
st.write("Image Uploaded!")
st.image(img)
else:
st.warning("Please enter an image URL and click 'Load Image' before asking a question.")
return img
def visual_qna():
st.title("Visual Q&A")
img = load_image()
if img:
if query := st.chat_input("Enter your message"):
response = model(question=query, image=img)
with st.chat_message("assistant"):
st.write(response)
else:
st.warning("Please enter an image URL and click 'Load Image' before asking a question.")