# Text to Vedio
# import torch
# from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
# from diffusers.utils import export_to_video
# import streamlit as st
# import numpy as np 

# # Title and User Input
# st.title("Text-to-Video with Streamlit")
# prompt = st.text_input("Enter your text prompt:", "Spiderman is surfing")

# # Button to trigger generation
# if st.button("Generate Video"):  
#     # Ensure you have 'accelerate' version 0.17.0 or higher 
#     import accelerate 
#     if accelerate.__version__ < "0.17.0":
#         st.warning("Please upgrade 'accelerate' to version 0.17.0 or higher for CPU offloading.")
#     else:
#         with st.spinner("Generating video..."):
#             # Define the pipeline for image generation
#             pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", 
#                                                  torch_dtype=torch.float16,                                                 variant="fp16",                                                 device="cpu") 
#             pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
#             pipe.enable_model_cpu_offload()  

#             # Generate video frames
#             video_frames = pipe(prompt, num_inference_steps=25).frames  

#             # Create dummy frames for testing (replace with actual manipulation later)
#             dummy_frames = [np.ones((256, 256, 3), dtype=np.uint8) for _ in range(20)]  

#             # Export to video
#             video_path = export_to_video(dummy_frames)

#             # Display the video in the Streamlit app  
#             st.video(video_path) 


# Text to 3D

# import streamlit as st
# import torch
# from diffusers import ShapEPipeline
# from diffusers.utils import export_to_gif
# from PIL import Image
# import numpy as np
# # import PyTorch

# # Model loading (Ideally done once at the start for efficiency)
# ckpt_id = "openai/shap-e"  

# @st.cache_resource  # Caches the model for faster subsequent runs

# def process_image_for_pil(image): 
#     if isinstance(image, torch.Tensor):   
#         # Your PyTorch conversion logic here (with correct indentation)
#     # elif isinstance(image, np.ndarray):   
#         # Your Numpy conversion logic here (with correct indentation)
#         image_array = image.astype('uint8')  # Assuming 8-bit conversion is needed
#         return Image.fromarray(image_array) 
#     else: 
#         raise TypeError("Unsupported image format. Please provide conversion logic.")

# test_image = np.random.randint(0, 256, size=(256, 256, 3), dtype=np.uint8)  # Placeholder image  
# result = process_image_for_pil(test_image)


# def should_resize(image):  # Add 'image' as an argument
#     """Determines whether to resize images (replace with your own logic)"""
#     if image.width > 512 or image.height > 512: 
#         return True   
#     else:
#         return False  
# def load_model():    
#     return ShapEPipeline.from_pretrained(ckpt_id).to("cuda")  

# pipe = load_model()

# # App Title
# st.title("Shark 3D Image Generator")

# # User Inputs
# prompt = st.text_input("Enter your prompt:", "a shark")
# guidance_scale = st.slider("Guidance Scale", 0.0, 20.0, 15.0, step=0.5)

# # Generate and Display Images 
# if st.button("Generate"):
#     with st.spinner("Generating images..."):
#         images = pipe(prompt, guidance_scale=guidance_scale, num_inference_steps=64).images

#         # ... (Process images for PIL conversion)

#         # Resize Images (Optional)
#         pil_images = []  # Modified to store resized images if needed
#         for image in images:
#             processed_image = process_image_for_pil(image)
#             if should_resize(processed_image):  # Pass image to should_resize
#                 resized_image = processed_image.resize((256, 256))
#                 pil_images.append(resized_image) 
#             else:
#                 pil_images.append(processed_image)  # Append without resizing

#         gif_path = export_to_gif(pil_images, "shark_3d.gif")
#         st.image(pil_images[0]) 
#         st.success("GIF saved as shark_3d.gif")


# visual QA
import requests
from PIL import Image
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import streamlit as st


image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)

model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-ai2d-base")
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-ai2d-base")

question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"

inputs = processor(images=image, text=question, return_tensors="pt")

predictions = model.generate(**inputs,max_new_tokens= 1000)
# print(processor.decode(predictions[0], skip_special_tokens=True))


def load_image():
    with st.sidebar:
        if img := st.text_input("Enter Image URL") or st.selectbox("Select Image", ("https://images.unsplash.com/photo-1593466144596-8abd50ad2c52?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3434&q=80", "https://images.unsplash.com/photo-1566438480900-0609be27a4be?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3394&q=80")):
            if st.button("Load Image"):
                st.write("Image Uploaded!")
                st.image(img)
        else:
            st.warning("Please enter an image URL and click 'Load Image' before asking a question.")
    return img


def visual_qna():
    st.title("Visual Q&A")
    img = load_image()
    if img:
        if query := st.chat_input("Enter your message"):
            response = model(question=query, image=img)
            with st.chat_message("assistant"):
                st.write(response)
    else:
        st.warning("Please enter an image URL and click 'Load Image' before asking a question.")