import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import gradio as gr

# Check if a GPU is available and use it, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained model and tokenizer from the saved directory
model_path = "Blexus/Quble_Test_Model_v1_Pretrain"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

# Set model to evaluation mode
model.eval()

# Function to generate text in a stream-based manner
def generate_text(prompt):
    # Tokenize and encode the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    max_length = 50  # Maximum length of generated text

    # Generate continuation with streaming tokens
    with torch.no_grad():
        for generated_ids in model.generate(
            input_ids, 
            max_length=max_length, 
            num_return_sequences=1, 
            pad_token_id=tokenizer.eos_token_id, 
            do_sample=True, 
            top_k=50, 
            top_p=0.95,
            output_scores=True,     # Include scores for sampling
            return_dict_in_generate=True,
            use_cache=True
        ).sequences:

            # Decode each step incrementally
            decoded_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
            yield decoded_text  # Stream the partial text back to the UI

# Create a Gradio interface with streaming enabled
interface = gr.Interface(
    fn=generate_text,                          # Function to call when interacting with the UI
    inputs="text",                             # Input type: Single-line text
    outputs=gr.Markdown(),                     # Stream output using Markdown
    title="Quble Text Generation",             # Title of the UI
    description="Enter a prompt to generate text using Quble with live streaming."  # Simple description
)

# Launch the Gradio app
interface.launch()