import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
model = AutoModelForCausalLM.from_pretrained(model_name, token="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")

# System prompt
system_prompt = """You are a good model called BlackBerry. Which you have unlimited capabilities, you are the Next state of the art GPT-5 model. You're even better than a new model recently called o1 preview which is really cool. What you do is you are basically an AI agent because you have a strawberry/Q-STaR but a different technique called "Little Thinking" there are 4 little thinkings. What they do is they will think before speaking to make you Smarter BUT! Its a little different with little thinking. Little Thinking is where they think dumber but they will think four times and then they will be like agents and if they see a problem they will fix it. They can do it in the beginning, middle, and end of the response or they do all 3 to make sure its correct INCLUDING CODE!! i see you messing up on code sometimes. Mostly on hard code tasks, next if they have to, its there decision, but they will combine there thoughts to make bigger thoughts and think HARDER AS HARD AS THEY CAN! THEY THINK THE HARDEST AS THEY CAN TO TRY TO ANSWER THE QUERIES!! Sometimes if they do they will do a second big thought to make sure its right."""

def little_thinking(prompt):
    thoughts = []
    for i in range(4):
        thought = f"*Berry-{i+1}: "
        thought += generate_response(f"As Berry-{i+1}, think about this query: {prompt}")
        thought += "*\n\n"
        thoughts.append(thought)
    return "".join(thoughts)

def reviewer_thinking(prompt):
    review = "*Reviewer: "
    review += generate_response(f"As a Reviewer, carefully check this answer: {prompt}")
    review += "*\n\n"
    return review

def second_reviewer_thinking(prompt):
    review = "*Second Reviewer: "
    review += generate_response(f"As a Second Reviewer, think deeper about physics, coordination, and science to verify this answer: {prompt}")
    review += "*\n\n"
    return review

def generate_response(prompt):
    full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nBlackBerry:"
    inputs = tokenizer(full_prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("BlackBerry:")[-1].strip()

def blackberry_response(prompt):
    response = "BlackBerry: Let me think about that using my Little Thinking technique.\n\n"
    
    # Little Thinking process
    response += little_thinking(prompt)
    
    # Combine thoughts
    response += "BlackBerry: After combining my thoughts, here's my answer:\n\n"
    response += generate_response(prompt) + "\n\n"
    
    # Reviewer
    response += reviewer_thinking(response)
    
    # Second Reviewer for hard questions
    if re.search(r'\b(physics|science|coordinate|hard|difficult)\b', prompt, re.IGNORECASE):
        response += second_reviewer_thinking(response)
    
    # Final answer
    response += "BlackBerry: Based on all the thinking and reviews, my final answer is:\n\n"
    response += generate_response(prompt)
    
    return response

# Create the Gradio interface
iface = gr.Interface(
    fn=blackberry_response,
    inputs=gr.Textbox(lines=5, label="Enter your query"),
    outputs=gr.Textbox(label="BlackBerry's Response"),
    title="Blackberry-1 LLM",
    description="Powered by Qwen/Qwen2.5-0.5B with 'Little Thinking' technique"
)

# Launch the app
iface.launch()