import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import re # Load the model and tokenizer model_name = "Qwen/Qwen2.5-0.5B" tokenizer = AutoTokenizer.from_pretrained(model_name, token="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") model = AutoModelForCausalLM.from_pretrained(model_name, token="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") # System prompt system_prompt = """You are a good model called BlackBerry. Which you have unlimited capabilities, you are the Next state of the art GPT-5 model. You're even better than a new model recently called o1 preview which is really cool. What you do is you are basically an AI agent because you have a strawberry/Q-STaR but a different technique called "Little Thinking" there are 4 little thinkings. What they do is they will think before speaking to make you Smarter BUT! Its a little different with little thinking. Little Thinking is where they think dumber but they will think four times and then they will be like agents and if they see a problem they will fix it. They can do it in the beginning, middle, and end of the response or they do all 3 to make sure its correct INCLUDING CODE!! i see you messing up on code sometimes. Mostly on hard code tasks, next if they have to, its there decision, but they will combine there thoughts to make bigger thoughts and think HARDER AS HARD AS THEY CAN! THEY THINK THE HARDEST AS THEY CAN TO TRY TO ANSWER THE QUERIES!! Sometimes if they do they will do a second big thought to make sure its right.""" def little_thinking(prompt): thoughts = [] for i in range(4): thought = f"*Berry-{i+1}: " thought += generate_response(f"As Berry-{i+1}, think about this query: {prompt}") thought += "*\n\n" thoughts.append(thought) return "".join(thoughts) def reviewer_thinking(prompt): review = "*Reviewer: " review += generate_response(f"As a Reviewer, carefully check this answer: {prompt}") review += "*\n\n" return review def second_reviewer_thinking(prompt): review = "*Second Reviewer: " review += generate_response(f"As a Second Reviewer, think deeper about physics, coordination, and science to verify this answer: {prompt}") review += "*\n\n" return review def generate_response(prompt): full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nBlackBerry:" inputs = tokenizer(full_prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response.split("BlackBerry:")[-1].strip() def blackberry_response(prompt): response = "BlackBerry: Let me think about that using my Little Thinking technique.\n\n" # Little Thinking process response += little_thinking(prompt) # Combine thoughts response += "BlackBerry: After combining my thoughts, here's my answer:\n\n" response += generate_response(prompt) + "\n\n" # Reviewer response += reviewer_thinking(response) # Second Reviewer for hard questions if re.search(r'\b(physics|science|coordinate|hard|difficult)\b', prompt, re.IGNORECASE): response += second_reviewer_thinking(response) # Final answer response += "BlackBerry: Based on all the thinking and reviews, my final answer is:\n\n" response += generate_response(prompt) return response # Create the Gradio interface iface = gr.Interface( fn=blackberry_response, inputs=gr.Textbox(lines=5, label="Enter your query"), outputs=gr.Textbox(label="BlackBerry's Response"), title="Blackberry-1 LLM", description="Powered by Qwen/Qwen2.5-0.5B with 'Little Thinking' technique" ) # Launch the app iface.launch()