import os import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch from huggingface_hub import login login(token=os.getenv('HF_TOKEN')) # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B") model = AutoModelForCausalLM.from_pretrained( "Zyphra/Zamba2-7B", device_map="auto", # Automatically handles device placement torch_dtype=torch.bfloat16 ) def generate_response(input_text): input_ids = tokenizer(input_text, return_tensors="pt").to(model.device) outputs = model.generate( **input_ids, max_new_tokens=500, do_sample=True, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2, num_beams=5, length_penalty=1.0, num_return_sequences=1 ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Create the Gradio interface demo = gr.Interface( fn=generate_response, inputs=gr.inputs.Textbox(lines=5, placeholder="Enter your question here..."), outputs="text", title="Zamba2-7B Model", description="Ask Zamba2 7B a question." ) if __name__ == "__main__": demo.launch()