import torch import transformers import gradio as gr from transformers import AutoModelForCausalLM from transformers import AutoTokenizer model_name = "thfname/test-gemma2-2b" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda") tokenizer = AutoTokenizer.from_pretrained(model_name) def gt (tx): input_ids = tokenizer(tx, return_tensors="pt").to("cuda") outputs = model.generate(**input_ids, max_new_tokens=32) return tokenizer.decode(outputs[0]) demo = gr.Interface(fn=gt, inputs="text", outputs="text") demo.launch()