from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(".", use_auth_token=None) model = AutoModelForCausalLM.from_pretrained(".", use_auth_token=None) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Example text input text_input = "How QOS is applied on routers?" p=""" <|system|> You are a helpful assistant.<|end|> <|user|>""" + text_input + """<|end|> <|assistant|> """ # Tokenize and move input to device inputs = tokenizer(p, return_tensors="pt") inputs = inputs.to(device) print("User Query: " + text_input) # Generate text on the device outputs = model.generate(**inputs, max_length=2000, num_return_sequences=1) print("Model Response: ") # Decode generated text for output in outputs: generated_text = tokenizer.decode(output, skip_special_tokens=True) print(generated_text)