--- base_model: unsloth/llama-3-8b-bnb-4bit library_name: peft 0.13.2 license: mit datasets: - yahma/alpaca-cleaned language: - en --- How to use : ```python !pip install --no-deps packaging ninja einops peft accelerate bitsandbytes import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer from peft import PeftModel, PeftConfig # Load model and tokenizer configurations config = PeftConfig.from_pretrained("Vijayendra/llama3.0-8B-merged-4bit") base_model = AutoModelForCausalLM.from_pretrained("unsloth/llama-3-8b-bnb-4bit") model = PeftModel.from_pretrained(base_model, "Vijayendra/llama3.0-8B-merged-4bit") tokenizer = AutoTokenizer.from_pretrained("Vijayendra/llama3.0-8B-merged-4bit") # Ensure padding token is set for the tokenizer if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Define the inference function with TextStreamer def generate_answer_with_stream(model, tokenizer, text, max_new_tokens=1024, temperature=0.5, top_k=40, top_p=0.9): prompt = f"Answer the following question\n\n{text}\n\nQuestion:" # Tokenize the input text inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device) # Initialize the TextStreamer streamer = TextStreamer(tokenizer) # Generate answer using the model with streaming with torch.no_grad(): model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True, top_k=top_k, top_p=top_p, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, streamer=streamer # Stream output as it's generated ) # Input Question question = "What is quantum mechanics?" # Generate and print answer generate_answer_with_stream(model, tokenizer, question)