Llama-3.1-8B-african-aya
- Developed by: vutuka
- License: apache-2.0
- Finetuned from model : unsloth/meta-llama-3.1-8b-bnb-4bit
This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.
Unsloth Inference (2x Faaaaster)
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
max_seq_length = 4096
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage.
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
## Load the Quantize model
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "vutuka/Llama-3.1-8B-african-aya",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)
def llama_african_aya(input: str = "", instruction: str = ""):
inputs = tokenizer(
[
alpaca_prompt.format(
instruction,
input,
"",
)
], return_tensors = "pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 800)
# Generate the response
output = model.generate(**inputs, max_new_tokens=1024)
# Decode the generated response
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract the response part if needed (assuming the response starts after "### Response:")
response_start = generated_text.find("### Response:") + len("### Response:")
response = generated_text[response_start:].strip()
# Format the response in Markdown
# markdown_response = f"{response}"
# Render the markdown response
# display(Markdown(markdown_response))
return response
llama_african_aya(
instruction="",
input="Àwọn ajínigbé méjì ni wọ́n mú ní Supare Akoko, ṣàlàyé ìtàn náà."
)
LlamaCPP Code
CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" \
pip install llama-cpp-python
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
## Download the GGUF model
model_name = "vutuka/Llama-3.1-8B-african-aya"
model_file = "llama-3.1-8B-african-aya.Q8_0.gguf"
model_path = hf_hub_download(model_name, filename=model_file)
## Instantiate model from downloaded file
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_gpu_layers=-1,
n_batch=512,
verbose=False,
)
## Run inference
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
prompt = alpaca_prompt.format(
"",
"Àwọn ajínigbé méjì ni wọ́n mú ní Supare Akoko, ṣàlàyé ìtàn náà.",
"",
)
res = llm(prompt) # Res is a dictionary
## Unpack and the generated text from the LLM response dictionary and print it
print(res["choices"][0]["text"])
# res is short for result
- Downloads last month
- 401
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.