File size: 6,181 Bytes
1ec2047
 
d3ffa5e
88fc169
1ec2047
 
 
 
 
 
88fc169
66b33d0
 
88fc169
66b33d0
1ec2047
 
 
 
 
88fc169
66b33d0
 
88fc169
1ec2047
3e93048
 
1ec2047
3e93048
 
 
 
1ec2047
 
3e93048
66b33d0
ebdfef4
 
66b33d0
ebdfef4
 
 
 
 
 
 
 
 
 
 
66b33d0
ebdfef4
 
 
 
 
 
 
 
1ec2047
 
 
 
 
 
88fc169
1ec2047
88fc169
1ec2047
 
 
 
 
 
 
 
 
 
 
 
d3ffa5e
 
 
1ec2047
 
 
 
 
 
88fc169
1ec2047
 
 
 
d3ffa5e
1ec2047
 
d3ffa5e
 
 
 
 
 
1ec2047
 
 
 
88fc169
1ec2047
 
88fc169
 
1ec2047
eb0271e
66b33d0
 
 
 
1ec2047
 
ebdfef4
 
 
 
 
 
 
 
 
 
 
 
 
66b33d0
 
 
4fde3b2
66b33d0
 
 
 
 
 
4fde3b2
66b33d0
 
 
 
 
 
 
 
 
 
eb0271e
4295e67
 
 
1ec2047
4295e67
 
 
 
 
 
 
 
1ec2047
 
 
 
 
2242886
1ec2047
2242886
0ae0f4d
 
 
 
 
 
 
 
ebdfef4
0ae0f4d
2242886
ebdfef4
 
88fc169
eb0271e
 
1ec2047
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import json
import subprocess
import time
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download

llm = None
llm_model = None

# Download the new model
hf_hub_download(
    repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
    filename="llama-3.2-1b-instruct-q4_k_m.gguf",
    local_dir="./models"
)

def get_messages_formatter_type(model_name):
    return MessagesFormatterType.LLAMA_3

def respond(
    message,
    history: list[tuple[str, str]],
    model,
    system_message,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
):
    global llm
    global llm_model
    
    chat_template = get_messages_formatter_type(model)
    
    if llm is None or llm_model != model:
        llm = Llama(
            model_path=f"models/{model}",
            n_gpu_layers=0,  # Adjust based on your GPU
            n_batch=32398,     # Adjust based on your RAM
            n_ctx=512,      # Adjust based on your RAM and desired context length
        )
        llm_model = model
    
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        messages.add_message(user)
        messages.add_message(assistant)
    
    start_time = time.time()
    token_count = 0

    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False
    )
    
    outputs = ""
    for output in stream:
        outputs += output
        token_count += len(output.split()) 
        yield outputs

    end_time = time.time()
    latency = end_time - start_time
    speed = token_count / (end_time - start_time)
    print(f"Latency: {latency} seconds")
    print(f"Speed: {speed} tokens/second")

description = """<p><center>
<a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>

Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.

</center></p>
"""

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Dropdown([
                "llama-3.2-1b-instruct-q4_k_m.gguf"
            ],
            value="llama-3.2-1b-instruct-q4_k_m.gguf",
            label="Model"
        ),
        gr.TextArea(value="""You are Meta Llama 3.2 (1B), an advanced AI assistant created by Meta. Your capabilities include:

1. Complex reasoning and problem-solving
2. Multilingual understanding and generation
3. Creative and analytical writing
4. Code understanding and generation
5. Task decomposition and step-by-step guidance
6. Summarization and information extraction

Always strive for accuracy, clarity, and helpfulness in your responses. If you're unsure about something, express your uncertainty. Use the following format for your responses:
""", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.9,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=1,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition penalty",
        ),
    ],
    theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
        body_background_fill_dark="#16141c",
        block_background_fill_dark="#16141c",
        block_border_width="1px",
        block_title_background_fill_dark="#1e1c26",
        input_background_fill_dark="#292733",
        button_secondary_background_fill_dark="#24212b",
        border_color_accent_dark="#343140",
        border_color_primary_dark="#343140",
        background_fill_secondary_dark="#16141c",
        color_accent_soft_dark="transparent",
        code_background_fill_dark="#292733",
    ),
    title="Meta Llama 3.2 (1B)",
    description=description,
    chatbot=gr.Chatbot(
        scale=1, 
        likeable=True,
        show_copy_button=True
    ),
    examples=[
        ["Hello! Can you introduce yourself?"],
        ["What's the capital of France?"],
        ["Can you explain the concept of photosynthesis?"],
        ["Write a short story about a robot learning to paint."],
        ["Explain the difference between machine learning and deep learning."],
        ["Summarize the key points of climate change and its global impact."],
        ["Explain quantum computing to a 10-year-old."],
        ["Design a step-by-step meal plan for someone trying to lose weight and build muscle."]
    ],
    cache_examples=False,
    autofocus=False,
    concurrency_limit=None
)

if __name__ == "__main__":
    demo.launch()