File size: 5,310 Bytes
576474f
 
 
 
 
 
992ad15
2ae746c
 
576474f
 
 
 
 
2ae746c
a2232d8
576474f
a2232d8
bd16ace
 
a2232d8
e80da7c
 
 
 
 
 
2ae746c
576474f
 
 
 
10006aa
576474f
 
 
 
 
 
 
 
10006aa
2ae746c
 
10006aa
2fb3212
81c62e1
 
 
 
2ae746c
2fb3212
992ad15
493f720
5cb071c
493f720
992ad15
576474f
992ad15
 
ce6efbb
992ad15
576474f
992ad15
 
 
 
a2232d8
992ad15
 
 
 
 
 
 
 
 
 
 
 
 
 
3c5e66e
9f4ac5e
3c5e66e
f8a999d
9f4ac5e
 
f8a999d
 
 
 
 
 
 
 
 
 
 
 
992ad15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ae746c
992ad15
a2232d8
992ad15
 
 
 
 
 
2ae746c
 
992ad15
 
ce6efbb
 
992ad15
 
 
 
a2232d8
992ad15
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python

import os
import requests
from threading import Thread
from typing import Iterator

import gradio as gr
import psutil
import spaces
import torch
from time import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from llama_cpp import Llama

# load like this - use tne variable everywhere 
model_uri_hf=os.getenv("MODEL_URI_HF")
# show warning, when empty and briefs description of how to set it 
# also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
# info about ram requirements

# DEBUG!
model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/blob/main/neural-chat-7b-v3-2.Q2_K.gguf"
model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/resolve/main/neural-chat-7b-v3-2.Q2_K.gguf"
# maybe use git lfs to dl instead?


# Initing things                
print(f"debug: init model: {model_uri_hf}")

# Check if the model file already exists
if not os.path.isfile('model.bin'):
    print(f"debug: can't find model locally, downloading ...")
    # Download the model
    response = requests.get(model_uri_hf)

    # Save the model to a local file
    with open('model.bin', 'wb') as file:
        file.write(response.content)

llm = Llama(model_path="./model.bin")                             # LLaMa model
print("debug: model loaded and ready")

# Preparing things to work
title = f"# Demo for 7B Models - Quantized {model_uri_hf}"
descr = '''
Quantized to run in the free tier hosting. 
Have a quick way to test models or share them with others without hassle.
It runs slow, as it's on cpu. Usable for basic tests.
It uses quantized models in gguf-Format and llama.cpp to run them.

Powered by ...'''

print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")

DESCRIPTION =  f"# Test model: {model_uri_hf}"

if torch.cuda.is_available():
    DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"

#todo - probably lower. like 200 in and maybe 500 out? Should be ok for quick test
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

# we need to make sure we only run one thread or we probably run out of ram
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    conversation = []
    for user, assistant in chat_history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})

    # Use LLaMa to create chat completion
    chat_completion = llm.create_chat_completion(conversation, stream=True)

    outputs = []
    # Yield the chat completions
    for completion in chat_completion:
        if "content" in completion["choices"][0]["delta"]:
            outputs.append(completion["choices"][0]['delta']['content'])
            yield "".join(outputs)

    # t = Thread(target=model.generate, kwargs=generate_kwargs)
    # t.start()

    # outputs = []
    # for text in streamer:
    #     outputs.append(text)
    #     yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    # add more eval examples, like a long list taken from teknium and others maybe group by type
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly to me what is the Python programming language?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(title)
    gr.Markdown(descr)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
        # add 
    )
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()