File size: 5,057 Bytes
e7cb277
 
 
66be512
 
e7cb277
66be512
780d871
e7cb277
66be512
e7cb277
 
 
66be512
e7cb277
 
 
66be512
 
e7cb277
 
 
3d66326
 
e7cb277
 
 
 
 
66be512
 
e7cb277
 
 
 
66be512
 
 
 
 
 
 
 
 
 
 
 
 
e7cb277
 
66be512
e7cb277
66be512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7cb277
 
 
 
 
 
 
 
 
 
 
 
66be512
e7cb277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66be512
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download
import subprocess
import asyncio

title = "Apollo-7B-GGUF Run On CPU"
 
description = """
🔎 [Apollo-7B](https://huggingface.co/FreedomIntelligence/Apollo-7B) [GGUF format model](https://huggingface.co/FreedomIntelligence/Apollo-7B-GGUF) , 8-bit quantization balanced quality gguf version, running on CPU. Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all). 

🔨 Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue. 

Mistral does not support system prompt symbol (such as ```<<SYS>>```) now, input your system prompt in the first message if you need. Learn more: [Guardrailing Mistral 7B](https://docs.mistral.ai/usage/guardrailing). 
"""

"""
[Model From TheBloke/Mistral-6B-Instruct-v0.1-GGUF](https://huggingface.co/FreedomIntelligence/Apollo-6B-GGUF)
[Mistral-instruct-v0.1 System prompt](https://docs.mistral.ai/usage/guardrailing)
"""

model_path = "models"
model_name = "Apollo-6B-q8_0.gguf"
hf_hub_download(repo_id="FreedomIntelligence/Apollo-6B-GGUF", filename=model_name, local_dir=model_path, local_dir_use_symlinks=False)

print("Start the model init process")
model = model = GPT4All(model_name, model_path, allow_download = False, device="cpu")
print("Finish the model init process")

model.config["promptTemplate"] = "{0}"
model.config["systemPrompt"] = "You are a multiligual AI doctor, your name is Apollo."
model._is_chat_session_activated = False

max_new_tokens = 2048

# def generater(message, history, temperature, top_p, top_k):
#     prompt = "<s>"
#     for user_message, assistant_message in history:
#         prompt += model.config["promptTemplate"].format(user_message)
#         prompt += assistant_message + "</s>"
#     prompt += model.config["promptTemplate"].format(message)
#     outputs = []    
#     for token in model.generate(prompt=prompt, temp=temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens, streaming=True):
#         outputs.append(token)
#         yield "".join(outputs)
async def generater(message, history, temperature, top_p, top_k):
    # 构建prompt
    prompt = ""
    for user_message, assistant_message in history:
        prompt += model.config["promptTemplate"].format(user_message)
        prompt += assistant_message
    prompt += model.config["promptTemplate"].format(message)

    # Debug: 打印最终的prompt以验证其正确性
    print(f"Final prompt: {prompt}")

    cmd = [
        "./main",
        "-m", model_path+"/"+model_name,
        "--prompt", prompt
    ]

    # 使用subprocess.Popen调用./main并流式读取输出
    process = subprocess.Popen(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )

    # 初始占位符输出
    yield "Generating response..."
    # 异步等待并处理输出
    try:
        while True:
            line = process.stdout.readline()
            if not line:
                break  # 如果没有更多的输出,结束循环
            print(f"Generated line: {line.strip()}")  # Debug: 打印生成的每行
            yield line
    except Exception as e:
        print(f"Error during generation: {e}")
        yield "Sorry, an error occurred while generating the response."


def vote(data: gr.LikeData):
    if data.liked:
        return
    else:
        return

chatbot = gr.Chatbot(avatar_images=('resourse/user-icon.png', 'resourse/chatbot-icon.png'),bubble_full_width = False)

additional_inputs=[
    gr.Slider(
        label="temperature",
        value=0.5,
        minimum=0.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.",
    ),
    gr.Slider(
        label="top_p",
        value=1.0,
        minimum=0.0,
        maximum=1.0,
        step=0.01,
        interactive=True,
        info="0.1 means only the tokens comprising the top 10% probability mass are considered. Suggest set to 1 and use temperature. 1 means 100% and will disable it",
    ),
    gr.Slider(
        label="top_k",
        value=40,
        minimum=0,
        maximum=1000,
        step=1,
        interactive=True,
        info="limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the vocabulary size deactivates this limit.",
    )
]

iface = gr.ChatInterface(
    fn = generater,
    title=title,
    description = description,
    chatbot=chatbot,
    additional_inputs=additional_inputs,
    examples=[
        ["枸杞有什么疗效"],
        ["I've taken several courses of antibiotics for recurring infections, and now they seem less effective. Am I developing antibiotic resistance?"],
    ]
)

with gr.Blocks(css="resourse/style/custom.css") as demo:
    chatbot.like(vote, None, None)
    iface.render()

if __name__ == "__main__":
    demo.queue(max_size=3).launch()