File size: 5,057 Bytes
e7cb277 66be512 e7cb277 66be512 780d871 e7cb277 66be512 e7cb277 66be512 e7cb277 66be512 e7cb277 3d66326 e7cb277 66be512 e7cb277 66be512 e7cb277 66be512 e7cb277 66be512 e7cb277 66be512 e7cb277 66be512 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download
import subprocess
import asyncio
title = "Apollo-7B-GGUF Run On CPU"
description = """
🔎 [Apollo-7B](https://huggingface.co/FreedomIntelligence/Apollo-7B) [GGUF format model](https://huggingface.co/FreedomIntelligence/Apollo-7B-GGUF) , 8-bit quantization balanced quality gguf version, running on CPU. Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all).
🔨 Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue.
Mistral does not support system prompt symbol (such as ```<<SYS>>```) now, input your system prompt in the first message if you need. Learn more: [Guardrailing Mistral 7B](https://docs.mistral.ai/usage/guardrailing).
"""
"""
[Model From TheBloke/Mistral-6B-Instruct-v0.1-GGUF](https://huggingface.co/FreedomIntelligence/Apollo-6B-GGUF)
[Mistral-instruct-v0.1 System prompt](https://docs.mistral.ai/usage/guardrailing)
"""
model_path = "models"
model_name = "Apollo-6B-q8_0.gguf"
hf_hub_download(repo_id="FreedomIntelligence/Apollo-6B-GGUF", filename=model_name, local_dir=model_path, local_dir_use_symlinks=False)
print("Start the model init process")
model = model = GPT4All(model_name, model_path, allow_download = False, device="cpu")
print("Finish the model init process")
model.config["promptTemplate"] = "{0}"
model.config["systemPrompt"] = "You are a multiligual AI doctor, your name is Apollo."
model._is_chat_session_activated = False
max_new_tokens = 2048
# def generater(message, history, temperature, top_p, top_k):
# prompt = "<s>"
# for user_message, assistant_message in history:
# prompt += model.config["promptTemplate"].format(user_message)
# prompt += assistant_message + "</s>"
# prompt += model.config["promptTemplate"].format(message)
# outputs = []
# for token in model.generate(prompt=prompt, temp=temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens, streaming=True):
# outputs.append(token)
# yield "".join(outputs)
async def generater(message, history, temperature, top_p, top_k):
# 构建prompt
prompt = ""
for user_message, assistant_message in history:
prompt += model.config["promptTemplate"].format(user_message)
prompt += assistant_message
prompt += model.config["promptTemplate"].format(message)
# Debug: 打印最终的prompt以验证其正确性
print(f"Final prompt: {prompt}")
cmd = [
"./main",
"-m", model_path+"/"+model_name,
"--prompt", prompt
]
# 使用subprocess.Popen调用./main并流式读取输出
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
# 初始占位符输出
yield "Generating response..."
# 异步等待并处理输出
try:
while True:
line = process.stdout.readline()
if not line:
break # 如果没有更多的输出,结束循环
print(f"Generated line: {line.strip()}") # Debug: 打印生成的每行
yield line
except Exception as e:
print(f"Error during generation: {e}")
yield "Sorry, an error occurred while generating the response."
def vote(data: gr.LikeData):
if data.liked:
return
else:
return
chatbot = gr.Chatbot(avatar_images=('resourse/user-icon.png', 'resourse/chatbot-icon.png'),bubble_full_width = False)
additional_inputs=[
gr.Slider(
label="temperature",
value=0.5,
minimum=0.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.",
),
gr.Slider(
label="top_p",
value=1.0,
minimum=0.0,
maximum=1.0,
step=0.01,
interactive=True,
info="0.1 means only the tokens comprising the top 10% probability mass are considered. Suggest set to 1 and use temperature. 1 means 100% and will disable it",
),
gr.Slider(
label="top_k",
value=40,
minimum=0,
maximum=1000,
step=1,
interactive=True,
info="limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the vocabulary size deactivates this limit.",
)
]
iface = gr.ChatInterface(
fn = generater,
title=title,
description = description,
chatbot=chatbot,
additional_inputs=additional_inputs,
examples=[
["枸杞有什么疗效"],
["I've taken several courses of antibiotics for recurring infections, and now they seem less effective. Am I developing antibiotic resistance?"],
]
)
with gr.Blocks(css="resourse/style/custom.css") as demo:
chatbot.like(vote, None, None)
iface.render()
if __name__ == "__main__":
demo.queue(max_size=3).launch()
|