File size: 8,099 Bytes
e7cb277 66be512 8b48d02 bc3f083 e0372cc 0c62e8f e7cb277 e0372cc e7cb277 e0372cc e7cb277 3d66326 e7cb277 bc3f083 e7cb277 66be512 e7cb277 66be512 e3ac837 66be512 e7cb277 66be512 e7cb277 66be512 e3ac837 66be512 e3ac837 66be512 e3ac837 66be512 e3ac837 66be512 e7cb277 66be512 e7cb277 66be512 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import gradio as gr
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download
import subprocess
import asyncio
import os
import stat
title = "Apollo-6B-GGUF Run On CPU"
description = """
🔎 [Apollo-6B](https://huggingface.co/FreedomIntelligence/Apollo-6B) [GGUF format model](https://huggingface.co/FreedomIntelligence/Apollo-6B-GGUF) , 8-bit quantization balanced quality gguf version, running on CPU. Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all).
🔨 Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue.
"""
"""
[Model From FreedomIntelligence/Apollo-6B-GGUF](https://huggingface.co/FreedomIntelligence/Apollo-6B-GGUF)
"""
model_path = "models"
model_name = "Apollo-6B-q8_0.gguf"
hf_hub_download(repo_id="FreedomIntelligence/Apollo-6B-GGUF", filename=model_name, local_dir=model_path, local_dir_use_symlinks=False)
current_dir = os.path.dirname(os.path.realpath(__file__))
main_path = os.path.join(current_dir, 'main')
os.chmod(main_path, os.stat(main_path).st_mode | stat.S_IEXEC)
print("Start the model init process")
model = model = GPT4All(model_name, model_path, allow_download = False, device="cpu")
print("Finish the model init process")
model.config["promptTemplate"] = "{0}"
model.config["systemPrompt"] = "You are a multiligual AI doctor, your name is Apollo."
model._is_chat_session_activated = False
max_new_tokens = 2048
# def generater(message, history, temperature, top_p, top_k):
# prompt = "<s>"
# for user_message, assistant_message in history:
# prompt += model.config["promptTemplate"].format(user_message)
# prompt += assistant_message + "</s>"
# prompt += model.config["promptTemplate"].format(message)
# outputs = []
# for token in model.generate(prompt=prompt, temp=temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens, streaming=True):
# outputs.append(token)
# yield "".join(outputs)
# async def generater(message, history, temperature, top_p, top_k):
# # 构建prompt
# prompt = ""
# for user_message, assistant_message in history:
# prompt += model.config["promptTemplate"].format(user_message)
# prompt += assistant_message
# prompt += model.config["promptTemplate"].format(message)
# # Debug: 打印最终的prompt以验证其正确性
# print(f"Final prompt: {prompt}")
# cmd = [
# main_path,
# "-m",os.path.join(model_path, model_name),
# "--prompt", prompt
# ]
# # 使用subprocess.Popen调用./main并流式读取输出
# process = subprocess.Popen(
# cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
# )
# # 初始占位符输出
# yield "Generating response..."
# # 异步等待并处理输出
# try:
# while True:
# line = process.stdout.readline()
# if not line:
# break # 如果没有更多的输出,结束循环
# print(f"Generated line: {line.strip()}") # Debug: 打印生成的每行
# yield line
# except Exception as e:
# print(f"Error during generation: {e}")
# yield "Sorry, an error occurred while generating the response."
async def generater(message, history, temperature, top_p, top_k):
# 构建prompt
prompt = ""
for user_message, assistant_message in history:
prompt += model.config["promptTemplate"].format(user_message)
prompt += assistant_message
prompt += model.config["promptTemplate"].format(message)
# Debug: 打印最终的prompt以验证其正确性
print(f"Final prompt: {prompt}\n\n\n\n\n\n\n\n")
cmd = [
"./main", # 确保这个是可执行文件的正确路径
"-m", os.path.join(model_path, model_name),
"--prompt", prompt
]
# 创建异步子进程
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
#text=True, # 这里设置text=True使得输出以字符串形式处理
)
# 初始占位符输出
yield "Generating response..."
# # 异步等待并逐字处理输出
# while True:
# char = await process.stdout.read(1) # 读取1字节
# if not char:
# break # 如果没有更多的输出,结束循环
# # 直接输出字符,这里假设输出是文本形式
# print(char, end='', flush=True) # 使用print来立即输出每个字符
# yield char
# while True:
# char = await process.stdout.read(1) # 读取1字节
# if not char:
# break # 如果没有更多的输出,结束循环
# # 将字节解码为字符串
# char_decoded = char.decode('utf-8')
# print(char_decoded, end='') # 使用print来立即输出每个字符
# yield "1"
# # 等待子进程结束
# await process.wait()
# 初始化一个空字节串用作缓冲区
buffer = b""
# 初始化一个空字符串用于累积解码的输出
accumulated_output = ""
while True:
# 尝试从stdout中读取更多的字节
more_bytes = await process.stdout.read(1)
if not more_bytes:
break # 没有更多的字节可以读取,结束循环
buffer += more_bytes # 将新读取的字节添加到缓冲区
try:
# 尝试解码整个缓冲区
decoded = buffer.decode('utf-8')
# 将成功解码的内容添加到累积的输出中
accumulated_output += decoded
# 输出累积的内容到屏幕上
print(f'\r{accumulated_output}', end='', flush=True)
yield accumulated_output
buffer = b"" # 清空缓冲区以接受新的输入
except UnicodeDecodeError:
# 解码失败,可能是因为字节不完整
# 继续循环,读取更多的字节
continue
# 循环结束后,处理缓冲区中剩余的字节
if buffer:
# 这里忽略解码错误,因为最后的字节可能不完整
remaining_output = buffer.decode('utf-8', errors='ignore')
accumulated_output += remaining_output
print(f'\r{accumulated_output}', end='', flush=True)
def vote(data: gr.LikeData):
if data.liked:
return
else:
return
chatbot = gr.Chatbot(avatar_images=('resourse/user-icon.png', 'resourse/chatbot-icon.png'),bubble_full_width = False)
additional_inputs=[
gr.Slider(
label="temperature",
value=0.5,
minimum=0.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.",
),
gr.Slider(
label="top_p",
value=1.0,
minimum=0.0,
maximum=1.0,
step=0.01,
interactive=True,
info="0.1 means only the tokens comprising the top 10% probability mass are considered. Suggest set to 1 and use temperature. 1 means 100% and will disable it",
),
gr.Slider(
label="top_k",
value=40,
minimum=0,
maximum=1000,
step=1,
interactive=True,
info="limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the vocabulary size deactivates this limit.",
)
]
iface = gr.ChatInterface(
fn = generater,
title=title,
description = description,
chatbot=chatbot,
additional_inputs=additional_inputs,
examples=[
["枸杞有什么疗效"],
["I've taken several courses of antibiotics for recurring infections, and now they seem less effective. Am I developing antibiotic resistance?"],
]
)
with gr.Blocks(css="resourse/style/custom.css") as demo:
chatbot.like(vote, None, None)
iface.render()
if __name__ == "__main__":
demo.queue(max_size=3).launch()
|