File size: 8,099 Bytes
e7cb277
 
 
66be512
 
8b48d02
bc3f083
e0372cc
0c62e8f
e7cb277
e0372cc
e7cb277
 
 
 
 
 
e0372cc
e7cb277
 
 
3d66326
 
e7cb277
bc3f083
 
 
 
 
 
e7cb277
 
 
 
66be512
 
e7cb277
 
 
 
66be512
 
 
 
 
 
 
 
 
 
e3ac837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66be512
 
 
e7cb277
 
66be512
e7cb277
66be512
 
e3ac837
66be512
e3ac837
 
66be512
 
 
e3ac837
 
 
 
 
 
66be512
 
 
 
e3ac837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66be512
e7cb277
 
 
 
 
 
 
 
 
 
 
 
66be512
e7cb277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66be512
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import gradio as gr
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download
import subprocess
import asyncio
import os
import stat
title = "Apollo-6B-GGUF Run On CPU" 
  
description = """
🔎 [Apollo-6B](https://huggingface.co/FreedomIntelligence/Apollo-6B) [GGUF format model](https://huggingface.co/FreedomIntelligence/Apollo-6B-GGUF) , 8-bit quantization balanced quality gguf version, running on CPU. Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all). 

🔨 Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue. 

"""

"""
[Model From FreedomIntelligence/Apollo-6B-GGUF](https://huggingface.co/FreedomIntelligence/Apollo-6B-GGUF)
"""

model_path = "models"
model_name = "Apollo-6B-q8_0.gguf"
hf_hub_download(repo_id="FreedomIntelligence/Apollo-6B-GGUF", filename=model_name, local_dir=model_path, local_dir_use_symlinks=False)


current_dir = os.path.dirname(os.path.realpath(__file__))
main_path = os.path.join(current_dir, 'main')
os.chmod(main_path, os.stat(main_path).st_mode | stat.S_IEXEC)


print("Start the model init process")
model = model = GPT4All(model_name, model_path, allow_download = False, device="cpu")
print("Finish the model init process")

model.config["promptTemplate"] = "{0}"
model.config["systemPrompt"] = "You are a multiligual AI doctor, your name is Apollo."
model._is_chat_session_activated = False

max_new_tokens = 2048

# def generater(message, history, temperature, top_p, top_k):
#     prompt = "<s>"
#     for user_message, assistant_message in history:
#         prompt += model.config["promptTemplate"].format(user_message)
#         prompt += assistant_message + "</s>"
#     prompt += model.config["promptTemplate"].format(message)
#     outputs = []    
#     for token in model.generate(prompt=prompt, temp=temperature, top_k = top_k, top_p = top_p, max_tokens = max_new_tokens, streaming=True):
#         outputs.append(token)
#         yield "".join(outputs)

# async def generater(message, history, temperature, top_p, top_k):
#     # 构建prompt
#     prompt = ""
#     for user_message, assistant_message in history:
#         prompt += model.config["promptTemplate"].format(user_message)
#         prompt += assistant_message
#     prompt += model.config["promptTemplate"].format(message)

#     # Debug: 打印最终的prompt以验证其正确性
#     print(f"Final prompt: {prompt}")
#     cmd = [
#         main_path,
#         "-m",os.path.join(model_path, model_name),
#         "--prompt", prompt
#     ]

#     # 使用subprocess.Popen调用./main并流式读取输出
#     process = subprocess.Popen(
#         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
#     )

#     # 初始占位符输出
#     yield "Generating response..."
#     # 异步等待并处理输出
#     try:
#         while True:
#             line = process.stdout.readline()
#             if not line:
#                 break  # 如果没有更多的输出,结束循环
#             print(f"Generated line: {line.strip()}")  # Debug: 打印生成的每行
#             yield line
#     except Exception as e:
#         print(f"Error during generation: {e}")
#         yield "Sorry, an error occurred while generating the response."

async def generater(message, history, temperature, top_p, top_k):
    # 构建prompt
    prompt = ""
    for user_message, assistant_message in history:
        prompt += model.config["promptTemplate"].format(user_message)
        prompt += assistant_message
    prompt += model.config["promptTemplate"].format(message)

    # Debug: 打印最终的prompt以验证其正确性
    print(f"Final prompt: {prompt}\n\n\n\n\n\n\n\n")
    cmd = [
        "./main",  # 确保这个是可执行文件的正确路径
        "-m", os.path.join(model_path, model_name),
        "--prompt", prompt
    ]

    # 创建异步子进程
    process = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
        #text=True,  # 这里设置text=True使得输出以字符串形式处理
    )

    # 初始占位符输出
    yield "Generating response..."
    
    # # 异步等待并逐字处理输出
    # while True:
    #     char = await process.stdout.read(1)  # 读取1字节
    #     if not char:
    #         break  # 如果没有更多的输出,结束循环
    #     # 直接输出字符,这里假设输出是文本形式
    #     print(char, end='', flush=True)  # 使用print来立即输出每个字符
    #     yield char

    # while True:
    #     char = await process.stdout.read(1)  # 读取1字节
    #     if not char:
    #         break  # 如果没有更多的输出,结束循环
    #     # 将字节解码为字符串
    #     char_decoded = char.decode('utf-8')
    #     print(char_decoded, end='')  # 使用print来立即输出每个字符
    #     yield "1"

    # # 等待子进程结束
    # await process.wait()
    # 初始化一个空字节串用作缓冲区
    buffer = b""
    # 初始化一个空字符串用于累积解码的输出
    accumulated_output = ""

    while True:
        # 尝试从stdout中读取更多的字节
        more_bytes = await process.stdout.read(1)
        if not more_bytes:
            break  # 没有更多的字节可以读取,结束循环
        buffer += more_bytes  # 将新读取的字节添加到缓冲区

        try:
            # 尝试解码整个缓冲区
            decoded = buffer.decode('utf-8')
            # 将成功解码的内容添加到累积的输出中
            accumulated_output += decoded
            # 输出累积的内容到屏幕上
            print(f'\r{accumulated_output}', end='', flush=True)
            yield accumulated_output
            buffer = b""  # 清空缓冲区以接受新的输入
        except UnicodeDecodeError:
            # 解码失败,可能是因为字节不完整
            # 继续循环,读取更多的字节
            continue

    # 循环结束后,处理缓冲区中剩余的字节
    if buffer:
        # 这里忽略解码错误,因为最后的字节可能不完整
        remaining_output = buffer.decode('utf-8', errors='ignore')
        accumulated_output += remaining_output
        print(f'\r{accumulated_output}', end='', flush=True)


def vote(data: gr.LikeData):
    if data.liked:
        return
    else:
        return

chatbot = gr.Chatbot(avatar_images=('resourse/user-icon.png', 'resourse/chatbot-icon.png'),bubble_full_width = False)

additional_inputs=[
    gr.Slider(
        label="temperature",
        value=0.5,
        minimum=0.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.",
    ),
    gr.Slider(
        label="top_p",
        value=1.0,
        minimum=0.0,
        maximum=1.0,
        step=0.01,
        interactive=True,
        info="0.1 means only the tokens comprising the top 10% probability mass are considered. Suggest set to 1 and use temperature. 1 means 100% and will disable it",
    ),
    gr.Slider(
        label="top_k",
        value=40,
        minimum=0,
        maximum=1000,
        step=1,
        interactive=True,
        info="limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the vocabulary size deactivates this limit.",
    )
]

iface = gr.ChatInterface(
    fn = generater,
    title=title,
    description = description,
    chatbot=chatbot,
    additional_inputs=additional_inputs,
    examples=[
        ["枸杞有什么疗效"],
        ["I've taken several courses of antibiotics for recurring infections, and now they seem less effective. Am I developing antibiotic resistance?"],
    ]
)

with gr.Blocks(css="resourse/style/custom.css") as demo:
    chatbot.like(vote, None, None)
    iface.render()

if __name__ == "__main__":
    demo.queue(max_size=3).launch()