Spaces:
Runtime error
Runtime error
add bloomz
Browse files- Dockerfile +1 -0
- app/api.py +19 -7
- app/config.json +4 -1
Dockerfile
CHANGED
@@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y \
|
|
8 |
libxmlsec1-dev libffi-dev liblzma-dev git-lfs ffmpeg libsm6 libxext6 cmake \
|
9 |
libgl1-mesa-glx curl nginx espeak-ng openssl libssl-dev libbz2-dev \
|
10 |
libncurses5-dev libreadline-dev \
|
|
|
11 |
&& rm -rf /var/lib/apt/lists/* && git lfs install
|
12 |
|
13 |
RUN wget https://github.com/tsl0922/ttyd/releases/download/1.7.3/ttyd.x86_64 -O /usr/local/bin/ttyd && \
|
|
|
8 |
libxmlsec1-dev libffi-dev liblzma-dev git-lfs ffmpeg libsm6 libxext6 cmake \
|
9 |
libgl1-mesa-glx curl nginx espeak-ng openssl libssl-dev libbz2-dev \
|
10 |
libncurses5-dev libreadline-dev \
|
11 |
+
vim lynx haproxy \
|
12 |
&& rm -rf /var/lib/apt/lists/* && git lfs install
|
13 |
|
14 |
RUN wget https://github.com/tsl0922/ttyd/releases/download/1.7.3/ttyd.x86_64 -O /usr/local/bin/ttyd && \
|
app/api.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from fastapi import FastAPI, WebSocket
|
2 |
from fastapi.responses import HTMLResponse
|
3 |
from fastapi import Form, Depends, HTTPException, status
|
4 |
-
from transformers import pipeline, set_seed, AutoConfig, AutoTokenizer,
|
5 |
import torch
|
6 |
import os
|
7 |
import time
|
@@ -68,7 +68,13 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
68 |
|
69 |
|
70 |
@app.post("/api/indochat/v1")
|
71 |
-
async def indochat(
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
text: str = Form(default="", description="The Prompt"),
|
73 |
decoding_method: str = Form(default="Sampling", description="Decoding method"),
|
74 |
min_length: int = Form(default=50, description="Minimal length of the generated text"),
|
@@ -102,13 +108,13 @@ async def indochat(
|
|
102 |
max_penalty = 1.5
|
103 |
repetition_penalty = max(min_penalty + (1.0 - temperature) * (max_penalty - min_penalty), 0.8)
|
104 |
prompt = f"User: {text}\nAssistant: "
|
105 |
-
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
|
106 |
-
model.eval()
|
107 |
print("Generating text...")
|
108 |
print(f"max_length: {max_length}, do_sample: {do_sample}, top_k: {top_k}, top_p: {top_p}, "
|
109 |
f"temperature: {temperature}, repetition_penalty: {repetition_penalty}, penalty_alpha: {penalty_alpha}")
|
110 |
time_start = time.time()
|
111 |
-
sample_outputs = model.generate(input_ids,
|
112 |
penalty_alpha=penalty_alpha,
|
113 |
do_sample=do_sample,
|
114 |
num_beams=num_beams,
|
@@ -134,7 +140,7 @@ def get_text_generator(model_name: str, device: str = "cpu"):
|
|
134 |
print(f"hf_auth_token: {hf_auth_token}")
|
135 |
print(f"Loading model with device: {device}...")
|
136 |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_auth_token)
|
137 |
-
model =
|
138 |
use_auth_token=hf_auth_token)
|
139 |
model.to(device)
|
140 |
print("Model loaded")
|
@@ -147,4 +153,10 @@ def get_config():
|
|
147 |
|
148 |
config = get_config()
|
149 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from fastapi import FastAPI, WebSocket
|
2 |
from fastapi.responses import HTMLResponse
|
3 |
from fastapi import Form, Depends, HTTPException, status
|
4 |
+
from transformers import pipeline, set_seed, AutoConfig, AutoTokenizer, AutoModelForCausalLM
|
5 |
import torch
|
6 |
import os
|
7 |
import time
|
|
|
68 |
|
69 |
|
70 |
@app.post("/api/indochat/v1")
|
71 |
+
async def indochat(**kwargs):
|
72 |
+
return text_generate("indochat-tiny", kwargs)
|
73 |
+
|
74 |
+
|
75 |
+
@app.post("/api/text-generator/v1")
|
76 |
+
async def text_generate(
|
77 |
+
model_name: str = Form(default="", description="The model name"),
|
78 |
text: str = Form(default="", description="The Prompt"),
|
79 |
decoding_method: str = Form(default="Sampling", description="Decoding method"),
|
80 |
min_length: int = Form(default=50, description="Minimal length of the generated text"),
|
|
|
108 |
max_penalty = 1.5
|
109 |
repetition_penalty = max(min_penalty + (1.0 - temperature) * (max_penalty - min_penalty), 0.8)
|
110 |
prompt = f"User: {text}\nAssistant: "
|
111 |
+
input_ids = text_generator[model_name]["tokenizer"](prompt, return_tensors='pt').input_ids.to(device)
|
112 |
+
text_generator[model_name]["model"].eval()
|
113 |
print("Generating text...")
|
114 |
print(f"max_length: {max_length}, do_sample: {do_sample}, top_k: {top_k}, top_p: {top_p}, "
|
115 |
f"temperature: {temperature}, repetition_penalty: {repetition_penalty}, penalty_alpha: {penalty_alpha}")
|
116 |
time_start = time.time()
|
117 |
+
sample_outputs = text_generator[model_name]["model"].generate(input_ids,
|
118 |
penalty_alpha=penalty_alpha,
|
119 |
do_sample=do_sample,
|
120 |
num_beams=num_beams,
|
|
|
140 |
print(f"hf_auth_token: {hf_auth_token}")
|
141 |
print(f"Loading model with device: {device}...")
|
142 |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_auth_token)
|
143 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id,
|
144 |
use_auth_token=hf_auth_token)
|
145 |
model.to(device)
|
146 |
print("Model loaded")
|
|
|
153 |
|
154 |
config = get_config()
|
155 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
156 |
+
text_generator = {}
|
157 |
+
for model_name in config["text-generator"]:
|
158 |
+
model, tokenizer = get_text_generator(model_name=config["text-generator"][model_name], device=device)
|
159 |
+
text_generator[model_name] = {
|
160 |
+
"model": model,
|
161 |
+
"tokenizer": tokenizer
|
162 |
+
}
|
app/config.json
CHANGED
@@ -1,3 +1,6 @@
|
|
1 |
{
|
2 |
-
"
|
|
|
|
|
|
|
3 |
}
|
|
|
1 |
{
|
2 |
+
"text-generator": {
|
3 |
+
"indochat-tiny": "cahya/indochat-tiny",
|
4 |
+
"bloomz-1b1-instruct": "cahya/bloomz-1b1-instruct"
|
5 |
+
}
|
6 |
}
|