|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
A model worker executes the model. |
|
""" |
|
import spaces |
|
import os |
|
import argparse |
|
import asyncio |
|
|
|
import json |
|
import math |
|
import threading |
|
import time |
|
import uuid |
|
import traceback |
|
from functools import partial |
|
from threading import Thread |
|
|
|
import requests |
|
import torch |
|
import torchvision.transforms as T |
|
import uvicorn |
|
from constants import IMAGENET_MEAN, IMAGENET_STD, WORKER_HEART_BEAT_INTERVAL |
|
from fastapi import BackgroundTasks, FastAPI, Request |
|
from fastapi.responses import StreamingResponse |
|
from PIL import Image |
|
from torchvision.transforms.functional import InterpolationMode |
|
from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer |
|
from utils import ( |
|
build_logger, |
|
pretty_print_semaphore, |
|
server_error_msg, |
|
load_image_from_base64, |
|
) |
|
|
|
|
|
worker_id = str(uuid.uuid4())[:6] |
|
logger = build_logger("model_worker", f"model_worker_{worker_id}.log") |
|
global_counter = 0 |
|
model_semaphore = None |
|
|
|
|
|
def build_transform(input_size): |
|
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD |
|
transform = T.Compose( |
|
[ |
|
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), |
|
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), |
|
T.ToTensor(), |
|
T.Normalize(mean=MEAN, std=STD), |
|
] |
|
) |
|
return transform |
|
|
|
|
|
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): |
|
best_ratio_diff = float("inf") |
|
best_ratio = (1, 1) |
|
area = width * height |
|
for ratio in target_ratios: |
|
target_aspect_ratio = ratio[0] / ratio[1] |
|
ratio_diff = abs(aspect_ratio - target_aspect_ratio) |
|
if ratio_diff < best_ratio_diff: |
|
best_ratio_diff = ratio_diff |
|
best_ratio = ratio |
|
elif ratio_diff == best_ratio_diff: |
|
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: |
|
best_ratio = ratio |
|
return best_ratio |
|
|
|
|
|
def dynamic_preprocess( |
|
image, min_num=1, max_num=6, image_size=448, use_thumbnail=False |
|
): |
|
orig_width, orig_height = image.size |
|
aspect_ratio = orig_width / orig_height |
|
|
|
|
|
target_ratios = set( |
|
(i, j) |
|
for n in range(min_num, max_num + 1) |
|
for i in range(1, n + 1) |
|
for j in range(1, n + 1) |
|
if i * j <= max_num and i * j >= min_num |
|
) |
|
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) |
|
|
|
|
|
target_aspect_ratio = find_closest_aspect_ratio( |
|
aspect_ratio, target_ratios, orig_width, orig_height, image_size |
|
) |
|
|
|
|
|
target_width = image_size * target_aspect_ratio[0] |
|
target_height = image_size * target_aspect_ratio[1] |
|
blocks = target_aspect_ratio[0] * target_aspect_ratio[1] |
|
|
|
|
|
resized_img = image.resize((target_width, target_height)) |
|
processed_images = [] |
|
for i in range(blocks): |
|
box = ( |
|
(i % (target_width // image_size)) * image_size, |
|
(i // (target_width // image_size)) * image_size, |
|
((i % (target_width // image_size)) + 1) * image_size, |
|
((i // (target_width // image_size)) + 1) * image_size, |
|
) |
|
|
|
split_img = resized_img.crop(box) |
|
processed_images.append(split_img) |
|
assert len(processed_images) == blocks |
|
if use_thumbnail and len(processed_images) != 1: |
|
thumbnail_img = image.resize((image_size, image_size)) |
|
processed_images.append(thumbnail_img) |
|
return processed_images |
|
|
|
|
|
def heart_beat_worker(controller): |
|
while True: |
|
time.sleep(WORKER_HEART_BEAT_INTERVAL) |
|
controller.send_heart_beat() |
|
|
|
|
|
def split_model(model_name): |
|
device_map = {} |
|
world_size = torch.cuda.device_count() |
|
num_layers = { |
|
"InternVL2-8B": 32, |
|
"InternVL2-26B": 48, |
|
"InternVL2-40B": 60, |
|
"InternVL2-Llama3-76B": 80, |
|
"InternVL2-78B": 80, |
|
"InternVL2-Pro": 80, |
|
}[model_name] |
|
|
|
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5)) |
|
num_layers_per_gpu = [num_layers_per_gpu] * world_size |
|
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) |
|
layer_cnt = 0 |
|
for i, num_layer in enumerate(num_layers_per_gpu): |
|
for j in range(num_layer): |
|
device_map[f"language_model.model.layers.{layer_cnt}"] = i |
|
layer_cnt += 1 |
|
device_map["vision_model"] = 0 |
|
device_map["mlp1"] = 0 |
|
device_map["language_model.model.tok_embeddings"] = 0 |
|
device_map["language_model.model.embed_tokens"] = 0 |
|
device_map["language_model.output"] = 0 |
|
device_map["language_model.model.norm"] = 0 |
|
device_map["language_model.lm_head"] = 0 |
|
device_map[f"language_model.model.layers.{num_layers - 1}"] = 0 |
|
|
|
return device_map |
|
|
|
|
|
def multi_thread_infer( |
|
model, tokenizer, pixel_values, question, history, generation_config |
|
): |
|
with torch.no_grad(): |
|
thread = Thread( |
|
target=model.chat, |
|
kwargs=dict( |
|
tokenizer=tokenizer, |
|
pixel_values=pixel_values, |
|
question=question, |
|
history=history, |
|
return_history=False, |
|
generation_config=generation_config, |
|
), |
|
) |
|
thread.start() |
|
|
|
|
|
class ModelWorker: |
|
def __init__( |
|
self, |
|
controller_addr, |
|
worker_addr, |
|
worker_id, |
|
model_path, |
|
model_name, |
|
load_8bit, |
|
device, |
|
context_len=8192, |
|
): |
|
self.controller_addr = controller_addr |
|
self.worker_addr = worker_addr |
|
self.worker_id = worker_id |
|
if model_path.endswith("/"): |
|
model_path = model_path[:-1] |
|
if model_name is None: |
|
model_paths = model_path.split("/") |
|
if model_paths[-1].startswith("checkpoint-"): |
|
self.model_name = model_paths[-2] + "_" + model_paths[-1] |
|
else: |
|
self.model_name = model_paths[-1] |
|
else: |
|
self.model_name = model_name |
|
|
|
self.import_flash_attn() |
|
logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...") |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_path, trust_remote_code=True, use_fast=False |
|
) |
|
tokens_to_keep = ["<box>", "</box>", "<ref>", "</ref>"] |
|
tokenizer.additional_special_tokens = [ |
|
item |
|
for item in tokenizer.additional_special_tokens |
|
if item not in tokens_to_keep |
|
] |
|
self.tokenizer = tokenizer |
|
|
|
if device == "auto": |
|
device_map = split_model(self.model_name) |
|
self.model = AutoModel.from_pretrained( |
|
model_path, |
|
load_in_8bit=load_8bit, |
|
torch_dtype=torch.bfloat16, |
|
device_map=device_map, |
|
trust_remote_code=True, |
|
).eval() |
|
else: |
|
self.model = AutoModel.from_pretrained( |
|
model_path, |
|
load_in_8bit=load_8bit, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
).eval() |
|
if not load_8bit and not device == "auto": |
|
self.model = self.model.cuda() |
|
self.load_8bit = load_8bit |
|
self.device = device |
|
self.model_path = model_path |
|
self.image_size = self.model.config.force_image_size |
|
self.context_len = context_len |
|
self.register_to_controller() |
|
self.heart_beat_thread = threading.Thread( |
|
target=heart_beat_worker, args=(self,) |
|
) |
|
self.heart_beat_thread.start() |
|
|
|
@spaces.GPU(duration=120) |
|
def import_flash_attn(self): |
|
try: |
|
import flash_attn |
|
except ImportError: |
|
|
|
def install_flash_attn(): |
|
os.system( |
|
"FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn==2.5.9.post1 --no-build-isolation" |
|
) |
|
|
|
install_flash_attn() |
|
|
|
|
|
def reload_model(self): |
|
del self.model |
|
torch.cuda.empty_cache() |
|
if self.device == "auto": |
|
device_map = split_model(self.model_name) |
|
self.model = AutoModel.from_pretrained( |
|
self.model_path, |
|
load_in_8bit=self.load_8bit, |
|
torch_dtype=torch.bfloat16, |
|
device_map=device_map, |
|
trust_remote_code=True, |
|
).eval() |
|
else: |
|
self.model = AutoModel.from_pretrained( |
|
self.model_path, |
|
load_in_8bit=self.load_8bit, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
).eval() |
|
if not self.load_8bit and not self.device == "auto": |
|
self.model = self.model.cuda() |
|
|
|
def register_to_controller(self): |
|
logger.info("Register to controller") |
|
|
|
url = self.controller_addr + "/register_worker" |
|
data = { |
|
"worker_name": self.worker_addr, |
|
"check_heart_beat": True, |
|
"worker_status": self.get_status(), |
|
} |
|
r = requests.post(url, json=data) |
|
assert r.status_code == 200 |
|
|
|
def send_heart_beat(self): |
|
logger.info( |
|
f"Send heart beat. Models: {[self.model_name]}. " |
|
f"Semaphore: {pretty_print_semaphore(model_semaphore)}. " |
|
f"global_counter: {global_counter}" |
|
) |
|
|
|
url = self.controller_addr + "/receive_heart_beat" |
|
|
|
while True: |
|
try: |
|
ret = requests.post( |
|
url, |
|
json={ |
|
"worker_name": self.worker_addr, |
|
"queue_length": self.get_queue_length(), |
|
}, |
|
timeout=5, |
|
) |
|
exist = ret.json()["exist"] |
|
break |
|
except requests.exceptions.RequestException as e: |
|
logger.error(f"heart beat error: {e}") |
|
time.sleep(5) |
|
|
|
if not exist: |
|
self.register_to_controller() |
|
|
|
def get_queue_length(self): |
|
if model_semaphore is None: |
|
return 0 |
|
else: |
|
return ( |
|
args.limit_model_concurrency |
|
- model_semaphore._value |
|
+ ( |
|
len(model_semaphore._waiters) |
|
if model_semaphore._waiters is not None |
|
else 0 |
|
) |
|
) |
|
|
|
def get_status(self): |
|
return { |
|
"model_names": [self.model_name], |
|
"speed": 1, |
|
"queue_length": self.get_queue_length(), |
|
} |
|
|
|
def generate_stream(self, params): |
|
system_message = params["prompt"][0]["content"] |
|
send_messages = params["prompt"][1:] |
|
max_input_tiles = params["max_input_tiles"] |
|
temperature = params["temperature"] |
|
top_p = params["top_p"] |
|
max_new_tokens = params["max_new_tokens"] |
|
repetition_penalty = params["repetition_penalty"] |
|
do_sample = True if temperature > 0.0 else False |
|
|
|
global_image_cnt = 0 |
|
history, pil_images, max_input_tile_list = [], [], [] |
|
for message in send_messages: |
|
if message["role"] == "user": |
|
prefix = "" |
|
if "image" in message: |
|
max_input_tile_temp = [] |
|
for image_str in message["image"]: |
|
pil_images.append(load_image_from_base64(image_str)) |
|
prefix += f"Image-{global_image_cnt + 1}: <image>\n\n" |
|
global_image_cnt += 1 |
|
max_input_tile_temp.append( |
|
max(1, max_input_tiles // len(message["image"])) |
|
) |
|
if len(max_input_tile_temp) > 0: |
|
max_input_tile_list.append(max_input_tile_temp) |
|
content = prefix + message["content"] |
|
history.append( |
|
[ |
|
content, |
|
] |
|
) |
|
else: |
|
history[-1].append(message["content"]) |
|
question, history = history[-1][0], history[:-1] |
|
|
|
if global_image_cnt == 1: |
|
question = question.replace("Image-1: <image>\n\n", "<image>\n") |
|
history = [ |
|
[item[0].replace("Image-1: <image>\n\n", "<image>\n"), item[1]] |
|
for item in history |
|
] |
|
|
|
|
|
flattened_list = [] |
|
|
|
for sublist in max_input_tile_list[:-1]: |
|
processed_sublist = [1] * len( |
|
sublist |
|
) |
|
flattened_list.extend( |
|
processed_sublist |
|
) |
|
|
|
if max_input_tile_list: |
|
flattened_list.extend(max_input_tile_list[-1]) |
|
max_input_tile_list = flattened_list |
|
assert len(max_input_tile_list) == len( |
|
pil_images |
|
), "The number of max_input_tile_list and pil_images should be the same." |
|
|
|
old_system_message = self.model.system_message |
|
self.model.system_message = system_message |
|
image_tiles = [] |
|
transform = build_transform(input_size=self.image_size) |
|
if len(pil_images) > 0: |
|
for current_max_input_tiles, pil_image in zip( |
|
max_input_tile_list, pil_images |
|
): |
|
if self.model.config.dynamic_image_size: |
|
tiles = dynamic_preprocess( |
|
pil_image, |
|
image_size=self.image_size, |
|
max_num=current_max_input_tiles, |
|
use_thumbnail=self.model.config.use_thumbnail, |
|
) |
|
else: |
|
tiles = [pil_image] |
|
image_tiles += tiles |
|
pixel_values = [transform(item) for item in image_tiles] |
|
pixel_values = torch.stack(pixel_values).to( |
|
self.model.device, dtype=torch.bfloat16 |
|
) |
|
logger.info(f"Split images to {pixel_values.shape}") |
|
else: |
|
pixel_values = None |
|
|
|
streamer = TextIteratorStreamer( |
|
self.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10 |
|
) |
|
generation_config = dict( |
|
num_beams=1, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=do_sample, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_length=self.context_len, |
|
top_p=top_p, |
|
streamer=streamer, |
|
) |
|
logger.info(f"Generation config: {generation_config}") |
|
multi_thread_infer( |
|
self.model, |
|
self.tokenizer, |
|
pixel_values, |
|
question, |
|
history, |
|
generation_config, |
|
) |
|
|
|
generated_text = "" |
|
for new_text in streamer: |
|
generated_text += new_text |
|
if generated_text.endswith(self.model.conv_template.sep): |
|
generated_text = generated_text[: -len(self.model.conv_template.sep)] |
|
yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0" |
|
logger.info( |
|
f"max_input_tile_list: {max_input_tile_list}, history: {history}, " |
|
f"question: {question}, answer: {generated_text}" |
|
) |
|
self.model.system_message = old_system_message |
|
|
|
def generate_stream_gate(self, params): |
|
try: |
|
for x in self.generate_stream(params): |
|
yield x |
|
except ValueError as e: |
|
print("Caught ValueError:", e) |
|
traceback.print_exc() |
|
ret = { |
|
"text": server_error_msg, |
|
"error_code": 1, |
|
} |
|
yield json.dumps(ret).encode() + b"\0" |
|
except torch.cuda.CudaError as e: |
|
traceback.print_exc() |
|
print("Caught torch.cuda.CudaError:", e) |
|
ret = { |
|
"text": server_error_msg, |
|
"error_code": 1, |
|
} |
|
yield json.dumps(ret).encode() + b"\0" |
|
except Exception as e: |
|
traceback.print_exc() |
|
print("Caught Unknown Error", e) |
|
ret = { |
|
"text": server_error_msg, |
|
"error_code": 1, |
|
} |
|
yield json.dumps(ret).encode() + b"\0" |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
def release_model_semaphore(fn=None): |
|
model_semaphore.release() |
|
if fn is not None: |
|
fn() |
|
|
|
|
|
@app.post("/worker_generate_stream") |
|
async def generate_stream(request: Request): |
|
global model_semaphore, global_counter |
|
global_counter += 1 |
|
params = await request.json() |
|
|
|
if model_semaphore is None: |
|
model_semaphore = asyncio.Semaphore(args.limit_model_concurrency) |
|
await model_semaphore.acquire() |
|
worker.send_heart_beat() |
|
generator = worker.generate_stream_gate(params) |
|
background_tasks = BackgroundTasks() |
|
background_tasks.add_task( |
|
partial(release_model_semaphore, fn=worker.send_heart_beat) |
|
) |
|
return StreamingResponse(generator, background=background_tasks) |
|
|
|
|
|
@app.post("/worker_get_status") |
|
async def get_status(request: Request): |
|
return worker.get_status() |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--host", type=str, default="0.0.0.0") |
|
parser.add_argument("--port", type=int, default=21002) |
|
parser.add_argument("--worker-url", type=str, default="http://localhost") |
|
parser.add_argument("--controller-url", type=str, default="http://localhost:21001") |
|
parser.add_argument("--model-path", type=str, default="facebook/opt-350m") |
|
parser.add_argument("--model-name", type=str) |
|
parser.add_argument("--device", type=str, default="cuda") |
|
parser.add_argument("--limit-model-concurrency", type=int, default=5) |
|
parser.add_argument("--stream-interval", type=int, default=1) |
|
parser.add_argument("--load-8bit", action="store_true") |
|
args = parser.parse_args() |
|
logger.info(f"args: {args}") |
|
|
|
worker = ModelWorker( |
|
args.controller_url, |
|
args.worker_url + f":{args.port}", |
|
worker_id, |
|
args.model_path, |
|
args.model_name, |
|
args.load_8bit, |
|
args.device, |
|
) |
|
uvicorn.run(app, host=args.host, port=args.port, log_level="info") |
|
|