ffreemt
commited on
Commit
•
670ac68
1
Parent(s):
b153e87
Update device='cuda' if ... else 'cpu' in m3_server.py
Browse files- Dockerfile +1 -0
- m3_server.py +87 -37
Dockerfile
CHANGED
@@ -23,3 +23,4 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
|
23 |
|
24 |
CMD ["sh", "start-m3-server.sh"]
|
25 |
# CMD ["sh", "-c", "HF_HOME=/tmp/cache", "python", "m3_server.py"]
|
|
|
|
23 |
|
24 |
CMD ["sh", "start-m3-server.sh"]
|
25 |
# CMD ["sh", "-c", "HF_HOME=/tmp/cache", "python", "m3_server.py"]
|
26 |
+
# ["sh", "-c", "'FOO=BAR python app.py'"]
|
m3_server.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import asyncio
|
2 |
import os
|
3 |
import time
|
4 |
-
from pathlib import Path
|
5 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
6 |
from typing import List, Tuple, Union
|
7 |
from uuid import uuid4
|
8 |
|
|
|
9 |
from fastapi import FastAPI, HTTPException, Request
|
10 |
from fastapi.responses import JSONResponse
|
11 |
from FlagEmbedding import BGEM3FlagModel
|
@@ -13,26 +14,34 @@ from pydantic import BaseModel
|
|
13 |
from starlette.status import HTTP_504_GATEWAY_TIMEOUT
|
14 |
|
15 |
Path("/tmp/cache").mkdir(exist_ok=True)
|
16 |
-
os.environ[
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
request_time_out = 30 # Timeout threshold
|
25 |
-
gpu_time_out = 5
|
26 |
-
port= 3000
|
27 |
-
port= 7860
|
|
|
|
|
28 |
|
29 |
class m3Wrapper:
|
30 |
-
def __init__(self, model_name: str, device: str =
|
31 |
"""Init."""
|
32 |
-
self.model = BGEM3FlagModel(
|
|
|
|
|
33 |
|
34 |
def embed(self, sentences: List[str]) -> List[List[float]]:
|
35 |
-
embeddings = self.model.encode(
|
|
|
|
|
36 |
embeddings = embeddings.tolist()
|
37 |
return embeddings
|
38 |
|
@@ -42,24 +51,31 @@ class m3Wrapper:
|
|
42 |
batch_size=batch_size,
|
43 |
max_query_length=max_q_length,
|
44 |
max_passage_length=max_length,
|
45 |
-
weights_for_different_modes=rerank_weights
|
46 |
-
)[
|
47 |
return scores
|
48 |
|
|
|
49 |
class EmbedRequest(BaseModel):
|
50 |
sentences: List[str]
|
51 |
|
|
|
52 |
class RerankRequest(BaseModel):
|
53 |
sentence_pairs: List[Tuple[str, str]]
|
54 |
|
|
|
55 |
class EmbedResponse(BaseModel):
|
56 |
embeddings: List[List[float]]
|
57 |
|
|
|
58 |
class RerankResponse(BaseModel):
|
59 |
scores: List[float]
|
60 |
|
|
|
61 |
class RequestProcessor:
|
62 |
-
def __init__(
|
|
|
|
|
63 |
"""Init."""
|
64 |
self.model = model
|
65 |
self.max_batch_size = max_request_to_flush
|
@@ -73,7 +89,7 @@ class RequestProcessor:
|
|
73 |
|
74 |
async def ensure_processing_loop_started(self):
|
75 |
if not self.processing_loop_started:
|
76 |
-
print(
|
77 |
self.processing_loop_task = asyncio.create_task(self.processing_loop())
|
78 |
self.processing_loop_started = True
|
79 |
|
@@ -83,12 +99,16 @@ class RequestProcessor:
|
|
83 |
start_time = asyncio.get_event_loop().time()
|
84 |
|
85 |
while len(requests) < self.max_batch_size:
|
86 |
-
timeout = self.accumulation_timeout - (
|
|
|
|
|
87 |
if timeout <= 0:
|
88 |
break
|
89 |
|
90 |
try:
|
91 |
-
req_data, req_type, req_id = await asyncio.wait_for(
|
|
|
|
|
92 |
requests.append(req_data)
|
93 |
request_types.append(req_type)
|
94 |
request_ids.append(req_id)
|
@@ -96,15 +116,27 @@ class RequestProcessor:
|
|
96 |
break
|
97 |
|
98 |
if requests:
|
99 |
-
await self.process_requests_by_type(
|
|
|
|
|
100 |
|
101 |
async def process_requests_by_type(self, requests, request_types, request_ids):
|
102 |
tasks = []
|
103 |
-
for request_data, request_type, request_id in zip(
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
else: # 'rerank'
|
107 |
-
task = asyncio.create_task(
|
|
|
|
|
|
|
|
|
108 |
tasks.append(task)
|
109 |
await asyncio.gather(*tasks)
|
110 |
|
@@ -112,14 +144,20 @@ class RequestProcessor:
|
|
112 |
async with self.gpu_lock: # Wait for sem
|
113 |
future = self.executor.submit(func, data)
|
114 |
try:
|
115 |
-
result = await asyncio.wait_for(
|
|
|
|
|
116 |
self.response_futures[request_id].set_result(result)
|
117 |
except asyncio.TimeoutError:
|
118 |
-
self.response_futures[request_id].set_exception(
|
|
|
|
|
119 |
except Exception as e:
|
120 |
self.response_futures[request_id].set_exception(e)
|
121 |
|
122 |
-
async def process_request(
|
|
|
|
|
123 |
try:
|
124 |
await self.ensure_processing_loop_started()
|
125 |
request_id = str(uuid4())
|
@@ -129,6 +167,7 @@ class RequestProcessor:
|
|
129 |
except Exception as e:
|
130 |
raise HTTPException(status_code=500, detail=f"Internal Server Error {e}")
|
131 |
|
|
|
132 |
app = FastAPI(
|
133 |
title="baai m3, serving embed and rerank",
|
134 |
description="Swagger UI at https://mikeee-baai-m3.hf.space/docs",
|
@@ -136,8 +175,10 @@ app = FastAPI(
|
|
136 |
)
|
137 |
|
138 |
# Initialize the model and request processor
|
139 |
-
model = m3Wrapper(
|
140 |
-
processor = RequestProcessor(
|
|
|
|
|
141 |
|
142 |
# Adding a middleware returning a 504 error if the request processing time is above a certain threshold
|
143 |
@app.middleware("http")
|
@@ -148,25 +189,34 @@ async def timeout_middleware(request: Request, call_next):
|
|
148 |
|
149 |
except asyncio.TimeoutError:
|
150 |
process_time = time.time() - start_time
|
151 |
-
return JSONResponse(
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
@app.get("/")
|
156 |
async def landing():
|
157 |
"""Define landing page."""
|
158 |
return "Swagger UI at https://mikeee-baai-m3.hf.space/docs"
|
159 |
|
|
|
160 |
@app.post("/embeddings/", response_model=EmbedResponse)
|
161 |
async def get_embeddings(request: EmbedRequest):
|
162 |
-
embeddings = await processor.process_request(request,
|
163 |
return EmbedResponse(embeddings=embeddings)
|
164 |
|
|
|
165 |
@app.post("/rerank/", response_model=RerankResponse)
|
166 |
async def rerank(request: RerankRequest):
|
167 |
-
scores = await processor.process_request(request,
|
168 |
return RerankResponse(scores=scores)
|
169 |
|
|
|
170 |
if __name__ == "__main__":
|
171 |
import uvicorn
|
172 |
-
|
|
|
|
1 |
import asyncio
|
2 |
import os
|
3 |
import time
|
|
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
5 |
+
from pathlib import Path
|
6 |
from typing import List, Tuple, Union
|
7 |
from uuid import uuid4
|
8 |
|
9 |
+
import torch
|
10 |
from fastapi import FastAPI, HTTPException, Request
|
11 |
from fastapi.responses import JSONResponse
|
12 |
from FlagEmbedding import BGEM3FlagModel
|
|
|
14 |
from starlette.status import HTTP_504_GATEWAY_TIMEOUT
|
15 |
|
16 |
Path("/tmp/cache").mkdir(exist_ok=True)
|
17 |
+
os.environ[
|
18 |
+
"HF_HOME"
|
19 |
+
] = "/tmp/cache" # does not quite work, need Path("/tmp/cache").mkdir(exist_ok=True)?
|
20 |
+
|
21 |
+
batch_size = 2 # gpu batch_size in order of your available vram
|
22 |
+
max_request = 10 # max request for future improvements on api calls / gpu batches (for now is pretty basic)
|
23 |
+
max_length = 5000 # max context length for embeddings and passages in re-ranker
|
24 |
+
max_q_length = 256 # max context lenght for questions in re-ranker
|
25 |
+
request_flush_timeout = 0.1 # flush time out for future improvements on api calls / gpu batches (for now is pretty basic)
|
26 |
+
rerank_weights = [0.4, 0.2, 0.4] # re-rank score weights
|
27 |
request_time_out = 30 # Timeout threshold
|
28 |
+
gpu_time_out = 5 # gpu processing timeout threshold
|
29 |
+
port = 3000
|
30 |
+
port = 7860
|
31 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
32 |
+
|
33 |
|
34 |
class m3Wrapper:
|
35 |
+
def __init__(self, model_name: str, device: str = DEVICE):
|
36 |
"""Init."""
|
37 |
+
self.model = BGEM3FlagModel(
|
38 |
+
model_name, device=device, use_fp16=True if device != "cpu" else False
|
39 |
+
)
|
40 |
|
41 |
def embed(self, sentences: List[str]) -> List[List[float]]:
|
42 |
+
embeddings = self.model.encode(
|
43 |
+
sentences, batch_size=batch_size, max_length=max_length
|
44 |
+
)["dense_vecs"]
|
45 |
embeddings = embeddings.tolist()
|
46 |
return embeddings
|
47 |
|
|
|
51 |
batch_size=batch_size,
|
52 |
max_query_length=max_q_length,
|
53 |
max_passage_length=max_length,
|
54 |
+
weights_for_different_modes=rerank_weights,
|
55 |
+
)["colbert+sparse+dense"]
|
56 |
return scores
|
57 |
|
58 |
+
|
59 |
class EmbedRequest(BaseModel):
|
60 |
sentences: List[str]
|
61 |
|
62 |
+
|
63 |
class RerankRequest(BaseModel):
|
64 |
sentence_pairs: List[Tuple[str, str]]
|
65 |
|
66 |
+
|
67 |
class EmbedResponse(BaseModel):
|
68 |
embeddings: List[List[float]]
|
69 |
|
70 |
+
|
71 |
class RerankResponse(BaseModel):
|
72 |
scores: List[float]
|
73 |
|
74 |
+
|
75 |
class RequestProcessor:
|
76 |
+
def __init__(
|
77 |
+
self, model: m3Wrapper, max_request_to_flush: int, accumulation_timeout: float
|
78 |
+
):
|
79 |
"""Init."""
|
80 |
self.model = model
|
81 |
self.max_batch_size = max_request_to_flush
|
|
|
89 |
|
90 |
async def ensure_processing_loop_started(self):
|
91 |
if not self.processing_loop_started:
|
92 |
+
print("starting processing_loop")
|
93 |
self.processing_loop_task = asyncio.create_task(self.processing_loop())
|
94 |
self.processing_loop_started = True
|
95 |
|
|
|
99 |
start_time = asyncio.get_event_loop().time()
|
100 |
|
101 |
while len(requests) < self.max_batch_size:
|
102 |
+
timeout = self.accumulation_timeout - (
|
103 |
+
asyncio.get_event_loop().time() - start_time
|
104 |
+
)
|
105 |
if timeout <= 0:
|
106 |
break
|
107 |
|
108 |
try:
|
109 |
+
req_data, req_type, req_id = await asyncio.wait_for(
|
110 |
+
self.queue.get(), timeout=timeout
|
111 |
+
)
|
112 |
requests.append(req_data)
|
113 |
request_types.append(req_type)
|
114 |
request_ids.append(req_id)
|
|
|
116 |
break
|
117 |
|
118 |
if requests:
|
119 |
+
await self.process_requests_by_type(
|
120 |
+
requests, request_types, request_ids
|
121 |
+
)
|
122 |
|
123 |
async def process_requests_by_type(self, requests, request_types, request_ids):
|
124 |
tasks = []
|
125 |
+
for request_data, request_type, request_id in zip(
|
126 |
+
requests, request_types, request_ids
|
127 |
+
):
|
128 |
+
if request_type == "embed":
|
129 |
+
task = asyncio.create_task(
|
130 |
+
self.run_with_semaphore(
|
131 |
+
self.model.embed, request_data.sentences, request_id
|
132 |
+
)
|
133 |
+
)
|
134 |
else: # 'rerank'
|
135 |
+
task = asyncio.create_task(
|
136 |
+
self.run_with_semaphore(
|
137 |
+
self.model.rerank, request_data.sentence_pairs, request_id
|
138 |
+
)
|
139 |
+
)
|
140 |
tasks.append(task)
|
141 |
await asyncio.gather(*tasks)
|
142 |
|
|
|
144 |
async with self.gpu_lock: # Wait for sem
|
145 |
future = self.executor.submit(func, data)
|
146 |
try:
|
147 |
+
result = await asyncio.wait_for(
|
148 |
+
asyncio.wrap_future(future), timeout=gpu_time_out
|
149 |
+
)
|
150 |
self.response_futures[request_id].set_result(result)
|
151 |
except asyncio.TimeoutError:
|
152 |
+
self.response_futures[request_id].set_exception(
|
153 |
+
TimeoutError("GPU processing timeout")
|
154 |
+
)
|
155 |
except Exception as e:
|
156 |
self.response_futures[request_id].set_exception(e)
|
157 |
|
158 |
+
async def process_request(
|
159 |
+
self, request_data: Union[EmbedRequest, RerankRequest], request_type: str
|
160 |
+
):
|
161 |
try:
|
162 |
await self.ensure_processing_loop_started()
|
163 |
request_id = str(uuid4())
|
|
|
167 |
except Exception as e:
|
168 |
raise HTTPException(status_code=500, detail=f"Internal Server Error {e}")
|
169 |
|
170 |
+
|
171 |
app = FastAPI(
|
172 |
title="baai m3, serving embed and rerank",
|
173 |
description="Swagger UI at https://mikeee-baai-m3.hf.space/docs",
|
|
|
175 |
)
|
176 |
|
177 |
# Initialize the model and request processor
|
178 |
+
model = m3Wrapper("BAAI/bge-m3")
|
179 |
+
processor = RequestProcessor(
|
180 |
+
model, accumulation_timeout=request_flush_timeout, max_request_to_flush=max_request
|
181 |
+
)
|
182 |
|
183 |
# Adding a middleware returning a 504 error if the request processing time is above a certain threshold
|
184 |
@app.middleware("http")
|
|
|
189 |
|
190 |
except asyncio.TimeoutError:
|
191 |
process_time = time.time() - start_time
|
192 |
+
return JSONResponse(
|
193 |
+
{
|
194 |
+
"detail": "Request processing time excedeed limit",
|
195 |
+
"processing_time": process_time,
|
196 |
+
},
|
197 |
+
status_code=HTTP_504_GATEWAY_TIMEOUT,
|
198 |
+
)
|
199 |
+
|
200 |
|
201 |
@app.get("/")
|
202 |
async def landing():
|
203 |
"""Define landing page."""
|
204 |
return "Swagger UI at https://mikeee-baai-m3.hf.space/docs"
|
205 |
|
206 |
+
|
207 |
@app.post("/embeddings/", response_model=EmbedResponse)
|
208 |
async def get_embeddings(request: EmbedRequest):
|
209 |
+
embeddings = await processor.process_request(request, "embed")
|
210 |
return EmbedResponse(embeddings=embeddings)
|
211 |
|
212 |
+
|
213 |
@app.post("/rerank/", response_model=RerankResponse)
|
214 |
async def rerank(request: RerankRequest):
|
215 |
+
scores = await processor.process_request(request, "rerank")
|
216 |
return RerankResponse(scores=scores)
|
217 |
|
218 |
+
|
219 |
if __name__ == "__main__":
|
220 |
import uvicorn
|
221 |
+
|
222 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|