Felix Marty
add timeout
590064e
raw
history blame
4.04 kB
import json
from datasets import load_dataset
from defaults import (
ADDRESS_BETTERTRANSFORMER,
ADDRESS_VANILLA,
HEADERS,
SPAM_N_REQUESTS,
)
from utils import ElapsedFuturesSession
data = load_dataset("glue", "sst2", split="validation")
RETURN_MESSAGE_SINGLE = """
Inference statistics:
* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""
RETURN_MESSAGE_SPAM = (
"""
Processing """
+ f"{SPAM_N_REQUESTS}"
+ """ inputs sent asynchronously. Grab a coffee.
Inference statistics:
* Promise resolution time: {0} ms
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
"""
)
def get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
return RETURN_MESSAGE_SINGLE.format(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def get_message_spam(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
**kwargs,
):
return RETURN_MESSAGE_SPAM.format(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
)
SESSION = ElapsedFuturesSession()
def send_single(input_model_vanilla, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# should not take more than 10 s, so timeout if that's the case
promise = SESSION.post(
address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
)
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
status = response.status_code
response_text = json.loads(response.text)
prediction = response_text[0]
inf_latency = response_text[1]
peak_gpu_memory = response_text[2]
end_to_end_latency = response.elapsed
return get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def send_spam(address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# data = "this is positive lol" #TODO: use dynamic data with padding
assert SPAM_N_REQUESTS <= len(data)
inp = data.shuffle().select(range(SPAM_N_REQUESTS))
resolution_time = 0
mean_inference_latency = 0
mean_peak_gpu_memory = 0
n_pads = 0
n_elems = 0
sequence_length = 0
promises = []
for i in range(SPAM_N_REQUESTS):
input_data = inp[i]["sentence"].encode("utf-8")
# should not take more than 15 s, so timeout if that's the case
promises.append(
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
)
for promise in promises:
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
response = promise.result()
response_text = json.loads(response.text)
resolution_time = max(resolution_time, response.elapsed)
mean_inference_latency += response_text[1]
mean_peak_gpu_memory += response_text[2]
n_pads += response_text[3]
n_elems += response_text[4]
sequence_length += response_text[5]
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
mean_sequence_length = sequence_length / SPAM_N_REQUESTS
resolution_time = round(resolution_time, 2)
mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
return get_message_spam(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
)