Spaces:
Running
Running
import json | |
from datasets import load_dataset | |
from defaults import ( | |
ADDRESS_BETTERTRANSFORMER, | |
ADDRESS_VANILLA, | |
HEADERS, | |
SPAM_N_REQUESTS, | |
) | |
from utils import ElapsedFuturesSession | |
data = load_dataset("glue", "sst2", split="validation") | |
RETURN_MESSAGE_SINGLE = """ | |
Inference statistics: | |
* Response status: {0} | |
* Prediction: {1} | |
* Inference latency (preprocessing/forward/postprocessing): {2} ms | |
* Peak GPU memory usage: {3} MB | |
* End-to-end latency (communication + pre/forward/post): {4} ms | |
* Padding ratio: 0.0 % | |
""" | |
RETURN_MESSAGE_SPAM = ( | |
""" | |
Processing """ | |
+ f"{SPAM_N_REQUESTS}" | |
+ """ inputs sent asynchronously. Grab a coffee. | |
Inference statistics: | |
* Promise resolution time: {0} ms | |
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms | |
* Mean peak GPU memory: {2} MB | |
* Mean padding ratio: {3} % | |
* Mean sequence length: {4} tokens | |
""" | |
) | |
def get_message_single( | |
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs | |
): | |
return RETURN_MESSAGE_SINGLE.format( | |
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency | |
) | |
def get_message_spam( | |
resolution_time, | |
mean_inference_latency, | |
mean_peak_gpu_memory, | |
mean_padding_ratio, | |
mean_sequence_length, | |
**kwargs, | |
): | |
return RETURN_MESSAGE_SPAM.format( | |
resolution_time, | |
mean_inference_latency, | |
mean_peak_gpu_memory, | |
mean_padding_ratio, | |
mean_sequence_length, | |
) | |
SESSION = ElapsedFuturesSession() | |
def send_single(input_model_vanilla, address: str): | |
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] | |
# should not take more than 10 s, so timeout if that's the case | |
promise = SESSION.post( | |
address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10 | |
) | |
try: | |
response = promise.result() # resolve ASAP | |
except Exception as e: | |
return f"{e}" | |
status = response.status_code | |
response_text = json.loads(response.text) | |
prediction = response_text[0] | |
inf_latency = response_text[1] | |
peak_gpu_memory = response_text[2] | |
end_to_end_latency = response.elapsed | |
return get_message_single( | |
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency | |
) | |
def send_spam(address: str): | |
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] | |
# data = "this is positive lol" #TODO: use dynamic data with padding | |
assert SPAM_N_REQUESTS <= len(data) | |
inp = data.shuffle().select(range(SPAM_N_REQUESTS)) | |
resolution_time = 0 | |
mean_inference_latency = 0 | |
mean_peak_gpu_memory = 0 | |
n_pads = 0 | |
n_elems = 0 | |
sequence_length = 0 | |
promises = [] | |
for i in range(SPAM_N_REQUESTS): | |
input_data = inp[i]["sentence"].encode("utf-8") | |
# should not take more than 15 s, so timeout if that's the case | |
promises.append( | |
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15) | |
) | |
for promise in promises: | |
try: | |
response = promise.result() # resolve ASAP | |
except Exception as e: | |
return f"{e}" | |
response = promise.result() | |
response_text = json.loads(response.text) | |
resolution_time = max(resolution_time, response.elapsed) | |
mean_inference_latency += response_text[1] | |
mean_peak_gpu_memory += response_text[2] | |
n_pads += response_text[3] | |
n_elems += response_text[4] | |
sequence_length += response_text[5] | |
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}" | |
mean_sequence_length = sequence_length / SPAM_N_REQUESTS | |
resolution_time = round(resolution_time, 2) | |
mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2) | |
mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2) | |
return get_message_spam( | |
resolution_time, | |
mean_inference_latency, | |
mean_peak_gpu_memory, | |
mean_padding_ratio, | |
mean_sequence_length, | |
) | |