Felix Marty commited on
Commit
64721de
1 Parent(s): 92dfc12
Files changed (3) hide show
  1. app.py +35 -23
  2. backend.py +64 -27
  3. utils.py +9 -8
app.py CHANGED
@@ -1,28 +1,28 @@
1
  import gradio as gr
2
 
3
- from defaults import defaults_vanilla_single, defaults_bt_spam, defaults_bt_single, defaults_vanilla_spam
4
- from defaults import ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER
5
- from backend import send_single, send_spam, get_message_single, get_message_spam
 
 
6
 
7
  with gr.Blocks() as demo:
8
- gr.Markdown("""
 
9
  Let's try out TorchServe + BetterTransformer! This is some longer description This is some longer description This is some longer description")
10
 
11
  ## Inference using...
12
  """
13
  )
14
-
15
  with gr.Row():
16
  with gr.Column(scale=50):
17
  gr.Markdown("### Vanilla Transformers + TorchServe")
18
 
19
  address_input_vanilla = gr.Textbox(
20
- max_lines=1,
21
- label="ip vanilla",
22
- value=ADDRESS_VANILLA,
23
- visible=False
24
  )
25
-
26
  input_model_vanilla = gr.Textbox(
27
  max_lines=1,
28
  label="Text",
@@ -30,11 +30,19 @@ with gr.Blocks() as demo:
30
  )
31
 
32
  btn_single_vanilla = gr.Button("Send single text request")
33
- output_single_vanilla = gr.Markdown(label="Output single vanilla", value=get_message_single(**defaults_vanilla_single))
34
-
35
- btn_spam_vanilla = gr.Button("Spam text requests (from sst2 validation set)")
36
- output_spam_vanilla = gr.Markdown(label="Output spam vanilla", value=get_message_spam(**defaults_vanilla_spam))
37
-
 
 
 
 
 
 
 
 
38
  btn_single_vanilla.click(
39
  fn=send_single,
40
  inputs=[input_model_vanilla, address_input_vanilla],
@@ -53,9 +61,9 @@ with gr.Blocks() as demo:
53
  max_lines=1,
54
  label="ip bettertransformer",
55
  value=ADDRESS_BETTERTRANSFORMER,
56
- visible=False
57
  )
58
-
59
  input_model_bettertransformer = gr.Textbox(
60
  max_lines=1,
61
  label="Text",
@@ -63,17 +71,21 @@ with gr.Blocks() as demo:
63
  )
64
 
65
  btn_single_bt = gr.Button("Send single text request")
66
- output_single_bt = gr.Markdown(label="Output single bt", value=get_message_single(**defaults_bt_single))
67
-
 
 
68
  btn_spam_bt = gr.Button("Spam text requests (from sst2 validation set)")
69
- output_spam_bt = gr.Markdown(label="Output spam bt", value=get_message_spam(**defaults_bt_spam))
70
-
 
 
71
  btn_single_bt.click(
72
  fn=send_single,
73
  inputs=[input_model_bettertransformer, address_input_bettertransformer],
74
  outputs=output_single_bt,
75
  )
76
-
77
  btn_spam_bt.click(
78
  fn=send_spam,
79
  inputs=[address_input_bettertransformer],
@@ -81,4 +93,4 @@ with gr.Blocks() as demo:
81
  )
82
 
83
  demo.queue(concurrency_count=1)
84
- demo.launch()
 
1
  import gradio as gr
2
 
3
+ from backend import (get_message_single, get_message_spam, send_single,
4
+ send_spam)
5
+ from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA,
6
+ defaults_bt_single, defaults_bt_spam,
7
+ defaults_vanilla_single, defaults_vanilla_spam)
8
 
9
  with gr.Blocks() as demo:
10
+ gr.Markdown(
11
+ """
12
  Let's try out TorchServe + BetterTransformer! This is some longer description This is some longer description This is some longer description")
13
 
14
  ## Inference using...
15
  """
16
  )
17
+
18
  with gr.Row():
19
  with gr.Column(scale=50):
20
  gr.Markdown("### Vanilla Transformers + TorchServe")
21
 
22
  address_input_vanilla = gr.Textbox(
23
+ max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
 
 
 
24
  )
25
+
26
  input_model_vanilla = gr.Textbox(
27
  max_lines=1,
28
  label="Text",
 
30
  )
31
 
32
  btn_single_vanilla = gr.Button("Send single text request")
33
+ output_single_vanilla = gr.Markdown(
34
+ label="Output single vanilla",
35
+ value=get_message_single(**defaults_vanilla_single),
36
+ )
37
+
38
+ btn_spam_vanilla = gr.Button(
39
+ "Spam text requests (from sst2 validation set)"
40
+ )
41
+ output_spam_vanilla = gr.Markdown(
42
+ label="Output spam vanilla",
43
+ value=get_message_spam(**defaults_vanilla_spam),
44
+ )
45
+
46
  btn_single_vanilla.click(
47
  fn=send_single,
48
  inputs=[input_model_vanilla, address_input_vanilla],
 
61
  max_lines=1,
62
  label="ip bettertransformer",
63
  value=ADDRESS_BETTERTRANSFORMER,
64
+ visible=False,
65
  )
66
+
67
  input_model_bettertransformer = gr.Textbox(
68
  max_lines=1,
69
  label="Text",
 
71
  )
72
 
73
  btn_single_bt = gr.Button("Send single text request")
74
+ output_single_bt = gr.Markdown(
75
+ label="Output single bt", value=get_message_single(**defaults_bt_single)
76
+ )
77
+
78
  btn_spam_bt = gr.Button("Spam text requests (from sst2 validation set)")
79
+ output_spam_bt = gr.Markdown(
80
+ label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
81
+ )
82
+
83
  btn_single_bt.click(
84
  fn=send_single,
85
  inputs=[input_model_bettertransformer, address_input_bettertransformer],
86
  outputs=output_single_bt,
87
  )
88
+
89
  btn_spam_bt.click(
90
  fn=send_spam,
91
  inputs=[address_input_bettertransformer],
 
93
  )
94
 
95
  demo.queue(concurrency_count=1)
96
+ demo.launch()
backend.py CHANGED
@@ -1,10 +1,11 @@
1
  import json
2
 
3
- from .defaults import SPAM_N_REQUESTS, ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER, HEADERS
4
- from .utils import ElapsedFuturesSession
5
-
6
  from datasets import load_dataset
7
 
 
 
 
 
8
  data = load_dataset("glue", "sst2", split="validation")
9
 
10
  RETURN_MESSAGE_SINGLE = """
@@ -18,8 +19,11 @@ Inference statistics:
18
  * Padding ratio: 0.0 %
19
  """
20
 
21
- RETURN_MESSAGE_SPAM = """
22
- Processing """ + f"{SPAM_N_REQUESTS}" + """ inputs sent asynchronously. Grab a coffee.
 
 
 
23
 
24
  Inference statistics:
25
 
@@ -29,37 +33,64 @@ Inference statistics:
29
  * Mean padding ratio: {3} %
30
  * Mean sequence length: {4} tokens
31
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs):
34
- return RETURN_MESSAGE_SINGLE.format(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
35
-
36
- def get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length, **kwargs):
37
- return RETURN_MESSAGE_SPAM.format(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
38
 
39
  SESSION = ElapsedFuturesSession()
40
 
 
41
  def send_single(input_model_vanilla, address: str):
42
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
43
-
44
- promise = SESSION.post(address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"))
 
 
45
 
46
  response = promise.result() # resolve immediately
47
 
48
  status = response.status_code
49
-
50
  response_text = json.loads(response.text)
51
  prediction = response_text[0]
52
  inf_latency = response_text[1]
53
  peak_gpu_memory = response_text[2]
54
  end_to_end_latency = response.elapsed
55
-
56
- return get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
 
 
 
57
 
58
  def send_spam(address: str):
59
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
60
-
61
  # data = "this is positive lol" #TODO: use dynamic data with padding
62
-
63
  assert SPAM_N_REQUESTS <= len(data)
64
 
65
  inp = data.shuffle().select(range(SPAM_N_REQUESTS))
@@ -67,35 +98,41 @@ def send_spam(address: str):
67
  resolution_time = 0
68
  mean_inference_latency = 0
69
  mean_peak_gpu_memory = 0
70
-
71
  n_pads = 0
72
  n_elems = 0
73
  sequence_length = 0
74
 
75
  promises = []
76
-
77
  for i in range(SPAM_N_REQUESTS):
78
  input_data = inp[i]["sentence"].encode("utf-8")
79
  promises.append(SESSION.post(address, headers=HEADERS, data=input_data))
80
-
81
  for promise in promises:
82
  response = promise.result()
83
-
84
  response_text = json.loads(response.text)
85
-
86
  resolution_time = max(resolution_time, response.elapsed)
87
-
88
  mean_inference_latency += response_text[1]
89
  mean_peak_gpu_memory += response_text[2]
90
  n_pads += response_text[3]
91
  n_elems += response_text[4]
92
  sequence_length += response_text[5]
93
-
94
  mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
95
  mean_sequence_length = sequence_length / SPAM_N_REQUESTS
96
-
97
  resolution_time = round(resolution_time, 2)
98
  mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
99
  mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
100
-
101
- return get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
 
 
 
 
 
 
 
1
  import json
2
 
 
 
 
3
  from datasets import load_dataset
4
 
5
+ from .defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS,
6
+ SPAM_N_REQUESTS)
7
+ from .utils import ElapsedFuturesSession
8
+
9
  data = load_dataset("glue", "sst2", split="validation")
10
 
11
  RETURN_MESSAGE_SINGLE = """
 
19
  * Padding ratio: 0.0 %
20
  """
21
 
22
+ RETURN_MESSAGE_SPAM = (
23
+ """
24
+ Processing """
25
+ + f"{SPAM_N_REQUESTS}"
26
+ + """ inputs sent asynchronously. Grab a coffee.
27
 
28
  Inference statistics:
29
 
 
33
  * Mean padding ratio: {3} %
34
  * Mean sequence length: {4} tokens
35
  """
36
+ )
37
+
38
+
39
+ def get_message_single(
40
+ status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
41
+ ):
42
+ return RETURN_MESSAGE_SINGLE.format(
43
+ status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
44
+ )
45
+
46
+
47
+ def get_message_spam(
48
+ resolution_time,
49
+ mean_inference_latency,
50
+ mean_peak_gpu_memory,
51
+ mean_padding_ratio,
52
+ mean_sequence_length,
53
+ **kwargs,
54
+ ):
55
+ return RETURN_MESSAGE_SPAM.format(
56
+ resolution_time,
57
+ mean_inference_latency,
58
+ mean_peak_gpu_memory,
59
+ mean_padding_ratio,
60
+ mean_sequence_length,
61
+ )
62
 
 
 
 
 
 
63
 
64
  SESSION = ElapsedFuturesSession()
65
 
66
+
67
  def send_single(input_model_vanilla, address: str):
68
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
69
+
70
+ promise = SESSION.post(
71
+ address, headers=HEADERS, data=input_model_vanilla.encode("utf-8")
72
+ )
73
 
74
  response = promise.result() # resolve immediately
75
 
76
  status = response.status_code
77
+
78
  response_text = json.loads(response.text)
79
  prediction = response_text[0]
80
  inf_latency = response_text[1]
81
  peak_gpu_memory = response_text[2]
82
  end_to_end_latency = response.elapsed
83
+
84
+ return get_message_single(
85
+ status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
86
+ )
87
+
88
 
89
  def send_spam(address: str):
90
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
91
+
92
  # data = "this is positive lol" #TODO: use dynamic data with padding
93
+
94
  assert SPAM_N_REQUESTS <= len(data)
95
 
96
  inp = data.shuffle().select(range(SPAM_N_REQUESTS))
 
98
  resolution_time = 0
99
  mean_inference_latency = 0
100
  mean_peak_gpu_memory = 0
101
+
102
  n_pads = 0
103
  n_elems = 0
104
  sequence_length = 0
105
 
106
  promises = []
107
+
108
  for i in range(SPAM_N_REQUESTS):
109
  input_data = inp[i]["sentence"].encode("utf-8")
110
  promises.append(SESSION.post(address, headers=HEADERS, data=input_data))
111
+
112
  for promise in promises:
113
  response = promise.result()
114
+
115
  response_text = json.loads(response.text)
116
+
117
  resolution_time = max(resolution_time, response.elapsed)
118
+
119
  mean_inference_latency += response_text[1]
120
  mean_peak_gpu_memory += response_text[2]
121
  n_pads += response_text[3]
122
  n_elems += response_text[4]
123
  sequence_length += response_text[5]
124
+
125
  mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
126
  mean_sequence_length = sequence_length / SPAM_N_REQUESTS
127
+
128
  resolution_time = round(resolution_time, 2)
129
  mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
130
  mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
131
+
132
+ return get_message_spam(
133
+ resolution_time,
134
+ mean_inference_latency,
135
+ mean_peak_gpu_memory,
136
+ mean_padding_ratio,
137
+ mean_sequence_length,
138
+ )
utils.py CHANGED
@@ -1,9 +1,9 @@
 
 
1
  from requests_futures.sessions import FuturesSession
2
 
3
- import time
4
 
5
  class ElapsedFuturesSession(FuturesSession):
6
-
7
  def request(self, method, url, hooks=None, *args, **kwargs):
8
  start = time.time()
9
  if hooks is None:
@@ -13,13 +13,14 @@ class ElapsedFuturesSession(FuturesSession):
13
  r.elapsed = round((time.time() - start) * 1000, 2)
14
 
15
  try:
16
- if isinstance(hooks['response'], (list, tuple)):
17
  # needs to be first so we don't time other hooks execution
18
- hooks['response'].insert(0, timing)
19
  else:
20
- hooks['response'] = [timing, hooks['response']]
21
  except KeyError:
22
- hooks['response'] = timing
23
 
24
- return super(ElapsedFuturesSession, self) \
25
- .request(method, url, hooks=hooks, *args, **kwargs)
 
 
1
+ import time
2
+
3
  from requests_futures.sessions import FuturesSession
4
 
 
5
 
6
  class ElapsedFuturesSession(FuturesSession):
 
7
  def request(self, method, url, hooks=None, *args, **kwargs):
8
  start = time.time()
9
  if hooks is None:
 
13
  r.elapsed = round((time.time() - start) * 1000, 2)
14
 
15
  try:
16
+ if isinstance(hooks["response"], (list, tuple)):
17
  # needs to be first so we don't time other hooks execution
18
+ hooks["response"].insert(0, timing)
19
  else:
20
+ hooks["response"] = [timing, hooks["response"]]
21
  except KeyError:
22
+ hooks["response"] = timing
23
 
24
+ return super(ElapsedFuturesSession, self).request(
25
+ method, url, hooks=hooks, *args, **kwargs
26
+ )