Felix Marty commited on
Commit
bf38ec8
1 Parent(s): d10c2a9
Files changed (3) hide show
  1. app.py +111 -79
  2. backend.py +28 -24
  3. defaults.py +5 -3
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import gradio as gr
2
 
3
- from backend import get_message_single, get_message_spam, send_single, send_spam
 
 
4
  from defaults import (
5
  ADDRESS_BETTERTRANSFORMER,
6
  ADDRESS_VANILLA,
@@ -8,8 +10,60 @@ from defaults import (
8
  defaults_bt_spam,
9
  defaults_vanilla_single,
10
  defaults_vanilla_spam,
 
11
  )
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  TTILE_IMAGE = """
14
  <div
15
  style="
@@ -34,7 +88,7 @@ TITLE = """
34
  font-size: 2.2rem;
35
  "
36
  >
37
- <h1 style="font-weight: 700; margin-bottom: 10px; margin-top: 10px;">
38
  Speed up your inference and support more workload with PyTorch's BetterTransformer 🤗
39
  </h1>
40
  </div>
@@ -67,98 +121,76 @@ with gr.Blocks() as demo:
67
  with gr.Row():
68
  with gr.Column(scale=50):
69
  gr.Markdown("### Vanilla Transformers + TorchServe")
 
 
 
 
 
 
70
 
71
- address_input_vanilla = gr.Textbox(
72
- max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
73
- )
 
 
 
74
 
75
- input_model_vanilla = gr.Textbox(
76
- max_lines=1,
77
- label="Text",
78
- value="Expectations were low, enjoyment was high",
79
- )
80
 
81
- btn_single_vanilla = gr.Button("Send single text request")
 
 
82
  output_single_vanilla = gr.Markdown(
83
  label="Output single vanilla",
84
  value=get_message_single(**defaults_vanilla_single),
85
  )
86
- with gr.Column():
87
- with gr.Column(scale=40):
88
- input_n_inputs_vanilla = gr.Textbox(
89
- max_lines=1,
90
- label="Number of inputs",
91
- value=8,
92
- )
93
- with gr.Column(scale=60):
94
- gr.Markdown("")
95
- btn_spam_vanilla = gr.Button(
96
- "Spam text requests (from sst2 validation set)"
97
- )
98
-
99
- output_spam_vanilla = gr.Markdown(
100
- label="Output spam vanilla",
101
- value=get_message_spam(**defaults_vanilla_spam),
102
- )
103
-
104
- btn_single_vanilla.click(
105
- fn=send_single,
106
- inputs=[input_model_vanilla, address_input_vanilla],
107
- outputs=output_single_vanilla,
108
- )
109
- btn_spam_vanilla.click(
110
- fn=send_spam,
111
- inputs=[address_input_vanilla],
112
- outputs=output_spam_vanilla,
113
- )
114
-
115
  with gr.Column(scale=50):
116
- gr.Markdown("### BetterTransformer + TorchServe")
117
-
118
- address_input_bettertransformer = gr.Textbox(
119
- max_lines=1,
120
- label="ip bettertransformer",
121
- value=ADDRESS_BETTERTRANSFORMER,
122
- visible=False,
123
- )
124
-
125
- input_model_bettertransformer = gr.Textbox(
126
- max_lines=1,
127
- label="Text",
128
- value="Expectations were low, enjoyment was high",
129
- )
130
-
131
- btn_single_bt = gr.Button("Send single text request")
132
  output_single_bt = gr.Markdown(
133
  label="Output single bt", value=get_message_single(**defaults_bt_single)
134
  )
135
 
136
- with gr.Row():
137
- with gr.Column(scale=40):
138
- input_n_inputs_bt = gr.Textbox(
139
- max_lines=1,
140
- label="Number of inputs",
141
- value=8,
142
- )
143
- with gr.Column(scale=60):
144
- gr.Markdown("")
145
- btn_spam_bt = gr.Button("Spam text requests (from sst2 validation set)")
146
-
147
- output_spam_bt = gr.Markdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
149
  )
150
 
151
- btn_single_bt.click(
152
- fn=send_single,
153
- inputs=[input_model_bettertransformer, address_input_bettertransformer],
154
- outputs=output_single_bt,
155
- )
156
 
157
- btn_spam_bt.click(
158
- fn=send_spam,
159
- inputs=[address_input_bettertransformer],
160
- outputs=output_spam_bt,
161
- )
162
 
163
  demo.queue(concurrency_count=1)
164
- demo.launch()
 
1
  import gradio as gr
2
 
3
+ import json
4
+ import math
5
+ from backend import get_message_single, get_message_spam, send_single, send_spam, tokenizer
6
  from defaults import (
7
  ADDRESS_BETTERTRANSFORMER,
8
  ADDRESS_VANILLA,
 
10
  defaults_bt_spam,
11
  defaults_vanilla_single,
12
  defaults_vanilla_spam,
13
+ BATCH_SIZE,
14
  )
15
 
16
+ import datasets
17
+ import torch
18
+
19
+ def dispatch_single(input_model_single, address_input_vanilla, address_input_bettertransformer):
20
+ result_vanilla = send_single(input_model_single, address_input_vanilla)
21
+ result_bettertransformer = send_single(input_model_single, address_input_bettertransformer)
22
+
23
+ return result_vanilla, result_bettertransformer
24
+
25
+ def dispatch_spam(input_n_spam, address_input_vanilla, address_input_bettertransformer):
26
+ input_n_spam = int(input_n_spam)
27
+ assert input_n_spam <= len(data)
28
+
29
+ inp = data.shuffle().select(range(input_n_spam))
30
+
31
+ result_vanilla = send_spam(inp, address_input_vanilla)
32
+ result_bettertransformer = send_spam(inp, address_input_bettertransformer)
33
+
34
+ return result_vanilla, result_bettertransformer
35
+
36
+ def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer):
37
+ sequence_length = int(sequence_length)
38
+ input_n_spam_artif = int(input_n_spam_artif)
39
+
40
+ inp_tokens = torch.randint(tokenizer.vocab_size - 1, (sequence_length,)) + 1
41
+
42
+ n_pads = max(int(padding_ratio * len(inp_tokens)), 1)
43
+ inp_tokens[- n_pads:] = 0
44
+
45
+ inp_tokens[0] = 101
46
+ inp_tokens[- n_pads - 1] = 102
47
+ #inp_tokens = inp_tokens.unsqueeze(0).repeat(BATCH_SIZE, 1)
48
+
49
+ attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
50
+ attention_mask[:- n_pads] = 1
51
+
52
+ str_input = json.dumps({
53
+ "input_ids": inp_tokens.cpu().tolist(),
54
+ "attention_mask": attention_mask.cpu().tolist(),
55
+ "pre_tokenized": True,
56
+ })
57
+
58
+ input_dataset = datasets.Dataset.from_dict(
59
+ {"sentence": [str_input for _ in range(input_n_spam_artif)]}
60
+ )
61
+
62
+ result_vanilla = send_spam(input_dataset, address_input_vanilla)
63
+ result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
64
+
65
+ return result_vanilla, result_bettertransformer
66
+
67
  TTILE_IMAGE = """
68
  <div
69
  style="
 
88
  font-size: 2.2rem;
89
  "
90
  >
91
+ <h1 style="font-weight: 500; margin-bottom: 10px; margin-top: 10px;">
92
  Speed up your inference and support more workload with PyTorch's BetterTransformer 🤗
93
  </h1>
94
  </div>
 
121
  with gr.Row():
122
  with gr.Column(scale=50):
123
  gr.Markdown("### Vanilla Transformers + TorchServe")
124
+ with gr.Column(scale=50):
125
+ gr.Markdown("### BetterTransformer + TorchServe")
126
+
127
+ address_input_vanilla = gr.Textbox(
128
+ max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
129
+ )
130
 
131
+ address_input_bettertransformer = gr.Textbox(
132
+ max_lines=1,
133
+ label="ip bettertransformer",
134
+ value=ADDRESS_BETTERTRANSFORMER,
135
+ visible=False,
136
+ )
137
 
138
+ input_model_single = gr.Textbox(
139
+ max_lines=1,
140
+ label="Text",
141
+ value="Expectations were low, enjoyment was high",
142
+ )
143
 
144
+ btn_single = gr.Button("Send single text request")
145
+ with gr.Row():
146
+ with gr.Column(scale=50):
147
  output_single_vanilla = gr.Markdown(
148
  label="Output single vanilla",
149
  value=get_message_single(**defaults_vanilla_single),
150
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  with gr.Column(scale=50):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  output_single_bt = gr.Markdown(
153
  label="Output single bt", value=get_message_single(**defaults_bt_single)
154
  )
155
 
156
+ btn_single.click(
157
+ fn=dispatch_single,
158
+ inputs=[input_model_single, address_input_vanilla, address_input_bettertransformer],
159
+ outputs=[output_single_vanilla, output_single_bt],
160
+ )
161
+
162
+ input_n_spam_artif = gr.Number(
163
+ label="Number of inputs to send",
164
+ value=8,
165
+ )
166
+ sequence_length = gr.Number(
167
+ label="Sequence length (in tokens)",
168
+ value=128,
169
+ )
170
+ padding_ratio = gr.Number(
171
+ label="Padding ratio",
172
+ value=0.5,
173
+ )
174
+ btn_spam_artif = gr.Button(
175
+ "Spam text requests (using artificial data)"
176
+ )
177
+ with gr.Row():
178
+ with gr.Column(scale=50):
179
+ output_spam_vanilla_artif = gr.Markdown(
180
+ label="Output spam vanilla",
181
+ value=get_message_spam(**defaults_vanilla_spam),
182
+ )
183
+ with gr.Column(scale=50):
184
+ output_spam_bt_artif = gr.Markdown(
185
  label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
186
  )
187
 
188
+ btn_spam_artif.click(
189
+ fn=dispatch_spam_artif,
190
+ inputs=[input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer],
191
+ outputs=[output_spam_vanilla_artif, output_spam_bt_artif],
192
+ )
193
 
 
 
 
 
 
194
 
195
  demo.queue(concurrency_count=1)
196
+ demo.launch()
backend.py CHANGED
@@ -1,16 +1,16 @@
1
  import json
2
 
3
- from datasets import load_dataset
4
-
5
  from defaults import (
6
  ADDRESS_BETTERTRANSFORMER,
7
  ADDRESS_VANILLA,
8
  HEADERS,
9
- SPAM_N_REQUESTS,
10
  )
11
  from utils import ElapsedFuturesSession
12
 
13
- data = load_dataset("glue", "sst2", split="validation")
 
 
14
 
15
  RETURN_MESSAGE_SINGLE = """
16
  Inference statistics:
@@ -26,19 +26,20 @@ Inference statistics:
26
  RETURN_MESSAGE_SPAM = (
27
  """
28
  Processing """
29
- + f"{SPAM_N_REQUESTS}"
30
- + """ inputs sent asynchronously. Grab a coffee.
31
 
32
  Inference statistics:
33
 
34
- * Promise resolution time: {0} ms
35
  * Mean inference latency (preprocessing/forward/postprocessing): {1} ms
36
  * Mean peak GPU memory: {2} MB
37
  * Mean padding ratio: {3} %
38
  * Mean sequence length: {4} tokens
 
39
  """
40
  )
41
 
 
42
 
43
  def get_message_single(
44
  status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
@@ -49,25 +50,26 @@ def get_message_single(
49
 
50
 
51
  def get_message_spam(
52
- resolution_time,
53
  mean_inference_latency,
54
  mean_peak_gpu_memory,
55
  mean_padding_ratio,
56
  mean_sequence_length,
 
57
  **kwargs,
58
  ):
59
  return RETURN_MESSAGE_SPAM.format(
60
- resolution_time,
61
  mean_inference_latency,
62
  mean_peak_gpu_memory,
63
  mean_padding_ratio,
64
  mean_sequence_length,
 
65
  )
66
 
67
 
68
  SESSION = ElapsedFuturesSession()
69
 
70
-
71
  def send_single(input_model_vanilla, address: str):
72
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
73
 
@@ -94,26 +96,24 @@ def send_single(input_model_vanilla, address: str):
94
  )
95
 
96
 
97
- def send_spam(address: str):
98
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
99
 
100
  # data = "this is positive lol" #TODO: use dynamic data with padding
101
-
102
- assert SPAM_N_REQUESTS <= len(data)
103
-
104
- inp = data.shuffle().select(range(SPAM_N_REQUESTS))
105
-
106
- resolution_time = 0
107
  mean_inference_latency = 0
108
  mean_peak_gpu_memory = 0
109
 
110
  n_pads = 0
111
  n_elems = 0
112
  sequence_length = 0
 
113
 
114
  promises = []
115
 
116
- for i in range(SPAM_N_REQUESTS):
 
 
117
  input_data = inp[i]["sentence"].encode("utf-8")
118
 
119
  # should not take more than 15 s, so timeout if that's the case
@@ -131,25 +131,29 @@ def send_spam(address: str):
131
 
132
  response_text = json.loads(response.text)
133
 
134
- resolution_time = max(resolution_time, response.elapsed)
135
 
136
  mean_inference_latency += response_text[1]
137
  mean_peak_gpu_memory += response_text[2]
138
  n_pads += response_text[3]
139
  n_elems += response_text[4]
140
  sequence_length += response_text[5]
 
141
 
 
142
  mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
143
- mean_sequence_length = sequence_length / SPAM_N_REQUESTS
 
144
 
145
- resolution_time = round(resolution_time, 2)
146
- mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
147
- mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
148
 
149
  return get_message_spam(
150
- resolution_time,
151
  mean_inference_latency,
152
  mean_peak_gpu_memory,
153
  mean_padding_ratio,
154
  mean_sequence_length,
 
155
  )
 
1
  import json
2
 
 
 
3
  from defaults import (
4
  ADDRESS_BETTERTRANSFORMER,
5
  ADDRESS_VANILLA,
6
  HEADERS,
7
+ MODEL_NAME,
8
  )
9
  from utils import ElapsedFuturesSession
10
 
11
+ from transformers import AutoTokenizer
12
+
13
+ import numpy as np
14
 
15
  RETURN_MESSAGE_SINGLE = """
16
  Inference statistics:
 
26
  RETURN_MESSAGE_SPAM = (
27
  """
28
  Processing """
29
+ + "NUMBER REQ" + """ inputs sent asynchronously. Grab a coffee.
 
30
 
31
  Inference statistics:
32
 
33
+ * Throughput: {0} samples/s
34
  * Mean inference latency (preprocessing/forward/postprocessing): {1} ms
35
  * Mean peak GPU memory: {2} MB
36
  * Mean padding ratio: {3} %
37
  * Mean sequence length: {4} tokens
38
+ * Effective mean batch size: {5}
39
  """
40
  )
41
 
42
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
43
 
44
  def get_message_single(
45
  status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
 
50
 
51
 
52
  def get_message_spam(
53
+ throughput,
54
  mean_inference_latency,
55
  mean_peak_gpu_memory,
56
  mean_padding_ratio,
57
  mean_sequence_length,
58
+ effective_batch_size,
59
  **kwargs,
60
  ):
61
  return RETURN_MESSAGE_SPAM.format(
62
+ throughput,
63
  mean_inference_latency,
64
  mean_peak_gpu_memory,
65
  mean_padding_ratio,
66
  mean_sequence_length,
67
+ effective_batch_size,
68
  )
69
 
70
 
71
  SESSION = ElapsedFuturesSession()
72
 
 
73
  def send_single(input_model_vanilla, address: str):
74
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
75
 
 
96
  )
97
 
98
 
99
+ def send_spam(inp, address: str):
100
  assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
101
 
102
  # data = "this is positive lol" #TODO: use dynamic data with padding
103
+ max_resolution_time = 0
 
 
 
 
 
104
  mean_inference_latency = 0
105
  mean_peak_gpu_memory = 0
106
 
107
  n_pads = 0
108
  n_elems = 0
109
  sequence_length = 0
110
+ effective_batch_size = 0
111
 
112
  promises = []
113
 
114
+ n_inputs = len(inp)
115
+
116
+ for i in range(n_inputs):
117
  input_data = inp[i]["sentence"].encode("utf-8")
118
 
119
  # should not take more than 15 s, so timeout if that's the case
 
131
 
132
  response_text = json.loads(response.text)
133
 
134
+ max_resolution_time = max(max_resolution_time, response.elapsed)
135
 
136
  mean_inference_latency += response_text[1]
137
  mean_peak_gpu_memory += response_text[2]
138
  n_pads += response_text[3]
139
  n_elems += response_text[4]
140
  sequence_length += response_text[5]
141
+ effective_batch_size += response_text[6]
142
 
143
+ throughput = n_inputs / (max_resolution_time * 1e-3)
144
  mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
145
+ mean_sequence_length = sequence_length / n_inputs
146
+ effective_batch_size = effective_batch_size / n_inputs
147
 
148
+ throughput = round(throughput, 2)
149
+ mean_inference_latency = round(mean_inference_latency / n_inputs, 2)
150
+ mean_peak_gpu_memory = round(mean_peak_gpu_memory / n_inputs, 2)
151
 
152
  return get_message_spam(
153
+ throughput,
154
  mean_inference_latency,
155
  mean_peak_gpu_memory,
156
  mean_padding_ratio,
157
  mean_sequence_length,
158
+ effective_batch_size,
159
  )
defaults.py CHANGED
@@ -15,24 +15,26 @@ defaults_bt_single = {
15
  }
16
 
17
  defaults_vanilla_spam = {
18
- "resolution_time": 2996.35,
19
  "mean_inference_latency": 29.69,
20
  "mean_peak_gpu_memory": 3620.9,
21
  "mean_padding_ratio": 35.26,
22
  "mean_sequence_length": 39.395,
 
23
  }
24
 
25
  defaults_bt_spam = {
26
- "resolution_time": 2996.35,
27
  "mean_inference_latency": 29.69,
28
  "mean_peak_gpu_memory": 3620.9,
29
  "mean_padding_ratio": 35.26,
30
  "mean_sequence_length": 39.395,
 
31
  }
32
 
33
- SPAM_N_REQUESTS = 200
34
  BATCH_SIZE = 8 # fixed!
35
 
36
  HEADERS = {"Content-Type": "text/plain"}
37
  ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
38
  ADDRESS_BETTERTRANSFORMER = "http://3.95.136.2:8080/predictions/my_tc"
 
 
15
  }
16
 
17
  defaults_vanilla_spam = {
18
+ "throughput": 20,
19
  "mean_inference_latency": 29.69,
20
  "mean_peak_gpu_memory": 3620.9,
21
  "mean_padding_ratio": 35.26,
22
  "mean_sequence_length": 39.395,
23
+ "effective_batch_size": 8,
24
  }
25
 
26
  defaults_bt_spam = {
27
+ "throughput": 20,
28
  "mean_inference_latency": 29.69,
29
  "mean_peak_gpu_memory": 3620.9,
30
  "mean_padding_ratio": 35.26,
31
  "mean_sequence_length": 39.395,
32
+ "effective_batch_size": 8,
33
  }
34
 
 
35
  BATCH_SIZE = 8 # fixed!
36
 
37
  HEADERS = {"Content-Type": "text/plain"}
38
  ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
39
  ADDRESS_BETTERTRANSFORMER = "http://3.95.136.2:8080/predictions/my_tc"
40
+ MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"