|
import argparse |
|
import json |
|
import random |
|
import numpy as np |
|
|
|
from time import perf_counter |
|
|
|
import sys |
|
sys.path.append('../') |
|
from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser(description="Faster Baichuan Demo") |
|
|
|
parser.add_argument('--model-path', type=str, required=True, |
|
help='Model Path, include config.ini and tokenizer files') |
|
|
|
parser.add_argument('--tokenizer-path', type=str, default=None) |
|
|
|
parser.add_argument( |
|
'--data-type', type=str, metavar='TYPE', default='fp16', |
|
choices=[None, 'fp32', 'fp16', 'bf16', 'int8'], |
|
help='The data type to inference. If None, the data type follows the ' |
|
'checkpoint data type.') |
|
|
|
parser.add_argument( |
|
'--memopt_mode', type=int, default=0, choices=[0, 1], |
|
help='Use MEMOPT mode to increase speed and reduce VRAM usage.' |
|
' 0: FP16 mode' |
|
' 1: Use MEMOPT mode') |
|
|
|
parser.add_argument("--prompt_filepath", type=str, required=True) |
|
parser.add_argument("--max-output-length", type=int, default=512) |
|
parser.add_argument("--warmups", type=int, default=10) |
|
parser.add_argument("--avgnums", type=int, default=10) |
|
args = parser.parse_args() |
|
|
|
print('\n=================== Arguments ===================') |
|
for k, v in vars(args).items(): |
|
print(f' - {k.ljust(25, ".")}: {v}') |
|
print('=================================================') |
|
|
|
return args |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
|
|
model = lyraBaichuan13B(args.model_path, args.tokenizer_path, args.data_type, args.memopt_mode) |
|
|
|
with open(args.prompt_filepath, "rb") as f: |
|
input_datas = json.loads(f.read()) |
|
|
|
used_input_data = input_datas[0] |
|
|
|
|
|
prompt_template = "{}" |
|
|
|
test_batch_size = [1, 2, 4,] |
|
print("test_batch_size: ", test_batch_size) |
|
|
|
for i, bs in enumerate(test_batch_size): |
|
all_use_prompts = [] |
|
all_output_texts = [] |
|
|
|
|
|
for _ in range(args.warmups): |
|
prompts = [prompt_template.format( used_input_data['prompts'].format(*x) ) for x in random.choices(used_input_data['contents'], bs)] |
|
output_texts = model.generate( |
|
prompts, output_length=args.max_output_length, |
|
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False) |
|
|
|
all_cost_s = 0.0 |
|
|
|
for _ in range(args.avgnums): |
|
prompts = [prompt_template.format( used_input_data['prompts'].format(*x) ) for x in random.choices(used_input_data['contents'], bs)] |
|
all_use_prompts.extend(prompts) |
|
|
|
start = perf_counter() |
|
output_texts = model.generate( |
|
prompts, output_length=args.max_output_length, |
|
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False) |
|
all_cost_s += perf_counter() - start |
|
|
|
all_output_texts.extend(output_texts) |
|
|
|
cost = all_cost_s / args.avgnums |
|
|
|
input_output_texts = [prompt + ' ' + gtext for prompt,gtext in zip(all_use_prompts, all_output_texts)] |
|
|
|
tokens = 0 |
|
avg_input_tokens = np.mean([len(model.tokenizer.encode(prompt)) for prompt in all_use_prompts]) |
|
|
|
words = 0 |
|
for text in input_output_texts: |
|
tokens += len(model.tokenizer.encode(text)) |
|
words += len(text) |
|
print( |
|
f"\nFaster-Dtype: {args.data_type}, Batch Size: {bs}, All tokens: {tokens}. Avg Input tokens: {avg_input_tokens}. Cost: {cost} seconds. Speed: {tokens/cost} tokens/s." |
|
) |
|
print( |
|
f"Faster-Dtype: {args.data_type}, Batch Size: {bs}, All generated words: {words}. Cost: {cost} seconds. Speed: {words/cost} words/s." |
|
) |
|
|
|
if i == 0: |
|
for k in range(bs): |
|
print( |
|
f"The {k} Sample, \n\t\tInputs: {prompts[k]}. \n\t\tOutputs: {output_texts[k].lstrip()}") |
|
if k>2: |
|
break |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|