carsonhxsu
# This is a combination of 22 commits.
8453337
import argparse
import json
import random
import numpy as np
from time import perf_counter
import sys
sys.path.append('../')
from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
def get_args():
parser = argparse.ArgumentParser(description="Faster Baichuan Demo")
parser.add_argument('--model-path', type=str, required=True,
help='Model Path, include config.ini and tokenizer files')
# parser.add_argument('--tokenizer-path', type=str, default='/group/30063/users/vanewu/LocalModels/ChatGLM6B-Torch/chatglm-6b')
parser.add_argument('--tokenizer-path', type=str, default=None)
parser.add_argument(
'--data-type', type=str, metavar='TYPE', default='fp16',
choices=[None, 'fp32', 'fp16', 'bf16', 'int8'],
help='The data type to inference. If None, the data type follows the '
'checkpoint data type.')
parser.add_argument(
'--memopt_mode', type=int, default=0, choices=[0, 1],
help='Use MEMOPT mode to increase speed and reduce VRAM usage.'
' 0: FP16 mode'
' 1: Use MEMOPT mode')
parser.add_argument("--prompt_filepath", type=str, required=True)
parser.add_argument("--max-output-length", type=int, default=512)
parser.add_argument("--warmups", type=int, default=10)
parser.add_argument("--avgnums", type=int, default=10)
args = parser.parse_args()
print('\n=================== Arguments ===================')
for k, v in vars(args).items():
print(f' - {k.ljust(25, ".")}: {v}')
print('=================================================')
return args
def main():
args = get_args()
# model = lyraBaichuan7B(args.model_path, args.tokenizer_path, args.data_type, args.memopt_mode)
model = lyraBaichuan13B(args.model_path, args.tokenizer_path, args.data_type, args.memopt_mode)
with open(args.prompt_filepath, "rb") as f:
input_datas = json.loads(f.read())
used_input_data = input_datas[0]
# prompt_template = "<reserved_106>{}\n<reserved_107>" # baichuan chat
prompt_template = "{}" # baichuan
test_batch_size = [1, 2, 4,] # 8, 16, 32, 64
print("test_batch_size: ", test_batch_size)
for i, bs in enumerate(test_batch_size):
all_use_prompts = []
all_output_texts = []
# warmup gpu
for _ in range(args.warmups):
prompts = [prompt_template.format( used_input_data['prompts'].format(*x) ) for x in random.choices(used_input_data['contents'], bs)]
output_texts = model.generate(
prompts, output_length=args.max_output_length,
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
all_cost_s = 0.0
for _ in range(args.avgnums):
prompts = [prompt_template.format( used_input_data['prompts'].format(*x) ) for x in random.choices(used_input_data['contents'], bs)]
all_use_prompts.extend(prompts)
start = perf_counter()
output_texts = model.generate(
prompts, output_length=args.max_output_length,
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
all_cost_s += perf_counter() - start
all_output_texts.extend(output_texts)
cost = all_cost_s / args.avgnums
input_output_texts = [prompt + ' ' + gtext for prompt,gtext in zip(all_use_prompts, all_output_texts)]
tokens = 0
avg_input_tokens = np.mean([len(model.tokenizer.encode(prompt)) for prompt in all_use_prompts])
words = 0
for text in input_output_texts:
tokens += len(model.tokenizer.encode(text))
words += len(text)
print(
f"\nFaster-Dtype: {args.data_type}, Batch Size: {bs}, All tokens: {tokens}. Avg Input tokens: {avg_input_tokens}. Cost: {cost} seconds. Speed: {tokens/cost} tokens/s."
)
print(
f"Faster-Dtype: {args.data_type}, Batch Size: {bs}, All generated words: {words}. Cost: {cost} seconds. Speed: {words/cost} words/s."
)
if i == 0:
for k in range(bs):
print(
f"The {k} Sample, \n\t\tInputs: {prompts[k]}. \n\t\tOutputs: {output_texts[k].lstrip()}")
if k>2:
break
if __name__ == "__main__":
main()