lyraChatGLM / demo.py

yibolu

Feat: Add support for cuda 11.x and faster model load speed

693dde8 over 1 year ago

818 Bytes

	from lyraChatGLM import LyraChatGLM6B
	import numpy as np

	model_path = "./models/1-gpu-fp16.bin"
	tokenizer_path = "./models"
	data_type = "fp16"
	int8_mode = 0
	max_output_length = 150
	arch = "Ampere" # Ampere or Volta
	cuda_version = 12 # cuda version, we currently support 11 and 12

	model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)

	prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
	# test_batch_size = 256

	prompts = [prompt, ]

	# # If you want to get different output in same batch, you can set do_sample to True
	output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)

	print(output_texts)