from lyraChatGLM import LyraChatGLM6B | |
import numpy as np | |
model_path = "./models/1-gpu-fp16.bin" | |
tokenizer_path = "./models" | |
data_type = "fp16" | |
int8_mode = 0 | |
max_output_length = 150 | |
arch = "Ampere" # Ampere or Volta | |
cuda_version = 12 # cuda version, we currently support 11 and 12 | |
model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version) | |
prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服裤子鞋子搭配。" | |
# test_batch_size = 256 | |
prompts = [prompt, ] | |
# # If you want to get different output in same batch, you can set do_sample to True | |
output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False) | |
print(output_texts) | |