OpenELM-270M-Instruct-4bit / generate_openelm.py
jenhantao's picture
Upload folder using huggingface_hub (#3)
2c6c161 verified
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#
"""Module to generate OpenELM output given a model and an input prompt."""
import os
import logging
import time
import argparse
from typing import Optional, Union
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
def generate(
prompt: str,
model: Union[str, AutoModelForCausalLM],
hf_access_token: str = None,
tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
device: Optional[str] = None,
max_length: int = 1024,
assistant_model: Optional[Union[str, AutoModelForCausalLM]] = None,
generate_kwargs: Optional[dict] = None,
) -> str:
""" Generates output given a prompt.
Args:
prompt: The string prompt.
model: The LLM Model. If a string is passed, it should be the path to
the hf converted checkpoint.
hf_access_token: Hugging face access token.
tokenizer: Tokenizer instance. If model is set as a string path,
the tokenizer will be loaded from the checkpoint.
device: String representation of device to run the model on. If None
and cuda available it would be set to cuda:0 else cpu.
max_length: Maximum length of tokens, input prompt + generated tokens.
assistant_model: If set, this model will be used for
speculative generation. If a string is passed, it should be the
path to the hf converted checkpoint.
generate_kwargs: Extra kwargs passed to the hf generate function.
Returns:
output_text: output generated as a string.
generation_time: generation time in seconds.
Raises:
ValueError: If device is set to CUDA but no CUDA device is detected.
ValueError: If tokenizer is not set.
ValueError: If hf_access_token is not specified.
"""
if not device:
if torch.cuda.is_available() and torch.cuda.device_count():
device = "cuda:0"
logging.warning(
'inference device is not set, using cuda:0, %s',
torch.cuda.get_device_name(0)
)
else:
device = 'cpu'
logging.warning(
(
'No CUDA device detected, using cpu, '
'expect slower speeds.'
)
)
if 'cuda' in device and not torch.cuda.is_available():
raise ValueError('CUDA device requested but no CUDA device detected.')
if not tokenizer:
raise ValueError('Tokenizer is not set in the generate function.')
if not hf_access_token:
raise ValueError((
'Hugging face access token needs to be specified. '
'Please refer to https://huggingface.co/docs/hub/security-tokens'
' to obtain one.'
)
)
if isinstance(model, str):
checkpoint_path = model
model = AutoModelForCausalLM.from_pretrained(
checkpoint_path,
trust_remote_code=True
)
model.to(device).eval()
if isinstance(tokenizer, str):
tokenizer = AutoTokenizer.from_pretrained(
tokenizer,
token=hf_access_token,
)
# Speculative mode
draft_model = None
if assistant_model:
draft_model = assistant_model
if isinstance(assistant_model, str):
draft_model = AutoModelForCausalLM.from_pretrained(
assistant_model,
trust_remote_code=True
)
draft_model.to(device).eval()
# Prepare the prompt
tokenized_prompt = tokenizer(prompt)
tokenized_prompt = torch.tensor(
tokenized_prompt['input_ids'],
device=device
)
tokenized_prompt = tokenized_prompt.unsqueeze(0)
# Generate
stime = time.time()
output_ids = model.generate(
tokenized_prompt,
max_length=max_length,
pad_token_id=0,
assistant_model=draft_model,
**(generate_kwargs if generate_kwargs else {}),
)
generation_time = time.time() - stime
output_text = tokenizer.decode(
output_ids[0].tolist(),
skip_special_tokens=True
)
return output_text, generation_time
def openelm_generate_parser():
"""Argument Parser"""
class KwargsParser(argparse.Action):
"""Parser action class to parse kwargs of form key=value"""
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, dict())
for val in values:
if '=' not in val:
raise ValueError(
(
'Argument parsing error, kwargs are expected in'
' the form of key=value.'
)
)
kwarg_k, kwarg_v = val.split('=')
try:
converted_v = int(kwarg_v)
except ValueError:
try:
converted_v = float(kwarg_v)
except ValueError:
converted_v = kwarg_v
getattr(namespace, self.dest)[kwarg_k] = converted_v
parser = argparse.ArgumentParser('OpenELM Generate Module')
parser.add_argument(
'--model',
dest='model',
help='Path to the hf converted model.',
required=True,
type=str,
)
parser.add_argument(
'--hf_access_token',
dest='hf_access_token',
help='Hugging face access token, starting with "hf_".',
type=str,
)
parser.add_argument(
'--prompt',
dest='prompt',
help='Prompt for LLM call.',
default='',
type=str,
)
parser.add_argument(
'--device',
dest='device',
help='Device used for inference.',
type=str,
)
parser.add_argument(
'--max_length',
dest='max_length',
help='Maximum length of tokens.',
default=256,
type=int,
)
parser.add_argument(
'--assistant_model',
dest='assistant_model',
help=(
(
'If set, this is used as a draft model '
'for assisted speculative generation.'
)
),
type=str,
)
parser.add_argument(
'--generate_kwargs',
dest='generate_kwargs',
help='Additional kwargs passed to the HF generate function.',
type=str,
nargs='*',
action=KwargsParser,
)
return parser.parse_args()
if __name__ == '__main__':
args = openelm_generate_parser()
prompt = args.prompt
output_text, genertaion_time = generate(
prompt=prompt,
model=args.model,
device=args.device,
max_length=args.max_length,
assistant_model=args.assistant_model,
generate_kwargs=args.generate_kwargs,
hf_access_token=args.hf_access_token,
)
print_txt = (
f'\r\n{"=" * os.get_terminal_size().columns}\r\n'
'\033[1m Prompt + Generated Output\033[0m\r\n'
f'{"-" * os.get_terminal_size().columns}\r\n'
f'{output_text}\r\n'
f'{"-" * os.get_terminal_size().columns}\r\n'
'\r\nGeneration took'
f'\033[1m\033[92m {round(genertaion_time, 2)} \033[0m'
'seconds.\r\n'
)
print(print_txt)