ChatGLM-6B

Runtime error

App Files Files Community

ChatGLM-6B / app.py

jodh-intel

ui: Add system features

8716816 6 months ago

raw

history blame

6.12 kB

	from transformers import AutoModel, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
	import gradio as gr
	import torch
	import os
	import io
	import sys
	import platform
	import intel_extension_for_pytorch as ipex
	import intel_extension_for_pytorch._C as ipex_core
	from cpuinfo import get_cpu_info
	from contextlib import redirect_stdout


	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	ROOT = '/'
	SELF_ROOT = '/proc/self/root'

	tokenizer = LlamaTokenizer.from_pretrained(
	"lmsys/vicuna-7b-v1.3", trust_remote_code=True
	)
	model = LlamaForCausalLM.from_pretrained(
	"lmsys/vicuna-7b-v1.3", trust_remote_code=True
	).to(DEVICE)
	model = model.eval()


	def in_chroot():
	'''
	Return true if running in a chroot environment.
	'''
	try:
	root_stat = os.stat(ROOT)
	self_stat = os.stat(SELF_ROOT)
	except FileNotFoundError as e:
	sys.exit(f"ERROR: Failed to stat: {e}")

	root_inode = root_stat.st_ino
	self_inode = self_stat.st_ino

	# Inode 2 is the root inode for most filesystems.
	# However, XFS uses 128 for root.
	if root_inode not in [2, 128]:
	return True

	return not (root_inode == self_inode)


	def get_features():
	'''
	Returns a dictionary of all feature:

	key: feature name.
	value: Boolean showing if feature available.
	'''

	cpu_info = get_cpu_info()
	flags = cpu_info["flags"]

	detect_ipex_amx_enabled = lambda: ipex_core._get_current_isa_level() == 'AMX'
	detect_ipex_amx_available = (
	lambda: ipex_core._get_highest_cpu_support_isa_level() == 'AMX'
	)

	features = {
	'VM': 'hypervisor' in flags,
	'TDX TD': 'tdx_guest' in flags,
	'AMX available': 'amx_tile' in flags,
	'AMX-BF16 available': 'amx_bf16' in flags,
	'AMX-INT8 available': 'amx_int8' in flags,
	'AVX-VNNI available': 'avx_vnni' in flags,
	'AVX512-VNNI available': 'avx512_vnni' in flags,
	'AVX512-FP16 available': 'avx512_fp16' in flags,
	'AVX512-BF16 available': 'avx512_bf16' in flags,
	'AMX IPEX available': detect_ipex_amx_available(),
	'AMX IPEX enabled': detect_ipex_amx_enabled(),
	}

	return features


	def get_debug_details():
	'''
	Return a block of markdown text that shows useful debug
	information.
	'''

	# ipex.version() prints to stdout, so redirect stdout to
	# capture the output.
	buffer = io.StringIO()

	with redirect_stdout(buffer):
	ipex.version()

	ipex_version_details = buffer.getvalue().replace("\n", ", ")

	ipex_current_isa_level = ipex_core._get_current_isa_level()
	ipex_max_isa_level = ipex_core._get_highest_cpu_support_isa_level()

	ipex_env_var = os.getenv('ATEN_CPU_CAPABILITY')
	onednn_env_var = os.getenv('ONEDNN_MAX_CPU_ISA')

	with open('/proc/version', 'r') as f:
	kernel_version = f.read().rstrip()

	in_chroot_result = in_chroot()

	cpu_info = get_cpu_info()
	flags = cpu_info["flags"]

	# Note that rather than using `<details>`, we could use gradio.Accordian(),
	# but the markdown version is more visually compact.
	md = f"""
	<details>
	<summary>Click to show debug details</summary>

	\| Feature \| Value \|
	\|-\|-\|
	\| Arch \| `{cpu_info['arch']}` \|
	\| CPU \| `{cpu_info['brand_raw']}` \|
	\| CPU flags \| `{flags}` \|
	\| Kernel \| `{kernel_version}` \|
	\| Python version \| `{sys.version}` (implementation: `{platform.python_implementation()}`) \|
	\| Python version details \| `{sys.version_info}` \|
	\| PyTorch version \| `{torch.__version__}` \|
	\| IPEX version \| `{ipex.ipex_version}` \|
	\| IPEX CPU detected \| `{ipex_core._has_cpu()}` \|
	\| IPEX XPU detected \| `{ipex_core._has_xpu()}` \|
	\| IPEX version details \| `{ipex_version_details}` \|
	\| IPEX env var `ATEN_CPU_CAPABILITY` \| `{ipex_env_var}` \|
	\| IPEX current ISA level \| `{ipex_current_isa_level}` \|
	\| IPEX max ISA level \| `{ipex_max_isa_level}` \|
	\| oneDNN env var `ONEDNN_MAX_CPU_ISA` \| `{onednn_env_var}` \|
	\| in chroot \| `{in_chroot_result}` \|

	</details>
	"""

	return md


	def predict(input, history=None):
	if history is None:
	history = []
	new_user_input_ids = tokenizer.encode(
	input + tokenizer.eos_token, return_tensors='pt'
	)
	bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
	history = model.generate(
	bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
	).tolist()
	# convert the tokens to text, and then split the responses into the right format
	response = tokenizer.decode(history[0]).split("<\|endoftext\|>")
	response = [
	(response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
	] # convert to tuples of list
	return response, history


	with gr.Blocks() as demo:
	gr.Markdown(
	'''## Confidential HuggingFace Runner
	'''
	)
	state = gr.State([])
	chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
	with gr.Row():
	with gr.Column(scale=4):
	txt = gr.Textbox(
	show_label=False, placeholder="Enter text and press enter"
	).style(container=False)
	with gr.Column(scale=1):
	button = gr.Button("Generate")
	txt.submit(predict, [txt, state], [chatbot, state])
	button.click(predict, [txt, state], [chatbot, state])

	with gr.Row():
	features_dict = get_features()

	all_features = features_dict.keys()

	# Get a list of feature names that are actually set/available
	set_features = [key for key in features_dict if features_dict[key]]

	gr.CheckboxGroup(
	all_features,
	label="Features",
	# Make the boxes read-only
	interactive=False,
	# Specify which features were detected
	value=set_features,
	info="Features detected from environment",
	)

	with gr.Row():
	debug_details = get_debug_details()
	gr.Markdown(debug_details)

	demo.queue().launch(share=True, server_name="0.0.0.0")