ChatGLM-6B / app.py
jodh-intel's picture
ui: Add system features
8716816
raw
history blame
6.12 kB
from transformers import AutoModel, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
import gradio as gr
import torch
import os
import io
import sys
import platform
import intel_extension_for_pytorch as ipex
import intel_extension_for_pytorch._C as ipex_core
from cpuinfo import get_cpu_info
from contextlib import redirect_stdout
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
ROOT = '/'
SELF_ROOT = '/proc/self/root'
tokenizer = LlamaTokenizer.from_pretrained(
"lmsys/vicuna-7b-v1.3", trust_remote_code=True
)
model = LlamaForCausalLM.from_pretrained(
"lmsys/vicuna-7b-v1.3", trust_remote_code=True
).to(DEVICE)
model = model.eval()
def in_chroot():
'''
Return true if running in a chroot environment.
'''
try:
root_stat = os.stat(ROOT)
self_stat = os.stat(SELF_ROOT)
except FileNotFoundError as e:
sys.exit(f"ERROR: Failed to stat: {e}")
root_inode = root_stat.st_ino
self_inode = self_stat.st_ino
# Inode 2 is the root inode for most filesystems.
# However, XFS uses 128 for root.
if root_inode not in [2, 128]:
return True
return not (root_inode == self_inode)
def get_features():
'''
Returns a dictionary of all feature:
key: feature name.
value: Boolean showing if feature available.
'''
cpu_info = get_cpu_info()
flags = cpu_info["flags"]
detect_ipex_amx_enabled = lambda: ipex_core._get_current_isa_level() == 'AMX'
detect_ipex_amx_available = (
lambda: ipex_core._get_highest_cpu_support_isa_level() == 'AMX'
)
features = {
'VM': 'hypervisor' in flags,
'TDX TD': 'tdx_guest' in flags,
'AMX available': 'amx_tile' in flags,
'AMX-BF16 available': 'amx_bf16' in flags,
'AMX-INT8 available': 'amx_int8' in flags,
'AVX-VNNI available': 'avx_vnni' in flags,
'AVX512-VNNI available': 'avx512_vnni' in flags,
'AVX512-FP16 available': 'avx512_fp16' in flags,
'AVX512-BF16 available': 'avx512_bf16' in flags,
'AMX IPEX available': detect_ipex_amx_available(),
'AMX IPEX enabled': detect_ipex_amx_enabled(),
}
return features
def get_debug_details():
'''
Return a block of markdown text that shows useful debug
information.
'''
# ipex.version() prints to stdout, so redirect stdout to
# capture the output.
buffer = io.StringIO()
with redirect_stdout(buffer):
ipex.version()
ipex_version_details = buffer.getvalue().replace("\n", ", ")
ipex_current_isa_level = ipex_core._get_current_isa_level()
ipex_max_isa_level = ipex_core._get_highest_cpu_support_isa_level()
ipex_env_var = os.getenv('ATEN_CPU_CAPABILITY')
onednn_env_var = os.getenv('ONEDNN_MAX_CPU_ISA')
with open('/proc/version', 'r') as f:
kernel_version = f.read().rstrip()
in_chroot_result = in_chroot()
cpu_info = get_cpu_info()
flags = cpu_info["flags"]
# Note that rather than using `<details>`, we could use gradio.Accordian(),
# but the markdown version is more visually compact.
md = f"""
<details>
<summary>Click to show debug details</summary>
| Feature | Value |
|-|-|
| Arch | `{cpu_info['arch']}` |
| CPU | `{cpu_info['brand_raw']}` |
| CPU flags | `{flags}` |
| Kernel | `{kernel_version}` |
| Python version | `{sys.version}` (implementation: `{platform.python_implementation()}`) |
| Python version details | `{sys.version_info}` |
| PyTorch version | `{torch.__version__}` |
| IPEX version | `{ipex.ipex_version}` |
| IPEX CPU detected | `{ipex_core._has_cpu()}` |
| IPEX XPU detected | `{ipex_core._has_xpu()}` |
| IPEX version details | `{ipex_version_details}` |
| IPEX env var `ATEN_CPU_CAPABILITY` | `{ipex_env_var}` |
| IPEX current ISA level | `{ipex_current_isa_level}` |
| IPEX max ISA level | `{ipex_max_isa_level}` |
| oneDNN env var `ONEDNN_MAX_CPU_ISA` | `{onednn_env_var}` |
| in chroot | `{in_chroot_result}` |
</details>
"""
return md
def predict(input, history=None):
if history is None:
history = []
new_user_input_ids = tokenizer.encode(
input + tokenizer.eos_token, return_tensors='pt'
)
bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
history = model.generate(
bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
).tolist()
# convert the tokens to text, and then split the responses into the right format
response = tokenizer.decode(history[0]).split("<|endoftext|>")
response = [
(response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
] # convert to tuples of list
return response, history
with gr.Blocks() as demo:
gr.Markdown(
'''## Confidential HuggingFace Runner
'''
)
state = gr.State([])
chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
with gr.Row():
with gr.Column(scale=4):
txt = gr.Textbox(
show_label=False, placeholder="Enter text and press enter"
).style(container=False)
with gr.Column(scale=1):
button = gr.Button("Generate")
txt.submit(predict, [txt, state], [chatbot, state])
button.click(predict, [txt, state], [chatbot, state])
with gr.Row():
features_dict = get_features()
all_features = features_dict.keys()
# Get a list of feature names that are actually set/available
set_features = [key for key in features_dict if features_dict[key]]
gr.CheckboxGroup(
all_features,
label="Features",
# Make the boxes read-only
interactive=False,
# Specify which features were detected
value=set_features,
info="Features detected from environment",
)
with gr.Row():
debug_details = get_debug_details()
gr.Markdown(debug_details)
demo.queue().launch(share=True, server_name="0.0.0.0")