Spaces:

leonardlin
/

shisa-ablations

Runtime error

App Files Files Community

leonardlin commited on May 17

Commit

554f3ed

•

1 Parent(s): 36badd8

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -2

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from huggingface_hub import InferenceClient
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
@@ -60,4 +60,109 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
-    demo.launch()

 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+client = InferenceClient("shisa-ai/shisa-llama3-8b-v1")
 def respond(
 if __name__ == "__main__":
+    demo.launch()
+'''
+# https://www.gradio.app/guides/using-hugging-face-integrations
+import gradio as gr
+import logging
+import html
+from   pprint import pprint
+import time
+import torch
+from   threading import Thread
+from   transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+# Model
+model_name = "augmxnt/shisa-7b-v1"
+# UI Settings
+title = "Shisa 7B"
+description = "Test out <a href='https://huggingface.co/augmxnt/shisa-7b-v1'>Shisa 7B</a> in either English or Japanese. If you aren't getting the right language outputs, you can try changing the system prompt to the appropriate language.\n\nNote: we are running this model quantized at `load_in_4bit` to fit in 16GB of VRAM."
+placeholder = "Type Here / ここに入力してください"
+examples = [
+    ["What are the best slices of pizza in New York City?"],
+    ["東京でおすすめのラーメン屋ってどこ？"],
+    ['How do I program a simple "hello world" in Python?'],
+    ["Pythonでシンプルな「ハローワールド」をプログラムするにはどうすればいいですか？"],
+]
+# LLM Settings
+# Initial
+system_prompt = 'You are a helpful, bilingual assistant. Reply in same language as the user.'
+default_prompt = system_prompt
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    # load_in_8bit=True,
+    load_in_4bit=True,
+    use_flash_attention_2=True,
+)
+def chat(message, history, system_prompt):
+    if not system_prompt:
+        system_prompt = default_prompt
+    print('---')
+    print('Prompt:', system_prompt)
+    pprint(history)
+    print(message)
+    # Let's just rebuild every time it's easier
+    chat_history = [{"role": "system", "content": system_prompt}]
+    for h in history:
+        chat_history.append({"role": "user", "content": h[0]})
+        chat_history.append({"role": "assistant", "content": h[1]})
+    chat_history.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")
+    # for multi-gpu, find the device of the first parameter of the model
+    first_param_device = next(model.parameters()).device
+    input_ids = input_ids.to(first_param_device)
+    generate_kwargs = dict(
+        inputs=input_ids,
+        max_new_tokens=200,
+        do_sample=True,
+        temperature=0.7,
+        repetition_penalty=1.15,
+        top_p=0.95,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    output_ids = model.generate(**generate_kwargs)
+    new_tokens = output_ids[0, input_ids.size(1):]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    return response
+chat_interface = gr.ChatInterface(
+    chat,
+    chatbot=gr.Chatbot(height=400),
+    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
+    title=title,
+    description=description,
+    theme="soft",
+    examples=examples,
+    cache_examples=False,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+    additional_inputs=[
+        gr.Textbox(system_prompt, label="System Prompt (Change the language of the prompt for better replies)"),
+    ],
+)
+# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
+with gr.Blocks() as demo:
+    chat_interface.render()
+    gr.Markdown("You can try asking this question in Japanese or English. We limit output to 200 tokens.")
+demo.queue().launch()
+'''