Llama-3.1-8B-Instruct

Runtime error

App Files Files Community

vilarin commited on Jun 6

Commit

bd34f0b

•

1 Parent(s): 9784048

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -39

app.py CHANGED Viewed

@@ -2,64 +2,49 @@ import torch
 from PIL import Image
 import gradio as gr
 import spaces
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 import os
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL_ID = "CohereForAI/aya-23-8B"
-MODEL_ID2 = "CohereForAI/aya-23-35B"
 MODELS = os.environ.get("MODELS")
 MODEL_NAME = MODELS.split("/")[-1]
-TITLE = "<h1><center>Aya-23-Chatbox</center></h1>"
-DESCRIPTION = f'<h3><center>MODEL: <a href="https://hf.co/{MODELS}">{MODEL_NAME}</a></center></h3>'
 CSS = """
 .duplicate-button {
-  margin: auto !important;
-  color: white !important;
-  background: black !important;
-  border-radius: 100vh !important;
 }
 """
-#QUANTIZE
-QUANTIZE_4BIT = True
-USE_GRAD_CHECKPOINTING = True
-TRAIN_BATCH_SIZE = 2
-TRAIN_MAX_SEQ_LENGTH = 512
-USE_FLASH_ATTENTION = False
-GRAD_ACC_STEPS = 16
-quantization_config = None
-if QUANTIZE_4BIT:
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_compute_dtype=torch.bfloat16,
-    )
-attn_implementation = None
-if USE_FLASH_ATTENTION:
-    attn_implementation="flash_attention_2"
 model = AutoModelForCausalLM.from_pretrained(
           MODELS,
-          quantization_config=quantization_config,
-          attn_implementation=attn_implementation,
-          torch_dtype=torch.bfloat16,
           device_map="auto",
         )
 tokenizer = AutoTokenizer.from_pretrained(MODELS)
 @spaces.GPU
-def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
     print(f'message is - {message}')
     print(f'history is - {history}')
     conversation = []
@@ -69,16 +54,21 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
     print(f"Conversation is -\n{conversation}")
-    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, **{"skip_special_tokens": True, "skip_prompt": True, 'clean_up_tokenization_spaces':False,})
     generate_kwargs = dict(
-        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
     )
     thread = Thread(target=model.generate, kwargs=generate_kwargs)
@@ -119,6 +109,30 @@ with gr.Blocks(css=CSS) as demo:
                 label="Max new tokens",
                 render=False,
             ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],

 from PIL import Image
 import gradio as gr
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import os
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL_ID = "Qwen/Qwen1.5-7B-Chat"
 MODELS = os.environ.get("MODELS")
 MODEL_NAME = MODELS.split("/")[-1]
+TITLE = "<h1><center>Qwen2-Chatbox</center></h1>"
+DESCRIPTION = f"""
+<h3>MODEL: <a href="https://hf.co/{MODELS}">{MODEL_NAME}</a></h3>
+<center>
+<p>Qwen is the large language model built by Alibaba Cloud.
+<br>
+Feel free to test without log.
+</p>
+</center>
+"""
 CSS = """
 .duplicate-button {
+    margin: auto !important;
+    color: white !important;
+    background: black !important;
+    border-radius: 100vh !important;
+}
+h3 {
+    text-align: center;
 }
 """
 model = AutoModelForCausalLM.from_pretrained(
           MODELS,
+          torch_dtype=torch.float16,
           device_map="auto",
         )
 tokenizer = AutoTokenizer.from_pretrained(MODELS)
 @spaces.GPU
+def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
     print(f'message is - {message}')
     print(f'history is - {history}')
     conversation = []
     print(f"Conversation is -\n{conversation}")
+    input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(input_ids, return_tensors="pt").to(0)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        inputs,
         streamer=streamer,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=penalty,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
+        eos_token_id = [151645, 151643],
     )
     thread = Thread(target=model.generate, kwargs=generate_kwargs)
                 label="Max new tokens",
                 render=False,
             ),
+            gr.Slider(
+                minimum=0.0,
+                maximum=1.0,
+                step=0.1,
+                value=0.8,
+                label="top_p",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=1,
+                maximum=20,
+                step=1,
+                value=20,
+                label="top_k",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=0.0,
+                maximum=2.0,
+                step=0.1,
+                value=1.0,
+                label="Repetition penalty",
+                render=False,
+            ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],