Spaces:

hoduyquocbao
/

LLAMA3.2-GRop

Sleeping

hoduyquocbao commited on Sep 27

Commit

b04fed6

•

1 Parent(s): 69c1d60

new version update

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,6 +6,22 @@ For more information on `huggingface_hub` Inference API support, please check th
 """
 client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
 def respond(
     message,
@@ -27,14 +43,27 @@ def respond(
     response = ""
-    for message in client.chat_completion(
         messages,
         max_tokens=max_tokens,
         stream=True,
         temperature=temperature,
         top_p=top_p,
     ):
-        token = message.choices[0].delta.content
         response += token
         yield response

 """
 client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
+import torch
+from transformers import pipeline
+model_id = "meta-llama/Llama-3.2-3B-Instruct"
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+messages = [
+    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+    {"role": "user", "content": "Who are you?"},
+]
+# print(outputs[0]["generated_text"][-1])
 def respond(
     message,
     response = ""
+    # outputs = pipe(
+    #     messages,
+    #     max_new_tokens=256,
+    # )
+    # for message in client.chat_completion(
+    #     messages,
+    #     max_tokens=max_tokens,
+    #     stream=True,
+    #     temperature=temperature,
+    #     top_p=top_p,
+    # ):
+    for message in pipe(
         messages,
         max_tokens=max_tokens,
         stream=True,
         temperature=temperature,
         top_p=top_p,
     ):
+        # token = message.choices[0].delta.content
+        token = message[0]["generated_text"][-1]
         response += token
         yield response