hoduyquocbao commited on
Commit
b04fed6
1 Parent(s): 69c1d60

new version update

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -6,6 +6,22 @@ For more information on `huggingface_hub` Inference API support, please check th
6
  """
7
  client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def respond(
11
  message,
@@ -27,14 +43,27 @@ def respond(
27
 
28
  response = ""
29
 
30
- for message in client.chat_completion(
 
 
 
 
 
 
 
 
 
 
 
 
31
  messages,
32
  max_tokens=max_tokens,
33
  stream=True,
34
  temperature=temperature,
35
  top_p=top_p,
36
  ):
37
- token = message.choices[0].delta.content
 
38
 
39
  response += token
40
  yield response
 
6
  """
7
  client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
8
 
9
+ import torch
10
+ from transformers import pipeline
11
+
12
+ model_id = "meta-llama/Llama-3.2-3B-Instruct"
13
+ pipe = pipeline(
14
+ "text-generation",
15
+ model=model_id,
16
+ torch_dtype=torch.bfloat16,
17
+ device_map="auto",
18
+ )
19
+ messages = [
20
+ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
21
+ {"role": "user", "content": "Who are you?"},
22
+ ]
23
+
24
+ # print(outputs[0]["generated_text"][-1])
25
 
26
  def respond(
27
  message,
 
43
 
44
  response = ""
45
 
46
+ # outputs = pipe(
47
+ # messages,
48
+ # max_new_tokens=256,
49
+ # )
50
+
51
+ # for message in client.chat_completion(
52
+ # messages,
53
+ # max_tokens=max_tokens,
54
+ # stream=True,
55
+ # temperature=temperature,
56
+ # top_p=top_p,
57
+ # ):
58
+ for message in pipe(
59
  messages,
60
  max_tokens=max_tokens,
61
  stream=True,
62
  temperature=temperature,
63
  top_p=top_p,
64
  ):
65
+ # token = message.choices[0].delta.content
66
+ token = message[0]["generated_text"][-1]
67
 
68
  response += token
69
  yield response