Paulus Michael Leang commited on
Commit
c8057e0
1 Parent(s): cda1776
Files changed (2) hide show
  1. app.py +40 -1
  2. requirements.txt +4 -1
app.py CHANGED
@@ -1,7 +1,46 @@
1
  from fastapi import FastAPI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  app = FastAPI()
4
 
5
  @app.get("/")
6
  def greet_json():
7
- return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from pydantic import BaseModel
4
+
5
+ device = "cuda" # the device to load the model onto
6
+
7
+ model = AutoModelForCausalLM.from_pretrained(
8
+ "Qwen/Qwen2-72B-Instruct",
9
+ torch_dtype="auto",
10
+ device_map="auto"
11
+ )
12
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-72B-Instruct")
13
+
14
+ class ChatRequest(BaseModel):
15
+ prompt: str
16
 
17
  app = FastAPI()
18
 
19
  @app.get("/")
20
  def greet_json():
21
+ return {"Hello": "World!"}
22
+
23
+ @app.post("/generate_chat")
24
+ def generateAi(request: ChatRequest):
25
+ messages = [
26
+ {"role": "system", "content": "You are a helpful assistant."},
27
+ {"role": "user", "content": request.prompt}
28
+ ]
29
+ text = tokenizer.apply_chat_template(
30
+ messages,
31
+ tokenize=False,
32
+ add_generation_prompt=True
33
+ )
34
+ model_inputs = tokenizer([text], return_tensors="pt").to(device)
35
+
36
+ generated_ids = model.generate(
37
+ model_inputs.input_ids,
38
+ max_new_tokens=512
39
+ )
40
+ generated_ids = [
41
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
42
+ ]
43
+
44
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
45
+
46
+ return {"answer": "Hello"}
requirements.txt CHANGED
@@ -1,2 +1,5 @@
1
  fastapi
2
- uvicorn[standard]
 
 
 
 
1
  fastapi
2
+ uvicorn[standard]
3
+ transformers
4
+ pydantic
5
+ accelerator