cpu_inf / main.py
daniellefranca96's picture
Create main.py
d20d20b
raw
history blame
617 Bytes
from llama_cpp import Llama
from fastapi import FastAPI
from pydantic import BaseModel
import requests
from ctransformers import AutoModelForCausalLM
llm = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v0.6", model_file="ggml-model-q4_0.gguf")
#Pydantic object
class validation(BaseModel):
prompt: str
#Fast API
app = FastAPI()
@app.post("/llm_on_cpu")
async def stream(item: validation):
prefix="""<|user|>
"""
suffix="""<|endoftext|><|assistant|>"""
user="""
{prompt}"""
prompt = f"{prefix}{user.replace('{prompt}', item.prompt)}{suffix}"
return llm(prompt)