Spaces:

daniellefranca96
/

cpu_inf

Sleeping

File size: 1,878 Bytes

d20d20b
 
 
c2af308
d20d20b
d899b2a
c51a031
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d899b2a
 
d20d20b
 
 
b7ebcbb
15d43aa
 
f2886e5
c51a031
d20d20b
 
 
 
 
d899b2a
c51a031
 
 
15d43aa
726fee7
d20d20b
 
d899b2a
c51a031
d20d20b
 
c51a031

from fastapi import FastAPI
from pydantic import BaseModel
import requests
from llama_cpp import Llama 

llms = {
  "TinyLLama 1b 4_K_M 2048": {
    "nctx": 2048, 
    "file": "tinyllama-1.1b-chat-v0.3.Q4_K_M.gguf", 
    "prefix": "### Human:", 
    "suffix": "### Assistant:"
  },
  "TinyLLama 1b OpenOrca 4_K_M 2048": {
    "nctx": 2048, 
    "file": "tinyllama-1.1b-1t-openorca.Q4_K_M.gguf", 
    "prefix": "<|im_start|>system You are a helpfull assistant<|im_end|><|im_start|>user", 
    "suffix": "<|im_end|><|im_start|>assistant"
  },
  "OpenLLama 3b 4_K_M 196k": {
    "nctx": 80000, 
    "file": "open-llama-3b-v2-wizard-evol-instuct-v2-196k.Q4_K_M.gguf", 
    "prefix": "### HUMAN:", 
    "suffix": "### RESPONSE:"
  },
  "Phi-2 2.7b 4_K_M 2048": {
    "nctx": 2048, 
    "file": "phi-2.Q4_K_M.gguf", 
    "prefix": "Instruct:", 
    "suffix": "Output:"
  },
  "Mixtral MOE 7bx2 4_K_M 32K": {
    "nctx": 32000, 
    "file": "mixtral_7bx2_moe.Q4_K_M.gguf", 
    "prefix": "", 
    "suffix": ""
  },
  "Stable Zephyr 3b 4_K_M 4096": {
    "nctx": 4096, 
    "file": "stablelm-zephyr-3b.Q4_K_M.gguf", 
    "prefix": "<|user|>", 
    "suffix": "<|endoftext|><|assistant|>"
  }
}

#Pydantic object
class validation(BaseModel):
    prompt: str
    llm: str
    max_tokens: int = 512
    nctx: int = 2048
    
    
#Fast API
app = FastAPI()

@app.post("/llm_on_cpu")
async def stream(item: validation):

    model = llms[item.llm]
    prefix=model['prefix']
    suffix=model['suffix']
    nctx =  item.nctx if item.nctx is not None else model['nctx']
    max_tokens = item.max_tokens if item.max_tokens is not None else 512
    user="""
    {prompt}"""
    
    model = Llama(model_path="./"+model['file'], n_ctx=model['nctx'], verbose=False,  n_threads=8)

    prompt = f"{prefix}{user.replace('{prompt}', item.prompt)}{suffix}"
    return llm(prompt, max_tokens=max_tokens)