import json from typing import List import fastapi import markdown import uvicorn from ctransformers import AutoModelForCausalLM from fastapi import HTTPException from fastapi.responses import HTMLResponse from fastapi.middleware.cors import CORSMiddleware from sse_starlette.sse import EventSourceResponse from pydantic import BaseModel, Field from typing_extensions import Literal from dialogue import DialogueTemplate # llm = AutoModelForCausalLM.from_pretrained("gsaivinay/airoboros-13B-gpt4-1.3-GGML", # model_file="airoboros-13b-gpt4-1.3.ggmlv3.q4_1.bin", # model_type="llama") llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-13B-chat-GGML", model_file="llama-2-13b-chat.ggmlv3.q2_K.bin", model_type="llama") app = fastapi.FastAPI(title="Starchat Beta") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/") async def index(): with open("README.md", "r", encoding="utf-8") as readme_file: md_template_string = readme_file.read() html_content = markdown.markdown(md_template_string) return HTMLResponse(content=html_content, status_code=200) @app.get("/stream") async def chat(prompt = "<|user|> Write an express server with server sent events. <|assistant|>"): tokens = llm.tokenize(prompt) async def server_sent_events(chat_chunks, llm): yield prompt for chat_chunk in llm.generate(chat_chunks): yield llm.detokenize(chat_chunk) yield "" return EventSourceResponse(server_sent_events(tokens, llm)) class ChatCompletionRequestMessage(BaseModel): role: Literal["system", "user", "assistant"] = Field( default="user", description="The role of the message." ) content: str = Field(default="", description="The content of the message.") class ChatCompletionRequest(BaseModel): messages: List[ChatCompletionRequestMessage] = Field( default=[], description="A list of messages to generate completions for." ) system_message = "Below is a conversation between a human user and a helpful AI coding assistant." @app.post("/v1/chat/completions") async def chat(request: ChatCompletionRequest): kwargs = request.dict() dialogue_template = DialogueTemplate( system=system_message, messages=kwargs['messages'] ) prompt = dialogue_template.get_inference_prompt() tokens = llm.tokenize(combined_messages) try: chat_chunks = llm.generate(tokens) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) async def format_response(chat_chunks: Generator) -> Any: for chat_chunk in chat_chunks: response = { 'choices': [ { 'message': { 'role': 'system', 'content': llm.detokenize(chat_chunk) }, 'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown' } ] } yield f"data: {json.dumps(response)}\n\n" yield "event: done\ndata: {}\n\n" return EventSourceResponse(format_response(chat_chunks), media_type="text/event-stream") @app.post("/v0/chat/completions") async def chatV0(request: ChatCompletionRequest, response_mode=None): kwargs = request.dict() dialogue_template = DialogueTemplate( system=system_message, messages=kwargs['messages'] ) prompt = dialogue_template.get_inference_prompt() tokens = llm.tokenize(prompt) async def server_sent_events(chat_chunks, llm): for token in llm.generate(chat_chunks): yield dict(data=llm.detokenize(token)) yield dict(data="[DONE]") return EventSourceResponse(server_sent_events(tokens, llm)) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)