Spaces:
Running
Running
import fastapi | |
import json | |
import markdown | |
import uvicorn | |
from fastapi.responses import HTMLResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from ctransformers import AutoModelForCausalLM | |
from pydantic import BaseModel | |
from sse_starlette.sse import EventSourceResponse | |
llm = AutoModelForCausalLM.from_pretrained('TheBloke/MPT-7B-Storywriter-GGML', | |
model_file='mpt-7b-storywriter.ggmlv3.q4_0.bin', | |
model_type='mpt') | |
app = fastapi.FastAPI() | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def index(): | |
with open("README.md", "r", encoding="utf-8") as readme_file: | |
md_template_string = readme_file.read() | |
html_content = markdown.markdown(md_template_string) | |
return HTMLResponse(content=html_content, status_code=200) | |
class ChatCompletionRequest(BaseModel): | |
prompt: str | |
async def demo(): | |
html_content = """ | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<style> | |
body { | |
align-items: center; | |
background-color: #d9b99b; | |
display: flex; | |
height: 100vh; | |
justify-content: center; | |
} | |
#content { | |
align-items: center; | |
background-color: #fff0db; | |
box-shadow: | |
12px 12px 16px 0 rgba(0, 0, 0, 0.25), | |
-8px -8px 12px 0 rgba(255, 255, 255, 0.3); | |
border-radius: 50px; | |
display: flex; | |
padding: 50px; | |
justify-content: center; | |
margin-right: 4rem; | |
font-size: 16px; | |
} | |
</style> | |
</head> | |
<body> | |
<div id="content"></div> | |
<script> | |
var source = new EventSource("https://matthoffner-storywriter.hf.space/stream"); | |
source.onmessage = function(event) { | |
document.getElementById("content").innerHTML += event.data | |
}; | |
</script> | |
</body> | |
</html> | |
""" | |
return HTMLResponse(content=html_content, status_code=200) | |
async def flow(prompt = ""): | |
completion = llm(prompt) | |
async def server_sent_events(chat_chunks): | |
yield prompt | |
for chat_chunk in chat_chunks: | |
yield chat_chunk | |
yield "" | |
return EventSourceResponse(server_sent_events(completion)) | |
async def chat(prompt = "Once upon a time there was a "): | |
tokens = llm.tokenize(prompt) | |
async def server_sent_events(chat_chunks, llm): | |
yield prompt | |
for chat_chunk in llm.generate(chat_chunks): | |
yield llm.detokenize(chat_chunk) | |
yield "" | |
return EventSourceResponse(server_sent_events(tokens, llm)) | |
async def chat(request: ChatCompletionRequest, response_mode=None): | |
completion = llm(request.prompt) | |
async def server_sent_events( | |
chat_chunks, | |
): | |
for chat_chunk in chat_chunks: | |
yield dict(data=json.dumps(chat_chunk)) | |
yield dict(data="[DONE]") | |
chunks = completion | |
return EventSourceResponse( | |
server_sent_events(chunks), | |
) | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=8000) | |