Spaces:

choltha
/

free-CPU-inference-for-testing

Paused

Christoph Holthaus

dev

e80da7c 10 months ago

5.85 kB

	#!/usr/bin/env python

	import os
	import requests
	from threading import Thread
	from typing import Iterator

	import gradio as gr
	import psutil
	import spaces
	import torch
	from time import time
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	from llama_cpp import Llama

	# load like this - use tne variable everywhere
	model_uri_hf=os.getenv("MODEL_URI_HF")
	# show warning, when empty and briefs description of how to set it
	# also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
	# info about ram requirements

	# DEBUG!
	model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/blob/main/neural-chat-7b-v3-2.Q2_K.gguf"
	model_uri_hf="https://huggingface.co/TheBloke/neural-chat-7B-v3-2-GGUF/resolve/main/neural-chat-7b-v3-2.Q2_K.gguf"
	# maybe use git lfs to dl instead?


	# Initing things
	print(f"debug: init model: {model_uri_hf}")

	# Check if the model file already exists
	if not os.path.isfile('model.bin'):
	# Download the model
	response = requests.get(model_uri_hf)

	# Save the model to a local file
	with open('model.bin', 'wb') as file:
	file.write(response.content)

	llm = Llama(model_path="./model.bin") # LLaMa model
	print("! INITING DONE !")

	# Preparing things to work
	title = "# Demo for 7B Models - Quantized"
	descr = '''
	Quantized to run in the free tier hosting.
	Have a quick way to test models or share them with others without hassle.
	It runs slow, as it's on cpu. Usable for basic tests.
	It uses quantized models in gguf-Format and llama.cpp to run them.

	Powered by ...'''

	print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB")
	print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
	print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")

	DESCRIPTION = f"# Test model: {model_uri_hf}"

	if torch.cuda.is_available():
	DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"

	#todo - probably lower. like 200 in and maybe 500 out? Should be ok for quick test
	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))




	if torch.cuda.is_available():
	model_id = "mistralai/Mistral-7B-Instruct-v0.1"
	model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# we need to make sure we only run one thread or we probably run out of ram
	def generate(
	message: str,
	chat_history: list[tuple[str, str]],
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	conversation = []
	for user, assistant in chat_history:
	conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
	conversation.append({"role": "user", "content": message})

	input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer= Llama()
	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	{"input_ids": input_ids},
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)


	chat_interface = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	),
	gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.6,
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	),
	gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	),
	gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.2,
	),
	],
	stop_btn=None,
	# add more eval examples, like a long list taken from teknium and others maybe group by type
	examples=[
	["Hello there! How are you doing?"],
	["Can you explain briefly to me what is the Python programming language?"],
	["Explain the plot of Cinderella in a sentence."],
	["How many hours does it take a man to eat a Helicopter?"],
	["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
	],
	)

	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(title)
	gr.Markdown(descr)
	gr.DuplicateButton(
	value="Duplicate Space for private use",
	elem_id="duplicate-button",
	visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
	# add
	)
	chat_interface.render()

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()