Spaces:

KwabsHug
/

GameConfigIdea

Running on Zero

GameConfigIdea / leveraging_machine_learning.py

kwabs22

Some changes and flie splitting

a69d738 about 2 months ago

12.4 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import gc
	import sys
	from diffusers import FluxPipeline
	import time
	from sentence_transformers import SentenceTransformer
	import psutil
	import json
	import spaces
	from threading import Thread
	#-----------------
	from relatively_constant_variables import knowledge_base

	# Initialize the zero tensor on CUDA
	zero = torch.Tensor([0]).cuda()
	print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function

	modelnames = ["stvlynn/Gemma-2-2b-Chinese-it", "nbeerbower/mistral-nemo-wissenschaft-12B", "princeton-nlp/gemma-2-9b-it-SimPO", "cognitivecomputations/dolphin-2.9.3-mistral-7B-32k", "01-ai/Yi-Coder-9B-Chat", "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1", "ArliAI/Phi-3.5-mini-3.8B-ArliAI-RPMax-v1.1",
	"Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2-0.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat", "HuggingFaceTB/SmolLM-135M-Instruct", "microsoft/Phi-3-mini-4k-instruct", "Groq/Llama-3-Groq-8B-Tool-Use", "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4",
	"SpectraSuite/TriLM_3.9B_Unpacked", "h2oai/h2o-danube3-500m-chat", "OuteAI/Lite-Mistral-150M-v2-Instruct", "Zyphra/Zamba2-1.2B", "anthracite-org/magnum-v2-4b", ]

	imagemodelnames = ["black-forest-labs/FLUX.1-schnell"]

	current_model_index = 0
	current_image_model_index = 0
	modelname = modelnames[current_model_index]
	imagemodelname = imagemodelnames[current_image_model_index]
	lastmodelnameinloadfunction = None
	lastimagemodelnameinloadfunction = None

	# Load the embedding model
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Initialize model and tokenizer as global variables
	model = None
	tokenizer = None
	flux_pipe = None

	# Dictionary to store loaded models
	loaded_models = {}

	def get_size_str(bytes):
	for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
	if bytes < 1024:
	return f"{bytes:.2f} {unit}"
	bytes /= 1024

	def load_model(model_name):
	global model, tokenizer, lastmodelnameinloadfunction, loaded_models

	print(f"Loading model and tokenizer: {model_name}")

	# Record initial GPU memory usage
	initial_memory = torch.cuda.memory_allocated()

	# Clear old model and tokenizer if they exist
	if 'model' in globals() and model is not None:
	model = None
	if 'tokenizer' in globals() and tokenizer is not None:
	tokenizer = None

	torch.cuda.empty_cache()
	gc.collect()

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype="auto",
	device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model_size = sum(p.numel() * p.element_size() for p in model.parameters())
	tokenizer_size = sum(sys.getsizeof(v) for v in tokenizer.__dict__.values())
	loaded_models[model_name] = (model, tokenizer)

	# Calculate memory usage
	final_memory = torch.cuda.memory_allocated()
	memory_used = final_memory - initial_memory

	loaded_models[model_name] = [str(time.time()), memory_used]

	lastmodelnameinloadfunction = (model_name, model_size, tokenizer_size)
	print(f"Model and tokenizer {model_name} loaded successfully")
	print(f"Model size: {get_size_str(model_size)}")
	print(f"Tokenizer size: {get_size_str(tokenizer_size)}")
	print(f"GPU memory used: {get_size_str(memory_used)}")

	return (f"Model and tokenizer {model_name} loaded successfully. "
	f"Model size: {get_size_str(model_size)}, "
	f"Tokenizer size: {get_size_str(tokenizer_size)}, "
	f"GPU memory used: {get_size_str(memory_used)}")

	def load_image_model(imagemodelname):
	global flux_pipe, lastimagemodelnameinloadfunction, loaded_models

	print(f"Loading image model: {imagemodelname}")

	# Record initial GPU memory usage
	initial_memory = torch.cuda.memory_allocated()

	if 'flux_pipe' in globals() and flux_pipe is not None:
	flux_pipe = None

	torch.cuda.empty_cache()
	gc.collect()

	flux_pipe = FluxPipeline.from_pretrained(imagemodelname, torch_dtype=torch.bfloat16)
	flux_pipe.enable_model_cpu_offload()
	model_size = sum(p.numel() * p.element_size() for p in flux_pipe.transformer.parameters())
	#tokenizer_size = 0 # FLUX doesn't use a separate tokenizer
	loaded_models[imagemodelname] = flux_pipe

	# Calculate memory usage
	final_memory = torch.cuda.memory_allocated()
	memory_used = final_memory - initial_memory

	loaded_models[imagemodelname] = [str(time.time()), memory_used]

	lastimagemodelnameinloadfunction = (imagemodelname, model_size) #, tokenizer_size)
	print(f"Model and tokenizer {imagemodelname} loaded successfully")
	print(f"Model size: {get_size_str(model_size)}")
	#print(f"Tokenizer size: {get_size_str(tokenizer_size)}")
	print(f"GPU memory used: {get_size_str(memory_used)}")

	return (f"Model and tokenizer {imagemodelname} loaded successfully. "
	f"Model size: {get_size_str(model_size)}, "
	#f"Tokenizer size: {get_size_str(tokenizer_size)}, "
	f"GPU memory used: {get_size_str(memory_used)}")


	def clear_all_models():
	global model, tokenizer, flux_pipe, loaded_models
	for model_name, model_obj in loaded_models.items():
	if isinstance(model_obj, tuple):
	model_obj[0].to('cpu')
	del model_obj[0]
	del model_obj[1]
	else:
	model_obj.to('cpu')
	del model_obj
	model = None
	tokenizer = None
	flux_pipe = None
	loaded_models.clear()
	torch.cuda.empty_cache()
	gc.collect()
	return "All models cleared from memory."

	def load_model_list(model_list):
	messages = []
	for model_name in model_list:
	message = load_model(model_name)
	messages.append(message)
	return "\n".join(messages)

	def loaded_model_list():
	global loaded_models
	return loaded_models


	# Initial model load
	load_model(modelname)
	load_image_model(imagemodelname)

	# Create embeddings for the knowledge base
	knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])

	def retrieve(query, k=2):
	query_embedding = embedding_model.encode([query])
	similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
	top_k_indices = similarities.argsort(descending=True)[:k]
	return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]

	def get_ram_usage():
	ram = psutil.virtual_memory()
	return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 3):.2f}GB, Total: {ram.total / (1024 3):.2f}GB"

	# Global dictionary to store outputs
	output_dict = {}

	def empty_output_dict():
	global output_dict
	output_dict = {}
	print("Output dictionary has been emptied.")

	def get_model_details(model):
	return {
	"name": model.config.name_or_path,
	"architecture": model.config.architectures[0] if model.config.architectures else "Unknown",
	"num_parameters": sum(p.numel() for p in model.parameters()),
	}

	def get_tokenizer_details(tokenizer):
	return {
	"name": tokenizer.__class__.__name__,
	"vocab_size": tokenizer.vocab_size,
	"model_max_length": tokenizer.model_max_length,
	}

	@spaces.GPU
	def generate_response(prompt, use_rag, stream=False):
	global output_dict, model, tokenizer

	print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
	torch.cuda.empty_cache()
	print(dir(model))

	if use_rag:
	retrieved_docs = retrieve(prompt)
	context = " ".join([doc for doc, _ in retrieved_docs])
	doc_ids = [doc_id for _, doc_id in retrieved_docs]
	full_prompt = f"Context: {context}\nQuestion: {prompt}\nAnswer:"
	else:
	full_prompt = prompt
	doc_ids = None
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": full_prompt}
	]
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(zero.device)
	start_time = time.time()
	total_tokens = 0

	print(output_dict)
	output_key = f"output_{len(output_dict) + 1}"
	print(output_key)
	output_dict[output_key] = {
	"input_prompt": prompt,
	"full_prompt": full_prompt,
	"use_rag": use_rag,
	"generated_text": "",
	"tokens_per_second": 0,
	"ram_usage": "",
	"doc_ids": doc_ids if doc_ids else "N/A",
	"model_details": get_model_details(model),
	"tokenizer_details": get_tokenizer_details(tokenizer),
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
	}
	print(output_dict)

	if stream:
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
	generation_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=512,
	temperature=0.7,
	)
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()
	for new_text in streamer:
	output_dict[output_key]["generated_text"] += new_text
	total_tokens += 1
	current_time = time.time()
	tokens_per_second = total_tokens / (current_time - start_time)
	ram_usage = get_ram_usage()
	output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
	output_dict[output_key]["ram_usage"] = ram_usage
	yield (output_dict[output_key]["generated_text"],
	output_dict[output_key]["tokens_per_second"],
	output_dict[output_key]["ram_usage"],
	output_dict[output_key]["doc_ids"])
	else:
	generated_ids = model.generate(
	model_inputs.input_ids,
	max_new_tokens=512
	)
	generated_ids = [
	output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
	]
	response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	total_tokens = len(generated_ids[0])
	end_time = time.time()
	tokens_per_second = total_tokens / (end_time - start_time)
	ram_usage = get_ram_usage()

	output_dict[output_key]["generated_text"] = response
	output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
	output_dict[output_key]["ram_usage"] = ram_usage
	print(output_dict)

	yield (output_dict[output_key]["generated_text"],
	output_dict[output_key]["tokens_per_second"],
	output_dict[output_key]["ram_usage"],
	output_dict[output_key]["doc_ids"])

	@spaces.GPU
	def generate_image(prompt):
	global output_dict, flux_pipe

	print(dir(flux_pipe))

	# Generate image using FLUX
	image = flux_pipe(
	prompt,
	guidance_scale=0.0,
	num_inference_steps=4,
	max_sequence_length=256,
	generator=torch.Generator("cpu").manual_seed(0)
	).images[0]
	image_path = f"flux_output_{time.time()}.png"
	print(image_path)
	image.save(image_path)
	ram_usage = get_ram_usage()
	return image_path, ram_usage, image_path

	def get_output_details(output_key):
	if output_key in output_dict:
	return output_dict[output_key]
	else:
	return f"No output found for key: {output_key}"

	# Update the switch_model function to return the load_model message
	def switch_model(choice):
	global modelname
	modelname = choice
	load_message = load_model(modelname)
	return load_message, f"Current model: {modelname}"

	# Update the model_change_handler function
	def model_change_handler(choice):
	message, current_model = switch_model(choice)
	return message, current_model, message # Use the same message for both outputs

	def format_output_dict():
	global output_dict
	formatted_output = ""
	for key, value in output_dict.items():
	formatted_output += f"Key: {key}\n"
	formatted_output += json.dumps(value, indent=2)
	formatted_output += "\n\n"
	print(formatted_output)
	return formatted_output