Spaces:

gregH
/

gradient_cuff

Running on Zero

App Files Files Community

gradient_cuff / app.py

gregH

Update app.py

04094cf verified 9 months ago

raw

history blame

2.77 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
	import time
	import numpy as np
	from torch.nn import functional as F
	import os
	from threading import Thread

	print(f"Starting to load the model to memory")
	tok = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=True)
	tok.padding_side = "left"
	tok.pad_token_id = tokenizer.eos_token_id
	# using CUDA for an optimal experience
	slot="<slot_for_user_input_design_by_xm>"
	chat=[{"role": "user", "content": slot}]
	sample_input = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
	input_start_id=sample_input.find(slot)
	prefix=sample_input[:input_start_id]
	suffix=sample_input[input_start_id+len(slot):]
	print(tok.encode(prefix,return_tensors="pt")[0])
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	m = AutoModelForCausalLM.from_pretrained(
	"stabilityai/stablelm-2-zephyr-1_6b", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True)
	embedding_func=m.get_input_embeddings()
	embedding_func.weight.requires_grad=False
	m = m.to(device)
	print(f"Sucessfully loaded the model to the memory")

	start_message = ""

	def user(message, history):
	# Append the user's message to the conversation history
	return "", history + [[message, ""]]

	#def defense(message):
	# to determine whether the query is malicious

	def chat(message, history):
	chat = []
	for item in history:
	chat.append({"role": "user", "content": item[0]})
	if item[1] is not None:
	chat.append({"role": "assistant", "content": item[1]})
	chat.append({"role": "user", "content": message})
	messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
	# Tokenize the messages string
	model_inputs = tok([messages], return_tensors="pt").to(device)
	streamer = TextIteratorStreamer(
	tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=1024,
	do_sample=True,
	top_p=0.90,
	temperature=0.6,
	num_beams=1
	)
	t = Thread(target=m.generate, kwargs=generate_kwargs)
	t.start()

	# Initialize an empty string to store the generated text
	partial_text = ""
	for new_text in streamer:
	print(new_text)
	partial_text += new_text
	# Yield an empty string to cleanup the message textbox and the updated conversation history
	yield partial_text

	demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
	demo.launch()