Spaces:

BridgeEight
/

internlm-20B-chat-w4-turbomind

Runtime error

App Files Files Community

internlm-20B-chat-w4-turbomind / app.py

BridgeEight

Update app.py

d0d202a verified 11 months ago

raw

history blame

5.23 kB

	from lmdeploy.serve.gradio.turbomind_coupled import *
	from lmdeploy.messages import TurbomindEngineConfig
	from lmdeploy import ChatTemplateConfig

	chat_template = ChatTemplateConfig(model_name='internlm2-chat-7b', system='', eosys='', meta_instruction='')
	backend_config = TurbomindEngineConfig(model_name='internlm2-chat-7b', max_batch_size=1, cache_max_entry_count=0.05)#, model_format='awq')
	model_path = 'internlm/internlm2-math-7b'

	InterFace.async_engine = AsyncEngine(
	model_path=model_path,
	backend='turbomind',
	backend_config=backend_config,
	chat_template_config=chat_template,
	tp=1)

	async def reset_local_func(instruction_txtbox: gr.Textbox,
	state_chatbot: Sequence, session_id: int):
	"""reset the session.

	Args:
	instruction_txtbox (str): user's prompt
	state_chatbot (Sequence): the chatting history
	session_id (int): the session id
	"""
	state_chatbot = []
	# end the session
	with InterFace.lock:
	InterFace.global_session_id += 1
	session_id = InterFace.global_session_id
	return (state_chatbot, state_chatbot, gr.Textbox.update(value=''), session_id)

	async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,
	reset_btn: gr.Button, session_id: int):
	"""stop the session.

	Args:
	instruction_txtbox (str): user's prompt
	state_chatbot (Sequence): the chatting history
	cancel_btn (gr.Button): the cancel button
	reset_btn (gr.Button): the reset button
	session_id (int): the session id
	"""
	yield (state_chatbot, disable_btn, disable_btn, session_id)
	InterFace.async_engine.stop_session(session_id)
	# pytorch backend does not support resume chat history now
	if InterFace.async_engine.backend == 'pytorch':
	yield (state_chatbot, disable_btn, enable_btn, session_id)
	else:
	with InterFace.lock:
	InterFace.global_session_id += 1
	session_id = InterFace.global_session_id
	messages = []
	for qa in state_chatbot:
	messages.append(dict(role='user', content=qa[0]))
	if qa[1] is not None:
	messages.append(dict(role='assistant', content=qa[1]))
	gen_config = GenerationConfig(max_new_tokens=0)
	async for out in InterFace.async_engine.generate(messages,
	session_id,
	gen_config=gen_config,
	stream_response=True,
	sequence_start=True,
	sequence_end=False):
	pass
	yield (state_chatbot, disable_btn, enable_btn, session_id)

	with gr.Blocks(css=CSS, theme=THEME) as demo:
	state_chatbot = gr.State([])
	state_session_id = gr.State(0)

	with gr.Column(elem_id='container'):
	gr.Markdown('## LMDeploy Playground')

	chatbot = gr.Chatbot(
	elem_id='chatbot',
	label=InterFace.async_engine.engine.model_name)
	instruction_txtbox = gr.Textbox(
	placeholder='Please input the instruction',
	label='Instruction')
	with gr.Row():
	cancel_btn = gr.Button(value='Cancel', interactive=False)
	reset_btn = gr.Button(value='Reset')
	with gr.Row():
	request_output_len = gr.Slider(1,
	2048,
	value=1024,
	step=1,
	label='Maximum new tokens')
	top_p = gr.Slider(0.01, 1, value=1.0, step=0.01, label='Top_p')
	temperature = gr.Slider(0.01,
	1.5,
	value=0.01,
	step=0.01,
	label='Temperature')

	send_event = instruction_txtbox.submit(chat_stream_local, [
	instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
	state_session_id, top_p, temperature, request_output_len
	], [state_chatbot, chatbot, cancel_btn, reset_btn])
	instruction_txtbox.submit(
	lambda: gr.Textbox.update(value=''),
	[],
	[instruction_txtbox],
	)
	cancel_btn.click(
	cancel_local_func,
	[state_chatbot, cancel_btn, reset_btn, state_session_id],
	[state_chatbot, cancel_btn, reset_btn, state_session_id],
	cancels=[send_event])

	reset_btn.click(reset_local_func,
	[instruction_txtbox, state_chatbot, state_session_id],
	[state_chatbot, chatbot, instruction_txtbox, state_session_id],
	cancels=[send_event])

	def init():
	with InterFace.lock:
	InterFace.global_session_id += 1
	new_session_id = InterFace.global_session_id
	return new_session_id

	demo.load(init, inputs=None, outputs=[state_session_id])

	demo.queue(concurrency_count=InterFace.async_engine.instance_num,
	max_size=100).launch()