Spaces:

hoduyquocbao
/

LLAMA3.2-GRop

Running on Zero

App Files Files Community

LLAMA3.2-GRop / app.py

hoduyquocbao

new version update

5839892 about 21 hours ago

raw

history blame

8.57 kB

	# Import các thư viện cần thiết
	import os
	import json
	from threading import Thread
	from typing import Iterator, List, Tuple

	# Import thư viện Gradio và các mô-đun khác
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	# Mô tả chung về mô hình và phiên bản Llama
	DESCRIPTION = """\
	# Llama 3.2 3B Instruct với Gọi Công Cụ Tiên Tiến

	Llama 3.2 3B là phiên bản mới nhất của LLM từ Meta, được tinh chỉnh để theo dõi hướng dẫn và hỗ trợ gọi công cụ.
	Đây là bản demo của [`meta-llama/Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct).
	Để biết thêm chi tiết, hãy xem [bài đăng của chúng tôi](https://huggingface.co/blog/llama32).
	"""

	# Các thiết lập thông số tối đa
	MAX_MAX_NEW_TOKENS = 2048 # Số token tối đa cho đầu ra mới
	DEFAULT_MAX_NEW_TOKENS = 1024 # Số token mặc định cho đầu ra mới
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) # Lấy giá trị chiều dài token đầu vào từ biến môi trường

	# Kiểm tra thiết bị có hỗ trợ GPU không, nếu không thì sử dụng CPU
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# Định danh mô hình và tải mô hình cùng tokenizer
	model_id = "nltpt/Llama-3.2-3B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto", # Tự động ánh xạ thiết bị
	torch_dtype=torch.bfloat16, # Sử dụng kiểu dữ liệu bfloat16
	)
	model.eval() # Đặt mô hình vào chế độ đánh giá (evaluation mode)

	# Định nghĩa các chức năng có thể được mô hình gọi
	def get_weather(city: str, metric: str = "celsius") -> str:
	# Ở đây bạn có thể tích hợp với API thời tiết thực tế
	# Ví dụ tĩnh:
	weather_data = {
	"San Francisco": "25 C",
	"Seattle": "18 C"
	}
	return weather_data.get(city, "Không có dữ liệu")

	def get_user_info(user_id: int, special: str = "none") -> str:
	# Ở đây bạn có thể truy xuất thông tin từ cơ sở dữ liệu
	# Ví dụ tĩnh:
	user_data = {
	7890: {"name": "Nguyễn Văn A", "special": special}
	}
	user = user_data.get(user_id, {"name": "Không xác định", "special": "none"})
	return f"Tên người dùng: {user['name']}, Yêu cầu đặc biệt: {user['special']}"

	# Từ điển chứa các chức năng có thể gọi
	AVAILABLE_FUNCTIONS = {
	"get_weather": get_weather,
	"get_user_info": get_user_info
	}

	@spaces.GPU(duration=10) # Chỉ định hàm này chạy trên GPU trong tối đa 90 giây
	def generate(
	message: str,
	chat_history: List[Tuple[str, str]],
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	) -> Iterator[str]:
	conversation = []

	# Duyệt qua lịch sử trò chuyện để xây dựng lại cuộc hội thoại
	for user, assistant in chat_history:
	conversation.extend(
	[
	{"role": "user", "content": user},
	{"role": "assistant", "content": assistant},
	]
	)
	# Thêm tin nhắn mới của người dùng vào cuộc hội thoại
	conversation.append({"role": "user", "content": message})

	# Áp dụng mẫu hội thoại và chuyển thành tensor
	input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")

	# Kiểm tra và cắt bớt chuỗi đầu vào nếu vượt quá chiều dài tối đa
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Đã cắt bớt đầu vào từ cuộc hội thoại vì vượt quá {MAX_INPUT_TOKEN_LENGTH} tokens.")

	# Chuyển tensor đến thiết bị của mô hình
	input_ids = input_ids.to(model.device)

	# Khởi tạo Streamer để lấy đầu ra theo từng phần (real-time)
	streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)

	# Thiết lập các tham số cho quá trình sinh đầu ra
	generate_kwargs = dict(
	{"input_ids": input_ids},
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)

	# Tạo một luồng để chạy quá trình sinh đầu ra
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	# Trả về từng phần đầu ra khi chúng được sinh ra
	outputs = []
	assistant_response = ""
	for text in streamer:
	outputs.append(text)
	assistant_response = "".join(outputs)
	# Kiểm tra xem mô hình có trả về cuộc gọi chức năng không
	if "[get_weather" in assistant_response or "[get_user_info" in assistant_response:
	try:
	# Trích xuất phần gọi chức năng từ phản hồi
	start = assistant_response.index('[')
	end = assistant_response.index(']') + 1
	func_calls_str = assistant_response[start:end]
	func_calls = json.loads(func_calls_str.replace("'", '"'))

	results = []
	for call in func_calls:
	func_name = list(call.keys())[0]
	params = call[func_name]
	if isinstance(params, dict):
	result = AVAILABLE_FUNCTIONS[func_name](**params)
	else:
	result = AVAILABLE_FUNCTIONS[func_name]()
	results.append(result)

	# Gộp kết quả và thêm vào phản hồi của trợ lý
	assistant_response = assistant_response[:start] + " ".join(results) + assistant_response[end:]
	yield assistant_response
	except Exception as e:
	yield f"Đã xảy ra lỗi khi xử lý cuộc gọi chức năng: {str(e)}"
	else:
	yield assistant_response

	# Tạo giao diện chat với Gradio
	chat_interface = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Slider(
	label="Số token mới tối đa",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	),
	gr.Slider(
	label="Nhiệt độ (Temperature)",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.6,
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	),
	gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	),
	gr.Slider(
	label="Hình phạt lặp lại (Repetition penalty)",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.2,
	),
	],
	stop_btn=None, # Không có nút dừng
	examples=[
	["Xin chào! Bạn có khỏe không?"],
	["Bạn có thể giải thích ngắn gọn về ngôn ngữ lập trình Python không?"],
	["Giải thích cốt truyện của Cô bé Lọ Lem trong một câu."],
	["Mất bao nhiêu giờ để một người ăn một chiếc trực thăng?"],
	["Viết một bài báo 100 từ về 'Lợi ích của mã nguồn mở trong nghiên cứu AI'"],
	["Thời tiết ở San Francisco thế nào"],

	],
	cache_examples=False, # Không lưu trữ các ví dụ
	)

	# Tạo bố cục giao diện với Gradio
	with gr.Blocks(css="style.css", fill_height=True) as demo:
	gr.Markdown(DESCRIPTION) # Hiển thị phần mô tả
	gr.DuplicateButton(value="Tạo bản sao cho sử dụng cá nhân", elem_id="duplicate-button")
	chat_interface.render() # Hiển thị giao diện chat

	# Khởi chạy ứng dụng khi chạy trực tiếp tệp này
	if __name__ == "__main__":
	demo.queue(max_size=20).launch()