Spaces:
Sleeping
Sleeping
""" | |
FileName: mcts_Gumbel_Alphazero.py | |
Author: Jiaxin Li | |
Create Date: 2023/11/21 | |
Description: The implement of Gumbel MCST | |
Edit History: | |
Debug: the dim of output: probs | |
""" | |
import numpy as np | |
import copy | |
import time | |
from .config.options import * | |
import sys | |
from .config.utils import * | |
def softmax(x): | |
probs = np.exp(x - np.max(x)) | |
probs /= np.sum(probs) | |
return probs | |
def _sigma_mano(y, Nb): | |
return (50 + Nb) * 1.0 * y | |
class TreeNode(object): | |
"""A node in the MCTS tree. | |
Each node keeps track of its own value Q, prior probability P, and | |
its visit-count-adjusted prior score u. | |
""" | |
def __init__(self, parent, prior_p): | |
self._parent = parent | |
self._children = {} # a map from action to TreeNode | |
self._n_visits = 0 | |
self._Q = 0 | |
self._u = 0 | |
self._v = 0 | |
self._p = prior_p | |
def expand(self, action_priors): | |
"""Expand tree by creating new children. | |
action_priors: a list of tuples of actions and their prior probability | |
according to the policy function. | |
""" | |
for action, prob in action_priors: | |
if action not in self._children: | |
self._children[action] = TreeNode(self, prob) | |
def select(self, v_pi): | |
"""Select action among children that gives maximum | |
(pi'(a) - N(a) \ (1 + \sum_b N(b))) | |
Return: A tuple of (action, next_node) | |
""" | |
# if opts.split == "train": | |
# v_pi = v_pi.detach().numpy() | |
# print(v_pi) | |
max_N_b = np.max(np.array([act_node[1]._n_visits for act_node in self._children.items()])) | |
if opts.split == "train": | |
pi_ = softmax(np.array([act_node[1].get_pi(v_pi, max_N_b) for act_node in self._children.items()])).reshape( | |
len(list(self._children.items())), -1) | |
else: | |
pi_ = softmax(np.array([act_node[1].get_pi(v_pi, max_N_b) for act_node in self._children.items()])).reshape( | |
len(list(self._children.items())), -1) | |
# print(pi_.shape) | |
N_a = np.array([act_node[1]._n_visits / (1 + self._n_visits) for act_node in self._children.items()]).reshape( | |
pi_.shape[0], -1) | |
# print(N_a.shape) | |
max_index = np.argmax(pi_ - N_a) | |
# print((pi_ - N_a).shape) | |
return list(self._children.items())[max_index] | |
def update(self, leaf_value): | |
"""Update node values from leaf evaluation. | |
leaf_value: the value of subtree evaluation from the current player's | |
perspective. | |
""" | |
# Count visit. | |
self._n_visits += 1 | |
# Update Q, a running average of values for all visits. | |
if opts.split == "train": | |
self._Q = self._Q + (1.0 * (leaf_value - self._Q) / self._n_visits) | |
else: | |
self._Q += (1.0 * (leaf_value - self._Q) / self._n_visits) | |
def update_recursive(self, leaf_value): | |
"""Like a call to update(), but applied recursively for all ancestors. | |
""" | |
# If it is not root, this node's parent should be updated first. | |
if self._parent: | |
self._parent.update_recursive(-leaf_value) | |
self.update(leaf_value) | |
def get_pi(self, v_pi, max_N_b): | |
if self._n_visits == 0: | |
Q_completed = v_pi | |
else: | |
Q_completed = self._Q | |
return self._p + _sigma_mano(Q_completed, max_N_b) | |
def get_value(self, c_puct): | |
"""Calculate and return the value for this node. | |
It is a combination of leaf evaluations Q, and this node's prior | |
adjusted for its visit count, u. | |
c_puct: a number in (0, inf) controlling the relative impact of | |
value Q, and prior probability P, on this node's score. | |
""" | |
self._u = (c_puct * self._P * | |
np.sqrt(self._parent._n_visits) / (1 + self._n_visits)) | |
return self._Q + self._u | |
def is_leaf(self): | |
"""Check if leaf node (i.e. no nodes below this have been expanded).""" | |
return self._children == {} | |
def is_root(self): | |
return self._parent is None | |
class Gumbel_MCTS(object): | |
"""An implementation of Monte Carlo Tree Search.""" | |
def __init__(self, policy_value_fn, c_puct=5, n_playout=10000): | |
""" | |
policy_value_fn: a function that takes in a board state and outputs | |
a list of (action, probability) tuples and also a score in [-1, 1] | |
(i.e. the expected value of the end game score from the current | |
player's perspective) for the current player. | |
c_puct: a number in (0, inf) that controls how quickly exploration | |
converges to the maximum-value policy. A higher value means | |
relying on the prior more. | |
""" | |
self._root = TreeNode(None, 1.0) | |
self._policy = policy_value_fn | |
self._c_puct = c_puct | |
self._n_playout = n_playout | |
def Gumbel_playout(self, child_node, child_state): | |
"""Run a single playout from the child of the root to the leaf, getting a value at | |
the leaf and propagating it back through its parents. | |
State is modified in-place, so a copy must be provided. | |
This mothod of select is a non-root selet. | |
""" | |
node = child_node | |
state = child_state | |
while (1): | |
if node.is_leaf(): | |
break | |
# Greedily select next move. | |
action, node = node.select(node._v) | |
state.do_move(action) | |
# Evaluate the leaf using a network which outputs a list of | |
# (action, probability) tuples p and also a score v in [-1, 1] | |
# for the current player. | |
action_probs, leaf_value = self._policy(state) | |
# leaf_value = leaf_value.detach().numpy()[0][0] | |
leaf_value = leaf_value.detach().numpy() | |
node._v = leaf_value | |
# Check for end of game. | |
end, winner = state.game_end() | |
if not end: | |
node.expand(action_probs) | |
else: | |
# for end state,return the "true" leaf_value | |
if winner == -1: # tie | |
leaf_value = 0.0 | |
else: | |
leaf_value = ( | |
1.0 if winner == state.get_current_player() else -1.0 | |
) | |
# Update value and visit count of nodes in this traversal. | |
node.update_recursive(-leaf_value) | |
def top_k(self, x, k): | |
# print("x",x.shape) | |
# print("k ", k) | |
return np.argpartition(x, k)[..., -k:] | |
def sample_k(self, logits, k): | |
u = np.random.uniform(size=np.shape(logits)) | |
z = -np.log(-np.log(u)) | |
return self.top_k(logits + z, k), z | |
def get_move_probs(self, state, temp=1e-3, m_action=16): | |
"""Run all playouts sequentially and return the available actions and | |
their corresponding probabilities. | |
state: the current game state | |
temp: temperature parameter in (0, 1] controls the level of exploration | |
""" | |
# 这里需要修改:1 | |
# logits 暂定为 p | |
start_time = time.time() | |
# 对根节点进行拓展 | |
act_probs, leaf_value = self._policy(state) | |
act_probs = list(act_probs) | |
# leaf_value = leaf_value.detach().numpy()[0][0] | |
leaf_value = leaf_value.detach().numpy() | |
# print(list(act_probs)) | |
porbs = [prob for act, prob in (act_probs)] | |
self._root.expand(act_probs) | |
n = self._n_playout | |
m = min(m_action, int(len(porbs) / 2)) | |
# 先进行Gumbel 分布采样,不重复的采样前m个动作,对应选择公式 logits + g | |
A_topm, g = self.sample_k(porbs, m) | |
# 获得state选取每个action后对应的状态,保存到一个列表中 | |
root_childs = list(self._root._children.items()) | |
child_state_m = [] | |
for i in range(m): | |
state_copy = copy.deepcopy(state) | |
action, node = root_childs[A_topm[i]] | |
state_copy.do_move(action) | |
child_state_m.append(state_copy) | |
print(porbs) | |
print("depend on:", np.array(porbs)[A_topm]) | |
print(f"A_topm_{m}", A_topm) | |
print("m ", m) | |
if m > 1: | |
# 每轮对选择的动作进行的仿真次数 | |
N = int(n / (np.log(m) * m)) | |
else: | |
N = n | |
# 进行sequential halving with Gumbel | |
while m >= 1: | |
# 对每个选择的动作进行仿真 | |
for i in range(m): | |
action_state = child_state_m[i] | |
action, node = root_childs[A_topm[i]] | |
for j in range(N): | |
action_state_copy = copy.deepcopy(action_state) | |
# 对选择动作进行仿真: 即找到这个子树的叶节点,然后再网络中预测v,然后往上回溯的过程 | |
self.Gumbel_playout(node, action_state_copy) | |
# 每轮不重复采样的动作个数减半 | |
m = m // 2 | |
# 不是最后一轮,单轮仿真次数加倍 | |
if (m != 1): | |
n = n - N | |
N *= 2 | |
# 当最后一轮时,只有一个动作,把所有仿真次数用完 | |
else: | |
N = n | |
# 进行新的一轮不重复采样, 采样在之前的动作前一半的动作, 对应公式 g + logits + \sigma( \hat{q} ) | |
# print([action_node[1]._Q for action_node in self._root._children.items() ]) | |
q_hat = np.array([action_node[1]._Q for action_node in self._root._children.items()]) | |
assert (np.sum(q_hat[A_topm] == 0) == 0) | |
print("depend on:", np.array(porbs)[A_topm] + np.array(g)[A_topm] + q_hat[A_topm]) | |
print(f"A_topm_{m}", A_topm) | |
A_index = self.top_k(np.array(porbs)[A_topm] + np.array(g)[A_topm] + q_hat[A_topm], m) | |
A_topm = np.array(A_topm)[A_index] | |
child_state_m = np.array(child_state_m)[A_index] | |
# 最后返回对应的决策函数, 即 pi' = softmax(logits + sigma(completed Q)) | |
max_N_b = np.max(np.array([act_node[1]._n_visits for act_node in self._root._children.items()])) | |
final_act_probs = softmax( | |
np.array([act_node[1].get_pi(leaf_value, max_N_b) for act_node in self._root._children.items()])) | |
action = (np.array([act_node[0] for act_node in self._root._children.items()])) | |
print("final_act_prbs", final_act_probs) | |
print("move :", action) | |
print("final_action", np.array(list(self._root._children.items()))[A_topm][0][0]) | |
print("argmax_prob", np.argmax(final_act_probs)) | |
need_time = time.time() - start_time | |
print(f" Gumbel Alphazero sum_time: {need_time}, total_simulation: {self._n_playout}") | |
return np.array(list(self._root._children.items()))[A_topm][0][0], action, final_act_probs, need_time | |
def update_with_move(self, last_move): | |
"""Step forward in the tree, keeping everything we already know | |
about the subtree. | |
""" | |
if last_move in self._root._children: | |
self._root = self._root._children[last_move] | |
self._root._parent = None | |
else: | |
self._root = TreeNode(None, 1.0) | |
def __str__(self): | |
return "MCTS" | |
class Gumbel_MCTSPlayer(object): | |
"""AI player based on MCTS""" | |
def __init__(self, policy_value_function, | |
c_puct=5, n_playout=2000, is_selfplay=0, m_action=16): | |
self.mcts = Gumbel_MCTS(policy_value_function, c_puct, n_playout) | |
self._is_selfplay = is_selfplay | |
self.m_action = m_action | |
def set_player_ind(self, p): | |
self.player = p | |
def reset_player(self): | |
self.mcts.update_with_move(-1) | |
def get_action(self, board, temp=1e-3, return_prob=0, return_time=False): | |
sensible_moves = board.availables | |
# the pi vector returned by MCTS as in the alphaGo Zero paper | |
move_probs = np.zeros(board.width * board.height) | |
if len(sensible_moves) > 0: | |
# 在搜索树中利用sequential halving with Gumbel 来进行动作选择 并且返回对应的决策函数 | |
move, acts, probs, simul_mean_time = self.mcts.get_move_probs(board, temp, self.m_action) | |
# 重置搜索树 | |
self.mcts.update_with_move(-1) | |
move_probs[list(acts)] = probs | |
move_probs = np.zeros(move_probs.shape[0]) | |
move_probs[move] = 1 | |
print("final prob:", move_probs) | |
print("arg_max:", np.argmax(move_probs)) | |
print("max", np.max(move_probs)) | |
print("move", move) | |
# 他通过训练能够使得最后move_probs 有一个位置趋近于1,即得到一个策略 | |
# 关键是他的策略,和MCTS得到move不一致,怀疑是分布策略计算的问题 | |
if return_time: | |
if return_prob: | |
return move, move_probs, simul_mean_time | |
else: | |
return move, simul_mean_time | |
else: | |
if return_prob: | |
return move, move_probs | |
else: | |
return move | |
else: | |
print("WARNING: the board is full") | |
def __str__(self): | |
return "MCTS {}".format(self.player) | |