Spaces:

Gomoku-Zero
/

Demo

Sleeping

App Files Files Community

Demo / Gomoku_MCTS /mcts_pure.py

HuskyDoge

finish model selections

e7a440c about 1 year ago

raw

history blame

10 kB

	# -- coding: utf-8 --

	import numpy as np
	import copy
	from operator import itemgetter
	import time


	def softmax(x):
	probs = np.exp(x - np.max(x))
	probs /= np.sum(probs)
	return probs

	def rollout_policy_fn(board):
	"""a coarse, fast version of policy_fn used in the rollout phase."""
	# rollout randomly
	action_probs = np.random.rand(len(board.availables))
	return zip(board.availables, action_probs)

	# 决策价值函数
	def policy_value_fn(board):
	"""a function that takes in a state and outputs a list of (action, probability)
	tuples and a score for the state"""
	# return uniform probabilities and 0 score for pure MCTS
	action_probs = np.ones(len(board.availables))/len(board.availables)
	return zip(board.availables, action_probs), 0


	class TreeNode(object):
	"""A node in the MCTS tree. Each node keeps track of its own value Q,
	prior probability P, and its visit-count-adjusted prior score u.
	"""

	def __init__(self, parent, prior_p):
	self._parent = parent
	self._children = {} # a map from action to TreeNode
	self._n_visits = 0
	self._Q = 0
	self._u = 0
	self._P = prior_p

	def expand(self, action_priors):
	"""Expand tree by creating new children.
	action_priors: a list of tuples of actions and their prior probability
	according to the policy function.
	"""
	for action, prob in action_priors:
	if action not in self._children:
	self._children[action] = TreeNode(self, prob)

	def select(self, c_puct):
	"""Select action among children that gives maximum action value Q
	plus bonus u(P).
	Return: A tuple of (action, next_node)
	"""
	return max(self._children.items(),
	key=lambda act_node: act_node[1].get_value(c_puct))

	def update(self, leaf_value):
	"""Update node values from leaf evaluation.
	leaf_value: the value of subtree evaluation from the current player's
	perspective.
	"""
	# Count visit.
	self._n_visits += 1
	# Update Q, a running average of values for all visits.
	# print("=====================================")
	# print("Before, Q: {}, visits: {}, leaf_value: {}".format(self._Q, self._n_visits,leaf_value))
	self._Q += 1.0*(leaf_value - self._Q) / self._n_visits
	# print("After, Q: {}, visits: {}, leaf_value: {}".format(self._Q, self._n_visits,leaf_value))


	def update_recursive(self, leaf_value):
	"""Like a call to update(), but applied recursively for all ancestors.
	"""
	# If it is not root, this node's parent should be updated first.
	if self._parent:
	self._parent.update_recursive(-leaf_value)
	self.update(leaf_value)

	def get_value(self, c_puct):
	"""Calculate and return the value for this node.
	It is a combination of leaf evaluations Q, and this node's prior
	adjusted for its visit count, u.
	c_puct: a number in (0, inf) controlling the relative impact of
	value Q, and prior probability P, on this node's score.
	"""
	self._u = (c_puct * self._P *
	np.sqrt(self._parent._n_visits) / (1 + self._n_visits))
	return self._Q + self._u

	def is_leaf(self):
	"""Check if leaf node (i.e. no nodes below this have been expanded).
	"""
	return self._children == {}

	def is_root(self):
	return self._parent is None


	class MCTS(object):
	"""A simple implementation of Monte Carlo Tree Search."""

	def __init__(self, policy_value_fn, c_puct=5, n_playout=2000):
	"""
	policy_value_fn: a function that takes in a board state and outputs
	a list of (action, probability) tuples and also a score in [-1, 1]
	(i.e. the expected value of the end game score from the current
	player's perspective) for the current player.
	c_puct: a number in (0, inf) that controls how quickly exploration
	converges to the maximum-value policy. A higher value means
	relying on the prior more. ???
	"""
	self._root = TreeNode(None, 1.0)
	self._policy = policy_value_fn
	self._c_puct = c_puct
	self._n_playout = n_playout

	def _playout(self, state):
	"""Run a single playout from the root to the leaf, getting a value at
	the leaf and propagating it back through its parents.
	State is modified in-place, so a copy must be provided.
	"""
	node = self._root
	while(1):
	if node.is_leaf():

	break
	# Greedily select next move.
	action, node = node.select(self._c_puct)
	state.do_move(action)

	action_probs, _ = self._policy(state)
	# Check for end of game
	end, winner = state.game_end()
	if not end:
	node.expand(action_probs)
	# Evaluate the leaf node by random rollout
	leaf_value = self._evaluate_rollout(state)
	# Update value and visit count of nodes in this traversal.
	node.update_recursive(-leaf_value)

	def _evaluate_rollout(self, state, limit=1000):
	"""Use the rollout policy to play until the end of the game,
	returning +1 if the current player wins, -1 if the opponent wins,
	and 0 if it is a tie.
	"""
	player = state.get_current_player()
	for i in range(limit):
	end, winner = state.game_end()
	if end:
	break
	action_probs = rollout_policy_fn(state)
	max_action = max(action_probs, key=itemgetter(1))[0]
	state.do_move(max_action)
	else:
	# If no break from the loop, issue a warning.
	print("WARNING: rollout reached move limit")
	if winner == -1: # tie
	return 0
	else:
	return 1 if winner == player else -1

	def get_move(self, state):
	"""Runs all playouts sequentially and returns the most visited action.
	state: the current game state

	Return: the selected action
	"""
	start_time = time.time()
	# n_playout 探索的次数
	for n in range(self._n_playout):
	state_copy = copy.deepcopy(state)
	self._playout(state_copy)

	need_time = time.time() - start_time

	print(f" PureMCTS sum_time: {need_time / self._n_playout }, total_simulation: {self._n_playout}")

	return max(self._root._children.items(),key=lambda act_node: act_node[1]._n_visits)[0], need_time / self._n_playout

	def update_with_move(self, last_move):
	"""Step forward in the tree, keeping everything we already know
	about the subtree.
	"""
	if last_move in self._root._children:
	self._root = self._root._children[last_move]
	self._root._parent = None
	else:
	self._root = TreeNode(None, 1.0)

	def get_move_probs(self, state, temp=1e-3):
	"""Run all playouts sequentially and return the available actions and
	their corresponding probabilities.
	state: the current game state
	temp: temperature parameter in (0, 1] controls the level of exploration
	"""

	start_time_averge = 0

	### test multi-thread
	# lock = threading.Lock()
	# with ThreadPoolExecutor(max_workers=4) as executor:
	# for n in range(self._n_playout):
	# start_time = time.time()

	# state_copy = copy.deepcopy(state)
	# executor.submit(self._playout, state_copy, lock)
	# start_time_averge += (time.time() - start_time)
	### end test multi-thread

	t = time.time()
	for n in range(self._n_playout):
	start_time = time.time()

	state_copy = copy.deepcopy(state)
	self._playout(state_copy)
	start_time_averge += (time.time() - start_time)
	total_time = time.time() - t
	# print('!!time!!:', time.time() - t)

	print(f" My MCTS sum_time: {total_time}, total_simulation: {self._n_playout}")

	# calc the move probabilities based on visit counts at the root node
	act_visits = [(act, node._n_visits)
	for act, node in self._root._children.items()]

	acts, visits = zip(*act_visits)

	act_probs = softmax(1.0 / temp * np.log(np.array(visits) + 1e-10))

	return 0, acts, act_probs, total_time

	def __str__(self):
	return "MCTS"



	class MCTSPlayer(object):
	"""AI player based on MCTS"""
	def __init__(self, c_puct=5, n_playout=2000):
	self.mcts = MCTS(policy_value_fn, c_puct, n_playout)

	def set_player_ind(self, p):
	self.player = p

	def reset_player(self):
	self.mcts.update_with_move(-1)

	def get_action(self, board, return_time=False):
	sensible_moves = board.availables
	if len(sensible_moves) > 0:
	move, simul_mean_time = self.mcts.get_move(board)
	self.mcts.update_with_move(-1)
	print("MCTS move:", move)
	return move, simul_mean_time
	else:
	print("WARNING: the board is full")


	def __str__(self):
	return "MCTS {}".format(self.player)


	# 多了下面这一串代码

	class Human_Player(object):
	def __init__(self):
	pass


	def set_player_ind(self, p):
	self.player = p


	def get_action(self, board):


	sensible_moves = board.availables
	if len(sensible_moves) > 0:
	# print(sensible_moves)

	move = int(input("Input the move:"))
	while (move not in sensible_moves ):
	print(sensible_moves)
	move = int(input("Input the move again:"))
	return move
	else:
	print("WARNING: the board is full")

	def __str__(self):
	return "Human {}".format(self.player)