Spaces:

Gomoku-Zero
/

Demo

Sleeping

App Files Files Community

HuskyDoge commited on Dec 15, 2023

Commit

beb9e09

•

1 Parent(s): 7d23b62

newest

Browse files

Files changed (15) hide show

Gomoku_Bot/eval.py +0 -39
Gomoku_Bot/gomoku_bot.py +3 -2
Gomoku_MCTS/__init__.py +4 -2
Gomoku_MCTS/checkpoint/2023-12-14-18-16-09_test_teaching_learning_collect_epochs=1000_size=9_model=duel/best_policy.model +3 -0
Gomoku_MCTS/checkpoint/2023-12-14-18-16-09_test_teaching_learning_collect_epochs=1000_size=9_model=duel/current_policy.model +3 -0
Gomoku_MCTS/checkpoint/2023-12-14-18-17-07_test_teaching_learning_collect_epochs=1000_size=9_model=normal/best_policy.model +3 -0
Gomoku_MCTS/checkpoint/2023-12-14-18-17-07_test_teaching_learning_collect_epochs=1000_size=9_model=normal/current_policy.model +3 -0
Gomoku_MCTS/checkpoint/2023-12-14-21-19-40_selfplay_epochs=1000_size=9_model=gumbel/best_policy.model +3 -0
Gomoku_MCTS/checkpoint/2023-12-14-21-19-40_selfplay_epochs=1000_size=9_model=gumbel/current_policy.model +3 -0
Gomoku_MCTS/config/utils.py +12 -12
Gomoku_MCTS/dueling_net.py +13 -10
Gomoku_MCTS/mcts_Gumbel_Alphazero.py +96 -103
Gomoku_MCTS/policy_value_net_pytorch_new.py +234 -0
const.py +8 -1
pages/Player_VS_AI.py +96 -24

Gomoku_Bot/eval.py CHANGED Viewed

@@ -460,11 +460,9 @@ class Evaluate:
             for point in fives:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(FIVE, model_train_matrix[x][y])
             for point in block_fives:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(BLOCK_FIVE, model_train_matrix[x][y])
             return set(list(fives) + list(block_fives)), model_train_matrix
@@ -474,12 +472,10 @@ class Evaluate:
             for point in fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(FOUR, model_train_matrix[x][y])
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(BLOCK_FOUR, model_train_matrix[x][y])
             return set(list(fours) + list(block_fours)), model_train_matrix
@@ -488,12 +484,10 @@ class Evaluate:
             for point in four_fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(FOUR_FOUR, model_train_matrix[x][y])
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(BLOCK_FOUR, model_train_matrix[x][y])
             return set(list(four_fours) + list(block_fours)), model_train_matrix
@@ -504,17 +498,14 @@ class Evaluate:
             for point in four_threes:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(FOUR_THREE, model_train_matrix[x][y])
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(BLOCK_FOUR, model_train_matrix[x][y])
             for point in threes:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(THREE, model_train_matrix[x][y])
             return set(list(four_threes) + list(block_fours) + list(threes)), model_train_matrix
@@ -524,17 +515,14 @@ class Evaluate:
             for point in three_threes:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(THREE_THREE, model_train_matrix[x][y])
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(BLOCK_FOUR, model_train_matrix[x][y])
             for point in threes:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(THREE, model_train_matrix[x][y])
             return set(list(three_threes) + list(block_fours) + list(threes)), model_train_matrix
@@ -542,43 +530,16 @@ class Evaluate:
             for point in threes:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(THREE, model_train_matrix[x][y])
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
-                model_train_matrix[x][y] = max(BLOCK_FOUR, model_train_matrix[x][y])
             return set(list(block_fours) + list(threes)), model_train_matrix
         block_threes = points[shapes['BLOCK_THREE']]
         two_twos = points[shapes['TWO_TWO']]
         twos = points[shapes['TWO']]
-        for point in block_threes:
-            x = point // self.size
-            y = point % self.size
-            model_train_matrix[x][y] = max(BLOCK_THREE, model_train_matrix[x][y])
-        for point in two_twos:
-            x = point // self.size
-            y = point % self.size
-            model_train_matrix[x][y] = max(TWO_TWO, model_train_matrix[x][y])
-        for point in twos:
-            x = point // self.size
-            y = point % self.size
-            model_train_matrix[x][y] = max(TWO, model_train_matrix[x][y])
-        for point in block_fours:
-            x = point // self.size
-            y = point % self.size
-            model_train_matrix[x][y] = max(BLOCK_FOUR, model_train_matrix[x][y])
-        for point in threes:
-            x = point // self.size
-            y = point % self.size
-            model_train_matrix[x][y] = max(THREE, model_train_matrix[x][y])
         mid = list(block_fours) + list(threes) + list(block_threes) + list(two_twos) + list(twos)
         res = set(mid[:5])
         for i in range(len(model_train_matrix)):

             for point in fives:
                 x = point // self.size
                 y = point % self.size
             for point in block_fives:
                 x = point // self.size
                 y = point % self.size
             return set(list(fives) + list(block_fives)), model_train_matrix
             for point in fours:
                 x = point // self.size
                 y = point % self.size
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
             return set(list(fours) + list(block_fours)), model_train_matrix
             for point in four_fours:
                 x = point // self.size
                 y = point % self.size
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
             return set(list(four_fours) + list(block_fours)), model_train_matrix
             for point in four_threes:
                 x = point // self.size
                 y = point % self.size
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
             for point in threes:
                 x = point // self.size
                 y = point % self.size
             return set(list(four_threes) + list(block_fours) + list(threes)), model_train_matrix
             for point in three_threes:
                 x = point // self.size
                 y = point % self.size
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
             for point in threes:
                 x = point // self.size
                 y = point % self.size
             return set(list(three_threes) + list(block_fours) + list(threes)), model_train_matrix
             for point in threes:
                 x = point // self.size
                 y = point % self.size
             for point in block_fours:
                 x = point // self.size
                 y = point % self.size
             return set(list(block_fours) + list(threes)), model_train_matrix
         block_threes = points[shapes['BLOCK_THREE']]
         two_twos = points[shapes['TWO_TWO']]
         twos = points[shapes['TWO']]
         mid = list(block_fours) + list(threes) + list(block_threes) + list(two_twos) + list(twos)
         res = set(mid[:5])
         for i in range(len(model_train_matrix)):

Gomoku_Bot/gomoku_bot.py CHANGED Viewed

@@ -3,7 +3,7 @@ import time
 class Gomoku_bot:
-    def __init__(self, board, role, depth=4, enableVCT=True):
         self.board = board
         self.role = role
         self.depth = depth
@@ -14,7 +14,8 @@ class Gomoku_bot:
         score = minmax(self.board, self.role, self.depth, self.enableVCT)
         end = time.time()
         sim_time = end - start
-        move = score[1]
         # turn tuple into an int
         move = move[0] * self.board.size + move[1]
         if return_time:

 class Gomoku_bot:
+    def __init__(self, board, role, depth=4, enableVCT=False):
         self.board = board
         self.role = role
         self.depth = depth
         score = minmax(self.board, self.role, self.depth, self.enableVCT)
         end = time.time()
         sim_time = end - start
+        move = score[1] # this move starts from left up corner (0,0), however, the move in the game starts from left bottom corner (0,0)
+        move = (self.board.size - 1 - move[0], move[1]) # convert the move to the game's coordinate
         # turn tuple into an int
         move = move[0] * self.board.size + move[1]
         if return_time:

Gomoku_MCTS/__init__.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from .mcts_pure import MCTSPlayer as MCTSpure
 from .mcts_alphaZero import MCTSPlayer as alphazero
-# from .dueling_net import PolicyValueNet
-from .policy_value_net_pytorch import PolicyValueNet
 import numpy as np

 from .mcts_pure import MCTSPlayer as MCTSpure
 from .mcts_alphaZero import MCTSPlayer as alphazero
+from .policy_value_net_pytorch import PolicyValueNet as PolicyValueNet_old
+from .policy_value_net_pytorch_new import PolicyValueNet as PolicyValueNet_new
+from .dueling_net import PolicyValueNet as duel_PolicyValueNet
+from .mcts_Gumbel_Alphazero import Gumbel_MCTSPlayer
 import numpy as np

Gomoku_MCTS/checkpoint/2023-12-14-18-16-09_test_teaching_learning_collect_epochs=1000_size=9_model=duel/best_policy.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6278cd8f69e66f42a927df96e1fe3952a6e1e5a41e37f99f315b9bc3febd6d7a
+size 529974

Gomoku_MCTS/checkpoint/2023-12-14-18-16-09_test_teaching_learning_collect_epochs=1000_size=9_model=duel/current_policy.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab24465e4c52e038fda2bafae71550cb89c3f42f59d2ebf12b7a45c2c353eb33
+size 530034

Gomoku_MCTS/checkpoint/2023-12-14-18-17-07_test_teaching_learning_collect_epochs=1000_size=9_model=normal/best_policy.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d43bc56e0bc86c5548d7857b6777b1971d2d14da6f344280b6f81be8595ac710
+size 555837

Gomoku_MCTS/checkpoint/2023-12-14-18-17-07_test_teaching_learning_collect_epochs=1000_size=9_model=normal/current_policy.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ab8cee4afa72bf29d73d0707ad1b92e1d0a24a009721c110c99b2ff5d2f866f
+size 556110

Gomoku_MCTS/checkpoint/2023-12-14-21-19-40_selfplay_epochs=1000_size=9_model=gumbel/best_policy.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8a225a330990d289278d8fa2cbd8bda0a7ea541c3ffd7aac6327d4553ef8683
+size 555837

Gomoku_MCTS/checkpoint/2023-12-14-21-19-40_selfplay_epochs=1000_size=9_model=gumbel/current_policy.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dfdc692e55ba43f6ae714da93ee6182b9adcf8824c292bb597ef0d003c6d10b
+size 556110

Gomoku_MCTS/config/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os, shutil
 import torch
-from tensorboardX import SummaryWriter
-from config.options import *
 import torch.distributed as dist
 import time
@@ -42,13 +42,13 @@ def makedir(path):
         os.makedirs(path, 0o777)
-def visualizer():
-    if get_rank() == 0:
-        # filewriter_path = config['visual_base']+opts.savepath+'/'
-        save_path = make_path()
-        filewriter_path = os.path.join(config['visual_base'], save_path)
-        if opts.clear_visualizer and os.path.exists(filewriter_path):   # 删掉以前的summary，以免重合
-            shutil.rmtree(filewriter_path)
-        makedir(filewriter_path)
-        writer = SummaryWriter(filewriter_path, comment='visualizer')
-        return writer

 import os, shutil
 import torch
+# from tensorboardX import SummaryWriter
+from .options import *
 import torch.distributed as dist
 import time
         os.makedirs(path, 0o777)
+# def visualizer():
+#     if get_rank() == 0:
+#         # filewriter_path = config['visual_base']+opts.savepath+'/'
+#         save_path = make_path()
+#         filewriter_path = os.path.join(config['visual_base'], save_path)
+#         if opts.clear_visualizer and os.path.exists(filewriter_path):   # 删掉以前的summary，以免重合
+#             shutil.rmtree(filewriter_path)
+#         makedir(filewriter_path)
+#         writer = SummaryWriter(filewriter_path, comment='visualizer')
+#         return writer

Gomoku_MCTS/dueling_net.py CHANGED Viewed

@@ -52,16 +52,16 @@ class DuelingDQNNet(nn.Module):
         return F.log_softmax(q_values, dim=1), val
 class PolicyValueNet():
-    """dueling policy-value network """
     def __init__(self, board_width, board_height,
-                 model_file=None, use_gpu=False):
         self.use_gpu = use_gpu
         self.board_width = board_width
         self.board_height = board_height
         self.l2_const = 1e-4  # coef of l2 penalty
         # the policy value net module
         if self.use_gpu:
-            self.policy_value_net = DuelingDQNNet(board_width, board_height).cuda()
         else:
             self.policy_value_net = DuelingDQNNet(board_width, board_height)
         self.optimizer = optim.Adam(self.policy_value_net.parameters(),
@@ -70,7 +70,6 @@ class PolicyValueNet():
         if model_file:
             net_params = torch.load(model_file)
             self.policy_value_net.load_state_dict(net_params, strict=False)
-            print('loaded dueling model file')
     def policy_value(self, state_batch):
         """
@@ -78,7 +77,7 @@ class PolicyValueNet():
         output: a batch of action probabilities and state values
         """
         if self.use_gpu:
-            state_batch = Variable(torch.FloatTensor(state_batch).cuda())
             log_act_probs, value = self.policy_value_net(state_batch)
             act_probs = np.exp(log_act_probs.data.cpu().numpy())
             return act_probs, value.data.cpu().numpy()
@@ -97,16 +96,20 @@ class PolicyValueNet():
         legal_positions = board.availables
         current_state = np.ascontiguousarray(board.current_state().reshape(
                 -1, 4, self.board_width, self.board_height))
         if self.use_gpu:
             log_act_probs, value = self.policy_value_net(
-                    Variable(torch.from_numpy(current_state)).cuda().float())
             act_probs = np.exp(log_act_probs.data.cpu().numpy().flatten())
         else:
             log_act_probs, value = self.policy_value_net(
                     Variable(torch.from_numpy(current_state)).float())
             act_probs = np.exp(log_act_probs.data.numpy().flatten())
         act_probs = zip(legal_positions, act_probs[legal_positions])
-        value = value.data[0][0]
         return act_probs, value
     def train_step(self, state_batch, mcts_probs, winner_batch, lr):
@@ -115,9 +118,9 @@ class PolicyValueNet():
         # self.use_gpu = True
         # wrap in Variable
         if self.use_gpu:
-            state_batch = Variable(torch.FloatTensor(state_batch).cuda())
-            mcts_probs = Variable(torch.FloatTensor(mcts_probs).cuda())
-            winner_batch = Variable(torch.FloatTensor(winner_batch).cuda())
         else:
             state_batch = Variable(torch.FloatTensor(state_batch))
             mcts_probs = Variable(torch.FloatTensor(mcts_probs))

         return F.log_softmax(q_values, dim=1), val
 class PolicyValueNet():
+    """policy-value network """
     def __init__(self, board_width, board_height,
+                 model_file=None, use_gpu=False, device = None):
         self.use_gpu = use_gpu
         self.board_width = board_width
         self.board_height = board_height
         self.l2_const = 1e-4  # coef of l2 penalty
         # the policy value net module
         if self.use_gpu:
+            self.policy_value_net = DuelingDQNNet(board_width, board_height).to(device)
         else:
             self.policy_value_net = DuelingDQNNet(board_width, board_height)
         self.optimizer = optim.Adam(self.policy_value_net.parameters(),
         if model_file:
             net_params = torch.load(model_file)
             self.policy_value_net.load_state_dict(net_params, strict=False)
     def policy_value(self, state_batch):
         """
         output: a batch of action probabilities and state values
         """
         if self.use_gpu:
+            state_batch = Variable(torch.FloatTensor(state_batch).to(device))
             log_act_probs, value = self.policy_value_net(state_batch)
             act_probs = np.exp(log_act_probs.data.cpu().numpy())
             return act_probs, value.data.cpu().numpy()
         legal_positions = board.availables
         current_state = np.ascontiguousarray(board.current_state().reshape(
                 -1, 4, self.board_width, self.board_height))
         if self.use_gpu:
             log_act_probs, value = self.policy_value_net(
+                    Variable(torch.from_numpy(current_state)).to(device).float())
             act_probs = np.exp(log_act_probs.data.cpu().numpy().flatten())
         else:
             log_act_probs, value = self.policy_value_net(
                     Variable(torch.from_numpy(current_state)).float())
             act_probs = np.exp(log_act_probs.data.numpy().flatten())
         act_probs = zip(legal_positions, act_probs[legal_positions])
         return act_probs, value
     def train_step(self, state_batch, mcts_probs, winner_batch, lr):
         # self.use_gpu = True
         # wrap in Variable
         if self.use_gpu:
+            state_batch = Variable(torch.FloatTensor(state_batch).to(device))
+            mcts_probs = Variable(torch.FloatTensor(mcts_probs).to(device))
+            winner_batch = Variable(torch.FloatTensor(winner_batch).to(device))
         else:
             state_batch = Variable(torch.FloatTensor(state_batch))
             mcts_probs = Variable(torch.FloatTensor(mcts_probs))

Gomoku_MCTS/mcts_Gumbel_Alphazero.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-FileName: main_worker.py
 Author: Jiaxin Li
 Create Date: 2023/11/21
 Description: The implement of  Gumbel MCST
@@ -9,11 +9,11 @@ Debug: the dim of output: probs
 import numpy as np
 import copy
-import time
-from config.options import *
 import sys
-from config.utils import *
 def softmax(x):
@@ -22,8 +22,8 @@ def softmax(x):
     return probs
-def _sigma_mano(y ,Nb):
-    return (50 + Nb) * 1.0 * y
 class TreeNode(object):
@@ -42,8 +42,6 @@ class TreeNode(object):
         self._v = 0
         self._p = prior_p
     def expand(self, action_priors):
         """Expand tree by creating new children.
         action_priors: a list of tuples of actions and their prior probability
@@ -52,7 +50,6 @@ class TreeNode(object):
         for action, prob in action_priors:
             if action not in self._children:
                 self._children[action] = TreeNode(self, prob)
     def select(self, v_pi):
         """Select action among children that gives maximum
@@ -62,29 +59,25 @@ class TreeNode(object):
         # if opts.split == "train":
         #     v_pi = v_pi.detach().numpy()
         # print(v_pi)
-        max_N_b = np.max(np.array( [act_node[1]._n_visits   for act_node in   self._children.items()]))
         if opts.split == "train":
-            pi_ = softmax( np.array( [ act_node[1].get_pi(v_pi,max_N_b)  for act_node in   self._children.items() ])).reshape(len(list(self._children.items())) ,-1)
         else:
-            pi_ = softmax( np.array( [ act_node[1].get_pi(v_pi,max_N_b) for act_node in   self._children.items() ])).reshape(len(list(self._children.items())) ,-1)
         # print(pi_.shape)
-        N_a = np.array( [ act_node[1]._n_visits / (1 + self._n_visits)   for act_node in   self._children.items() ]).reshape(pi_.shape[0],-1)
         # print(N_a.shape)
-        max_index=  np.argmax(pi_ - N_a)
         # print((pi_ - N_a).shape)
-        return  list(self._children.items())[max_index]
     def update(self, leaf_value):
         """Update node values from leaf evaluation.
@@ -95,11 +88,9 @@ class TreeNode(object):
         self._n_visits += 1
         # Update Q, a running average of values for all visits.
         if opts.split == "train":
-            self._Q = self._Q +  (1.0*(leaf_value  - self._Q ) / self._n_visits)
-        else:
-            self._Q += (1.0*(leaf_value - self._Q) / self._n_visits)
     def update_recursive(self, leaf_value):
         """Like a call to update(), but applied recursively for all ancestors.
@@ -109,14 +100,13 @@ class TreeNode(object):
             self._parent.update_recursive(-leaf_value)
         self.update(leaf_value)
-    def get_pi(self,v_pi,max_N_b):
         if self._n_visits == 0:
             Q_completed = v_pi
         else:
             Q_completed = self._Q
-        return  self._p + _sigma_mano(Q_completed,max_N_b)
     def get_value(self, c_puct):
         """Calculate and return the value for this node.
@@ -155,9 +145,6 @@ class Gumbel_MCTS(object):
         self._c_puct = c_puct
         self._n_playout = n_playout
     def Gumbel_playout(self, child_node, child_state):
         """Run a single playout from the child of the root to the leaf, getting a value at
         the leaf and propagating it back through its parents.
@@ -166,28 +153,26 @@ class Gumbel_MCTS(object):
         """
         node = child_node
         state = child_state
-        while(1):
             if node.is_leaf():
                 break
             # Greedily select next move.
             action, node = node.select(node._v)
-            state.do_move(action)
         # Evaluate the leaf using a network which outputs a list of
         # (action, probability) tuples p and also a score v in [-1, 1]
         # for the current player.
         action_probs, leaf_value = self._policy(state)
-        leaf_value = leaf_value.detach().numpy()[0][0]
-        node._v = leaf_value
         # Check for end of game.
         end, winner = state.game_end()
@@ -204,24 +189,19 @@ class Gumbel_MCTS(object):
         # Update value and visit count of nodes in this traversal.
         node.update_recursive(-leaf_value)
-    def top_k(self,x, k):
         # print("x",x.shape)
         # print("k ", k)
         return np.argpartition(x, k)[..., -k:]
-    def sample_k(self,logits, k):
         u = np.random.uniform(size=np.shape(logits))
         z = -np.log(-np.log(u))
-        return self.top_k(logits + z, k),z
-    def get_move_probs(self, state, temp=1e-3,m_action = 16):
         """Run all playouts sequentially and return the available actions and
         their corresponding probabilities.
         state: the current game state
@@ -231,92 +211,102 @@ class Gumbel_MCTS(object):
         # logits 暂定为 p
         start_time = time.time()
         # 对根节点进行拓展
         act_probs, leaf_value = self._policy(state)
-        act_probs =  list(act_probs)
-        leaf_value = leaf_value.detach().numpy()[0][0]
         # print(list(act_probs))
-        porbs = [prob  for act,prob in (act_probs)]
-        self._root.expand(act_probs)
         n = self._n_playout
-        m = min( m_action,int(len( porbs) / 2))
         # 先进行Gumbel 分布采样，不重复的采样前m个动作，对应选择公式 logits + g
-        A_topm ,g = self.sample_k(porbs , m)
         # 获得state选取每个action后对应的状态，保存到一个列表中
         root_childs = list(self._root._children.items())
         child_state_m = []
         for i in range(m):
             state_copy = copy.deepcopy(state)
-            action,node = root_childs[A_topm[i]]
             state_copy.do_move(action)
             child_state_m.append(state_copy)
-        # 每轮对选择的动作进行的仿真次数
-        N = int( n /( np.log(m) * m ))
         # 进行sequential halving with Gumbel
         while m >= 1:
             # 对每个选择的动作进行仿真
             for i in range(m):
                 action_state = child_state_m[i]
-                action,node = root_childs[A_topm[i]]
                 for j in range(N):
                     action_state_copy = copy.deepcopy(action_state)
                     # 对选择动作进行仿真: 即找到这个子树的叶节点，然后再网络中预测v，然后往上回溯的过程
                     self.Gumbel_playout(node, action_state_copy)
             # 每轮不重复采样的动作个数减半
-            m = m //2
             # 不是最后一轮,单轮仿真次数加倍
-            if(m != 1):
                 n = n - N
                 N *= 2
             # 当最后一轮时,只有一个动作,把所有仿真次数用完
             else:
                 N = n
             # 进行新的一轮不重复采样, 采样在之前的动作前一半的动作, 对应公式 g + logits + \sigma( \hat{q} )
             # print([action_node[1]._Q for action_node in self._root._children.items()  ])
-            q_hat = np.array([action_node[1]._Q for action_node in self._root._children.items()  ])
-            assert(np.sum(q_hat[A_topm] == 0) == 0  )
-            A_index = self.top_k( np.array(porbs)[A_topm] +  np.array(g)[A_topm]  +  q_hat[A_topm]  , m)
             A_topm = np.array(A_topm)[A_index]
             child_state_m = np.array(child_state_m)[A_index]
         # 最后返回对应的决策函数, 即 pi' = softmax(logits + sigma(completed Q))
-        max_N_b = np.max(np.array( [act_node[1]._n_visits   for act_node in   self._root._children.items()]  ))
-        final_act_probs=    softmax( np.array( [ act_node[1].get_pi(leaf_value, max_N_b)   for act_node in   self._root._children.items() ]))
-        action =  ( np.array( [ act_node[0]   for act_node in   self._root._children.items() ]))
         need_time = time.time() - start_time
-        print(f" Gumbel Alphazero sum_time: {need_time  }, total_simulation: {self._n_playout}")
-        return   np.array(list(self._root._children.items()))[A_topm][0][0], action,  final_act_probs , need_time
     def update_with_move(self, last_move):
         """Step forward in the tree, keeping everything we already know
@@ -336,50 +326,53 @@ class Gumbel_MCTSPlayer(object):
     """AI player based on MCTS"""
     def __init__(self, policy_value_function,
-                 c_puct=5, n_playout=2000, is_selfplay=0,m_action = 16):
         self.mcts = Gumbel_MCTS(policy_value_function, c_puct, n_playout)
         self._is_selfplay = is_selfplay
         self.m_action = m_action
     def set_player_ind(self, p):
         self.player = p
     def reset_player(self):
         self.mcts.update_with_move(-1)
-    def get_action(self, board, temp=1e-3, return_prob=0,return_time = False):
         sensible_moves = board.availables
         # the pi vector returned by MCTS as in the alphaGo Zero paper
-        move_probs = np.zeros(board.width*board.height)
         if len(sensible_moves) > 0:
             # 在搜索树中利用sequential halving with Gumbel 来进行动作选择 并且返回对应的决策函数
-            move, acts, probs,simul_mean_time  = self.mcts.get_move_probs(board, temp,self.m_action)
             # 重置搜索树
             self.mcts.update_with_move(-1)
             move_probs[list(acts)] = probs
             if return_time:
                 if return_prob:
-                    return move, move_probs,simul_mean_time
                 else:
-                    return move,simul_mean_time
             else:
                 if return_prob:
                     return move, move_probs
                 else:
                     return move

 """
+FileName: mcts_Gumbel_Alphazero.py
 Author: Jiaxin Li
 Create Date: 2023/11/21
 Description: The implement of  Gumbel MCST
 import numpy as np
 import copy
+import time
+from .config.options import *
 import sys
+from .config.utils import *
 def softmax(x):
     return probs
+def _sigma_mano(y, Nb):
+    return (50 + Nb) * 1.0 * y
 class TreeNode(object):
         self._v = 0
         self._p = prior_p
     def expand(self, action_priors):
         """Expand tree by creating new children.
         action_priors: a list of tuples of actions and their prior probability
         for action, prob in action_priors:
             if action not in self._children:
                 self._children[action] = TreeNode(self, prob)
     def select(self, v_pi):
         """Select action among children that gives maximum
         # if opts.split == "train":
         #     v_pi = v_pi.detach().numpy()
         # print(v_pi)
+        max_N_b = np.max(np.array([act_node[1]._n_visits for act_node in self._children.items()]))
         if opts.split == "train":
+            pi_ = softmax(np.array([act_node[1].get_pi(v_pi, max_N_b) for act_node in self._children.items()])).reshape(
+                len(list(self._children.items())), -1)
         else:
+            pi_ = softmax(np.array([act_node[1].get_pi(v_pi, max_N_b) for act_node in self._children.items()])).reshape(
+                len(list(self._children.items())), -1)
         # print(pi_.shape)
+        N_a = np.array([act_node[1]._n_visits / (1 + self._n_visits) for act_node in self._children.items()]).reshape(
+            pi_.shape[0], -1)
         # print(N_a.shape)
+        max_index = np.argmax(pi_ - N_a)
         # print((pi_ - N_a).shape)
+        return list(self._children.items())[max_index]
     def update(self, leaf_value):
         """Update node values from leaf evaluation.
         self._n_visits += 1
         # Update Q, a running average of values for all visits.
         if opts.split == "train":
+            self._Q = self._Q + (1.0 * (leaf_value - self._Q) / self._n_visits)
+        else:
+            self._Q += (1.0 * (leaf_value - self._Q) / self._n_visits)
     def update_recursive(self, leaf_value):
         """Like a call to update(), but applied recursively for all ancestors.
             self._parent.update_recursive(-leaf_value)
         self.update(leaf_value)
+    def get_pi(self, v_pi, max_N_b):
         if self._n_visits == 0:
             Q_completed = v_pi
         else:
             Q_completed = self._Q
+        return self._p + _sigma_mano(Q_completed, max_N_b)
     def get_value(self, c_puct):
         """Calculate and return the value for this node.
         self._c_puct = c_puct
         self._n_playout = n_playout
     def Gumbel_playout(self, child_node, child_state):
         """Run a single playout from the child of the root to the leaf, getting a value at
         the leaf and propagating it back through its parents.
         """
         node = child_node
         state = child_state
+        while (1):
             if node.is_leaf():
                 break
             # Greedily select next move.
             action, node = node.select(node._v)
+            state.do_move(action)
         # Evaluate the leaf using a network which outputs a list of
         # (action, probability) tuples p and also a score v in [-1, 1]
         # for the current player.
         action_probs, leaf_value = self._policy(state)
+        # leaf_value = leaf_value.detach().numpy()[0][0]
+        leaf_value = leaf_value.detach().numpy()
+        node._v = leaf_value
         # Check for end of game.
         end, winner = state.game_end()
         # Update value and visit count of nodes in this traversal.
         node.update_recursive(-leaf_value)
+    def top_k(self, x, k):
         # print("x",x.shape)
         # print("k ", k)
         return np.argpartition(x, k)[..., -k:]
+    def sample_k(self, logits, k):
         u = np.random.uniform(size=np.shape(logits))
         z = -np.log(-np.log(u))
+        return self.top_k(logits + z, k), z
+    def get_move_probs(self, state, temp=1e-3, m_action=16):
         """Run all playouts sequentially and return the available actions and
         their corresponding probabilities.
         state: the current game state
         # logits 暂定为 p
         start_time = time.time()
         # 对根节点进行拓展
         act_probs, leaf_value = self._policy(state)
+        act_probs = list(act_probs)
+        # leaf_value = leaf_value.detach().numpy()[0][0]
+        leaf_value = leaf_value.detach().numpy()
         # print(list(act_probs))
+        porbs = [prob for act, prob in (act_probs)]
+        self._root.expand(act_probs)
         n = self._n_playout
+        m = min(m_action, int(len(porbs) / 2))
         # 先进行Gumbel 分布采样，不重复的采样前m个动作，对应选择公式 logits + g
+        A_topm, g = self.sample_k(porbs, m)
         # 获得state选取每个action后对应的状态，保存到一个列表中
         root_childs = list(self._root._children.items())
         child_state_m = []
         for i in range(m):
             state_copy = copy.deepcopy(state)
+            action, node = root_childs[A_topm[i]]
             state_copy.do_move(action)
             child_state_m.append(state_copy)
+        print(porbs)
+        print("depend on:", np.array(porbs)[A_topm])
+        print(f"A_topm_{m}", A_topm)
+        print("m ", m)
+        if m > 1:
+            # 每轮对选择的动作进行的仿真次数
+            N = int(n / (np.log(m) * m))
+        else:
+            N = n
         # 进行sequential halving with Gumbel
         while m >= 1:
             # 对每个选择的动作进行仿真
             for i in range(m):
                 action_state = child_state_m[i]
+                action, node = root_childs[A_topm[i]]
                 for j in range(N):
                     action_state_copy = copy.deepcopy(action_state)
                     # 对选择动作进行仿真: 即找到这个子树的叶节点，然后再网络中预测v，然后往上回溯的过程
                     self.Gumbel_playout(node, action_state_copy)
             # 每轮不重复采样的动作个数减半
+            m = m // 2
             # 不是最后一轮,单轮仿真次数加倍
+            if (m != 1):
                 n = n - N
                 N *= 2
             # 当最后一轮时,只有一个动作,把所有仿真次数用完
             else:
                 N = n
             # 进行新的一轮不重复采样, 采样在之前的动作前一半的动作, 对应公式 g + logits + \sigma( \hat{q} )
             # print([action_node[1]._Q for action_node in self._root._children.items()  ])
+            q_hat = np.array([action_node[1]._Q for action_node in self._root._children.items()])
+            assert (np.sum(q_hat[A_topm] == 0) == 0)
+            print("depend on:", np.array(porbs)[A_topm] + np.array(g)[A_topm] + q_hat[A_topm])
+            print(f"A_topm_{m}", A_topm)
+            A_index = self.top_k(np.array(porbs)[A_topm] + np.array(g)[A_topm] + q_hat[A_topm], m)
             A_topm = np.array(A_topm)[A_index]
             child_state_m = np.array(child_state_m)[A_index]
         # 最后返回对应的决策函数, 即 pi' = softmax(logits + sigma(completed Q))
+        max_N_b = np.max(np.array([act_node[1]._n_visits for act_node in self._root._children.items()]))
+        final_act_probs = softmax(
+            np.array([act_node[1].get_pi(leaf_value, max_N_b) for act_node in self._root._children.items()]))
+        action = (np.array([act_node[0] for act_node in self._root._children.items()]))
+        print("final_act_prbs", final_act_probs)
+        print("move :", action)
+        print("final_action", np.array(list(self._root._children.items()))[A_topm][0][0])
+        print("argmax_prob", np.argmax(final_act_probs))
         need_time = time.time() - start_time
+        print(f" Gumbel Alphazero sum_time: {need_time}, total_simulation: {self._n_playout}")
+        return np.array(list(self._root._children.items()))[A_topm][0][0], action, final_act_probs, need_time
     def update_with_move(self, last_move):
         """Step forward in the tree, keeping everything we already know
     """AI player based on MCTS"""
     def __init__(self, policy_value_function,
+                 c_puct=5, n_playout=2000, is_selfplay=0, m_action=16):
         self.mcts = Gumbel_MCTS(policy_value_function, c_puct, n_playout)
         self._is_selfplay = is_selfplay
         self.m_action = m_action
     def set_player_ind(self, p):
         self.player = p
     def reset_player(self):
         self.mcts.update_with_move(-1)
+    def get_action(self, board, temp=1e-3, return_prob=0, return_time=False):
         sensible_moves = board.availables
         # the pi vector returned by MCTS as in the alphaGo Zero paper
+        move_probs = np.zeros(board.width * board.height)
         if len(sensible_moves) > 0:
             # 在搜索树中利用sequential halving with Gumbel 来进行动作选择 并且返回对应的决策函数
+            move, acts, probs, simul_mean_time = self.mcts.get_move_probs(board, temp, self.m_action)
             # 重置搜索树
             self.mcts.update_with_move(-1)
             move_probs[list(acts)] = probs
+            move_probs = np.zeros(move_probs.shape[0])
+            move_probs[move] = 1
+            print("final prob:", move_probs)
+            print("arg_max:", np.argmax(move_probs))
+            print("max", np.max(move_probs))
+            print("move", move)
+            # 他通过训练能够使得最后move_probs 有一个位置趋近于1,即得到一个策略
+            # 关键是他的策略,和MCTS得到move不一致,怀疑是分布策略计算的问题
             if return_time:
                 if return_prob:
+                    return move, move_probs, simul_mean_time
                 else:
+                    return move, simul_mean_time
             else:
                 if return_prob:
                     return move, move_probs
                 else:
                     return move

Gomoku_MCTS/policy_value_net_pytorch_new.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# -*- coding: utf-8 -*-
+"""
+An implementation of the policyValueNet in PyTorch
+Tested in PyTorch 0.2.0 and 0.3.0
+@author: Junxiao Song
+"""
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+import numpy as np
+def set_learning_rate(optimizer, lr):
+    """Sets the learning rate to the given value"""
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+class ResidualBlock(nn.Module):
+    def __init__(self, channels):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(channels)
+        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(channels)
+    def forward(self, x):
+        residual = x
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += residual
+        return F.relu(out)
+class Net(nn.Module):
+    """Policy-Value network module for AlphaZero Gomoku."""
+    def __init__(self, board_width, board_height, num_residual_blocks=5):
+        super(Net, self).__init__()
+        self.board_width = board_width
+        self.board_height = board_height
+        self.conv1 = nn.Conv2d(4, 32, kernel_size=3, padding=1)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.res_layers = nn.Sequential(*[ResidualBlock(32) for _ in range(num_residual_blocks)])
+        # Action Policy layers
+        self.act_conv1 = nn.Conv2d(32, 4, kernel_size=1)
+        self.act_fc1 = nn.Linear(4 * board_width * board_height, board_width * board_height)
+        # State Value layers
+        self.val_conv1 = nn.Conv2d(32, 2, kernel_size=1)
+        self.val_fc1 = nn.Linear(2 * board_width * board_height, 64)
+        self.val_fc2 = nn.Linear(64, 1)
+    def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = self.res_layers(x)
+        # Action Policy head
+        x_act = F.relu(self.act_conv1(x))
+        x_act = x_act.view(-1, 4 * self.board_width * self.board_height)
+        x_act = F.log_softmax(self.act_fc1(x_act), dim=1)
+        # State Value head
+        x_val = F.relu(self.val_conv1(x))
+        x_val = x_val.view(-1, 2 * self.board_width * self.board_height)
+        x_val = F.relu(self.val_fc1(x_val))
+        x_val = torch.tanh(self.val_fc2(x_val))
+        return x_act, x_val
+class PolicyValueNet():
+    """policy-value network """
+    def __init__(self, board_width, board_height,
+                 model_file=None, use_gpu=False, bias = False):
+        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+        self.use_gpu = use_gpu
+        self.l2_const = 1e-4  # coef of l2 penalty
+        self.board_width = board_width
+        self.board_height = board_height
+        self.bias = bias
+        if model_file:
+            net_params = torch.load(model_file, map_location='cpu' if not use_gpu else None)
+            # Infer board dimensions from the loaded model
+            inferred_width, inferred_height = self.infer_board_size_from_model(net_params)
+            if inferred_width and inferred_height:
+                self.policy_value_net = Net(inferred_width, inferred_height).to(self.device) if use_gpu else Net(
+                    inferred_width, inferred_height)
+                self.policy_value_net.load_state_dict(net_params)
+                print("Use model file to initialize the policy value net")
+            else:
+                raise Exception("The model file does not contain the board dimensions")
+            if inferred_width < board_width:
+                self.use_conv = True
+            elif inferred_width > board_width:
+                raise Exception("The model file has a larger board size than the current board size!!")
+        else:
+            # the policy value net module
+            if self.use_gpu:
+                self.policy_value_net = Net(board_width, board_height).to(self.device)
+            else:
+                self.policy_value_net = Net(board_width, board_height)
+        self.optimizer = optim.Adam(self.policy_value_net.parameters(),
+                                        weight_decay=self.l2_const)
+    def infer_board_size_from_model(self, model):
+        # Use the size of the act_fc1 layer to infer board dimensions
+        for name in model.keys():
+            if name == 'act_fc1.weight':
+                # Assuming the weight shape is [board_width * board_height, 4 * board_width * board_height]
+                c, _ = model[name].shape
+                print(f"act_fc1.weight shape: {model[name].shape}")
+                board_size = int(c ** 0.5)  # Extracting board_width/height assuming they are the same
+                print(f"Board size inferred from model: {board_size}x{board_size}")
+                return board_size, board_size
+        return None
+    def apply_normal_bias(self, tensor, mean=0, std=1):
+        bsize = tensor.shape[0]
+        x, y = np.meshgrid(np.linspace(-1, 1, bsize), np.linspace(-1, 1, bsize))
+        d = np.sqrt(x * x + y * y)
+        sigma, mu = 1.0, 0.0
+        gauss = np.exp(-((d - mu) ** 2 / (2.0 * sigma ** 2)))
+        # Applying the bias only to non-zero elements
+        biased_tensor = tensor - (tensor != 0) * gauss
+        return biased_tensor
+    def policy_value(self, state_batch):
+        """
+        input: a batch of states
+        output: a batch of action probabilities and state values
+        """
+        if self.use_gpu:
+            state_batch = Variable(torch.FloatTensor(state_batch).to(self.device))
+            log_act_probs, value = self.policy_value_net(state_batch)
+            act_probs = np.exp(log_act_probs.data.cpu().numpy())
+            return act_probs, value.data.cpu().numpy()
+        else:
+            state_batch = Variable(torch.FloatTensor(state_batch))
+            log_act_probs, value = self.policy_value_net(state_batch)
+            act_probs = np.exp(log_act_probs.data.numpy())
+            return act_probs, value.data.numpy()
+    def policy_value_fn(self, board):
+        """
+        input: board
+        output: a list of (action, probability) tuples for each available
+        action and the score of the board state
+        """
+        legal_positions = board.availables
+        current_state = np.ascontiguousarray(board.current_state().reshape(
+            -1, 4, self.board_width, self.board_height))
+        if self.bias:
+            current_state[0][1] = self.apply_normal_bias(current_state[0][1])
+        if self.use_gpu:
+            log_act_probs, value = self.policy_value_net(
+                Variable(torch.from_numpy(current_state)).to(self.device).float())
+            act_probs = np.exp(log_act_probs.data.cpu().numpy().flatten())
+        else:
+            log_act_probs, value = self.policy_value_net(
+                Variable(torch.from_numpy(current_state)).float())
+            act_probs = np.exp(log_act_probs.data.numpy().flatten())
+        act_probs = zip(legal_positions, act_probs[legal_positions])
+        value = value.data[0][0]
+        return act_probs, value
+    def train_step(self, state_batch, mcts_probs, winner_batch, lr):
+        """perform a training step"""
+        # self.use_gpu = True
+        # wrap in Variable
+        if self.use_gpu:
+            state_batch = Variable(torch.FloatTensor(state_batch).to(self.device))
+            mcts_probs = Variable(torch.FloatTensor(mcts_probs).to(self.device))
+            winner_batch = Variable(torch.FloatTensor(winner_batch).to(self.device))
+        else:
+            state_batch = Variable(torch.FloatTensor(state_batch))
+            mcts_probs = Variable(torch.FloatTensor(mcts_probs))
+            winner_batch = Variable(torch.FloatTensor(winner_batch))
+        # zero the parameter gradients
+        self.optimizer.zero_grad()
+        # set learning rate
+        set_learning_rate(self.optimizer, lr)
+        # forward
+        log_act_probs, value = self.policy_value_net(state_batch)
+        # define the loss = (z - v)^2 - pi^T * log(p) + c||theta||^2
+        # Note: the L2 penalty is incorporated in optimizer
+        value_loss = F.mse_loss(value.view(-1), winner_batch)
+        policy_loss = -torch.mean(torch.sum(mcts_probs * log_act_probs, 1))
+        loss = value_loss + policy_loss
+        # backward and optimize
+        loss.backward()
+        self.optimizer.step()
+        # calc policy entropy, for monitoring only
+        entropy = -torch.mean(
+            torch.sum(torch.exp(log_act_probs) * log_act_probs, 1)
+        )
+        # for pytorch version >= 0.5 please use the following line instead.
+        return loss.item(), entropy.item()
+    def get_policy_param(self):
+        net_params = self.policy_value_net.state_dict()
+        return net_params
+    def save_model(self, model_file):
+        """ save model params to file """
+        net_params = self.get_policy_param()  # get model params
+        torch.save(net_params, model_file)
+if __name__ == "__main__":
+    import torch
+    import torch.onnx
+    # 假设您的 Net 模型已经定义好了
+    model = Net(board_width=9, board_height=9)  # 使用适当的参数初始化模型
+    dummy_input = torch.randn(1, 4, 9, 9)  # 创建一个示例输入
+    # 将模型导出到 ONNX 格式
+    torch.onnx.export(model, dummy_input, "model.onnx", verbose=True)

const.py CHANGED Viewed

@@ -9,7 +9,7 @@ import numpy as np
 _AI_AID_INFO = ["Use AI Aid", "Close AI Aid"]
-_BOARD_SIZE = 8
 _BOARD_SIZE_1D = _BOARD_SIZE * _BOARD_SIZE
 _BLANK = 0
 _BLACK = 1
@@ -68,3 +68,10 @@ _ROOM_COLOR = {
     True: _BLACK,
     False: _WHITE,
 }

 _AI_AID_INFO = ["Use AI Aid", "Close AI Aid"]
+_BOARD_SIZE = 9
 _BOARD_SIZE_1D = _BOARD_SIZE * _BOARD_SIZE
 _BLANK = 0
 _BLACK = 1
     True: _BLACK,
     False: _WHITE,
 }
+_MODEL_PATH = {
+    "AlphaZero": "/Users/husky/GomokuDemo/Gomoku_MCTS/checkpoint/2023-12-14-18-17-07_test_teaching_learning_collect_epochs=1000_size=9_model=normal/best_policy.model",
+    "duel": "/Users/husky/GomokuDemo/Gomoku_MCTS/checkpoint/2023-12-14-18-16-09_test_teaching_learning_collect_epochs=1000_size=9_model=duel/best_policy.model",
+    "Gumbel AlphaZero": "/Users/husky/GomokuDemo/Gomoku_MCTS/checkpoint/2023-12-14-21-19-40_selfplay_epochs=1000_size=9_model=gumbel/best_policy.model",
+}

pages/Player_VS_AI.py CHANGED Viewed

@@ -15,13 +15,14 @@ import numpy as np
 import streamlit as st
 from scipy.signal import convolve  # this is used to check if any player wins
 from streamlit import session_state
 from streamlit_server_state import server_state, server_state_lock
-from Gomoku_MCTS import MCTSpure, alphazero, Board, PolicyValueNet
 from Gomoku_Bot import Gomoku_bot
 from Gomoku_Bot import Board as Gomoku_bot_board
-import matplotlib.pyplot as plt
 from const import (
     _BLACK,  # 1, for human
@@ -37,10 +38,10 @@ from const import (
     _DIAGONAL_UP_RIGHT,
     _BOARD_SIZE,
     _BOARD_SIZE_1D,
-    _AI_AID_INFO
 )
 from ai import (
     BOS_TOKEN_ID,
     generate_gpt2,
@@ -63,14 +64,23 @@ class Room:
         self.TIME = time.time()
         self.gomoku_bot_board = Gomoku_bot_board(_BOARD_SIZE, 1)
         self.MCTS_dict = {'Pure MCTS': MCTSpure(c_puct=5, n_playout=1000),
-                          'AlphaZero': alphazero(PolicyValueNet(_BOARD_SIZE, _BOARD_SIZE, 'Gomoku_MCTS/checkpoints/best_policy_8_8_5_2torch.pth').policy_value_fn, c_puct=5, n_playout=100),
                           'Gomoku Bot': Gomoku_bot(self.gomoku_bot_board, -1)}
         self.MCTS = self.MCTS_dict['AlphaZero']
         self.last_mcts = self.MCTS
         self.AID_MCTS = self.MCTS_dict['AlphaZero']
         self.COORDINATE_1D = [BOS_TOKEN_ID]
         self.current_move = -1
-        self.simula_time_list = []
 def change_turn(cur):
@@ -90,9 +100,9 @@ if "ROOMS" not in server_state:
     with server_state_lock["ROOMS"]:
         server_state.ROOMS = {}
 def handle_oppo_model_selection():
     if st.session_state['selected_oppo_model'] == 'Gomoku Bot':
-        session_state.ROOM.last_mcts = session_state.ROOM.MCTS # since use different mechanism, store previous mcts first
         session_state.ROOM.MCTS = session_state.ROOM.MCTS_dict['Gomoku Bot']
         return
     else:
@@ -100,20 +110,22 @@ def handle_oppo_model_selection():
         new_mct = session_state.ROOM.MCTS_dict[st.session_state['selected_oppo_model']]
         new_mct.mcts._root = deepcopy(TreeNode)
         session_state.ROOM.MCTS = new_mct
-        session_state.ROOM.last_mcts  = new_mct
     return
 def handle_aid_model_selection():
     if st.session_state['selected_aid_model'] == 'None':
         session_state.USE_AIAID = False
         return
     session_state.USE_AIAID = True
-    TreeNode = session_state.ROOM.MCTS.mcts._root # use the same tree node
     new_mct = session_state.ROOM.MCTS_dict[st.session_state['selected_aid_model']]
     new_mct.mcts._root = deepcopy(TreeNode)
     session_state.ROOM.AID_MCTS = new_mct
     return
 if 'selected_oppo_model' not in st.session_state:
     st.session_state['selected_oppo_model'] = 'AlphaZero'  # 默认值
@@ -125,7 +137,9 @@ TITLE = st.empty()
 Model_Switch = st.empty()
 TITLE.header("🤖 AI 3603 Gomoku")
-selected_oppo_option = Model_Switch.selectbox('Select Opponent Model', ['Pure MCTS', 'AlphaZero','Gomoku Bot'], index=1, key='oppo_model')
 if st.session_state['selected_oppo_model'] != selected_oppo_option:
     st.session_state['selected_oppo_model'] = selected_oppo_option
@@ -149,9 +163,11 @@ MULTIPLAYER_TAG = st.sidebar.empty()
 with st.sidebar.container():
     ANOTHER_ROUND = st.empty()
     RESTART = st.empty()
     AIAID = st.empty()
     EXIT = st.empty()
-selected_aid_option = AIAID.selectbox('Select Assistant Model', ['None', 'Pure MCTS', 'AlphaZero'], index=0, key='aid_model')
 if st.session_state['selected_aid_model'] != selected_aid_option:
     st.session_state['selected_aid_model'] = selected_aid_option
     handle_aid_model_selection()
@@ -174,7 +190,6 @@ GAME_INFO.markdown(
 )
 def restart() -> None:
     """
     Restart the game.
@@ -182,12 +197,56 @@ def restart() -> None:
     session_state.ROOM = Room(session_state.ROOM.ROOM_ID)
     st.session_state['selected_oppo_model'] = 'AlphaZero'
 RESTART.button(
     "Reset",
     on_click=restart,
     help="Clear the board as well as the scores",
 )
 # Draw the board
 def gomoku():
@@ -207,13 +266,24 @@ def gomoku():
         session_state.ROOM.BOARD = Board(width=_BOARD_SIZE, height=_BOARD_SIZE, n_in_row=5)
         session_state.ROOM.gomoku_bot_board = Gomoku_bot_board(_BOARD_SIZE, 1)
         session_state.ROOM.MCTS_dict = {'Pure MCTS': MCTSpure(c_puct=5, n_playout=1000),
-                          'AlphaZero': alphazero(PolicyValueNet(_BOARD_SIZE, _BOARD_SIZE, 'Gomoku_MCTS/checkpoints/best_policy_8_8_5_2torch.pth').policy_value_fn, c_puct=5, n_playout=100),
-                          'Gomoku Bot': Gomoku_bot(session_state.ROOM.gomoku_bot_board, -1)}
         session_state.ROOM.MCTS = session_state.ROOM.MCTS_dict[st.session_state['selected_oppo_model']]
         session_state.ROOM.last_mcts = session_state.ROOM.MCTS
         session_state.ROOM.PLAYER = session_state.ROOM.PLAYER
         session_state.ROOM.TURN = session_state.ROOM.PLAYER
         session_state.ROOM.WINNER = _BLANK  # 0
         session_state.ROOM.COORDINATE_1D = [BOS_TOKEN_ID]
     # Room status sync
@@ -310,7 +380,8 @@ def gomoku():
             session_state.ROOM.current_move = move
             session_state.ROOM.BOARD.do_move(move)
             # Gomoku Bot BOARD
-            session_state.ROOM.MCTS_dict["Gomoku Bot"].board.put(move // _BOARD_SIZE, move % _BOARD_SIZE)
             session_state.ROOM.BOARD.board_map[x][y] = session_state.ROOM.TURN
             session_state.ROOM.COORDINATE_1D.append(x * _BOARD_SIZE + y)
@@ -356,7 +427,7 @@ def gomoku():
                                 _PLAYER_SYMBOL[_NEW],
                                 key=f"{i}:{j}",
                                 args=(i, j),
-                                on_click=handle_click,
                             )
                         else:
                             # disable click for GPT choices
@@ -424,7 +495,7 @@ def gomoku():
                     move, simul_time = session_state.ROOM.MCTS.get_action(session_state.ROOM.BOARD, return_time=True)
                 else:
                     move, simul_time = session_state.ROOM.MCTS.get_action(return_time=True)
-                session_state.ROOM.simula_time_list.append(simul_time)
                 print("AI takes move: ", move)
                 session_state.ROOM.current_move = move
                 gpt_response = move
@@ -436,7 +507,8 @@ def gomoku():
                 # MCTS BOARD
                 session_state.ROOM.BOARD.do_move(move)
                 # Gomoku Bot BOARD
-                session_state.ROOM.MCTS_dict["Gomoku Bot"].board.put(move // _BOARD_SIZE, move % _BOARD_SIZE)
                 # session_state.ROOM.BOARD[gpt_i][gpt_j] = session_state.ROOM.TURN
                 session_state.ROOM.COORDINATE_1D.append(gpt_i * _BOARD_SIZE + gpt_j)
@@ -475,7 +547,8 @@ def gomoku():
                                     on_click=forbid_click
                                 )
                         else:
-                            if session_state.USE_AIAID and i * _BOARD_SIZE + j in top_five_acts and not session_state.ROOM.BOARD.game_end()[0]:
                                 # enable click for other cells available for human choices
                                 prob = top_five_probs[top_five_acts.index(i * _BOARD_SIZE + j)]
                                 BOARD_PLATE[i][j].button(
@@ -493,7 +566,6 @@ def gomoku():
                                     args=(i, j),
                                 )
             message.markdown(
                 'AI agent has calculated its strategy, which takes <span style="color: blue; font-size: 20px;">{:.3e}</span>s per simulation.'.format(
                     simul_time),
@@ -533,6 +605,7 @@ def gomoku():
         else:
             draw_board(True)
         if session_state.ROOM.WINNER != _BLANK or 0 not in session_state.ROOM.BOARD.board_map:
             ANOTHER_ROUND.button(
                 "Play Next round!",
                 on_click=another_round,
@@ -560,13 +633,12 @@ def gomoku():
         # draw the plot for simulation time
         # 创建一个 DataFrame
-        # print(session_state.ROOM.simula_time_list)
         st.markdown("<br>", unsafe_allow_html=True)
         st.markdown("<br>", unsafe_allow_html=True)
-        chart_data = pd.DataFrame(session_state.ROOM.simula_time_list, columns=["Simulation Time"])
         st.line_chart(chart_data)
     game_control()
     update_info()

 import streamlit as st
 from scipy.signal import convolve  # this is used to check if any player wins
 from streamlit import session_state
+from streamlit.delta_generator import DeltaGenerator
 from streamlit_server_state import server_state, server_state_lock
+from Gomoku_MCTS import MCTSpure, alphazero, Board, PolicyValueNet_old, PolicyValueNet_new, duel_PolicyValueNet, \
+    Gumbel_MCTSPlayer
 from Gomoku_Bot import Gomoku_bot
 from Gomoku_Bot import Board as Gomoku_bot_board
+import matplotlib.pyplot as plt
 from const import (
     _BLACK,  # 1, for human
     _DIAGONAL_UP_RIGHT,
     _BOARD_SIZE,
     _BOARD_SIZE_1D,
+    _AI_AID_INFO,
+    _MODEL_PATH
 )
 from ai import (
     BOS_TOKEN_ID,
     generate_gpt2,
         self.TIME = time.time()
         self.gomoku_bot_board = Gomoku_bot_board(_BOARD_SIZE, 1)
         self.MCTS_dict = {'Pure MCTS': MCTSpure(c_puct=5, n_playout=1000),
+                          'AlphaZero': alphazero(PolicyValueNet_new(_BOARD_SIZE, _BOARD_SIZE,
+                                                                    _MODEL_PATH["AlphaZero"]).policy_value_fn,
+                                                 c_puct=5, n_playout=100),
+                          'duel': alphazero(duel_PolicyValueNet(_BOARD_SIZE, _BOARD_SIZE,
+                                                                _MODEL_PATH["duel"]).policy_value_fn,
+                                            c_puct=5, n_playout=100),
+                          'Gumbel AlphaZero': Gumbel_MCTSPlayer(PolicyValueNet_new(_BOARD_SIZE, _BOARD_SIZE,
+                                                                                   _MODEL_PATH["Gumbel AlphaZero"]).policy_value_fn,
+                                                                c_puct=5, n_playout=100, m_action=8),
                           'Gomoku Bot': Gomoku_bot(self.gomoku_bot_board, -1)}
         self.MCTS = self.MCTS_dict['AlphaZero']
         self.last_mcts = self.MCTS
         self.AID_MCTS = self.MCTS_dict['AlphaZero']
         self.COORDINATE_1D = [BOS_TOKEN_ID]
         self.current_move = -1
+        self.ai_simula_time_list = []
+        self.human_simula_time_list = []
 def change_turn(cur):
     with server_state_lock["ROOMS"]:
         server_state.ROOMS = {}
 def handle_oppo_model_selection():
     if st.session_state['selected_oppo_model'] == 'Gomoku Bot':
         session_state.ROOM.MCTS = session_state.ROOM.MCTS_dict['Gomoku Bot']
         return
     else:
         new_mct = session_state.ROOM.MCTS_dict[st.session_state['selected_oppo_model']]
         new_mct.mcts._root = deepcopy(TreeNode)
         session_state.ROOM.MCTS = new_mct
+        session_state.ROOM.last_mcts = new_mct
     return
 def handle_aid_model_selection():
     if st.session_state['selected_aid_model'] == 'None':
         session_state.USE_AIAID = False
         return
     session_state.USE_AIAID = True
+    TreeNode = session_state.ROOM.MCTS.mcts._root  # use the same tree node
     new_mct = session_state.ROOM.MCTS_dict[st.session_state['selected_aid_model']]
     new_mct.mcts._root = deepcopy(TreeNode)
     session_state.ROOM.AID_MCTS = new_mct
     return
 if 'selected_oppo_model' not in st.session_state:
     st.session_state['selected_oppo_model'] = 'AlphaZero'  # 默认值
 Model_Switch = st.empty()
 TITLE.header("🤖 AI 3603 Gomoku")
+selected_oppo_option = Model_Switch.selectbox('Select Opponent Model',
+                                              ['Pure MCTS', 'AlphaZero', 'Gomoku Bot', 'duel', 'Gumbel AlphaZero'],
+                                              index=1, key='oppo_model')
 if st.session_state['selected_oppo_model'] != selected_oppo_option:
     st.session_state['selected_oppo_model'] = selected_oppo_option
 with st.sidebar.container():
     ANOTHER_ROUND = st.empty()
     RESTART = st.empty()
+    GIVEIN = st.empty()
     AIAID = st.empty()
     EXIT = st.empty()
+selected_aid_option = AIAID.selectbox('Select Assistant Model', ['None', 'Pure MCTS', 'AlphaZero'], index=0,
+                                      key='aid_model')
 if st.session_state['selected_aid_model'] != selected_aid_option:
     st.session_state['selected_aid_model'] = selected_aid_option
     handle_aid_model_selection()
 )
 def restart() -> None:
     """
     Restart the game.
     session_state.ROOM = Room(session_state.ROOM.ROOM_ID)
     st.session_state['selected_oppo_model'] = 'AlphaZero'
+def givein() -> None:
+    """
+    Give in to AI.
+    """
+    session_state.ROOM = deepcopy(session_state.ROOM)
+    session_state.ROOM.WINNER = _WHITE
+    # add 1 score to AI
+    session_state.ROOM.HISTORY = (
+        session_state.ROOM.HISTORY[0]
+        + int(session_state.ROOM.WINNER == _WHITE),
+        session_state.ROOM.HISTORY[1]
+        + int(session_state.ROOM.WINNER == _BLACK),
+    )
+    session_state.ROOM.BOARD = Board(width=_BOARD_SIZE, height=_BOARD_SIZE, n_in_row=5)
+    session_state.ROOM.gomoku_bot_board = Gomoku_bot_board(_BOARD_SIZE, 1)
+    session_state.ROOM.MCTS_dict = {'Pure MCTS': MCTSpure(c_puct=5, n_playout=1000),
+                                    'AlphaZero': alphazero(PolicyValueNet_new(_BOARD_SIZE, _BOARD_SIZE,
+                                                                              _MODEL_PATH["AlphaZero"]).policy_value_fn,
+                                                           c_puct=5, n_playout=100),
+                                    'duel': alphazero(duel_PolicyValueNet(_BOARD_SIZE, _BOARD_SIZE,
+                                                                          _MODEL_PATH["duel"]).policy_value_fn,
+                                                      c_puct=5, n_playout=100),
+                                    'Gumbel AlphaZero': Gumbel_MCTSPlayer(PolicyValueNet_new(_BOARD_SIZE, _BOARD_SIZE,
+                                                                                             _MODEL_PATH[
+                                                                                                 "Gumbel AlphaZero"]).policy_value_fn,
+                                                                          c_puct=5, n_playout=100, m_action=8),
+                                    'Gomoku Bot': Gomoku_bot(session_state.ROOM.gomoku_bot_board, -1)}
+    session_state.ROOM.MCTS = session_state.ROOM.MCTS_dict[st.session_state['selected_oppo_model']]
+    session_state.ROOM.last_mcts = session_state.ROOM.MCTS
+    session_state.ROOM.PLAYER = session_state.ROOM.PLAYER
+    session_state.ROOM.TURN = session_state.ROOM.PLAYER
+    session_state.ROOM.WINNER = _BLANK  # 0
+    session_state.ROOM.ai_simula_time_list = []
+    session_state.ROOM.human_simula_time_list = []
+    session_state.ROOM.COORDINATE_1D = [BOS_TOKEN_ID]
 RESTART.button(
     "Reset",
     on_click=restart,
     help="Clear the board as well as the scores",
 )
+GIVEIN.button(
+    "Give in",
+    on_click = givein,
+    help="Give in to AI",
+)
 # Draw the board
 def gomoku():
         session_state.ROOM.BOARD = Board(width=_BOARD_SIZE, height=_BOARD_SIZE, n_in_row=5)
         session_state.ROOM.gomoku_bot_board = Gomoku_bot_board(_BOARD_SIZE, 1)
         session_state.ROOM.MCTS_dict = {'Pure MCTS': MCTSpure(c_puct=5, n_playout=1000),
+                                        'AlphaZero': alphazero(PolicyValueNet_new(_BOARD_SIZE, _BOARD_SIZE,
+                                                                                  _MODEL_PATH["AlphaZero"]).policy_value_fn,
+                                                               c_puct=5, n_playout=100),
+                                        'duel': alphazero(duel_PolicyValueNet(_BOARD_SIZE, _BOARD_SIZE,
+                                                                              _MODEL_PATH["duel"]).policy_value_fn,
+                                                          c_puct=5, n_playout=100),
+                                        'Gumbel AlphaZero': Gumbel_MCTSPlayer(PolicyValueNet_new(_BOARD_SIZE, _BOARD_SIZE,
+                                                                                   _MODEL_PATH["Gumbel AlphaZero"]).policy_value_fn,
+                                                                c_puct=5, n_playout=100, m_action=8),
+                                        'Gomoku Bot': Gomoku_bot(session_state.ROOM.gomoku_bot_board, -1)}
         session_state.ROOM.MCTS = session_state.ROOM.MCTS_dict[st.session_state['selected_oppo_model']]
         session_state.ROOM.last_mcts = session_state.ROOM.MCTS
         session_state.ROOM.PLAYER = session_state.ROOM.PLAYER
         session_state.ROOM.TURN = session_state.ROOM.PLAYER
         session_state.ROOM.WINNER = _BLANK  # 0
+        session_state.ROOM.ai_simula_time_list = []
+        session_state.ROOM.human_simula_time_list = []
         session_state.ROOM.COORDINATE_1D = [BOS_TOKEN_ID]
     # Room status sync
             session_state.ROOM.current_move = move
             session_state.ROOM.BOARD.do_move(move)
             # Gomoku Bot BOARD
+            session_state.ROOM.MCTS_dict["Gomoku Bot"].board.put(_BOARD_SIZE - move // _BOARD_SIZE - 1,
+                                                                 move % _BOARD_SIZE)  # # this move starts from left up corner (0,0), however, the move in the game starts from left bottom corner (0,0)
             session_state.ROOM.BOARD.board_map[x][y] = session_state.ROOM.TURN
             session_state.ROOM.COORDINATE_1D.append(x * _BOARD_SIZE + y)
                                 _PLAYER_SYMBOL[_NEW],
                                 key=f"{i}:{j}",
                                 args=(i, j),
+                                on_click=forbid_click,
                             )
                         else:
                             # disable click for GPT choices
                     move, simul_time = session_state.ROOM.MCTS.get_action(session_state.ROOM.BOARD, return_time=True)
                 else:
                     move, simul_time = session_state.ROOM.MCTS.get_action(return_time=True)
+                session_state.ROOM.ai_simula_time_list.append(simul_time)
                 print("AI takes move: ", move)
                 session_state.ROOM.current_move = move
                 gpt_response = move
                 # MCTS BOARD
                 session_state.ROOM.BOARD.do_move(move)
                 # Gomoku Bot BOARD
+                session_state.ROOM.MCTS_dict["Gomoku Bot"].board.put(_BOARD_SIZE - 1 - move // _BOARD_SIZE,
+                                                                     move % _BOARD_SIZE)
                 # session_state.ROOM.BOARD[gpt_i][gpt_j] = session_state.ROOM.TURN
                 session_state.ROOM.COORDINATE_1D.append(gpt_i * _BOARD_SIZE + gpt_j)
                                     on_click=forbid_click
                                 )
                         else:
+                            if session_state.USE_AIAID and i * _BOARD_SIZE + j in top_five_acts and not \
+                                    session_state.ROOM.BOARD.game_end()[0]:
                                 # enable click for other cells available for human choices
                                 prob = top_five_probs[top_five_acts.index(i * _BOARD_SIZE + j)]
                                 BOARD_PLATE[i][j].button(
                                     args=(i, j),
                                 )
             message.markdown(
                 'AI agent has calculated its strategy, which takes <span style="color: blue; font-size: 20px;">{:.3e}</span>s per simulation.'.format(
                     simul_time),
         else:
             draw_board(True)
         if session_state.ROOM.WINNER != _BLANK or 0 not in session_state.ROOM.BOARD.board_map:
+            GIVEIN.empty()
             ANOTHER_ROUND.button(
                 "Play Next round!",
                 on_click=another_round,
         # draw the plot for simulation time
         # 创建一个 DataFrame
+        # print(session_state.ROOM.ai_simula_time_list)
         st.markdown("<br>", unsafe_allow_html=True)
         st.markdown("<br>", unsafe_allow_html=True)
+        chart_data = pd.DataFrame(session_state.ROOM.ai_simula_time_list, columns=["Simulation Time"])
         st.line_chart(chart_data)
     game_control()
     update_info()