rikirolly / expert-iteration

Modular reimplementation of the expert iteration method described in the AlphaGo Zero paper.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

expert-iteration

Library for implementing learning game AIs based on the expert iteration method described in the AlphaGo Zero paper, with planned generalizations to games that aren't competitive, 2-player, perfect information. Works with any ML library and game engine that has python bindings.

For example, here is the client implementation of a similar architecture to the one described in the paper using Keras and the python-chess library:

_num_piece_types = 6
_board_dim = 8
_input_filters = 2 * _num_piece_types + 1
_input_shape = (_board_dim, _board_dim, _input_filters)
_regularizer = regularizers.l2(0.001)
_depth = 64
_conv_layer = lambda: layers.Conv2D(_depth, 3, padding='same', kernel_regularizer=_regularizer)
_norm_layer = lambda: layers.BatchNormalization()
_activation_layer = lambda: layers.Activation('relu')
_residual_tower_height = 5

def _keras_model():
    board = layers.Input(shape=_input_shape)
    conv = _conv_layer()(board)
    norm = _norm_layer()(conv)
    bottom = _activation_layer()(norm)
    for i in range(_residual_tower_height):
        conv = _conv_layer()(bottom)
        norm = _norm_layer()(conv)
        mid = _activation_layer()(norm)
        conv = _conv_layer()(mid)
        norm = _norm_layer()(conv)
        add = layers.Add()([bottom, norm])
        bottom = _activation_layer()(add)

    policy = layers.Conv2D(2, 1, padding='same', kernel_regularizer=_regularizer)(bottom)
    policy = _norm_layer()(policy)
    policy = _activation_layer()(policy)
    policy = layers.Flatten()(policy)
    policy = layers.Dense(game.num_actions, kernel_regularizer=_regularizer)(policy)

    value = layers.Conv2D(1, 1, padding='same', kernel_regularizer=_regularizer)(bottom)
    value = _norm_layer()(value)
    value = _activation_layer()(value)
    value = layers.Flatten()(value)
    value = layers.Dense(_depth, kernel_regularizer=_regularizer)(value)
    value = _activation_layer()(value)
    value = layers.Dense(1, activation='tanh')(value)

    model = keras.models.Model(inputs=board, outputs=[policy, value])
    model.compile(optimizer='adagrad',
                  loss=['categorical_crossentropy', 'mean_squared_error'],
                  loss_weights=[1.0, 1.0])

    return model

_boards = 'boards'
_probs = 'probs'
_rewards = 'rewards'


class Model(model.Supervised[np.ndarray]):
    def __init__(self, opts: model.SupervisedOpts = model._default_supervised_opts) -> None:
        super().__init__(game, opts=opts)
        checkpoint_folder = os.path.join('.', 'checkpoints')
        if not os.path.exists(checkpoint_folder):
            os.makedirs(checkpoint_folder)
        self.checkpoint_file = os.path.join(checkpoint_folder, 'chess-weights')

        self.data = {
            _boards: np.empty((0,) + _input_shape),
            _probs: np.empty((0, game.num_actions)),
            _rewards: np.empty((0,))
        }

        self.best_model = _keras_model()
        self.train_model = _keras_model()
        self.train_model.save_weights(self.checkpoint_file)
        self.best_model.load_weights(self.checkpoint_file)

    def eval_states(self,
                    states: List[State[np.ndarray]],
                    using=model.Model.PARAMS_BEST) -> Tuple[np.ndarray, np.ndarray]:

        model = self.best_model if using == Model.PARAMS_BEST else self.train_model
        actions, values = model.predict(self._parse_states(states), batch_size=self.opts.batch_size)
        poss_actions = self.game.valid_actionses(states)
        actions = softmax(actions, poss_actions)

        return (actions, values)

    def train(self):
        self.train_model.fit(x=self.data[_boards],
                             y=[self.data[_probs], self.data[_rewards]],
                             batch_size=self.opts.batch_size)

    def new_checkpoint(self):
        self.train_model.save_weights(self.checkpoint_file)
        self.best_model.load_weights(self.checkpoint_file)

    def restore_checkpoint(self):
        self.train_model.load_weights(self.checkpoint_file)

    def _parse_states(self, states: List[State[np.ndarray]]) -> np.ndarray:
        piece_types = range(1, _num_piece_types + 1)
        colors = [False, True]
        boards = np.zeros((len(states),) + _input_shape, dtype=np.float)
        for i, state in enumerate(states):
            for square, piece in state.board_state.piece_map().items():
                x, y = np.unravel_index(square, (8, 8))
                val = piece.piece_type - 1 + 6 * int(piece.color)
                boards[i, x, y, val] = 1.0
            if state.board_state.turn == chess.WHITE:
                boards[i, :, :, -1] = 1.0

        return boards

    def _parse_data(self, data: List[List[Tuple[State[np.ndarray], np.ndarray, np.ndarray]]]) -> Tuple[Dict[Any, np.ndarray], int]:
        states, probs, rewards = unzip(sum(data, []))
        parsed = {
            _boards: self._parse_states(states),
            _probs: np.array(probs),
            _rewards: np.array([reward[1] for reward in rewards])
        }
        return (parsed, len(states))

About

Modular reimplementation of the expert iteration method described in the AlphaGo Zero paper.

License:GNU General Public License v3.0


Languages

Language:Python 100.0%