ちょっとだけ余暇を確保出来たのでずっと前からやりたかった強化学習をやります。強化学習を使って最強のスマブラ64AIを作って、練習相手になってもらいたいなーと思っているのですが、さすがにいきなりそれは難し過ぎるので、簡単なゲームのAIを作ってみます。
今回は重力四目並べでやってみます。アメリカでは Connect Four という名前で販売されているみたいですね。
1対1の対戦ゲームで、5x7とか6x7くらいの盤面に、自分のコマを置いていき、縦・横・斜めのいずれかで4つ隣接させれば勝利というシンプルなゲームですね。◯×ゲームの拡張という感じです。違う点としては、「重力」の概念があって、下から上へと積み上げていく必要があり、コマを空中に浮かすことが出来ません。

WEBでプレー出来るものもあるので、やってみればすぐわかると思います。
http://www.gamedesign.jp/flash/balls/balls_jp.html
四目並べ実装
学習の計算量の都合で、今回は5x5のマスで実装しました。、ほとんどこちらのエントリの三目並べ(◯×ゲーム)の実装をベースに作ってます。
ChainerでDQN。強化学習を三目並べでいろいろ試してみた。(Deep Q Network、Q-Learning、モンテカルロ)
今回ぼくが実装したソースコードはこちら
https://github.com/harada4atsushi/connect_for
board.py(盤面の実装)
import numpy as np
from more_itertools import chunked
EMPTY=0
PLAYER_X=1
PLAYER_O=-1
RED = '\033[91m'
ENDC = '\033[0m'
MARKS={PLAYER_X: RED + "X" + ENDC, PLAYER_O: "O", EMPTY: " "}
DRAW=2
COL_NUM = 5
ROW_NUM = 5
ALL_POS_COUNT = 25
class Board:
def __init__(self,board=None):
if board==None:
self.board = []
for i in range(ALL_POS_COUNT):self.board.append(EMPTY)
else:
self.board=board
self.winner=None
def get_possible_pos(self):
pos = []
append = pos.append
for i in range(ALL_POS_COUNT):
if self.board[i] == EMPTY:
# 下のマスが埋まっている場合(重力対応)
below_i = i + COL_NUM
if below_i >= ALL_POS_COUNT or self.board[below_i] != EMPTY:
append(i)
return pos
def print_board(self):
tempboard=[]
for i in self.board:
tempboard.append(MARKS[i])
row = ' {} | {} | {} | {} | {} '
hr = '\n-------------------\n'
matrix_str = ''
for i in range(ROW_NUM):
matrix_str += row
if i != ROW_NUM - 1:
matrix_str += hr
print(matrix_str.format(*tempboard))
def check_winner(self, pos):
self.check_winner_horizontal(pos)
self.check_winner_vertical()
self.check_winner_skew()
return None
def check_winner_horizontal(self, pos):
start_pos = pos - (pos % COL_NUM)
end_pos = start_pos + COL_NUM - 1
for i in range(start_pos, end_pos):
if i + COL_NUM - 1 > end_pos:
break
if self.board[i] == self.board[i + 1] == self.board[i + 2] == self.board[i + 3]:
if self.board[i] != EMPTY:
self.winner = self.board[i]
return self.winner
# ②
# for i in range(ALL_POS_COUNT - 4):
# if i % COL_NUM < (i + 3) % COL_NUM:
# if self.board[i] == self.board[i + 1] == self.board[i + 2] == self.board[i + 3]:
# if self.board[i] != EMPTY:
# self.winner = self.board[i]
# return self.winner
# ①
# rows = list(chunked(self.board, COL_NUM))
# self.check_connected(rows)
def check_winner_vertical(self):
rows = list(chunked(self.board, COL_NUM))
cols = list(zip(*rows))
self.check_connected(cols)
def check_winner_skew(self):
# indexに対応するマスに埋まっているプレイヤーの行列を作る
for index_list in Board.__get_check_indices():
if self.board[index_list[0]] == self.board[index_list[1]] == self.board[index_list[2]] == self.board[index_list[3]]:
if self.board[index_list[0]] != EMPTY:
self.winner = self.board[index_list[0]]
return self.winner
# self.check_connected(rows)
def check_connected(self, lists):
for list in lists:
pre = EMPTY
count = 0
for player in list:
if player == EMPTY:
count = 0
elif pre == player:
count += 1
if count == 4:
self.winner = player
else:
count = 1
pre = player
def check_draw(self):
if len(self.get_possible_pos())==0 and self.winner is None:
self.winner=DRAW
return DRAW
return None
def move(self,pos,player):
if self.board[pos]== EMPTY:
self.board[pos]=player
else:
self.winner=-1*player
self.check_winner(pos)
self.check_draw()
def clone(self):
return Board(self.board.copy())
def switch_player(self):
if self.player_turn == self.player_x:
self.player_turn=self.player_o
else:
self.player_turn=self.player_x
@classmethod
def __get_check_indices(cls):
if hasattr(cls, 'check_indices'):
return cls.check_indices
# print('__get_check_indices')
indices = []
# 左上がり斜め方向のindexのリストを作る
for i in range(ALL_POS_COUNT):
list = []
next_i = i
while next_i < ALL_POS_COUNT:
if next_i % COL_NUM >= i % COL_NUM:
list.append(next_i)
next_i += COL_NUM + 1
if len(list) >= 4:
indices.append(list)
# 右上がり斜め方向のindexのリストを作る
for i in range(ALL_POS_COUNT):
list = []
next_i = i
while next_i < ALL_POS_COUNT:
if (next_i % COL_NUM < i % COL_NUM) or i == next_i:
list.append(next_i)
next_i += COL_NUM - 1
if len(list) >= 4:
indices.append(list)
cls.check_indices = indices
# print(indices)
return cls.check_indices
game_organizer.py(ゲームの進行役)
import random
from board import DRAW, Board
class GameOrganizer:
act_turn=0
winner=None
def __init__(self, px, po, nplay=1, showBoard=True, showResult=True, stat=100):
self.player_x=px
self.player_o=po
self.nwon={px.myturn:0,po.myturn:0,DRAW:0}
self.nplay=nplay
self.players=(self.player_x,self.player_o)
self.board=None
self.disp=showBoard
self.showResult=showResult
self.player_turn=self.players[random.randrange(2)]
self.nplayed=0
self.stat=stat
def progress(self):
while self.nplayed < self.nplay:
self.board = Board()
while self.board.winner == None:
if self.disp:
print("Turn is " + self.player_turn.name)
act = self.player_turn.act(self.board)
self.board.move(act,self.player_turn.myturn)
if self.disp:
self.board.print_board()
if self.board.winner != None:
# notice every player that game ends
for i in self.players:
i.getGameResult(self.board)
if self.board.winner == DRAW:
if self.showResult:print ("Draw Game")
elif self.board.winner == self.player_turn.myturn:
out = "Winner : " + self.player_turn.name
if self.showResult: print(out)
else:
print ("Invalid Move!")
self.nwon[self.board.winner]+=1
else:
self.switch_player()
# Notice other player that the game is going
self.player_turn.getGameResult(self.board)
self.nplayed += 1
if self.nplayed%self.stat==0 or self.nplayed==self.nplay:
print(self.player_x.name+":"+str(self.nwon[self.player_x.myturn])+","+self.player_o.name+":"+str(self.nwon[self.player_o.myturn])
+",DRAW:"+str(self.nwon[DRAW]))
def switch_player(self):
if self.player_turn == self.player_x:
self.player_turn=self.player_o
else:
self.player_turn=self.player_x
プレーヤーの実装
たぶん元コードからいじっていないけど、一応載せておきます。
ランダム
勝てる状態のときには必ず勝てる手を打つが、それ以外は全てランダムな手を打つザコ。
player_alpha_random.py
import random
class PlayerAlphaRandom:
def __init__(self,turn,name="AlphaRandom"):
self.name=name
self.myturn=turn
def getGameResult(self,winner):
pass
def act(self,board):
acts=board.get_possible_pos()
#see only next winnable act
for act in acts:
tempboard=board.clone()
tempboard.move(act,self.myturn)
# check if win
if tempboard.winner==self.myturn:
#print ("Check mate")
return act
i=random.randrange(len(acts))
return acts[i]
人間
自分で手を打って対戦できるプレーヤー。
player_human.py
class PlayerHuman:
def __init__(self,turn):
self.name="Human"
self.myturn=turn
def act(self,board):
valid = False
while not valid:
try:
act = input("Where would you like to place " + str(self.myturn) + " (1-35)? ")
act = int(act)
#if act >= 1 and act <= 9 and board.board[act-1]==EMPTY:
if act >= 1 and act <= 35:
valid=True
return act-1
else:
print ("That is not a valid move! Please try again.")
except Exception as e:
print (act + "is not a valid move! Please try again.")
return act
def getGameResult(self,board):
if board.winner is not None and board.winner!=self.myturn and board.winner!=DRAW:
print("I lost...")
竜王モンテカルロ
次の手以降をしらみつぶし的にシミュレーションして、良い手を選択するなかなか賢いヤツ。考えるのが遅いのが難点。とりあえずシミュレーションの試行回数は50回になっている。
player_mc.py
import random
from board import DRAW
class PlayerMC:
def __init__(self, turn, name="MC"):
self.name = name
self.myturn = turn
def getGameResult(self,winner):
pass
def win_or_rand(self,board,turn):
acts = board.get_possible_pos()
#see only next winnable act
for act in acts:
tempboard=board.clone()
tempboard.move(act,turn)
# check if win
if tempboard.winner==turn:
return act
i=random.randrange(len(acts))
return acts[i]
def trial(self,score,board,act):
tempboard=board.clone()
tempboard.move(act,self.myturn)
tempturn=self.myturn
while tempboard.winner is None:
tempturn=tempturn*-1
tempboard.move(self.win_or_rand(tempboard,tempturn),tempturn)
if tempboard.winner==self.myturn:
score[act]+=1
elif tempboard.winner==DRAW:
pass
else:
score[act]-=1
def getGameResult(self,board):
pass
def act(self,board):
acts = board.get_possible_pos()
scores = {}
n = 50
for act in acts:
scores[act] = 0
for i in range(n):
#print("Try"+str(i))
self.trial(scores, board, act)
#print(scores)
scores[act]/=n
max_score = max(scores.values())
for act, v in scores.items():
if v == max_score:
#print(str(act)+"="+str(v))
return act
Q-Learning
期待のQ学習プレーヤー。Q値を更新し、良い手を学習します。時折ランダムな手を打って局所最適に収束するのを回避してます。
player_ql.py
import random
from board import DRAW
class PlayerQL:
def __init__(self, turn, name="QL", e=0.2, alpha=0.3):
self.name=name
self.myturn=turn
self.q={} #set of s,a
self.e=e
self.alpha=alpha
self.gamma = 0.9
self.last_move=None
self.last_board=None
self.totalgamecount = 0
def policy(self,board):
self.last_board = board.clone()
acts = board.get_possible_pos()
# Explore sometimes
# ゲーム回数が少ない間は、ある程度の確率で打ち手をランダムにする
if random.random() < (self.e / (self.totalgamecount // 10000 + 1)):
i = random.randrange(len(acts))
return acts[i]
qs = [self.getQ(tuple(self.last_board.board),act) for act in acts]
maxQ= max(qs)
if qs.count(maxQ) > 1:
# more than 1 best option; choose among them randomly
best_options = [i for i in range(len(acts)) if qs[i] == maxQ]
i = random.choice(best_options)
else:
i = qs.index(maxQ)
self.last_move = acts[i]
return acts[i]
def getQ(self, state, act):
# encourage exploration; "optimistic" 1.0 initial values
if self.q.get((state, act)) is None:
self.q[(state, act)] = 1
return self.q.get((state, act))
def getGameResult(self,board):
r=0
if self.last_move is not None:
if board.winner is None:
self.learn(self.last_board,self.last_move, 0, board)
pass
else:
if board.winner == self.myturn:
self.learn(self.last_board,self.last_move, 1, board)
elif board.winner !=DRAW:
self.learn(self.last_board,self.last_move, -1, board)
else:
self.learn(self.last_board,self.last_move, 0, board)
self.totalgamecount+=1
self.last_move=None
self.last_board=None
def learn(self,s,a,r,fs):
pQ=self.getQ(tuple(s.board),a)
if fs.winner is not None:
maxQnew=0
else:
maxQnew=max([self.getQ(tuple(fs.board),act) for act in fs.get_possible_pos()])
self.q[(tuple(s.board),a)]=pQ+self.alpha*((r+self.gamma*maxQnew)-pQ)
#print (str(s.board)+"with "+str(a)+" is updated from "+str(pQ)+" refs MAXQ="+str(maxQnew)+":"+str(r))
#print(self.q)
def act(self,board):
return self.policy(board)
対戦させてみる
人間 vs 竜王モンテカルロ
4目並べは初心者レベルだと、単なるうっかりゲーなので、相手のリーチに気づかずに敗北ってパターンがほとんど。ともすると竜王モンテカルロは論理的にはうっかりしないヤツだったはずなので、人間が普通に戦ったら負けるのではないか?
やってみる。
versus.py
def human_vs_mc():
p1 = PlayerHuman(PLAYER_X)
p2 = PlayerMC(PLAYER_O, 'M2')
game = GameOrganizer(p1, p2)
game.progress()
main.py
human_vs_mc()
Turn is M2 | | | | ------------------- | | | | ------------------- | | | | ------------------- | | | | ------------------- | | | O | Turn is Human Where would you like to place 1 (1-35)? 21 | | | | ------------------- | | | | ------------------- | | | | ------------------- | | | | ------------------- X | | | O | Turn is M2 | | | | ------------------- | | | | ------------------- | | | | ------------------- | | | O | ------------------- X | | | O | Turn is Human Where would you like to place 1 (1-35)? 16 | | | | ------------------- | | | | ------------------- | | | | ------------------- X | | | O | ------------------- X | | | O | Turn is M2 | | | | ------------------- | | | | ------------------- | | | O | ------------------- X | | | O | ------------------- X | | | O | Turn is Human Where would you like to place 1 (1-35)? 9 | | | | ------------------- | | | X | ------------------- | | | O | ------------------- X | | | O | ------------------- X | | | O | Turn is M2 | | | | ------------------- | | | X | ------------------- O | | | O | ------------------- X | | | O | ------------------- X | | | O | Turn is Human Where would you like to place 1 (1-35)? 6 | | | | ------------------- X | | | X | ------------------- O | | | O | ------------------- X | | | O | ------------------- X | | | O | Turn is M2 | | | | ------------------- X | | | X | ------------------- O | | | O | ------------------- X | | | O | ------------------- X | | | O | O Turn is Human Where would you like to place 1 (1-35)? 20 | | | | ------------------- X | | | X | ------------------- O | | | O | ------------------- X | | | O | X ------------------- X | | | O | O Turn is M2 | | | | ------------------- X | | | X | ------------------- O | | | O | ------------------- X | | | O | X ------------------- X | O | | O | O Turn is Human Where would you like to place 1 (1-35)? 17 | | | | ------------------- X | | | X | ------------------- O | | | O | ------------------- X | X | | O | X ------------------- X | O | | O | O Turn is M2 | | | | ------------------- X | | | X | ------------------- O | | | O | ------------------- X | X | | O | X ------------------- X | O | O | O | O I lost... Winner : M2 Human:0,M2:1,DRAW:0
やっぱり勝てねえ…。モンテカルロつよし。
モンテカルロに勝てるQ-Learningを実現したい。
Q-Learning vs モンテカルロ
学習①
Q-Learningとランダムで100万回対戦させて学習させる。学習処理に30分以上かかるので、結果をファイルにdumpしておく。
versus.py
def ql_vs_alpha_random():
p1 = PlayerQL(PLAYER_X, 'Q1')
p2 = PlayerAlphaRandom(PLAYER_O)
game = GameOrganizer(p1, p2, 1000000, False, False, 10000)
game.progress()
with open('dump/ql_vs_alpha_random_%s.pkl' % sdate(), mode='wb') as f:
pickle.dump(p1, f)
main.py
ql_vs_alpha_random()
ログ保存するの忘れちゃったよ。。大体ですね30万回くらい対戦させると、Q-Learningの方が勝利数が上回るみたい。これでランダム野郎より強いのは分かる。
対戦①
10回戦した結果をみてみます。
versus.py
def mc_vs_dumped():
p1 = PlayerMC(PLAYER_X, 'M1')
with open('dump/ql_vs_alpha_random_2017_05_19_17_42_46.pkl', mode='rb') as f:
p2 = pickle.load(f)
p2.e = 0
game = GameOrganizer(p1, p2, 10, False)
game.progress()
main.py
mc_vs_dumped()
ログ
Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : Q1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 M1:9,Q1:1,DRAW:0
9対1でモンテカルロさんの圧勝ですね。。100万回も戦ったのに。。
学習②
悔しいので今度は、Q-Learning vs Q-Learningで100万回対戦させてみます。
versus.py
def ql_vs_ql():
p1 = PlayerQL(PLAYER_X, 'Q1')
p2 = PlayerQL(PLAYER_O, 'Q2')
game = GameOrganizer(p1, p2, 1000000, False, False, 10000)
game.progress()
with open('dump/ql_vs_ql_%s.pkl' % sdate(), mode='wb') as f:
pickle.dump(p1, f)
main.py
ql_vs_ql()
すさまじく時間がかかります。
ログを一部抜粋。当然同じアルゴリズムなのでほぼ互角の戦いをしていますね。dumpしたファイルを見ると1.21GBとな。。。
... Q1:285772,Q2:285673,DRAW:68555 Q1:290402,Q2:289947,DRAW:69651 Q1:294845,Q2:294393,DRAW:70762 Q1:299172,Q2:298911,DRAW:71917 Q1:303581,Q2:303374,DRAW:73045 Q1:308033,Q2:307850,DRAW:74117 Q1:312373,Q2:312432,DRAW:75195 Q1:316757,Q2:316871,DRAW:76372 Q1:321205,Q2:321323,DRAW:77472 Q1:325535,Q2:325913,DRAW:78552 Q1:329798,Q2:330599,DRAW:79603 Q1:334360,Q2:334965,DRAW:80675 Q1:338741,Q2:339487,DRAW:81772 Q1:343038,Q2:344011,DRAW:82951 Q1:347604,Q2:348288,DRAW:84108 ...
対戦②
さあ、先程と同様に竜王モンテカルロにぶつけてみます!dumpしたファイルがでかすぎるのでロードにも時間がかかる。
versus.py
def mc_vs_dumped():
p1 = PlayerMC(PLAYER_O, 'M1')
# with open('dump/ql_vs_alpha_random_2017_05_19_17_42_46.pkl', mode='rb') as f:
with open('dump/ql_vs_ql_2017_05_19_18_35_32.pkl', mode='rb') as f:
p2 = pickle.load(f)
p2.e = 0
game = GameOrganizer(p1, p2, 10, False)
game.progress()
main.py
mc_vs_dumped()
どうなるかどうなるか?
ログ
Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 Winner : M1 M1:10,Q1:0,DRAW:0
結果は竜王モンテカルロの圧勝。Q-Learning頑張ったのに。。。
3x3の三目並べならQ-Learningは竜王モンテカルロに勝てるのは確認したんですが、このルールでは厳しかったかなー。いずれロジックを見直したいところ。