From cbfac9b44e5cbcc600aa9e76010e9db2596ae857 Mon Sep 17 00:00:00 2001
From: Joe Moudrik <j.moudrik@gmail.com>
Date: Fri, 24 May 2013 17:45:18 +0200
Subject: [PATCH] clanek: features, ml

---
 clanek_go_congress/clanek.bib |  17 ++++
 clanek_go_congress/clanek.tex | 208 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 223 insertions(+), 2 deletions(-)

diff --git a/clanek_go_congress/clanek.bib b/clanek_go_congress/clanek.bib
index 94c9ef8..d296b22 100644
--- a/clanek_go_congress/clanek.bib
+++ b/clanek_go_congress/clanek.bib
@@ -9,6 +9,23 @@
   owner = "jm"
 }
 
+@MastersThesis{Moudrik13,
+  author   = "Josef Moud{\v{r}\'{i}}k",
+  title    = "Meta-learning methods for analyzing Go playing trends",
+  school   = "Charles University, Faculty of Mathematics and Physics",
+  address  = "Prague, Czech Republic",
+  year     = "2013",
+}
+
+@ONLINE{GoAncientChina,
+  author = "John Fairbairn",
+  year = "1995",
+  title = "Go in Ancient China",
+  url = "http://www.pandanet.co.jp/English/essay/goancientchina.html",
+  cited = "15.\,1.\,2013",
+  owner = "jm"
+}
+
 @ONLINE{ wiki:Go,
    author = "Wikipedia",
    title = "Go (game) --- Wikipedia{,} The Free Encyclopedia",
diff --git a/clanek_go_congress/clanek.tex b/clanek_go_congress/clanek.tex
index 2bdd5ee..a92c8e5 100644
--- a/clanek_go_congress/clanek.tex
+++ b/clanek_go_congress/clanek.tex
@@ -1,5 +1,7 @@
 \documentclass[12pt,a4paper,notitlepage]{article}
 
+\usepackage[a4paper,vmargin={20mm,20mm},hmargin={20mm,20mm}]{geometry}
+
 %% Použité kódování znaků: obvykle latin2, cp1250 nebo utf8:
 \usepackage[utf8]{inputenc}
 
@@ -118,17 +120,216 @@ Section~\ref{sec:disc} discusses applications and future work.
 
 \section{Feature Extraction}
 \label{sec:feat}
+This section presents the methods for extracting the evaluation 
+vector (call it $ev$) from a set of games. Because we should
+distinguish between both players in any particular game,
+each game in the
+set is accompanied by the color which specifies our player of
+interest. The sample is therefore is regarded as a \emph{set
+of colored games}, $GC = \{ (game_1, color_1), ...\}$. For example,
+the $color_1$ specifies the player of interest in $game_1$.
+
+The evaluation vector $ev$ is composed by concatenating several
+sub-vectors we call \emph{features} -- examples include the
+aforementioned local patterns or statistics of sente and gote
+sequences. These will be detailed in the rest of this section.
+Some of the explanations are simplified to fit the size of
+the paper, please see \citep{Moudrik13} for precise details.
+
+\subsection{Raw Game Processing}
+Firstly, we need to specify how do we process the games.
+\footnote{
+We use the standard \emph{.sgf} file format as input, \cite{SGF}.
+}
+We have used the Pachi Go
+Engine~\citep{Pachi} which -- apart
+from being quite a good performing Go Bot -- allows to extract
+raw information from each game on a per-move basis.
+For each move,
+Pachi outputs a list of key-value pairs regarding the current move:
 
-\subsection{Game Processing}
+\begin{itemize}
+    \item \textbf{atari flag} --- whether the move put enemy stones in atari,
+    \item \textbf{atari escape flag} --- whether the move saved own stones from atari,
+    \item \textbf{capture} --- number of enemy stones the move captured,
+    \item \textbf{contiguity to last move} --- the gridcular distance (presented
+        below) from the last move,
+    \item \textbf{board edge distance} --- the distance from the nearest edge of the board,
+    \item \textbf{spatial pattern} --- configuration of stones around the played move.
+\end{itemize}
+
+We use this information to compute the higher level features given below.
+The spatial pattern pictures positions of stones around the current move up to
+a certain distance.\footnote{
+    The distance is given by the {\em gridcular} metric
+$d(x,y) = |\delta x| + |\delta y| + \max(|\delta x|, |\delta y|)$, which produces
+a circle-like structure on the Go board square grid \cite{SpatPat}.
+Spatial patterns of sizes 2 to 6 are regarded.
+}
 
 \subsection{Patterns}
-\subsection{Local Sente and Gote Sequences}
+The first feature collects a statistics of $N = 400$ most frequently ocurring
+spatial patterns (together with both atari flags). The list of the $N$ most frequently
+played patterns is computed beforehand from the whole database of games.
+
+Given a set of of colored games $GC$ we then count how many times was each of the $N$
+patterns played -- thus obtaining a vector $c, |c| = 400$ of counts.
+Like this, however, the particular counts $c_i$ increase proportionally to 
+number of games in $GC$. To maintain invariancy under the number of games in the sample,
+a normalization is needed. We do this by dividing the $c$ by $|GC|$, though other schemes
+are possible, see \citep{Moudrik13}.
+
+\subsection{$\omega$-local Sente and Gote Sequences}
+Because the concept of sente and gote is very important in real games, we devised
+a statistics which tries to capture distribution of sente and gote plays in the games
+from the sample. Because deciding what moves are sente or gote can be hard even
+for human players, we restricted ourselves to what we call $\omega$-local (sente
+and gote) sequences. The simplification has a clear assumption -- the responses to
+a sente move are always local. We say, that a move is $\omega$-local (with respect
+to the previous move) if its gridcular distance from previous move
+is smaller than a fixed number $\omega$. Of course, this assumption might not always
+hold, but the feature proves to be useful nonetheless.
+
+We than partition each game into $\omega$-local sequences (that is, each move in the
+sequence is $\omega$-local with respect to its directly previous move) and observe
+whether the player who started the sequence is different from the player who ended it.
+If it is so, the $\omega$-local sequence is said to be sente for player who started it
+because he gets to play somewhere else first (tenuki). Similarly if the player who 
+started the sequence had to respond at last we say that the sequence is gote for him.
+Based on this partitioning, we can count the average number of sente and gote
+sequences per game from the sample $GC$. These two numbers, along with their difference,
+form the second feature.
+
 \subsection{Border Distance}
+The third feature is a two dimensional histogram, counting the average number of moves
+in the sample played low or high in different game stages. The original idea was to help
+to distinguish between territorial and influence based moves in the opening.
+
+The first dimension is specified by
+the move's border distance, the second one by the number of the current move. The size of each
+dimension is given by intervals dividing the domains.
+We use
+$$ByMoves = \{ \langle1, 10\rangle, \langle 11, 64\rangle, \langle 65,200\rangle, \langle 201, \infty)\}$$ 
+for the move coordinate -- the motivation is to (very roughly) distinguish
+between opening (say first ten moves), early middle game (moves 11-64), middle game
+and endgame.
+The border distance dimension is given by
+$$ByDist = \{ \langle1, 2\rangle, \langle 3 \rangle, \langle4\rangle, \langle 5, \infty)\}$$
+(distinguishing between first 2 lines, 3rd line of territory, 4th line of influence and
+higher plays for the rest).
+
+If we use the $ByMoves$ and $ByDist$ intervals to divide the domains, we obtain a histogram
+of total $|ByMoves| * |ByDist| = 16$ field. For each move, we increase the count in the
+appropriate histogram fields.  In the end, the whole histogram is normalized
+to establish invariancy under the number of games scanned by dividing the 
+histogram elements by $|GC|$. These 16 numbers form the third feature.
+
 \subsection{Captured Stones}
+Apart from the border distance feature, we realized a two-dimensional histogram
+which counts numbers of captured stones in different game stages. The motivation is
+simple -- especially beginners tend to capture stones because ``they could'' instead of
+because it is the ''best move''. For example, in the opening such capture might
+be a grave mistake.
+
+As before, one of the dimensions is given by intervals 
+$$ByMoves = \{ \langle1, 60\rangle, \langle 61, 240\rangle, \langle 241, \infty)\}$$
+which try to specify the game stages (opening, middle game, endgame).
+The second dimension has a fixed size of three bins. Along the number of captives
+of the player of interest (the first bin), we also count the number of his
+opponent's captives (the second bin) and a difference between the two numbers
+(the third bin). Together, we obtain a histogram of $|ByMoves| * 3 = 9$ elements.
+These 9 numbers (again normalized by dividing by $|GC|$) are the output of the fourth
+feature.
+
 \subsection{Win/Loss Statistics}
+Finally, we came up with a simple feature which makes statistics of
+wins and losses and whether they were by points or by resignation\footnote{
+    We disregard forfeited, unfinished or jigo games in this feature
+    because the frequency of these events is so small it would
+    require a very large dataset to utilize them reliably.
+}. 
+For example, quite a lot of weak players continues playing already lost games
+until the end, mainly because their counting is not very good (they do not
+know there is no way to win), while professionals do not hesitate to resign 
+if they think that nothing can be done.
+
+For the colored games of $GC$ we count how many times did the player of interest:
+\begin{itemize}
+    \item win standardly,
+    \item win by resignation,
+    \item lost standardly,
+    \item and lost by resignation.
+\end{itemize}
+Again, we divide these four numbers by $|GC|$ to maintain the invariancy under number of games
+in $GC$. Furthermore, for the games won or lost standardly we count:
+\begin{itemize}
+    \item average number of points the player won by for won games,
+    \item average number of points he lost by for lost games.
+\end{itemize}
+The six numbers form the last feature.
 
 \section{Machine Learning}
 \label{sec:mach}
+So far, we have learned how we can turn a set of coloured games $GC$ into
+an evaluation. Now, we are going to study how to utilize the evaulation.
+If we are to predict various player attributes, we need some input data
+to learn from. Suppose we have a dataset $D$ consisting
+of pairs $D=\{ (GC_i, y_i),...\}$, where $GC_i$
+corresponds to a set of colored games of $i$-th player and $y_i$ is the
+target attribute. The $y_i$ might be fairly arbitrary, as long as it has
+\emph{some} relation to the $GC_i$. For example, $y_i$ might be $i$'s strength.
+
+Now, lets denote our evaluation process we presented before as $eval$ and
+let $ev_i$ be evaluation of $i$-th player, $ev_i = eval(GC_i)$. Then,
+we can transform $D$ into $D_{ev} = \{(ev_i, y_i),. \}$, which forms
+our training data.
+The task of our machine learning algorithm is to generalize the knowledge
+from the dataset $D_{ev}$ to predict correct $y_X$ even to previously unseen $GC_X$.
+In the case of strength, we might therefore be able to predict strength $y_X$
+of an unknown player $X$ given a set of his games $GC_X$ (from which we can
+compute the evaluation $ev_X$).
+
+In this work, we have used a bagged artificial neural network
+to learn the dependency.
+Neural networks are a standard technique in machine learning. The network is
+composed of simple computational units which are organized in a layered topology.
+Please see the monograph by \citet{haykin_nn} to learn more.
+We have used a simple feedforward neural network with 20 hidden units, trained
+using the RPROP algorithm \citep{Riedmiller1993}.
+
+The bagging \citep{breimanbag96} is a method which combines an ensemble of
+$N$ models (trained on differently sampled data) to improve their
+performance and robustness. In this work, we used $N=20$. Please refer to the
+paper to learn more about bagging.
+
+\subsection{Measuring the Performance}
+Given a dataset $D_{ev}$, it would be nice to estimate performance of a given machine
+learning algorithm (in our case the bagged neural network). A performance measure
+allows to compare different algorithms and give estimates of method precision for
+unseen inputs. A standard way to do this is to divide the $D_{ev}$ in to training
+and testing parts and compute the error of the method on the testing part.
+
+A commonly used measure is the mean square error ($MSE$) which estimates variance of
+the error distribution. We use its square root ($RMSE$) which is an estimate of
+standard deviation of the predictions.
+
+$$ MSE = \frac{1}{|Ts|} \sum_{(ev, y) \in Ts}{ (predict(ev) - y)^2} $$
+
+Where the machine learning model $predict$ is trained on the
+training data $Tr$ and $Ts$ denotes the testing data.
+Now we will describe how do we split the data into testing and training for the
+error estimation to be robust.
+
+\subsubsection*{Cross-Validation}
+
+Cross-validation is a standard statistical technique for estimation of parameters.
+The idea is to split the data into $k$ disjunct subsets (called \emph{folds}), and then
+iteratively compose the training and testing sets and measure errors.
+In each of the $k$ iterations, $k$-th fold is chosen as the testing data, and
+all the remaining $k-1$ folds form the training data. The division into the folds is
+done randomly, and so that the folds have approximately the
+same size. Refer to~\citep{crossval} to learn more. In this work, we have used 10-fold
+cross validation.
 
 \section{Experiments and Results}
 \label{sec:expe}
@@ -139,6 +340,9 @@ Section~\ref{sec:disc} discusses applications and future work.
 \section{Conclusion}
 \label{sec:conc}
 
+\section{Implementation}
+\label{sec:impl}
+
 \bibliographystyle{abbrvnat}
 \bibliography{clanek}
 
-- 
2.11.4.GIT