From 394797d64ffbc75c3af3f0e0ac851036675507d9 Mon Sep 17 00:00:00 2001 From: Petr Baudis Date: Sat, 13 Feb 2010 02:54:03 +0100 Subject: [PATCH] Add support+docs for easy dataset generation --- README | 35 +++++++++++++++++++++++++++++++++++ data_gogod.sh | 19 +++++++++++++++++++ data_summarize.sh | 24 ++++++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100755 data_gogod.sh create mode 100755 data_summarize.sh diff --git a/README b/README index 2d27a10..13d187c 100644 --- a/README +++ b/README @@ -43,3 +43,38 @@ What we do is that we try to determine players' strategies and characteristics b has patterns in it :-) A pattern is a set of features (such as a distance from a border, a shape of stones on the goban, ..) for each move. TODO + +---------- +PREPARING THE DATASET: + +1. Assemble a list of players you are interested in, one name per line, + to some file, e.g. players.list. + +2. Get large SGF collection with index file listing games by players. + + We use GoGoD 2008 Winter as our reference database. + +3. Get Pachi source tree, and build yourself the binary. Copy over these + files: zzgo sgf2gtp.pl patterns.spat pattern_byplayer.sh + +4. Create a directory where you will stash raw lists of patterns for each + player encountered - e.g. rawpats/. + +5. For each game in the SGF collection where at least one of the players + is interesting, run ./pattern_byplayer.sh rawpats/ $SGFFILE. You can + ignore warnings/errors sometimes printed for individual files. + + If importing from GoGoD, you can use: + + ./data_gogod.sh $PATH_TO_GOGOD players.list rawpats/ + + See its header for usage details. + +6. Create a directory where the pattern summaries by frequency for + interesting players will go - e.g. pats/. + +7. Fill up pats/ from rawpats/ based on players.list. You can use a pre-made + script, run ./data_summarize.sh players.list rawpats/ pats/. + +8. pats/ now contains the dataset that can be analyzed by other tools + in the suite. diff --git a/data_gogod.sh b/data_gogod.sh new file mode 100755 index 0000000..f43b31f --- /dev/null +++ b/data_gogod.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# +# data_gogod.sh - Generate patterns list for some players in the GoGoD collection +# +# Usage: data_gogod.sh GOGOD_ROOT PLAYERLIST PATDIR + +gogod=$1; shift +players=$1; shift +patdir=$1; shift + +# pattern_byplayer.sh will _append_ to existing state, so clean it up first. +rm "$patdir"/* + +grep -Ff "$players" "$gogod"/GoGoD/GameList.txt | cut -d ' ' -f 1 | + # list of SGF files where at least one of the players is interesting + while read sgf; do + echo $sgf + ./pattern_byplayer.sh "$patdir" "$gogod"/Database/*/$sgf.sgf + done diff --git a/data_summarize.sh b/data_summarize.sh new file mode 100755 index 0000000..2d1b9b4 --- /dev/null +++ b/data_summarize.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# +# data_summarize.sh - Summarize raw per-player pattern data by pattern frequency +# +# Usage: data_summarize.sh PLAYERLIST RAWPATDIR PATDIR + +players=$1; shift +rawpatdir=$1; shift +patdir=$1; shift + +summarize() { + sort | uniq -c | sort -rn +} + +# per-player pattern data +cat "$players" | while read player; do + echo "$player" + cat "$rawpatdir/$player" | summarize >"$patdir/$player" +done + +# summary pattern data +echo "all.pat" +cat "$players" | while read player; do cat "$rawpatdir/$player"; done | + summarize >"$patdir/all.pat" -- 2.11.4.GIT