elo_get_probdist(): Iterate only over empty positions
[pachi.git] / pattern_gather.sh
blobf9d229ec7dc813fa6d17d7c12a962a02623945f9
1 #!/bin/sh
2 # pattern_gather: Gather patterns from a SGF collection
4 # We will first gather all spatial features from the SGF collection
5 # (we take files as arguments), keep only 5000 most frequently occuring
6 # in the dictionary, and then do full pattern-matching again with this
7 # dictionary.
9 # DO NOT RUN THIS CONCURRENTLY!
10 # You really want to run this on a fast filesystem, not NFS or anything.
11 # During the conversion, you will need about 100M per ~100 games, after
12 # it's over it will take much less.
14 spatials=5000
16 rm -f spatial.dict
18 echo "Gathering patterns (1st pass)..."
19 (for i in "$@"; do ./sgf2gtp.pl $i; done) |
20 ./zzgo -e patternscan >/tmp/patterns
22 echo "Filtering population of $spatials most popular spatials..."
23 cat /tmp/patterns | sed 's/ /\n/g' |
24 sed -ne 's/)//; s/^s:/0x/p; ' | # pick out spatial payloads
25 perl -nle 'print (((1<<24)-1) & hex $_)' | # convert to ids
26 sort -n | uniq -c | sort -rn | # sort by frequency
27 head -n $spatials | awk '{print$2}' | # take 5000 top ids
28 cat >/tmp/pattern.pop
30 echo "Composing new spatial.dict..."
31 # Preserve top comments
32 sed -e '/^[^#]/Q' spatial.dict >/tmp/spatial.dict
33 # join needs lexicographic order
34 sort /tmp/pattern.pop >/tmp/pattern.filter
35 grep -v '^#' spatial.dict | sort | join - /tmp/pattern.filter | # patterns with id in pattern.filter
36 sort -n | cut -d ' ' -f 2- | perl -pe '$_="$. $_"' | # re-number patterns
37 cat >>/tmp/spatial.dict
39 echo -n "Counting hash collisions... "
40 perl -lne 'chomp; my ($id, $d, $p, @h) = split(/ /, $_); foreach (@h) { next if $h{$_} = $id; print "collision $id - $h{$_} ($_)" if $h{$_}; $h{$_}=$id; }' /tmp/spatial.dict | wc -l
42 echo "Deploying spatial.dict in final position..."
43 mv /tmp/spatial.dict spatial.dict
44 rm /tmp/patterns /tmp/pattern.pop /tmp/pattern.filter
47 # Now, re-scan patterns with limited dictionary!
48 echo "Gathering patterns (2nd pass)..."
49 (for i in "$@"; do ./sgf2gtp.pl $i; done) |
50 ./zzgo -e patternscan fixed_dict >patterns
52 echo "Gathered pattern data in .:"
53 ls -l patterns spatial.dict