UCT: Some commented-out debug prints
[pachi/peepo.git] / uct / policy / ucb1.c
blob5baa250c4625673b4c4f7f9cb37ec329d9341a3a
1 #include <assert.h>
2 #include <math.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
7 #include "board.h"
8 #include "debug.h"
9 #include "move.h"
10 #include "uct/internal.h"
11 #include "uct/tree.h"
13 /* This implements the basic UCB1 policy. */
15 struct ucb1_policy {
16 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
17 * paper calls 'p'. Original UCB has this on 2, but this seems to
18 * produce way too wide searches; reduce this to get deeper and
19 * narrower readouts - try 0.2. */
20 float explore_p;
21 /* First Play Urgency - if set to less than infinity (the MoGo paper
22 * above reports 1.0 as the best), new branches are explored only
23 * if none of the existing ones has higher urgency than fpu. */
24 float fpu;
25 /* Equivalent experience for prior knowledge. MoGo paper recommends
26 * 50 playouts per source. */
27 int eqex;
31 struct tree_node *
32 ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color)
34 struct tree_node *nbest = NULL;
35 for (struct tree_node *ni = node->children; ni; ni = ni->sibling)
36 // we compare playouts and choose the best-explored
37 // child; comparing values is more brittle
38 if (!nbest || ni->u.playouts > nbest->u.playouts) {
39 /* Play pass only if we can afford scoring */
40 if (is_pass(ni->coord)) {
41 float score = board_official_score(b);
42 if (color == S_BLACK)
43 score = -score;
44 //fprintf(stderr, "%d score %f\n", color, score);
45 if (score <= 0)
46 continue;
48 nbest = ni;
50 return nbest;
54 struct tree_node *
55 ucb1_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
57 struct ucb1_policy *b = p->data;
58 float xpl = log(node->u.playouts) * b->explore_p;
60 struct tree_node *nbest = node->children;
61 float best_urgency = -9999;
62 for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
63 /* Do not consider passing early. */
64 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
65 continue;
66 ni->prior.value = (float)ni->prior.wins / ni->prior.playouts;
67 float priorp = (parity > 0 ? ni->prior.value : 1- ni->prior.value);
68 float urgency = ni->u.playouts ? (parity > 0 ? ni->u.value : 1 - ni->u.value) + sqrt(xpl / ni->u.playouts) : ni->prior.playouts ? priorp : b->fpu;
69 #if 0
71 struct board b2; b2.size = 9;
72 fprintf(stderr, "[%s -> %s] UCB1 urgency %f (%f + %f : %f : %f)\n", coord2sstr(node->coord, &b2), coord2sstr(ni->coord, &b2), urgency, ni->u.value, sqrt(xpl / ni->u.playouts), priorp, b->fpu);
74 #endif
75 if (urgency > best_urgency) {
76 best_urgency = urgency;
77 nbest = ni;
80 return nbest;
83 void
84 ucb1_prior(struct uct_policy *p, struct tree *tree, struct tree_node *node, struct board *b, enum stone color, int parity)
86 /* Initialization of UCT values based on prior knowledge */
87 struct ucb1_policy *pp = p->data;
89 #if 0
90 /* Q_{even} */
91 /* This somehow does not work at all. */
92 node->prior.playouts += p->eqex;
93 node->prior.wins += p->eqex / 2;
94 #endif
96 /* Q_{grandparent} */
97 if (node->parent && node->parent->parent && node->parent->parent->parent) {
98 struct tree_node *gpp = node->parent->parent->parent;
99 for (struct tree_node *ni = gpp->children; ni; ni = ni->sibling) {
100 /* Be careful not to emphasize too random results. */
101 if (ni->coord == node->coord && ni->u.playouts > pp->eqex) {
102 node->prior.playouts += pp->eqex;
103 node->prior.wins += pp->eqex * ni->u.wins / ni->u.playouts;
104 node->hints |= 1;
109 /* Q_{playout-policy} */
110 float assess = NAN;
111 struct playout_policy *playout = p->uct->playout;
112 if (playout->assess) {
113 struct move m = { node->coord, color };
114 assess = playout->assess(playout, b, &m);
116 if (!isnan(assess)) {
117 if (parity < 0)
118 assess = 1 - assess;
119 node->prior.playouts += pp->eqex;
120 node->prior.wins += pp->eqex * assess;
121 node->hints |= 2;
124 if (node->prior.playouts) {
125 node->prior.value = (float) node->prior.wins / node->prior.playouts;
126 tree_update_node_value(node, true);
129 //fprintf(stderr, "%s,%s prior: %d/%d = %f (%f)\n", coord2sstr(node->parent->coord, b), coord2sstr(node->coord, b), node->prior.wins, node->prior.playouts, node->prior.value, assess);
132 void
133 ucb1_update(struct uct_policy *p, struct tree_node *node, enum stone color, struct playout_amafmap *map, int result)
135 /* It is enough to iterate by a single chain; we will
136 * update all the preceding positions properly since
137 * they had to all occur in all branches, only in
138 * different order. */
139 for (; node; node = node->parent) {
140 node->u.playouts++;
141 node->u.wins += result;
142 tree_update_node_value(node, true);
147 struct uct_policy *
148 policy_ucb1_init(struct uct *u, char *arg)
150 struct uct_policy *p = calloc(1, sizeof(*p));
151 struct ucb1_policy *b = calloc(1, sizeof(*b));
152 p->uct = u;
153 p->data = b;
154 p->descend = ucb1_descend;
155 p->choose = ucb1_choose;
156 p->update = ucb1_update;
158 b->explore_p = 0.2;
159 b->fpu = INFINITY;
161 if (arg) {
162 char *optspec, *next = arg;
163 while (*next) {
164 optspec = next;
165 next += strcspn(next, ":");
166 if (*next) { *next++ = 0; } else { *next = 0; }
168 char *optname = optspec;
169 char *optval = strchr(optspec, '=');
170 if (optval) *optval++ = 0;
172 if (!strcasecmp(optname, "explore_p") && optval) {
173 b->explore_p = atof(optval);
174 } else if (!strcasecmp(optname, "prior")) {
175 b->eqex = optval ? atoi(optval) : 50;
176 if (b->eqex)
177 p->prior = ucb1_prior;
178 } else if (!strcasecmp(optname, "fpu") && optval) {
179 b->fpu = atof(optval);
180 } else {
181 fprintf(stderr, "ucb1: Invalid policy argument %s or missing value\n", optname);
186 return p;