tree_update_node_rvalue(): Introduce; honor amaf_prior setting in _value()
[pachi.git] / uct / policy / ucb1.c
blob69f1c20c8faddc7825b195d13889b2455ceaa6ce
1 #include <assert.h>
2 #include <math.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
7 #include "board.h"
8 #include "debug.h"
9 #include "move.h"
10 #include "random.h"
11 #include "uct/internal.h"
12 #include "uct/tree.h"
14 /* This implements the basic UCB1 policy. */
16 struct ucb1_policy {
17 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
18 * paper calls 'p'. Original UCB has this on 2, but this seems to
19 * produce way too wide searches; reduce this to get deeper and
20 * narrower readouts - try 0.2. */
21 float explore_p;
22 /* First Play Urgency - if set to less than infinity (the MoGo paper
23 * above reports 1.0 as the best), new branches are explored only
24 * if none of the existing ones has higher urgency than fpu. */
25 float fpu;
26 int urg_randoma, urg_randomm;
30 struct tree_node *
31 ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color)
33 struct tree_node *nbest = NULL;
34 for (struct tree_node *ni = node->children; ni; ni = ni->sibling)
35 // we compare playouts and choose the best-explored
36 // child; comparing values is more brittle
37 if (!nbest || ni->u.playouts > nbest->u.playouts) {
38 /* Play pass only if we can afford scoring */
39 if (is_pass(ni->coord)) {
40 float score = board_official_score(b);
41 if (color == S_BLACK)
42 score = -score;
43 //fprintf(stderr, "%d score %f\n", color, score);
44 if (score <= 0)
45 continue;
47 nbest = ni;
49 return nbest;
53 struct tree_node *
54 ucb1_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
56 /* We want to count in the prior stats here after all. Otherwise,
57 * nodes with positive prior will get explored _LESS_ since the
58 * urgency will be always higher; even with normal FPU because
59 * of the explore coefficient. */
61 struct ucb1_policy *b = p->data;
62 float xpl = log(node->u.playouts + node->prior.playouts) * b->explore_p;
64 // XXX: Stack overflow danger on big boards?
65 struct tree_node *nbest[512] = { node->children }; int nbests = 1;
66 float best_urgency = -9999;
68 for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
69 /* Do not consider passing early. */
70 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
71 continue;
72 int uct_playouts = ni->u.playouts + ni->prior.playouts;
73 ni->prior.value = (float)ni->prior.wins / ni->prior.playouts;
75 /* XXX: We later compare urgency with best_urgency; this can
76 * be difficult given that urgency can be in register with
77 * higher precision than best_urgency, thus even though
78 * the numbers are in fact the same, urgency will be
79 * slightly higher (or lower). Thus, we declare urgency
80 * as volatile, attempting to force the compiler to keep
81 * everything as a float. Ideally, we should do some random
82 * __FLT_EPSILON__ magic instead. */
83 volatile float urgency = uct_playouts ? tree_node_get_value(tree, ni, u, parity) + sqrt(xpl / uct_playouts) : b->fpu;
85 #if 0
87 struct board b2; b2.size = 9+2;
88 fprintf(stderr, "[%s -> %s] UCB1 urgency %f (%f + %f : %f)\n", coord2sstr(node->coord, &b2), coord2sstr(ni->coord, &b2), urgency, ni->u.value, sqrt(xpl / ni->u.playouts), b->fpu);
90 #endif
91 if (b->urg_randoma)
92 urgency += (float)(fast_random(b->urg_randoma) - b->urg_randoma / 2) / 1000;
93 if (b->urg_randomm)
94 urgency *= (float)(fast_random(b->urg_randomm) + 5) / b->urg_randomm;
95 if (urgency > best_urgency) {
96 best_urgency = urgency; nbests = 0;
98 if (urgency >= best_urgency) {
99 /* We want to always choose something else than a pass
100 * in case of a tie. pass causes degenerative behaviour. */
101 if (nbests == 1 && is_pass(nbest[0]->coord)) {
102 nbests--;
104 nbest[nbests++] = ni;
107 return nbest[fast_random(nbests)];
110 void
111 ucb1_update(struct uct_policy *p, struct tree *tree, struct tree_node *node, enum stone node_color, enum stone player_color, struct playout_amafmap *map, int result)
113 /* It is enough to iterate by a single chain; we will
114 * update all the preceding positions properly since
115 * they had to all occur in all branches, only in
116 * different order. */
117 for (; node; node = node->parent) {
118 node->u.playouts++;
119 node->u.wins += result;
120 tree_update_node_value(node, false);
125 struct uct_policy *
126 policy_ucb1_init(struct uct *u, char *arg)
128 struct uct_policy *p = calloc(1, sizeof(*p));
129 struct ucb1_policy *b = calloc(1, sizeof(*b));
130 p->uct = u;
131 p->data = b;
132 p->descend = ucb1_descend;
133 p->choose = ucb1_choose;
134 p->update = ucb1_update;
136 b->explore_p = 0.2;
137 b->fpu = 1.1; //INFINITY;
139 if (arg) {
140 char *optspec, *next = arg;
141 while (*next) {
142 optspec = next;
143 next += strcspn(next, ":");
144 if (*next) { *next++ = 0; } else { *next = 0; }
146 char *optname = optspec;
147 char *optval = strchr(optspec, '=');
148 if (optval) *optval++ = 0;
150 if (!strcasecmp(optname, "explore_p") && optval) {
151 b->explore_p = atof(optval);
152 } else if (!strcasecmp(optname, "fpu") && optval) {
153 b->fpu = atof(optval);
154 } else if (!strcasecmp(optname, "urg_randoma") && optval) {
155 b->urg_randoma = atoi(optval);
156 } else if (!strcasecmp(optname, "urg_randomm") && optval) {
157 b->urg_randomm = atoi(optval);
158 } else {
159 fprintf(stderr, "ucb1: Invalid policy argument %s or missing value\n", optname);
164 return p;