11 #include "uct/internal.h"
14 /* This implements the basic UCB1 policy. */
17 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
18 * paper calls 'p'. Original UCB has this on 2, but this seems to
19 * produce way too wide searches; reduce this to get deeper and
20 * narrower readouts - try 0.2. */
22 /* First Play Urgency - if set to less than infinity (the MoGo paper
23 * above reports 1.0 as the best), new branches are explored only
24 * if none of the existing ones has higher urgency than fpu. */
26 int urg_randoma
, urg_randomm
;
31 ucb1_choose(struct uct_policy
*p
, struct tree_node
*node
, struct board
*b
, enum stone color
)
33 struct tree_node
*nbest
= NULL
;
34 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
)
35 // we compare playouts and choose the best-explored
36 // child; comparing values is more brittle
37 if (!nbest
|| ni
->u
.playouts
> nbest
->u
.playouts
) {
38 /* Play pass only if we can afford scoring */
39 if (is_pass(ni
->coord
)) {
40 float score
= board_official_score(b
);
43 //fprintf(stderr, "%d score %f\n", color, score);
54 ucb1_descend(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, int parity
, bool allow_pass
)
56 /* We want to count in the prior stats here after all. Otherwise,
57 * nodes with positive prior will get explored _LESS_ since the
58 * urgency will be always higher; even with normal FPU because
59 * of the explore coefficient. */
61 struct ucb1_policy
*b
= p
->data
;
62 float xpl
= log(node
->u
.playouts
+ node
->prior
.playouts
) * b
->explore_p
;
64 // XXX: Stack overflow danger on big boards?
65 struct tree_node
*nbest
[512] = { node
->children
}; int nbests
= 1;
66 float best_urgency
= -9999;
68 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
) {
69 /* Do not consider passing early. */
70 if (likely(!allow_pass
) && unlikely(is_pass(ni
->coord
)))
72 int uct_playouts
= ni
->u
.playouts
+ ni
->prior
.playouts
;
73 ni
->prior
.value
= (float)ni
->prior
.wins
/ ni
->prior
.playouts
;
75 /* XXX: We later compare urgency with best_urgency; this can
76 * be difficult given that urgency can be in register with
77 * higher precision than best_urgency, thus even though
78 * the numbers are in fact the same, urgency will be
79 * slightly higher (or lower). Thus, we declare urgency
80 * as volatile, attempting to force the compiler to keep
81 * everything as a float. Ideally, we should do some random
82 * __FLT_EPSILON__ magic instead. */
83 volatile float urgency
= uct_playouts
? tree_node_get_value(tree
, ni
, u
, parity
) + sqrt(xpl
/ uct_playouts
) : b
->fpu
;
87 struct board b2
; b2
.size
= 9+2;
88 fprintf(stderr
, "[%s -> %s] UCB1 urgency %f (%f + %f : %f)\n", coord2sstr(node
->coord
, &b2
), coord2sstr(ni
->coord
, &b2
), urgency
, ni
->u
.value
, sqrt(xpl
/ ni
->u
.playouts
), b
->fpu
);
92 urgency
+= (float)(fast_random(b
->urg_randoma
) - b
->urg_randoma
/ 2) / 1000;
94 urgency
*= (float)(fast_random(b
->urg_randomm
) + 5) / b
->urg_randomm
;
95 if (urgency
> best_urgency
) {
96 best_urgency
= urgency
; nbests
= 0;
98 if (urgency
>= best_urgency
) {
99 /* We want to always choose something else than a pass
100 * in case of a tie. pass causes degenerative behaviour. */
101 if (nbests
== 1 && is_pass(nbest
[0]->coord
)) {
104 nbest
[nbests
++] = ni
;
107 return nbest
[fast_random(nbests
)];
111 ucb1_update(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, enum stone node_color
, enum stone player_color
, struct playout_amafmap
*map
, int result
)
113 /* It is enough to iterate by a single chain; we will
114 * update all the preceding positions properly since
115 * they had to all occur in all branches, only in
116 * different order. */
117 for (; node
; node
= node
->parent
) {
119 node
->u
.wins
+= result
;
120 tree_update_node_value(node
, false);
126 policy_ucb1_init(struct uct
*u
, char *arg
)
128 struct uct_policy
*p
= calloc(1, sizeof(*p
));
129 struct ucb1_policy
*b
= calloc(1, sizeof(*b
));
132 p
->descend
= ucb1_descend
;
133 p
->choose
= ucb1_choose
;
134 p
->update
= ucb1_update
;
137 b
->fpu
= 1.1; //INFINITY;
140 char *optspec
, *next
= arg
;
143 next
+= strcspn(next
, ":");
144 if (*next
) { *next
++ = 0; } else { *next
= 0; }
146 char *optname
= optspec
;
147 char *optval
= strchr(optspec
, '=');
148 if (optval
) *optval
++ = 0;
150 if (!strcasecmp(optname
, "explore_p") && optval
) {
151 b
->explore_p
= atof(optval
);
152 } else if (!strcasecmp(optname
, "fpu") && optval
) {
153 b
->fpu
= atof(optval
);
154 } else if (!strcasecmp(optname
, "urg_randoma") && optval
) {
155 b
->urg_randoma
= atoi(optval
);
156 } else if (!strcasecmp(optname
, "urg_randomm") && optval
) {
157 b
->urg_randomm
= atoi(optval
);
159 fprintf(stderr
, "ucb1: Invalid policy argument %s or missing value\n", optname
);