11 #include "uct/internal.h"
14 /* This implements the basic UCB1 policy. */
17 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
18 * paper calls 'p'. Original UCB has this on 2, but this seems to
19 * produce way too wide searches; reduce this to get deeper and
20 * narrower readouts - try 0.2. */
22 /* First Play Urgency - if set to less than infinity (the MoGo paper
23 * above reports 1.0 as the best), new branches are explored only
24 * if none of the existing ones has higher urgency than fpu. */
26 int urg_randoma
, urg_randomm
;
31 ucb1_choose(struct uct_policy
*p
, struct tree_node
*node
, struct board
*b
, enum stone color
)
33 struct tree_node
*nbest
= NULL
;
34 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
)
35 // we compare playouts and choose the best-explored
36 // child; comparing values is more brittle
37 if (!nbest
|| ni
->u
.playouts
> nbest
->u
.playouts
) {
38 /* Play pass only if we can afford scoring */
39 if (is_pass(ni
->coord
)) {
40 float score
= board_official_score(b
);
43 //fprintf(stderr, "%d score %f\n", color, score);
54 ucb1_descend(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, int parity
, bool allow_pass
)
56 /* We want to count in the prior stats here after all. Otherwise,
57 * nodes with positive prior will get explored _LESS_ since the
58 * urgency will be always higher; even with normal FPU because
59 * of the explore coefficient. */
61 struct ucb1_policy
*b
= p
->data
;
62 float xpl
= log(node
->u
.playouts
+ node
->prior
.playouts
) * b
->explore_p
;
64 // XXX: Stack overflow danger on big boards?
65 struct tree_node
*nbest
[512] = { node
->children
}; int nbests
= 1;
66 float best_urgency
= -9999;
68 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
) {
69 /* Do not consider passing early. */
70 if (likely(!allow_pass
) && unlikely(is_pass(ni
->coord
)))
72 int uct_playouts
= ni
->u
.playouts
+ ni
->prior
.playouts
;
74 float urgency
= uct_playouts
? tree_node_get_value(tree
, ni
, u
, parity
) + sqrt(xpl
/ uct_playouts
) : b
->fpu
;
78 struct board b2
; b2
.size
= 9+2;
79 fprintf(stderr
, "[%s -> %s] UCB1 urgency %f (%f + %f : %f)\n", coord2sstr(node
->coord
, &b2
), coord2sstr(ni
->coord
, &b2
), urgency
, ni
->u
.value
, sqrt(xpl
/ ni
->u
.playouts
), b
->fpu
);
83 urgency
+= (float)(fast_random(b
->urg_randoma
) - b
->urg_randoma
/ 2) / 1000;
85 urgency
*= (float)(fast_random(b
->urg_randomm
) + 5) / b
->urg_randomm
;
86 if (urgency
- best_urgency
> __FLT_EPSILON__
) { // urgency > best_urgency
87 best_urgency
= urgency
; nbests
= 0;
89 if (urgency
- best_urgency
> -__FLT_EPSILON__
) { // urgency >= best_urgency
90 /* We want to always choose something else than a pass
91 * in case of a tie. pass causes degenerative behaviour. */
92 if (nbests
== 1 && is_pass(nbest
[0]->coord
)) {
98 return nbest
[fast_random(nbests
)];
102 ucb1_update(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, enum stone node_color
, enum stone player_color
, struct playout_amafmap
*map
, int result
)
104 /* It is enough to iterate by a single chain; we will
105 * update all the preceding positions properly since
106 * they had to all occur in all branches, only in
107 * different order. */
108 for (; node
; node
= node
->parent
) {
110 node
->u
.wins
+= result
;
111 tree_update_node_value(node
, false);
117 policy_ucb1_init(struct uct
*u
, char *arg
)
119 struct uct_policy
*p
= calloc(1, sizeof(*p
));
120 struct ucb1_policy
*b
= calloc(1, sizeof(*b
));
123 p
->descend
= ucb1_descend
;
124 p
->choose
= ucb1_choose
;
125 p
->update
= ucb1_update
;
128 b
->fpu
= 1.1; //INFINITY;
131 char *optspec
, *next
= arg
;
134 next
+= strcspn(next
, ":");
135 if (*next
) { *next
++ = 0; } else { *next
= 0; }
137 char *optname
= optspec
;
138 char *optval
= strchr(optspec
, '=');
139 if (optval
) *optval
++ = 0;
141 if (!strcasecmp(optname
, "explore_p") && optval
) {
142 b
->explore_p
= atof(optval
);
143 } else if (!strcasecmp(optname
, "fpu") && optval
) {
144 b
->fpu
= atof(optval
);
145 } else if (!strcasecmp(optname
, "urg_randoma") && optval
) {
146 b
->urg_randoma
= atoi(optval
);
147 } else if (!strcasecmp(optname
, "urg_randomm") && optval
) {
148 b
->urg_randomm
= atoi(optval
);
150 fprintf(stderr
, "ucb1: Invalid policy argument %s or missing value\n", optname
);