11 #include "uct/internal.h"
14 /* This implements the basic UCB1 policy. */
17 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
18 * paper calls 'p'. Original UCB has this on 2, but this seems to
19 * produce way too wide searches; reduce this to get deeper and
20 * narrower readouts - try 0.2. */
22 /* First Play Urgency - if set to less than infinity (the MoGo paper
23 * above reports 1.0 as the best), new branches are explored only
24 * if none of the existing ones has higher urgency than fpu. */
26 /* Equivalent experience for prior knowledge. MoGo paper recommends
27 * 50 playouts per source. */
28 int eqex
, even_eqex
, gp_eqex
, policy_eqex
;
29 int urg_randoma
, urg_randomm
;
34 ucb1_choose(struct uct_policy
*p
, struct tree_node
*node
, struct board
*b
, enum stone color
)
36 struct tree_node
*nbest
= NULL
;
37 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
)
38 // we compare playouts and choose the best-explored
39 // child; comparing values is more brittle
40 if (!nbest
|| ni
->u
.playouts
> nbest
->u
.playouts
) {
41 /* Play pass only if we can afford scoring */
42 if (is_pass(ni
->coord
)) {
43 float score
= board_official_score(b
);
46 //fprintf(stderr, "%d score %f\n", color, score);
57 ucb1_descend(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, int parity
, bool allow_pass
)
59 /* We want to count in the prior stats here after all. Otherwise,
60 * nodes with positive prior will get explored _LESS_ since the
61 * urgency will be always higher; even with normal FPU because
62 * of the explore coefficient. */
64 struct ucb1_policy
*b
= p
->data
;
65 float xpl
= log(node
->u
.playouts
+ node
->prior
.playouts
) * b
->explore_p
;
67 struct tree_node
*nbest
= node
->children
;
68 float best_urgency
= -9999;
69 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
) {
70 /* Do not consider passing early. */
71 if (likely(!allow_pass
) && unlikely(is_pass(ni
->coord
)))
73 int uct_playouts
= ni
->u
.playouts
+ ni
->prior
.playouts
;
74 ni
->prior
.value
= (float)ni
->prior
.wins
/ ni
->prior
.playouts
;
75 float urgency
= uct_playouts
? (parity
> 0 ? ni
->u
.value
: 1 - ni
->u
.value
) + sqrt(xpl
/ uct_playouts
) : b
->fpu
;
79 struct board b2
; b2
.size
= 9+2;
80 fprintf(stderr
, "[%s -> %s] UCB1 urgency %f (%f + %f : %f)\n", coord2sstr(node
->coord
, &b2
), coord2sstr(ni
->coord
, &b2
), urgency
, ni
->u
.value
, sqrt(xpl
/ ni
->u
.playouts
), b
->fpu
);
84 urgency
+= (float)(fast_random(b
->urg_randoma
) - b
->urg_randoma
/ 2) / 1000;
86 urgency
*= (float)(fast_random(b
->urg_randomm
) + 5) / b
->urg_randomm
;
87 if (urgency
> best_urgency
) {
88 best_urgency
= urgency
;
96 ucb1_prior(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, struct board
*b
, enum stone color
, int parity
)
98 /* Initialization of UCT values based on prior knowledge */
99 struct ucb1_policy
*pp
= p
->data
;
102 /* This may be dubious for normal UCB1 but is essential for
103 * reading stability of RAVE, it appears. */
105 node
->prior
.playouts
+= pp
->even_eqex
;
106 node
->prior
.wins
+= pp
->even_eqex
/ 2;
109 /* Q_{grandparent} */
110 if (pp
->gp_eqex
&& node
->parent
&& node
->parent
->parent
&& node
->parent
->parent
->parent
) {
111 struct tree_node
*gpp
= node
->parent
->parent
->parent
;
112 for (struct tree_node
*ni
= gpp
->children
; ni
; ni
= ni
->sibling
) {
113 /* Be careful not to emphasize too random results. */
114 if (ni
->coord
== node
->coord
&& ni
->u
.playouts
> pp
->gp_eqex
) {
115 node
->prior
.playouts
+= pp
->gp_eqex
;
116 node
->prior
.wins
+= pp
->gp_eqex
* ni
->u
.wins
/ ni
->u
.playouts
;
122 /* Q_{playout-policy} */
123 if (pp
->policy_eqex
) {
125 struct playout_policy
*playout
= p
->uct
->playout
;
126 if (playout
->assess
) {
127 struct move m
= { node
->coord
, color
};
128 assess
= playout
->assess(playout
, b
, &m
);
130 if (!isnan(assess
)) {
132 /* Good moves for enemy are losses for us.
133 * We will properly maximize this in the UCB1
137 node
->prior
.playouts
+= pp
->policy_eqex
;
138 node
->prior
.wins
+= pp
->policy_eqex
* assess
;
143 if (node
->prior
.playouts
) {
144 node
->prior
.value
= (float) node
->prior
.wins
/ node
->prior
.playouts
;
145 tree_update_node_value(node
);
148 //fprintf(stderr, "%s,%s prior: %d/%d = %f (%f)\n", coord2sstr(node->parent->coord, b), coord2sstr(node->coord, b), node->prior.wins, node->prior.playouts, node->prior.value, assess);
152 ucb1_update(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, enum stone node_color
, enum stone player_color
, struct playout_amafmap
*map
, int result
)
154 /* It is enough to iterate by a single chain; we will
155 * update all the preceding positions properly since
156 * they had to all occur in all branches, only in
157 * different order. */
158 for (; node
; node
= node
->parent
) {
160 node
->u
.wins
+= result
;
161 tree_update_node_value(node
);
167 policy_ucb1_init(struct uct
*u
, char *arg
)
169 struct uct_policy
*p
= calloc(1, sizeof(*p
));
170 struct ucb1_policy
*b
= calloc(1, sizeof(*b
));
173 p
->descend
= ucb1_descend
;
174 p
->choose
= ucb1_choose
;
175 p
->update
= ucb1_update
;
180 b
->gp_eqex
= b
->policy_eqex
= -1;
184 char *optspec
, *next
= arg
;
187 next
+= strcspn(next
, ":");
188 if (*next
) { *next
++ = 0; } else { *next
= 0; }
190 char *optname
= optspec
;
191 char *optval
= strchr(optspec
, '=');
192 if (optval
) *optval
++ = 0;
194 if (!strcasecmp(optname
, "explore_p") && optval
) {
195 b
->explore_p
= atof(optval
);
196 } else if (!strcasecmp(optname
, "prior")) {
198 b
->eqex
= atoi(optval
);
199 } else if (!strcasecmp(optname
, "prior_even") && optval
) {
200 b
->even_eqex
= atoi(optval
);
201 } else if (!strcasecmp(optname
, "prior_gp") && optval
) {
202 b
->gp_eqex
= atoi(optval
);
203 } else if (!strcasecmp(optname
, "prior_policy") && optval
) {
204 b
->policy_eqex
= atoi(optval
);
205 } else if (!strcasecmp(optname
, "fpu") && optval
) {
206 b
->fpu
= atof(optval
);
207 } else if (!strcasecmp(optname
, "urg_randoma") && optval
) {
208 b
->urg_randoma
= atoi(optval
);
209 } else if (!strcasecmp(optname
, "urg_randomm") && optval
) {
210 b
->urg_randomm
= atoi(optval
);
212 fprintf(stderr
, "ucb1: Invalid policy argument %s or missing value\n", optname
);
217 if (b
->eqex
) p
->prior
= ucb1_prior
;
218 if (b
->even_eqex
< 0) b
->even_eqex
= b
->eqex
;
219 if (b
->gp_eqex
< 0) b
->gp_eqex
= b
->eqex
;
220 if (b
->policy_eqex
< 0) b
->policy_eqex
= b
->eqex
;