10 #include "uct/internal.h"
13 /* This implements the basic UCB1 policy. */
16 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
17 * paper calls 'p'. Original UCB has this on 2, but this seems to
18 * produce way too wide searches; reduce this to get deeper and
19 * narrower readouts - try 0.2. */
21 /* First Play Urgency - if set to less than infinity (the MoGo paper
22 * above reports 1.0 as the best), new branches are explored only
23 * if none of the existing ones has higher urgency than fpu. */
25 /* Equivalent experience for prior knowledge. MoGo paper recommends
26 * 50 playouts per source. */
32 ucb1_choose(struct uct_policy
*p
, struct tree_node
*node
, struct board
*b
, enum stone color
)
34 struct tree_node
*nbest
= NULL
;
35 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
)
36 // we compare playouts and choose the best-explored
37 // child; comparing values is more brittle
38 if (!nbest
|| ni
->u
.playouts
> nbest
->u
.playouts
) {
39 /* Play pass only if we can afford scoring */
40 if (is_pass(ni
->coord
)) {
41 float score
= board_official_score(b
);
44 //fprintf(stderr, "%d score %f\n", color, score);
55 ucb1_descend(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, int parity
, bool allow_pass
)
57 struct ucb1_policy
*b
= p
->data
;
58 float xpl
= log(node
->u
.playouts
) * b
->explore_p
;
60 struct tree_node
*nbest
= node
->children
;
61 float best_urgency
= -9999;
62 for (struct tree_node
*ni
= node
->children
; ni
; ni
= ni
->sibling
) {
63 /* Do not consider passing early. */
64 if (likely(!allow_pass
) && unlikely(is_pass(ni
->coord
)))
66 ni
->prior
.value
= (float)ni
->prior
.wins
/ ni
->prior
.playouts
;
67 float priorp
= (parity
> 0 ? ni
->prior
.value
: 1- ni
->prior
.value
);
68 float urgency
= ni
->u
.playouts
? (parity
> 0 ? ni
->u
.value
: 1 - ni
->u
.value
) + sqrt(xpl
/ ni
->u
.playouts
) : ni
->prior
.playouts
? priorp
: b
->fpu
;
71 struct board b2
; b2
.size
= 9;
72 fprintf(stderr
, "[%s -> %s] UCB1 urgency %f (%f + %f : %f : %f)\n", coord2sstr(node
->coord
, &b2
), coord2sstr(ni
->coord
, &b2
), urgency
, ni
->u
.value
, sqrt(xpl
/ ni
->u
.playouts
), priorp
, b
->fpu
);
75 if (urgency
> best_urgency
) {
76 best_urgency
= urgency
;
84 ucb1_prior(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, struct board
*b
, enum stone color
, int parity
)
86 /* Initialization of UCT values based on prior knowledge */
87 struct ucb1_policy
*pp
= p
->data
;
91 /* This somehow does not work at all. */
92 node
->prior
.playouts
+= p
->eqex
;
93 node
->prior
.wins
+= p
->eqex
/ 2;
97 if (node
->parent
&& node
->parent
->parent
&& node
->parent
->parent
->parent
) {
98 struct tree_node
*gpp
= node
->parent
->parent
->parent
;
99 for (struct tree_node
*ni
= gpp
->children
; ni
; ni
= ni
->sibling
) {
100 /* Be careful not to emphasize too random results. */
101 if (ni
->coord
== node
->coord
&& ni
->u
.playouts
> pp
->eqex
) {
102 node
->prior
.playouts
+= pp
->eqex
;
103 node
->prior
.wins
+= pp
->eqex
* ni
->u
.wins
/ ni
->u
.playouts
;
109 /* Q_{playout-policy} */
111 struct playout_policy
*playout
= p
->uct
->playout
;
112 if (playout
->assess
) {
113 struct move m
= { node
->coord
, color
};
114 assess
= playout
->assess(playout
, b
, &m
);
116 if (!isnan(assess
)) {
119 node
->prior
.playouts
+= pp
->eqex
;
120 node
->prior
.wins
+= pp
->eqex
* assess
;
124 if (node
->prior
.playouts
) {
125 node
->prior
.value
= (float) node
->prior
.wins
/ node
->prior
.playouts
;
126 tree_update_node_value(node
, true);
129 //fprintf(stderr, "%s,%s prior: %d/%d = %f (%f)\n", coord2sstr(node->parent->coord, b), coord2sstr(node->coord, b), node->prior.wins, node->prior.playouts, node->prior.value, assess);
133 ucb1_update(struct uct_policy
*p
, struct tree_node
*node
, enum stone color
, struct playout_amafmap
*map
, int result
)
135 /* It is enough to iterate by a single chain; we will
136 * update all the preceding positions properly since
137 * they had to all occur in all branches, only in
138 * different order. */
139 for (; node
; node
= node
->parent
) {
141 node
->u
.wins
+= result
;
142 tree_update_node_value(node
, true);
148 policy_ucb1_init(struct uct
*u
, char *arg
)
150 struct uct_policy
*p
= calloc(1, sizeof(*p
));
151 struct ucb1_policy
*b
= calloc(1, sizeof(*b
));
154 p
->descend
= ucb1_descend
;
155 p
->choose
= ucb1_choose
;
156 p
->update
= ucb1_update
;
162 char *optspec
, *next
= arg
;
165 next
+= strcspn(next
, ":");
166 if (*next
) { *next
++ = 0; } else { *next
= 0; }
168 char *optname
= optspec
;
169 char *optval
= strchr(optspec
, '=');
170 if (optval
) *optval
++ = 0;
172 if (!strcasecmp(optname
, "explore_p") && optval
) {
173 b
->explore_p
= atof(optval
);
174 } else if (!strcasecmp(optname
, "prior")) {
175 b
->eqex
= optval
? atoi(optval
) : 50;
177 p
->prior
= ucb1_prior
;
178 } else if (!strcasecmp(optname
, "fpu") && optval
) {
179 b
->fpu
= atof(optval
);
181 fprintf(stderr
, "ucb1: Invalid policy argument %s or missing value\n", optname
);