11 #include "uct/internal.h"
13 #include "uct/policy/generic.h"
15 /* This implements the basic UCB1 policy. */
18 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
19 * paper calls 'p'. Original UCB has this on 2, but this seems to
20 * produce way too wide searches; reduce this to get deeper and
21 * narrower readouts - try 0.2. */
23 /* First Play Urgency - if set to less than infinity (the MoGo paper
24 * above reports 1.0 as the best), new branches are explored only
25 * if none of the existing ones has higher urgency than fpu. */
31 ucb1_descend(struct uct_policy
*p
, struct tree
*tree
, struct uct_descent
*descent
, int parity
, bool allow_pass
)
33 /* We want to count in the prior stats here after all. Otherwise,
34 * nodes with positive prior will get explored _LESS_ since the
35 * urgency will be always higher; even with normal FPU because
36 * of the explore coefficient. */
38 struct ucb1_policy
*b
= p
->data
;
39 floating_t xpl
= log(descent
->node
->u
.playouts
+ descent
->node
->prior
.playouts
);
41 uctd_try_node_children(tree
, descent
, allow_pass
, parity
, p
->uct
->tenuki_d
, di
, urgency
) {
42 struct tree_node
*ni
= di
.node
;
43 int uct_playouts
= ni
->u
.playouts
+ ni
->prior
.playouts
+ ni
->descents
;
45 /* XXX: We don't take local-tree information into account. */
48 urgency
= (ni
->u
.playouts
* tree_node_get_value(tree
, parity
, ni
->u
.value
)
49 + ni
->prior
.playouts
* tree_node_get_value(tree
, parity
, ni
->prior
.value
))
50 + (parity
> 0 ? 0 : ni
->descents
)
52 urgency
+= b
->explore_p
* sqrt(xpl
/ uct_playouts
);
56 } uctd_set_best_child(di
, urgency
);
58 uctd_get_best_child(descent
);
62 ucb1_update(struct uct_policy
*p
, struct tree
*tree
, struct tree_node
*node
, enum stone node_color
, enum stone player_color
, struct playout_amafmap
*map
, struct board
*final_board
, floating_t result
)
64 /* It is enough to iterate by a single chain; we will
65 * update all the preceding positions properly since
66 * they had to all occur in all branches, only in
68 enum stone winner_color
= result
> 0.5 ? S_BLACK
: S_WHITE
;
70 for (; node
; node
= node
->parent
) {
71 stats_add_result(&node
->u
, result
, 1);
73 if (!is_pass(node_coord(node
))) {
74 stats_add_result(&node
->winner_owner
, board_at(final_board
, node_coord(node
)) == winner_color
? 1.0 : 0.0, 1);
75 stats_add_result(&node
->black_owner
, board_at(final_board
, node_coord(node
)) == S_BLACK
? 1.0 : 0.0, 1);
81 ucb1_done(struct uct_policy
*p
)
88 policy_ucb1_init(struct uct
*u
, char *arg
)
90 struct uct_policy
*p
= calloc2(1, sizeof(*p
));
91 struct ucb1_policy
*b
= calloc2(1, sizeof(*b
));
95 p
->descend
= ucb1_descend
;
96 p
->choose
= uctp_generic_choose
;
97 p
->update
= ucb1_update
;
100 b
->fpu
= 1.1; //INFINITY;
103 char *optspec
, *next
= arg
;
106 next
+= strcspn(next
, ":");
107 if (*next
) { *next
++ = 0; } else { *next
= 0; }
109 char *optname
= optspec
;
110 char *optval
= strchr(optspec
, '=');
111 if (optval
) *optval
++ = 0;
113 if (!strcasecmp(optname
, "explore_p") && optval
) {
114 b
->explore_p
= atof(optval
);
115 } else if (!strcasecmp(optname
, "fpu") && optval
) {
116 b
->fpu
= atof(optval
);
118 fprintf(stderr
, "ucb1: Invalid policy argument %s or missing value\n",