Allow double for all floating point values in large configurations.
[pachi/ann.git] / uct / policy / ucb1.c
blob679ec233dcd42c043eed3e60a3a0a3226f1aaf19
1 #include <assert.h>
2 #include <math.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
7 #include "board.h"
8 #include "debug.h"
9 #include "move.h"
10 #include "random.h"
11 #include "uct/internal.h"
12 #include "uct/tree.h"
13 #include "uct/policy/generic.h"
15 /* This implements the basic UCB1 policy. */
17 struct ucb1_policy {
18 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
19 * paper calls 'p'. Original UCB has this on 2, but this seems to
20 * produce way too wide searches; reduce this to get deeper and
21 * narrower readouts - try 0.2. */
22 floating_t explore_p;
23 /* First Play Urgency - if set to less than infinity (the MoGo paper
24 * above reports 1.0 as the best), new branches are explored only
25 * if none of the existing ones has higher urgency than fpu. */
26 floating_t fpu;
30 void
31 ucb1_descend(struct uct_policy *p, struct tree *tree, struct uct_descent *descent, int parity, bool allow_pass)
33 /* We want to count in the prior stats here after all. Otherwise,
34 * nodes with positive prior will get explored _LESS_ since the
35 * urgency will be always higher; even with normal FPU because
36 * of the explore coefficient. */
38 struct ucb1_policy *b = p->data;
39 floating_t xpl = log(descent->node->u.playouts + descent->node->prior.playouts);
41 uctd_try_node_children(tree, descent, allow_pass, parity, p->uct->tenuki_d, di, urgency) {
42 struct tree_node *ni = di.node;
43 int uct_playouts = ni->u.playouts + ni->prior.playouts;
45 /* XXX: We don't take local-tree information into account. */
47 if (uct_playouts) {
48 urgency = (ni->u.playouts * tree_node_get_value(tree, parity, ni->u.value)
49 + ni->prior.playouts * tree_node_get_value(tree, parity, ni->prior.value))
50 / uct_playouts;
51 urgency += b->explore_p * sqrt(xpl / uct_playouts);
52 } else {
53 urgency = b->fpu;
55 } uctd_set_best_child(di, urgency);
57 uctd_get_best_child(descent);
60 void
61 ucb1_update(struct uct_policy *p, struct tree *tree, struct tree_node *node, enum stone node_color, enum stone player_color, struct playout_amafmap *map, floating_t result)
63 /* It is enough to iterate by a single chain; we will
64 * update all the preceding positions properly since
65 * they had to all occur in all branches, only in
66 * different order. */
67 for (; node; node = node->parent) {
68 stats_add_result(&node->u, result, 1);
73 struct uct_policy *
74 policy_ucb1_init(struct uct *u, char *arg)
76 struct uct_policy *p = calloc2(1, sizeof(*p));
77 struct ucb1_policy *b = calloc2(1, sizeof(*b));
78 p->uct = u;
79 p->data = b;
80 p->descend = ucb1_descend;
81 p->choose = uctp_generic_choose;
82 p->update = ucb1_update;
84 b->explore_p = 0.2;
85 b->fpu = 1.1; //INFINITY;
87 if (arg) {
88 char *optspec, *next = arg;
89 while (*next) {
90 optspec = next;
91 next += strcspn(next, ":");
92 if (*next) { *next++ = 0; } else { *next = 0; }
94 char *optname = optspec;
95 char *optval = strchr(optspec, '=');
96 if (optval) *optval++ = 0;
98 if (!strcasecmp(optname, "explore_p") && optval) {
99 b->explore_p = atof(optval);
100 } else if (!strcasecmp(optname, "fpu") && optval) {
101 b->fpu = atof(optval);
102 } else {
103 fprintf(stderr, "ucb1: Invalid policy argument %s or missing value\n",
104 optname);
105 exit(1);
110 return p;