tree_update_node_rvalue(): Introduce; honor amaf_prior setting in _value()
[pachi.git] / uct / policy / ucb1tuned.c
blob7a8c91f1d606315015173cdcf021ce2020cbb234
1 #include <assert.h>
2 #include <math.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
7 #include "board.h"
8 #include "debug.h"
9 #include "move.h"
10 #include "uct/internal.h"
11 #include "uct/tree.h"
13 /* This implements the UCB1-TUNED policy. */
15 struct ucb1_policy_tuned {
16 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
17 * paper calls 'p'. Original UCB has this on 2, but this seems to
18 * produce way too wide searches; reduce this to get deeper and
19 * narrower readouts - try 0.2. */
20 float explore_p;
21 float fpu;
25 struct tree_node *ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color);
27 struct tree_node *
28 ucb1tuned_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
30 struct ucb1_policy_tuned *b = p->data;
31 float xpl = log(node->u.playouts) * b->explore_p;
33 struct tree_node *nbest = node->children;
34 float best_urgency = -9999;
35 for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
36 /* Do not consider passing early. */
37 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
38 continue;
39 float xpl_loc = (ni->u.value - ni->u.value * ni->u.value);
40 if (tree_parity(tree, parity) < 0) xpl_loc = 1 - xpl_loc;
41 xpl_loc += sqrt(xpl / ni->u.playouts);
42 if (xpl_loc > 1.0/4) xpl_loc = 1.0/4;
43 float urgency = tree_node_get_value(tree, ni, u, parity) + sqrt(xpl * xpl_loc / ni->u.playouts);
44 if (urgency > best_urgency) {
45 best_urgency = urgency;
46 nbest = ni;
49 return nbest;
52 void ucb1_update(struct uct_policy *p, struct tree *tree, struct tree_node *node, enum stone node_color, enum stone player_color, struct playout_amafmap *map, int result);
55 struct uct_policy *
56 policy_ucb1tuned_init(struct uct *u, char *arg)
58 struct uct_policy *p = calloc(1, sizeof(*p));
59 struct ucb1_policy_tuned *b = calloc(1, sizeof(*b));
60 p->uct = u;
61 p->data = b;
62 p->descend = ucb1tuned_descend;
63 p->choose = ucb1_choose;
64 p->update = ucb1_update;
66 b->explore_p = 0.2;
67 b->fpu = INFINITY;
69 if (arg) {
70 char *optspec, *next = arg;
71 while (*next) {
72 optspec = next;
73 next += strcspn(next, ":");
74 if (*next) { *next++ = 0; } else { *next = 0; }
76 char *optname = optspec;
77 char *optval = strchr(optspec, '=');
78 if (optval) *optval++ = 0;
80 if (!strcasecmp(optname, "explore_p")) {
81 b->explore_p = atof(optval);
82 } else {
83 fprintf(stderr, "ucb1tuned: Invalid policy argument %s or missing value\n", optname);
88 return p;