UCT: Add ucb1tuned policy
[pachi/peepo.git] / uct / policy / ucb1tuned.c
blob1ca8b26d6c7e152f60bb6d5208e71a2acfedb32e
1 #include <assert.h>
2 #include <math.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
7 #include "board.h"
8 #include "debug.h"
9 #include "move.h"
10 #include "uct/internal.h"
11 #include "uct/tree.h"
13 /* This implements the UCB1-TUNED policy. */
15 struct ucb1_policy {
16 /* This is what the Modification of UCT with Patterns in Monte Carlo Go
17 * paper calls 'p'. Original UCB has this on 2, but this seems to
18 * produce way too wide searches; reduce this to get deeper and
19 * narrower readouts - try 0.2. */
20 float explore_p;
24 static struct tree_node *
25 ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color)
27 struct tree_node *nbest = NULL;
28 for (struct tree_node *ni = node->children; ni; ni = ni->sibling)
29 // we compare playouts and choose the best-explored
30 // child; comparing values is more brittle
31 if (!nbest || ni->playouts > nbest->playouts) {
32 /* Play pass only if we can afford scoring */
33 if (is_pass(ni->coord)) {
34 float score = board_official_score(b);
35 if (color == S_BLACK)
36 score = -score;
37 //fprintf(stderr, "%d score %f\n", color, score);
38 if (score <= 0)
39 continue;
41 nbest = ni;
43 return nbest;
47 static struct tree_node *
48 ucb1_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
50 struct ucb1_policy *b = p->data;
51 float xpl = log(node->playouts) * b->explore_p;
53 struct tree_node *nbest = node->children;
54 float best_urgency = -9999;
55 for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
56 /* Do not consider passing early. */
57 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
58 continue;
59 float xpl_loc = (ni->value - ni->value * ni->value);
60 if (parity < 0) xpl_loc = 1 - xpl_loc;
61 xpl_loc += sqrt(xpl / ni->playouts);
62 if (xpl_loc > 1.0/4) xpl_loc = 1.0/4;
63 float urgency = ni->value * parity + sqrt(xpl * xpl_loc / ni->playouts);
64 if (urgency > best_urgency) {
65 best_urgency = urgency;
66 nbest = ni;
69 return nbest;
73 struct uct_policy *
74 policy_ucb1tuned_init(struct uct *u, char *arg)
76 struct uct_policy *p = calloc(1, sizeof(*p));
77 struct ucb1_policy *b = calloc(1, sizeof(*b));
78 p->uct = u;
79 p->data = b;
80 p->descend = ucb1_descend;
81 p->choose = ucb1_choose;
83 b->explore_p = 0.2;
85 if (arg) {
86 char *optspec, *next = arg;
87 while (*next) {
88 optspec = next;
89 next += strcspn(next, "+");
90 if (*next) { *next++ = 0; } else { *next = 0; }
92 char *optname = optspec;
93 char *optval = strchr(optspec, '=');
94 if (optval) *optval++ = 0;
96 if (!strcasecmp(optname, "explore_p")) {
97 b->explore_p = atof(optval);
98 } else {
99 fprintf(stderr, "ucb1tuned: Invalid policy argument %s or missing value\n", optname);
104 return p;