uct/policy/ucb1.c

   1 #include <assert.h>
   2 #include <math.h>
   3 #include <stdio.h>
   4 #include <stdlib.h>
   5 #include <string.h>
   6
   7 #include "board.h"
   8 #include "debug.h"
   9 #include "move.h"
  10 #include "random.h"
  11 #include "uct/internal.h"
  12 #include "uct/tree.h"
  13 #include "uct/policy/generic.h"
  14
  15 /* This implements the basic UCB1 policy. */
  16
  17 struct ucb1_policy {
  18         /* This is what the Modification of UCT with Patterns in Monte Carlo Go
  19          * paper calls 'p'. Original UCB has this on 2, but this seems to
  20          * produce way too wide searches; reduce this to get deeper and
  21          * narrower readouts - try 0.2. */
  22         floating_t explore_p;
  23         /* First Play Urgency - if set to less than infinity (the MoGo paper
  24          * above reports 1.0 as the best), new branches are explored only
  25          * if none of the existing ones has higher urgency than fpu. */
  26         floating_t fpu;
  27 };
  28
  29
  30 void
  31 ucb1_descend(struct uct_policy *p, struct tree *tree, struct uct_descent *descent, int parity, bool allow_pass)
  32 {
  33         /* We want to count in the prior stats here after all. Otherwise,
  34          * nodes with positive prior will get explored _LESS_ since the
  35          * urgency will be always higher; even with normal FPU because
  36          * of the explore coefficient. */
  37
  38         struct ucb1_policy *b = p->data;
  39         floating_t xpl = log(descent->node->u.playouts + descent->node->prior.playouts);
  40
  41         uctd_try_node_children(tree, descent, allow_pass, parity, p->uct->tenuki_d, di, urgency) {
  42                 struct tree_node *ni = di.node;
  43                 int uct_playouts = ni->u.playouts + ni->prior.playouts + ni->descents;
  44
  45                 /* XXX: We don't take local-tree information into account. */
  46
  47                 if (uct_playouts) {
  48                         urgency = (ni->u.playouts * tree_node_get_value(tree, parity, ni->u.value)
  49                                    + ni->prior.playouts * tree_node_get_value(tree, parity, ni->prior.value))
  50                                    + (parity > 0 ? 0 : ni->descents)
  51                                   / uct_playouts;
  52                         urgency += b->explore_p * sqrt(xpl / uct_playouts);
  53                 } else {
  54                         urgency = b->fpu;
  55                 }
  56         } uctd_set_best_child(di, urgency);
  57
  58         uctd_get_best_child(descent);
  59 }
  60
  61 void
  62 ucb1_update(struct uct_policy *p, struct tree *tree, struct tree_node *node, enum stone node_color, enum stone player_color, struct playout_amafmap *map, struct board *final_board, floating_t result)
  63 {
  64         /* It is enough to iterate by a single chain; we will
  65          * update all the preceding positions properly since
  66          * they had to all occur in all branches, only in
  67          * different order. */
  68         enum stone winner_color = result > 0.5 ? S_BLACK : S_WHITE;
  69
  70         for (; node; node = node->parent) {
  71                 stats_add_result(&node->u, result, 1);
  72
  73                 if (!is_pass(node_coord(node))) {
  74                         stats_add_result(&node->winner_owner, board_at(final_board, node_coord(node)) == winner_color ? 1.0 : 0.0, 1);
  75                         stats_add_result(&node->black_owner, board_at(final_board, node_coord(node)) == S_BLACK ? 1.0 : 0.0, 1);
  76                 }
  77         }
  78 }
  79
  80 void
  81 ucb1_done(struct uct_policy *p)
  82 {
  83         free(p->data);
  84         free(p);
  85 }
  86
  87 struct uct_policy *
  88 policy_ucb1_init(struct uct *u, char *arg)
  89 {
  90         struct uct_policy *p = calloc2(1, sizeof(*p));
  91         struct ucb1_policy *b = calloc2(1, sizeof(*b));
  92         p->uct = u;
  93         p->data = b;
  94         p->done = ucb1_done;
  95         p->descend = ucb1_descend;
  96         p->choose = uctp_generic_choose;
  97         p->update = ucb1_update;
  98
  99         b->explore_p = 0.2;
 100         b->fpu = 1.1; //INFINITY;
 101
 102         if (arg) {
 103                 char *optspec, *next = arg;
 104                 while (*next) {
 105                         optspec = next;
 106                         next += strcspn(next, ":");
 107                         if (*next) { *next++ = 0; } else { *next = 0; }
 108
 109                         char *optname = optspec;
 110                         char *optval = strchr(optspec, '=');
 111                         if (optval) *optval++ = 0;
 112
 113                         if (!strcasecmp(optname, "explore_p") && optval) {
 114                                 b->explore_p = atof(optval);
 115                         } else if (!strcasecmp(optname, "fpu") && optval) {
 116                                 b->fpu = atof(optval);
 117                         } else {
 118                                 fprintf(stderr, "ucb1: Invalid policy argument %s or missing value\n",
 119                                         optname);
 120                                 exit(1);
 121                         }
 122                 }
 123         }
 124
 125         return p;
 126 }