uct/policy/ucb1.c

   1 #include <assert.h>
   2 #include <math.h>
   3 #include <stdio.h>
   4 #include <stdlib.h>
   5 #include <string.h>
   6
   7 #include "board.h"
   8 #include "debug.h"
   9 #include "move.h"
  10 #include "random.h"
  11 #include "uct/internal.h"
  12 #include "uct/tree.h"
  13
  14 /* This implements the basic UCB1 policy. */
  15
  16 struct ucb1_policy {
  17         /* This is what the Modification of UCT with Patterns in Monte Carlo Go
  18          * paper calls 'p'. Original UCB has this on 2, but this seems to
  19          * produce way too wide searches; reduce this to get deeper and
  20          * narrower readouts - try 0.2. */
  21         float explore_p;
  22         /* First Play Urgency - if set to less than infinity (the MoGo paper
  23          * above reports 1.0 as the best), new branches are explored only
  24          * if none of the existing ones has higher urgency than fpu. */
  25         float fpu;
  26         int urg_randoma, urg_randomm;
  27 };
  28
  29
  30 struct tree_node *
  31 ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color)
  32 {
  33         struct tree_node *nbest = NULL;
  34         for (struct tree_node *ni = node->children; ni; ni = ni->sibling)
  35                 // we compare playouts and choose the best-explored
  36                 // child; comparing values is more brittle
  37                 if (!nbest || ni->u.playouts > nbest->u.playouts) {
  38                         /* Play pass only if we can afford scoring */
  39                         if (is_pass(ni->coord)) {
  40                                 float score = board_official_score(b);
  41                                 if (color == S_BLACK)
  42                                         score = -score;
  43                                 //fprintf(stderr, "%d score %f\n", color, score);
  44                                 if (score <= 0)
  45                                         continue;
  46                         }
  47                         nbest = ni;
  48                 }
  49         return nbest;
  50 }
  51
  52
  53 struct tree_node *
  54 ucb1_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
  55 {
  56         /* We want to count in the prior stats here after all. Otherwise,
  57          * nodes with positive prior will get explored _LESS_ since the
  58          * urgency will be always higher; even with normal FPU because
  59          * of the explore coefficient. */
  60
  61         struct ucb1_policy *b = p->data;
  62         float xpl = log(node->u.playouts + node->prior.playouts) * b->explore_p;
  63
  64         // XXX: Stack overflow danger on big boards?
  65         struct tree_node *nbest[512] = { node->children }; int nbests = 1;
  66         float best_urgency = -9999;
  67
  68         for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
  69                 /* Do not consider passing early. */
  70                 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
  71                         continue;
  72                 int uct_playouts = ni->u.playouts + ni->prior.playouts;
  73                 ni->prior.value = (float)ni->prior.wins / ni->prior.playouts;
  74
  75                 /* XXX: We later compare urgency with best_urgency; this can
  76                  * be difficult given that urgency can be in register with
  77                  * higher precision than best_urgency, thus even though
  78                  * the numbers are in fact the same, urgency will be
  79                  * slightly higher (or lower). Thus, we declare urgency
  80                  * as volatile, attempting to force the compiler to keep
  81                  * everything as a float. Ideally, we should do some random
  82                  * __FLT_EPSILON__ magic instead. */
  83                 volatile float urgency = uct_playouts ? tree_node_get_value(tree, ni, u, parity) + sqrt(xpl / uct_playouts) : b->fpu;
  84
  85 #if 0
  86                 {
  87                         struct board b2; b2.size = 9+2;
  88                         fprintf(stderr, "[%s -> %s] UCB1 urgency %f (%f + %f : %f)\n", coord2sstr(node->coord, &b2), coord2sstr(ni->coord, &b2), urgency, ni->u.value, sqrt(xpl / ni->u.playouts), b->fpu);
  89                 }
  90 #endif
  91                 if (b->urg_randoma)
  92                         urgency += (float)(fast_random(b->urg_randoma) - b->urg_randoma / 2) / 1000;
  93                 if (b->urg_randomm)
  94                         urgency *= (float)(fast_random(b->urg_randomm) + 5) / b->urg_randomm;
  95                 if (urgency > best_urgency) {
  96                         best_urgency = urgency; nbests = 0;
  97                 }
  98                 if (urgency >= best_urgency) {
  99                         /* We want to always choose something else than a pass
 100                          * in case of a tie. pass causes degenerative behaviour. */
 101                         if (nbests == 1 && is_pass(nbest[0]->coord)) {
 102                                 nbests--;
 103                         }
 104                         nbest[nbests++] = ni;
 105                 }
 106         }
 107         return nbest[fast_random(nbests)];
 108 }
 109
 110 void
 111 ucb1_update(struct uct_policy *p, struct tree *tree, struct tree_node *node, enum stone node_color, enum stone player_color, struct playout_amafmap *map, int result)
 112 {
 113         /* It is enough to iterate by a single chain; we will
 114          * update all the preceding positions properly since
 115          * they had to all occur in all branches, only in
 116          * different order. */
 117         for (; node; node = node->parent) {
 118                 node->u.playouts++;
 119                 node->u.wins += result;
 120                 tree_update_node_value(node, false);
 121         }
 122 }
 123
 124
 125 struct uct_policy *
 126 policy_ucb1_init(struct uct *u, char *arg)
 127 {
 128         struct uct_policy *p = calloc(1, sizeof(*p));
 129         struct ucb1_policy *b = calloc(1, sizeof(*b));
 130         p->uct = u;
 131         p->data = b;
 132         p->descend = ucb1_descend;
 133         p->choose = ucb1_choose;
 134         p->update = ucb1_update;
 135
 136         b->explore_p = 0.2;
 137         b->fpu = 1.1; //INFINITY;
 138
 139         if (arg) {
 140                 char *optspec, *next = arg;
 141                 while (*next) {
 142                         optspec = next;
 143                         next += strcspn(next, ":");
 144                         if (*next) { *next++ = 0; } else { *next = 0; }
 145
 146                         char *optname = optspec;
 147                         char *optval = strchr(optspec, '=');
 148                         if (optval) *optval++ = 0;
 149
 150                         if (!strcasecmp(optname, "explore_p") && optval) {
 151                                 b->explore_p = atof(optval);
 152                         } else if (!strcasecmp(optname, "fpu") && optval) {
 153                                 b->fpu = atof(optval);
 154                         } else if (!strcasecmp(optname, "urg_randoma") && optval) {
 155                                 b->urg_randoma = atoi(optval);
 156                         } else if (!strcasecmp(optname, "urg_randomm") && optval) {
 157                                 b->urg_randomm = atoi(optval);
 158                         } else {
 159                                 fprintf(stderr, "ucb1: Invalid policy argument %s or missing value\n", optname);
 160                         }
 161                 }
 162         }
 163
 164         return p;
 165 }