uct/policy/ucb1.c

   1 #include <assert.h>
   2 #include <math.h>
   3 #include <stdio.h>
   4 #include <stdlib.h>
   5 #include <string.h>
   6
   7 #include "board.h"
   8 #include "debug.h"
   9 #include "move.h"
  10 #include "uct/internal.h"
  11 #include "uct/tree.h"
  12
  13 /* This implements the basic UCB1 policy. */
  14
  15 struct ucb1_policy {
  16         /* This is what the Modification of UCT with Patterns in Monte Carlo Go
  17          * paper calls 'p'. Original UCB has this on 2, but this seems to
  18          * produce way too wide searches; reduce this to get deeper and
  19          * narrower readouts - try 0.2. */
  20         float explore_p;
  21         /* First Play Urgency - if set to less than infinity (the MoGo paper
  22          * above reports 1.0 as the best), new branches are explored only
  23          * if none of the existing ones has higher urgency than fpu. */
  24         float fpu;
  25         /* Equivalent experience for prior knowledge. MoGo paper recommends
  26          * 50 playouts per source. */
  27         int eqex;
  28 };
  29
  30
  31 struct tree_node *
  32 ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color)
  33 {
  34         struct tree_node *nbest = NULL;
  35         for (struct tree_node *ni = node->children; ni; ni = ni->sibling)
  36                 // we compare playouts and choose the best-explored
  37                 // child; comparing values is more brittle
  38                 if (!nbest || ni->u.playouts > nbest->u.playouts) {
  39                         /* Play pass only if we can afford scoring */
  40                         if (is_pass(ni->coord)) {
  41                                 float score = board_official_score(b);
  42                                 if (color == S_BLACK)
  43                                         score = -score;
  44                                 //fprintf(stderr, "%d score %f\n", color, score);
  45                                 if (score <= 0)
  46                                         continue;
  47                         }
  48                         nbest = ni;
  49                 }
  50         return nbest;
  51 }
  52
  53
  54 struct tree_node *
  55 ucb1_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
  56 {
  57         struct ucb1_policy *b = p->data;
  58         float xpl = log(node->u.playouts) * b->explore_p;
  59
  60         struct tree_node *nbest = node->children;
  61         float best_urgency = -9999;
  62         for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
  63                 /* Do not consider passing early. */
  64                 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
  65                         continue;
  66                 ni->prior.value = (float)ni->prior.wins / ni->prior.playouts;
  67                 float priorp = (parity > 0 ? ni->prior.value : 1- ni->prior.value);
  68                 float urgency = ni->u.playouts ? (parity > 0 ? ni->u.value : 1 - ni->u.value) + sqrt(xpl / ni->u.playouts) : ni->prior.playouts ? priorp : b->fpu;
  69 #if 0
  70                 {
  71                         struct board b2; b2.size = 9;
  72                         fprintf(stderr, "[%s -> %s] UCB1 urgency %f (%f + %f : %f : %f)\n", coord2sstr(node->coord, &b2), coord2sstr(ni->coord, &b2), urgency, ni->u.value, sqrt(xpl / ni->u.playouts), priorp, b->fpu);
  73                 }
  74 #endif
  75                 if (urgency > best_urgency) {
  76                         best_urgency = urgency;
  77                         nbest = ni;
  78                 }
  79         }
  80         return nbest;
  81 }
  82
  83 void
  84 ucb1_prior(struct uct_policy *p, struct tree *tree, struct tree_node *node, struct board *b, enum stone color, int parity)
  85 {
  86         /* Initialization of UCT values based on prior knowledge */
  87         struct ucb1_policy *pp = p->data;
  88
  89 #if 0
  90         /* Q_{even} */
  91         /* This somehow does not work at all. */
  92         node->prior.playouts += p->eqex;
  93         node->prior.wins += p->eqex / 2;
  94 #endif
  95
  96         /* Q_{grandparent} */
  97         if (node->parent && node->parent->parent && node->parent->parent->parent) {
  98                 struct tree_node *gpp = node->parent->parent->parent;
  99                 for (struct tree_node *ni = gpp->children; ni; ni = ni->sibling) {
 100                         /* Be careful not to emphasize too random results. */
 101                         if (ni->coord == node->coord && ni->u.playouts > pp->eqex) {
 102                                 node->prior.playouts += pp->eqex;
 103                                 node->prior.wins += pp->eqex * ni->u.wins / ni->u.playouts;
 104                                 node->hints |= 1;
 105                         }
 106                 }
 107         }
 108
 109         /* Q_{playout-policy} */
 110         float assess = NAN;
 111         struct playout_policy *playout = p->uct->playout;
 112         if (playout->assess) {
 113                 struct move m = { node->coord, color };
 114                 assess = playout->assess(playout, b, &m);
 115         }
 116         if (!isnan(assess)) {
 117                 if (parity < 0)
 118                         assess = 1 - assess;
 119                 node->prior.playouts += pp->eqex;
 120                 node->prior.wins += pp->eqex * assess;
 121                 node->hints |= 2;
 122         }
 123
 124         if (node->prior.playouts) {
 125                 node->prior.value = (float) node->prior.wins / node->prior.playouts;
 126                 tree_update_node_value(node, true);
 127         }
 128
 129         //fprintf(stderr, "%s,%s prior: %d/%d = %f (%f)\n", coord2sstr(node->parent->coord, b), coord2sstr(node->coord, b), node->prior.wins, node->prior.playouts, node->prior.value, assess);
 130 }
 131
 132 void
 133 ucb1_update(struct uct_policy *p, struct tree_node *node, enum stone color, struct playout_amafmap *map, int result)
 134 {
 135         /* It is enough to iterate by a single chain; we will
 136          * update all the preceding positions properly since
 137          * they had to all occur in all branches, only in
 138          * different order. */
 139         for (; node; node = node->parent) {
 140                 node->u.playouts++;
 141                 node->u.wins += result;
 142                 tree_update_node_value(node, true);
 143         }
 144 }
 145
 146
 147 struct uct_policy *
 148 policy_ucb1_init(struct uct *u, char *arg)
 149 {
 150         struct uct_policy *p = calloc(1, sizeof(*p));
 151         struct ucb1_policy *b = calloc(1, sizeof(*b));
 152         p->uct = u;
 153         p->data = b;
 154         p->descend = ucb1_descend;
 155         p->choose = ucb1_choose;
 156         p->update = ucb1_update;
 157
 158         b->explore_p = 0.2;
 159         b->fpu = INFINITY;
 160
 161         if (arg) {
 162                 char *optspec, *next = arg;
 163                 while (*next) {
 164                         optspec = next;
 165                         next += strcspn(next, ":");
 166                         if (*next) { *next++ = 0; } else { *next = 0; }
 167
 168                         char *optname = optspec;
 169                         char *optval = strchr(optspec, '=');
 170                         if (optval) *optval++ = 0;
 171
 172                         if (!strcasecmp(optname, "explore_p") && optval) {
 173                                 b->explore_p = atof(optval);
 174                         } else if (!strcasecmp(optname, "prior")) {
 175                                 b->eqex = optval ? atoi(optval) : 50;
 176                                 if (b->eqex)
 177                                         p->prior = ucb1_prior;
 178                         } else if (!strcasecmp(optname, "fpu") && optval) {
 179                                 b->fpu = atof(optval);
 180                         } else {
 181                                 fprintf(stderr, "ucb1: Invalid policy argument %s or missing value\n", optname);
 182                         }
 183                 }
 184         }
 185
 186         return p;
 187 }