uct/policy/ucb1tuned.c

   1 #include <assert.h>
   2 #include <math.h>
   3 #include <stdio.h>
   4 #include <stdlib.h>
   5 #include <string.h>
   6
   7 #include "board.h"
   8 #include "debug.h"
   9 #include "move.h"
  10 #include "uct/internal.h"
  11 #include "uct/tree.h"
  12
  13 /* This implements the UCB1-TUNED policy. */
  14
  15 struct ucb1_policy {
  16         /* This is what the Modification of UCT with Patterns in Monte Carlo Go
  17          * paper calls 'p'. Original UCB has this on 2, but this seems to
  18          * produce way too wide searches; reduce this to get deeper and
  19          * narrower readouts - try 0.2. */
  20         float explore_p;
  21 };
  22
  23
  24 static struct tree_node *
  25 ucb1_choose(struct uct_policy *p, struct tree_node *node, struct board *b, enum stone color)
  26 {
  27         struct tree_node *nbest = NULL;
  28         for (struct tree_node *ni = node->children; ni; ni = ni->sibling)
  29                 // we compare playouts and choose the best-explored
  30                 // child; comparing values is more brittle
  31                 if (!nbest || ni->playouts > nbest->playouts) {
  32                         /* Play pass only if we can afford scoring */
  33                         if (is_pass(ni->coord)) {
  34                                 float score = board_official_score(b);
  35                                 if (color == S_BLACK)
  36                                         score = -score;
  37                                 //fprintf(stderr, "%d score %f\n", color, score);
  38                                 if (score <= 0)
  39                                         continue;
  40                         }
  41                         nbest = ni;
  42                 }
  43         return nbest;
  44 }
  45
  46
  47 static struct tree_node *
  48 ucb1_descend(struct uct_policy *p, struct tree *tree, struct tree_node *node, int parity, bool allow_pass)
  49 {
  50         struct ucb1_policy *b = p->data;
  51         float xpl = log(node->playouts) * b->explore_p;
  52
  53         struct tree_node *nbest = node->children;
  54         float best_urgency = -9999;
  55         for (struct tree_node *ni = node->children; ni; ni = ni->sibling) {
  56                 /* Do not consider passing early. */
  57                 if (likely(!allow_pass) && unlikely(is_pass(ni->coord)))
  58                         continue;
  59                 float xpl_loc = (ni->value - ni->value * ni->value);
  60                 if (parity < 0) xpl_loc = 1 - xpl_loc;
  61                 xpl_loc += sqrt(xpl / ni->playouts);
  62                 if (xpl_loc > 1.0/4) xpl_loc = 1.0/4;
  63                 float urgency = ni->value * parity + sqrt(xpl * xpl_loc / ni->playouts);
  64                 if (urgency > best_urgency) {
  65                         best_urgency = urgency;
  66                         nbest = ni;
  67                 }
  68         }
  69         return nbest;
  70 }
  71
  72
  73 struct uct_policy *
  74 policy_ucb1tuned_init(struct uct *u, char *arg)
  75 {
  76         struct uct_policy *p = calloc(1, sizeof(*p));
  77         struct ucb1_policy *b = calloc(1, sizeof(*b));
  78         p->uct = u;
  79         p->data = b;
  80         p->descend = ucb1_descend;
  81         p->choose = ucb1_choose;
  82
  83         b->explore_p = 0.2;
  84
  85         if (arg) {
  86                 char *optspec, *next = arg;
  87                 while (*next) {
  88                         optspec = next;
  89                         next += strcspn(next, "+");
  90                         if (*next) { *next++ = 0; } else { *next = 0; }
  91
  92                         char *optname = optspec;
  93                         char *optval = strchr(optspec, '=');
  94                         if (optval) *optval++ = 0;
  95
  96                         if (!strcasecmp(optname, "explore_p")) {
  97                                 b->explore_p = atof(optval);
  98                         } else {
  99                                 fprintf(stderr, "ucb1tuned: Invalid policy argument %s or missing value\n", optname);
 100                         }
 101                 }
 102         }
 103
 104         return p;
 105 }