knn_strength.py

   1 #!/usr/bin/python
   2 import sys
   3 from gostyle import *
   4 from math import sqrt
   5 from itertools import izip
   6 import numpy
   7
   8 from data_about_players import Data
   9
  10 class KNNOutputVectorGenerator(VectorGenerator):
  11         """ k-NearestNeighbour output vector generator."""
  12         def __init__(self, ref_dict, k=5, weight_param=0.8, dist_mult=10):
  13                 """
  14                         ref_dict is a dictionary of refence input/output vectors.
  15                         e.g. ref_dict= { (1.0,2.0):(9.0,16.0,21.0)
  16                 """
  17                 self.ref_dict = ref_dict
  18                 self.k = k
  19                 self.weigth_param = weight_param
  20                 self.dist_mult = dist_mult
  21         def __call__(self, player_vector):
  22                 distance=[]
  23                 for ref_vec in self.ref_dict.keys():
  24                         distance.append((self.distance(ref_vec, player_vector), ref_vec))
  25                 distance.sort()
  26
  27                 #for p,v in distance:
  28                 #       print "%2.3f"%(float(p),),
  29                 #print
  30                 ref_output_vecs = [ self.ref_dict[b] for a,b in distance[:self.k] ]
  31                 coefs = [ self.weight_fc(a) for a,b in distance[:self.k] ]
  32
  33                 return linear_combination(ref_output_vecs, coefs)
  34         def weight_fc(self, distance):
  35                 return self.weigth_param ** (distance)
  36         def distance(self, vec1, vec2):
  37                 if len(vec1) != len(vec2):
  38                         raise RuntimeError("Dimensions of vectors mismatch.")
  39                 ### the 10* multiplicative constant is empirically determined for correct scaling
  40                 return self.dist_mult * numpy.sqrt(sum( (a - b)**2 for a,b in izip(vec1,vec2)))
  41
  42
  43 if __name__ == '__main__':
  44         root_dir = '../pdb-gtl/'
  45         main_pat_filename = root_dir + 'all.pat'
  46         player_vector = Data.strength_linear_vector
  47         num_features = 400
  48
  49         ### Object creating input vector when called
  50         print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
  51         i = InputVectorGenerator(main_pat_filename, num_features)#, rescale=LogRescale)
  52
  53         #raw = root_dir + 'testpat_files'
  54         def list_dir(raw):
  55                 import os, random, shutil
  56                 ranks = os.listdir(raw)
  57                 tot={}
  58                 for rank in ranks:
  59                         plays = os.listdir(raw + '/'+ rank)
  60                         for play in plays:
  61                                 tot[raw + '/' + rank + '/' + play] = rank
  62                 return tot
  63
  64         train_set_dir = root_dir + 'train_set'
  65         test_set_dir = root_dir + 'rawpat_files_merged_test'
  66         train_dict = list_dir(train_set_dir)
  67         test_dict = list_dir(test_set_dir)
  68
  69         train_pl = []
  70         input_vectors_train = []
  71         for f, rank in train_dict.items():
  72                 try:
  73                         input_vectors_train += [i(f)]
  74                 except:
  75                         continue
  76                 train_pl += [rank]
  77
  78         input_vectors_test = []
  79         test_pl = []
  80         test_files = []
  81         for f, rank in test_dict.items():
  82                 try:
  83                         input_vectors_test += [i(f)]
  84                 except:
  85                         continue
  86
  87                 test_pl += [rank]
  88                 test_files += [f]
  89
  90
  91         #if len(input_vectors_train) == 0:
  92         #       print >>sys.stderr, "No reference vectors."
  93         #       sys.exit()
  94         if len(input_vectors_test) == 0:
  95                 print >>sys.stderr, "No vectors to process."
  96                 sys.exit()
  97
  98         ### PCA example usage
  99         # Change this to False, if you do not want to use PCA
 100         use_pca = True
 101         if use_pca:
 102                 # Create PCA object, trained on input_vectors
 103                 print >>sys.stderr, "Running PCA."
 104                 pca = PCA(input_vectors_train + input_vectors_test, reduce=True)
 105                 # Perform a PCA on input vectors
 106                 if input_vectors_train:
 107                         input_vectors_train = pca.process_list_of_vectors(input_vectors_train)
 108                 if input_vectors_test:
 109                         input_vectors_test = pca.process_list_of_vectors(input_vectors_test)
 110                 # Creates a Composed object that first generates an input vector
 111                 # and then performs a PCA analysis on it.
 112                 i = Compose(i, pca)
 113
 114         ### Object creating output vector when called;
 115         ref_dict = {}
 116         for name, input_vector in zip(train_pl, input_vectors_train):
 117                 ref_dict[tuple(input_vector)] = player_vector[name]
 118
 119         print "creating the knn"
 120         #print ref_dict
 121         oknn = KNNOutputVectorGenerator(ref_dict, k=4, weight_param=0.9, dist_mult=26.4)
 122
 123         #oknn = KNNOutputVectorGenerator(ref_dict, k=5, weight_param=0.2, dist_mult=10)
 124
 125         def revnorm(vec):
 126                 return [ (1-x) * 16.5 - 3.0 for x in vec ]
 127
 128         def rand_vect(k):
 129                 return list(2.0*numpy.random.random(k)-1.0)
 130
 131         print "running"
 132         # Create list of output vectors using weighted kNN algorithm approximating output_vector
 133         output_vectors= [ revnorm(oknn(input_vector)) for input_vector in input_vectors_test ]
 134         #output_vectors= [ revnorm(rand_vect(1)) for _ in input_vectors_test ]
 135         desired_vectors= [ revnorm(player_vector[rank]) for rank in test_pl ]
 136
 137         if True:
 138                 for f, out, des in zip(test_files, output_vectors, desired_vectors):
 139                         assert len(out) == 1
 140                         assert len(des) == 1
 141                         print f, "%2.3f ; %2.3f"%(out[0], des[0])
 142
 143                 print
 144                 diff = [ abs(x[0] - y[0]) for x,y in zip(output_vectors,desired_vectors) ]
 145                 zips = zip(diff, test_files)
 146                 zips.sort()
 147                 for diff,a in zips:
 148                         print a, " %2.3f"%(diff,)
 149
 150         errs =[]
 151         for o,d in zip(output_vectors, desired_vectors):
 152                 err = 0.0
 153                 for x,y in zip(o,d):
 154                         e = (1.0*x-1.0*y)**2
 155                         err += e
 156                 errs += [err]
 157
 158         mean = numpy.array(errs).mean()
 159         print "Mean square err:  " + "%2.3f ( = sd %2.3f)  "%(mean, sqrt(mean))