knn_strength.py

   1 #!/usr/bin/python
   2 import sys
   3 from gostyle import *
   4 from math import sqrt
   5 import numpy
   6
   7 from data_about_players import Data
   8
   9 class KNNOutputVectorGenerator(VectorGenerator):
  10         """ k-NearestNeighbour output vector generator."""
  11         def __init__(self, ref_dict, k=5, weight_param=0.8, dist_mult=10):
  12                 """
  13                         ref_dict is a dictionary of refence input/output vectors.
  14                         e.g. ref_dict= { (1.0,2.0):(9.0,16.0,21.0)
  15                 """
  16                 self.ref_dict = ref_dict
  17                 self.k = k
  18                 self.weigth_param = weight_param
  19                 self.dist_mult = dist_mult
  20         def __call__(self, player_vector):
  21                 distance=[]
  22                 for ref_vec in self.ref_dict.keys():
  23                         distance.append((self.distance(ref_vec, player_vector), ref_vec))
  24                 distance.sort()
  25
  26                 #for p,v in distance:
  27                 #       print "%2.3f"%(float(p),),
  28                 #print
  29                 ref_output_vecs = [ self.ref_dict[b] for a,b in distance[:self.k] ]
  30                 coefs = [ self.weight_fc(a) for a,b in distance[:self.k] ]
  31
  32                 return linear_combination(ref_output_vecs, coefs)
  33         def weight_fc(self, distance):
  34                 return self.weigth_param ** (distance)
  35         def distance(self, vec1, vec2):
  36                 if len(vec1) != len(vec2):
  37                         raise RuntimeError("Dimensions of vectors mismatch.")
  38                 ### the 10* multiplicative constant is empirically determined for correct scaling
  39                 return self.dist_mult * sqrt(sum([ (float(a) - float(b))**2 for a,b in zip(vec1,vec2)]))
  40
  41
  42 if __name__ == '__main__':
  43         root_dir = '../pdb-gtl/'
  44         main_pat_filename = root_dir + 'all.pat'
  45         player_vector = Data.strength_linear_vector
  46         num_features = 400
  47         k = 5
  48
  49         ### Object creating input vector when called
  50         print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
  51         i = InputVectorGenerator(main_pat_filename, num_features)#, rescale=LogRescale)
  52
  53         #raw = root_dir + 'testpat_files'
  54         def list_dir(raw):
  55                 import os, random, shutil
  56                 ranks = os.listdir(raw)
  57                 tot={}
  58                 for rank in ranks:
  59                         plays = os.listdir(raw + '/'+ rank)
  60                         for play in plays:
  61                                 tot[raw + '/' + rank + '/' + play] = rank
  62                 return tot
  63
  64         train_set_dir = root_dir + 'rawpat_files_merged'
  65         test_set_dir = root_dir + 'rawpat_files_merged_test'
  66         train_dict = list_dir(train_set_dir)
  67         test_dict = list_dir(test_set_dir)
  68
  69         train_pl = []
  70         input_vectors_train = []
  71         for f, rank in train_dict.items():
  72                 try:
  73                         input_vectors_train += [i(f)]
  74                 except:
  75                         continue
  76                 train_pl += [rank]
  77
  78         input_vectors_test = []
  79         test_pl = []
  80         test_files = []
  81         for f, rank in test_dict.items():
  82                 try:
  83                         input_vectors_test += [i(f)]
  84                 except:
  85                         continue
  86
  87                 test_pl += [rank]
  88                 test_files += [f]
  89
  90
  91         if len(input_vectors_train) == 0:
  92                 print >>sys.stderr, "No reference vectors."
  93                 sys.exit()
  94         if len(input_vectors_test) == 0:
  95                 print >>sys.stderr, "No vectors to process."
  96                 sys.exit()
  97
  98         ### PCA example usage
  99         # Change this to False, if you do not want to use PCA
 100         use_pca = True
 101         if use_pca:
 102                 # Create PCA object, trained on input_vectors
 103                 print >>sys.stderr, "Running PCA."
 104                 pca = PCA(input_vectors_train + input_vectors_test, reduce=True)
 105                 # Perform a PCA on input vectors
 106                 input_vectors_train = pca.process_list_of_vectors(input_vectors_train)
 107                 input_vectors_test = pca.process_list_of_vectors(input_vectors_test)
 108                 # Creates a Composed object that first generates an input vector
 109                 # and then performs a PCA analysis on it.
 110                 i = Compose(i, pca)
 111
 112         ### Object creating output vector when called;
 113         ref_dict = {}
 114         for name, input_vector in zip(train_pl, input_vectors_train):
 115                 ref_dict[tuple(input_vector)] = player_vector[name]
 116
 117         #print ref_dict
 118         oknn = KNNOutputVectorGenerator(ref_dict, k=4, weight_param=0.9, dist_mult=10)
 119         #oknn = KNNOutputVectorGenerator(ref_dict, k=5, weight_param=0.2, dist_mult=10)
 120
 121         def rand_vect(k):
 122                 return list(numpy.random.random(k))
 123         # Create list of output vectors using weighted kNN algorithm approximating output_vector
 124         output_vectors= [ oknn(input_vector) for input_vector in input_vectors_test ]
 125         #output_vectors= [ rand_vect(1) for _ in input_vectors_test ]
 126         desired_vectors= [ player_vector[rank] for rank in test_pl ]
 127
 128         if True:
 129                 for f, out, des in zip(test_files, output_vectors, desired_vectors):
 130                         assert len(out) == 1
 131                         assert len(des) == 1
 132                         print f, "%2.3f ; %2.3f"%(out[0], des[0])
 133
 134                 print
 135                 diff = [ abs(x[0] - y[0]) for x,y in zip(output_vectors,desired_vectors) ]
 136                 zips = zip(diff, test_files)
 137                 zips.sort()
 138                 for diff,a in zips:
 139                         print a, " %2.3f"%(diff,)
 140
 141         errs =[]
 142         for o,d in zip(output_vectors, desired_vectors):
 143                 err = 0.0
 144                 for x,y in zip(o,d):
 145                         e = (1.0*x-1.0*y)**2
 146                         err += e
 147                 errs += [err]
 148
 149         mean = numpy.array(errs).mean()
 150         print "Mean square err:  " + "%2.3f ( = sd %2.3f)  "%(mean, sqrt(mean))