gnet: bugfix
[gostyle.git] / knn_strength.py
blob504e71523b441190ed56c09876f9582466d3b37f
1 #!/usr/bin/python
2 import sys
3 from gostyle import *
4 from math import sqrt
5 from itertools import izip
6 import numpy
8 from data_about_players import Data
10 class KNNOutputVectorGenerator(VectorGenerator):
11 """ k-NearestNeighbour output vector generator."""
12 def __init__(self, ref_dict, k=5, weight_param=0.8, dist_mult=10):
13 """
14 ref_dict is a dictionary of refence input/output vectors.
15 e.g. ref_dict= { (1.0,2.0):(9.0,16.0,21.0)
16 """
17 self.ref_dict = ref_dict
18 self.k = k
19 self.weigth_param = weight_param
20 self.dist_mult = dist_mult
21 def __call__(self, player_vector):
22 distance=[]
23 for ref_vec in self.ref_dict.keys():
24 distance.append((self.distance(ref_vec, player_vector), ref_vec))
25 distance.sort()
27 #for p,v in distance:
28 # print "%2.3f"%(float(p),),
29 #print
30 ref_output_vecs = [ self.ref_dict[b] for a,b in distance[:self.k] ]
31 coefs = [ self.weight_fc(a) for a,b in distance[:self.k] ]
33 return linear_combination(ref_output_vecs, coefs)
34 def weight_fc(self, distance):
35 return self.weigth_param ** (distance)
36 def distance(self, vec1, vec2):
37 if len(vec1) != len(vec2):
38 raise RuntimeError("Dimensions of vectors mismatch.")
39 ### the 10* multiplicative constant is empirically determined for correct scaling
40 return self.dist_mult * numpy.sqrt(sum( (a - b)**2 for a,b in izip(vec1,vec2)))
43 if __name__ == '__main__':
44 root_dir = '../pdb-gtl/'
45 main_pat_filename = root_dir + 'all.pat'
46 player_vector = Data.strength_linear_vector
47 num_features = 400
49 ### Object creating input vector when called
50 print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
51 i = InputVectorGenerator(main_pat_filename, num_features)#, rescale=LogRescale)
53 #raw = root_dir + 'testpat_files'
54 def list_dir(raw):
55 import os, random, shutil
56 ranks = os.listdir(raw)
57 tot={}
58 for rank in ranks:
59 plays = os.listdir(raw + '/'+ rank)
60 for play in plays:
61 tot[raw + '/' + rank + '/' + play] = rank
62 return tot
64 train_set_dir = root_dir + 'train_set'
65 test_set_dir = root_dir + 'rawpat_files_merged_test'
66 train_dict = list_dir(train_set_dir)
67 test_dict = list_dir(test_set_dir)
69 train_pl = []
70 input_vectors_train = []
71 for f, rank in train_dict.items():
72 try:
73 input_vectors_train += [i(f)]
74 except:
75 continue
76 train_pl += [rank]
78 input_vectors_test = []
79 test_pl = []
80 test_files = []
81 for f, rank in test_dict.items():
82 try:
83 input_vectors_test += [i(f)]
84 except:
85 continue
87 test_pl += [rank]
88 test_files += [f]
91 #if len(input_vectors_train) == 0:
92 # print >>sys.stderr, "No reference vectors."
93 # sys.exit()
94 if len(input_vectors_test) == 0:
95 print >>sys.stderr, "No vectors to process."
96 sys.exit()
98 ### PCA example usage
99 # Change this to False, if you do not want to use PCA
100 use_pca = True
101 if use_pca:
102 # Create PCA object, trained on input_vectors
103 print >>sys.stderr, "Running PCA."
104 pca = PCA(input_vectors_train + input_vectors_test, reduce=True)
105 # Perform a PCA on input vectors
106 if input_vectors_train:
107 input_vectors_train = pca.process_list_of_vectors(input_vectors_train)
108 if input_vectors_test:
109 input_vectors_test = pca.process_list_of_vectors(input_vectors_test)
110 # Creates a Composed object that first generates an input vector
111 # and then performs a PCA analysis on it.
112 i = Compose(i, pca)
114 ### Object creating output vector when called;
115 ref_dict = {}
116 for name, input_vector in zip(train_pl, input_vectors_train):
117 ref_dict[tuple(input_vector)] = player_vector[name]
119 print "creating the knn"
120 #print ref_dict
121 oknn = KNNOutputVectorGenerator(ref_dict, k=4, weight_param=0.9, dist_mult=26.4)
123 #oknn = KNNOutputVectorGenerator(ref_dict, k=5, weight_param=0.2, dist_mult=10)
125 def revnorm(vec):
126 return [ (1-x) * 16.5 - 3.0 for x in vec ]
128 def rand_vect(k):
129 return list(2.0*numpy.random.random(k)-1.0)
131 print "running"
132 # Create list of output vectors using weighted kNN algorithm approximating output_vector
133 output_vectors= [ revnorm(oknn(input_vector)) for input_vector in input_vectors_test ]
134 #output_vectors= [ revnorm(rand_vect(1)) for _ in input_vectors_test ]
135 desired_vectors= [ revnorm(player_vector[rank]) for rank in test_pl ]
137 if True:
138 for f, out, des in zip(test_files, output_vectors, desired_vectors):
139 assert len(out) == 1
140 assert len(des) == 1
141 print f, "%2.3f ; %2.3f"%(out[0], des[0])
143 print
144 diff = [ abs(x[0] - y[0]) for x,y in zip(output_vectors,desired_vectors) ]
145 zips = zip(diff, test_files)
146 zips.sort()
147 for diff,a in zips:
148 print a, " %2.3f"%(diff,)
150 errs =[]
151 for o,d in zip(output_vectors, desired_vectors):
152 err = 0.0
153 for x,y in zip(o,d):
154 e = (1.0*x-1.0*y)**2
155 err += e
156 errs += [err]
158 mean = numpy.array(errs).mean()
159 print "Mean square err: " + "%2.3f ( = sd %2.3f) "%(mean, sqrt(mean))