clanek_go_congress_initial
[gostyle.git] / cross_validation_strength.py
blob258e4d58f837e4e418effcd8045c10f4e2988a50
1 #!/usr/bin/python
2 import sys
3 import subprocess
4 import os
5 from gostyle import *
6 from math import sqrt
7 import numpy
9 from data_about_players import Data
10 from cross_val import Shuffled, CrossValidation
12 def list_dir(raw):
13 import os, random, shutil
14 ranks = os.listdir(raw)
15 tot={}
16 for rank in ranks:
17 plays = os.listdir(raw + '/'+ rank)
18 for play in plays:
19 tot[raw + '/' + rank + '/' + play] = rank
20 return tot
22 def param_f_for_knn(x):
23 r = 0.0426735 * x + 6.82347
24 print "knn par :", r
25 return r
27 if __name__ == '__main__':
28 num_features = 400
30 # Neural net
31 typ = 'nn'
32 #typ = 'knn'
33 # random
34 #typ = 'rnd'
35 #typ = 'joint_nn_knn'
37 player_vector = Data.strength_linear_vector
38 # players_ignore = [ "Yi Ch'ang-ho 2004-" ]#, "Fujisawa Hideyuki","Yuki Satoshi", "Otake Hideo", "Yi Ch'ang-ho 2005+","Takao Shinji","Hane Naoki","Kobayashi Koichi" ]
39 #players_ignore = [ ]#,"Takao Shinji","Hane Naoki","Kobayashi Koichi" ]
40 #players_all = [ p for p in player_vector.keys() if p not in players_ignore ]
42 root_dir = '../pdb-gtl/'
44 input_dir = root_dir + 'rawpat_files_merged_test'
45 train_dict = list_dir(input_dir)
47 main_pat_filename = root_dir + 'all.pat'
48 ### Object creating input vector when called
49 print "Creating input vector generator from main pat file:", main_pat_filename
50 i = InputVectorGenerator(main_pat_filename, num_features)
53 # Create list of input vectors
54 players_all = []
55 input_vectors = []
56 for f, rank in train_dict.items():
57 try:
58 input_vectors += [i(f)]
59 except:
60 continue
61 players_all += [rank]
62 assert rank in player_vector
64 if len(input_vectors) == 0:
65 print >>sys.stderr, "No vectors."
66 sys.exit()
68 ### PCA example usage
69 # Change this to False, if you do not want to use PCA
70 use_pca = True
71 if use_pca:
72 # Create PCA object, trained on input_vectors
73 print >>sys.stderr, "Running PCA."
74 pca = PCA(input_vectors, reduce=True)
75 # Perform a PCA on input vectors
76 input_vectors = pca.process_list_of_vectors(input_vectors)
77 # Creates a Composed object that first generates an input vector
78 # and then performs a PCA analysis on it.
79 i = Compose(i, pca)
81 def rand_vect(k):
82 return list(2.0*numpy.random.random(k)-1.0)
84 def revnorm(vec):
85 return [ (1-x) * 16.5 - 3.0 for x in vec ]
87 print >>sys.stderr, "Running Cross-validation."
89 for xxx in xrange(1):
90 errs=[ [] for _ in xrange(len(players_all)) ]
91 es=[]
93 number_runs = 1
94 for _ in xrange(number_runs):
95 num_play = len(players_all)
96 num_fold = num_play/10
97 #print len(players_all)/4, "-fold validation from:", len(players_all)
98 print num_fold, "-fold validation from population of:", num_play
99 cnt = 0
100 for reference_set, validation_set in Shuffled(CrossValidation)(range(num_play), num_fold):# len(players_all)/4):
101 cnt += 1
102 if cnt >= 50:
103 break
104 print len(reference_set), len(validation_set)
105 if typ == 'nn':
106 data =[]
107 for index in reference_set:
108 data.append( (input_vectors[index], player_vector[players_all[index]]) )
110 # print [x for y,x in data]
112 ### We can enlarge the data set by adding linear combinations of input and output vectors
113 use_lin_combinations = False
114 if use_lin_combinations:
115 data += Combinator().combine(data)
117 filename = 'nn_cross.data'+str(os.getpid())
118 print_set_to_file(data,filename)
120 nn = NeuralNet(filename, neurons=35, desired_error=sys.argv[1] if len(sys.argv) ==2 else 0.0005 )
121 # Create list of output vectors using weighted kNN algorithm approximating output_vector
122 output_vectors = [ nn(input_vectors[index]) for index in validation_set ]
123 nn.close()
124 elif typ == 'knn':
125 ### Object creating output vector when called;
126 ref_dict = {}
127 for index in reference_set:
128 ref_dict[tuple(input_vectors[index])] = player_vector[players_all[index]]
131 # best pro InputVectorGenerator rescale=Rescale
132 oknn = KNNOutputVectorGenerator(ref_dict, k=4, weight_param=0.9, dist_mult=param_f_for_knn(124))
133 #oknn = KNNOutputVectorGenerator(ref_dict, k=3, weight_param=0.99, dist_mult=1400)
135 # Create list of output vectors using weighted kNN algorithm approximating output_vector
136 output_vectors = [ oknn(input_vectors[index]) for index in validation_set ]
137 elif typ == 'rnd':
138 output_vectors = [ rand_vect(4) for index in validation_set ]
140 output_vectors = [ revnorm(ov) for ov in output_vectors ]
141 desired_vectors = [ revnorm(player_vector[players_all[index]]) for index in validation_set ]
143 if True:
144 for vec_set,text in [(output_vectors, "Output: "), (desired_vectors, "Desired:")]:
145 print text,
146 for o in vec_set:
147 for x in o:
148 print "%02.3f"%(x,),
149 print "; ",
150 print
152 for num1, (o,d) in zip(validation_set, zip(output_vectors, desired_vectors)):
153 err = 0.0
154 for x,y in zip(o,d):
155 e = (1.0*x-1.0*y)**2
156 es += [e]
157 err += e
158 errs[num1] += [err]
160 if typ == 'joint_nn_knn':
161 print "Joint classifier:"
162 elif typ == 'knn':
163 print "k-NN classifier:"
164 elif typ == 'nn':
165 print "Neural network classifier:"
166 elif typ == 'rnd':
167 print "Random classifier:"
169 #print "Total square err: %2.3f"%( sum(errs) / number_runs,)
170 # mar = numpy.array(errs)
171 # mean = mar.mean()
172 # print "Mean square err per player: " + "%2.3f ( = sd %2.3f) "%(mean, sqrt(mean))
174 mean = numpy.array(es).mean()
175 print "Mean square err: " + "%2.3f ( = sd %2.3f) "%(mean, sqrt(mean))
177 #mean = numpy.array(es).mean()
178 #print "%2.3f &"%(mean),
179 #print "%2.3f \\\\\\hline"%(11.776 / mean)
181 #print
182 #print "Players sorted by mean square error:"
183 #p = zip([numpy.array(errs[p]).mean() for p in xrange(len(players_all)) ], players_all)
184 #p.sort()
185 #for err, name in p:
186 # print "%2.3f %s"%(err,name)
187 # #print "%s"%(name,)
188 # sys.exit()