Merge branch 'master' of ssh://repo.or.cz/srv/git/gostyle
[gostyle.git] / pca.py
blob76fc58faa3a014577b5c4ce1347da97eb7bd02e4
1 #!/usr/bin/python
2 """
3 This code creates input vectors and performs PCA on it. Each pca'd vector is then printed along with
4 the player name, suitable e.g. to plot using gnuplot.
5 """
6 # not true currently
7 """
8 OUTPUT FORMAT
9 player_name first_principal_component_of_player's_input_vector second_principal_component ...
10 second_player_name ...
11 ...
12 """
13 import sys
14 from gostyle import * #print_vector, OccurenceVectorGenerator, Rescale, PlayerStrategyIdentificator, PCA, InputVectorGenerator
15 from itertools import izip, count
16 from data_about_players import Data
17 from math import sqrt
19 if __name__ == '__main__':
20 main_pat_filename = Data.main_pat_filename
21 num_features = 500
22 pickle_filename = 'input_gen.pickle'
23 filename_raw = 'raw.data'
24 filename_pca = 'pca.data'
25 filename_proj = 'pca.dimdata'
26 players_ignore = ['Go Seigen', 'Ishida Yoshio', 'Yamashita Keigo']# 'Cho Tae-hyeon', 'Shao Zhenzhong', 'Wu Songsheng', 'Honinbo Shusaku', 'Kuwahara Shusaku', 'Yasuda Shusaku', 'Go Seigen', 'Suzuki Goro', 'Jie Li' ] #, 'Cho Chikun', 'Takemiya Masaki']
27 players_all = Data.players_all
28 #players_all = Data.strength_all
29 players = [ p for p in players_all if p not in players_ignore ]
30 #players = Data.player_vector.keys()
32 ### Objects creating input and output vectors when called
33 print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
34 ivg = InputVectorGenerator(main_pat_filename, num_features)
36 # Create pairs of (input vector, player name)
37 input_vectors = []
38 for name in players:
39 #input_vectors += [ivg( Data.pat_files_folder + name)]
40 input_vectors += [[float(occ) for occ in ivg(Data.pat_files_folder + name)]]
42 if len(input_vectors) == 0:
43 print >>sys.stderr, "No input vectors.", main_pat_filename
44 sys.exit()
46 # Create PCA object, trained on input_vectors
47 pca = PCA(input_vectors, output_dim=10)
48 #pca = PCA(input_vectors, reduce=True)
50 # Dump the input generator to file
51 print "Saving the pca input generator object to file:", pickle_filename
52 dump_object_to_file(Compose(ivg, pca), pickle_filename)
54 print >> sys.stderr, "Writing raw vectors to file: ", filename_raw
55 f=open(filename_raw, 'w')
56 for name,vector in izip(players, input_vectors):
57 print >> f, '"'+name+'"', vector
58 f.close()
60 #print >> sys.stderr, "Eigenvalues (top 5): ",
61 #for val in pca.get_eigenvalues()[:5]:
62 # print >> sys.stderr, "%2.5f"%(val,),
63 #print >> sys.stderr
65 # Perform a PCA on input vectors
66 input_vectors = pca.process_list_of_vectors(input_vectors)
68 ### Now we rescale vectors, so that each component fits on -1.0 to 1.0
69 ### this makes a very nice plot!!
70 r = Rescale(-1.0,1.0)
72 ### Normalize each component separately
73 # We need to transpose input_vectors - a list of per-player-vector-of-pca-component
74 # to get list of vectors of per-component-vector-of-player-data
75 def transpose(list_of_vectors):
76 return zip(*list_of_vectors)
77 input_vectors = transpose([ r(vector) for vector in transpose(input_vectors)])
80 print >> sys.stderr, "Writing output to file: ", filename_pca
81 # prints vectors along with player names
82 f=open(filename_pca, 'w')
83 for name,vector in izip(players, input_vectors):
84 # Substitute ' ' by '_' to allow for gnuplot plotting (recognizing columns correctly)
85 name_to_print = '_'.join(name.split())
86 for p, i in izip(vector, count()):
87 print >> f, name_to_print, i+1, p
88 f.close()
90 print >> sys.stderr, "Writing projection info to file: ", filename_proj
91 f = open(filename_proj, 'w')
92 P = pca.get_projection_info()
93 for y in xrange(1, P.shape[0]):
94 for x in xrange(1, P.shape[1]):
95 print >> f, y, x, P[y,x], ivg.ovg.stringof(x)
96 f.close()
98 print >> sys.stderr, "\nNow print that by:"
99 print >> sys.stderr, 'gnuplot> set xrange[1:%d]'%(pca.pca.output_dim+1)
100 print >> sys.stderr, 'plot "%s" using 2:3:1 with labels font "arial,10" left point pt 4 offset 1,0'%(filename_pca,)
102 def pearson_coef(vec1, vec2):
103 assert len(vec1)
104 assert len(vec1) == len(vec2)
106 def norm(vec):
107 return sqrt(sum([ x*x for x in vec]))
108 def center(vec):
109 def avg(vec):
110 return sum(vec) / float(len(vec))
111 c = avg(vec)
112 return [ x - c for x in vec ]
113 vec1, vec2 = center(vec1), center(vec2)
114 return sum([x * y for x,y in zip(vec1,vec2)])/(norm(vec1) * norm(vec2))
116 # [ first_component , second_component
117 # first_component = [ 1 player, 2 player, ...
118 pca_components = transpose(input_vectors)
119 styles = [[],[],[],[]]
120 for s in xrange(4):
121 for p in players:
122 styles[s] += [float(Data.questionare_total[p][s])]
123 styles += [[ float(Data.player_year[p]) for p in players ]]
125 eiv = pca.get_eigenvalues()
126 colh = '\033[93m'
127 coll = '\033[94m'
128 colstop = '\033[0m'
129 print
130 print "Eigenvalue \ Style:",
131 for s in xrange(4):
132 print "%d. "%(s+1,) + (' ' if s != 3 else ''),
133 print "Year"
134 for num, c in enumerate(pca_components):
135 print "%1.7f %2d.: "%(eiv[num],num+1,),
136 for s in styles:
137 pc = pearson_coef(c,s)
138 if pc > 0:
139 pad = ' '
140 else:
141 pad = ''
142 if abs(pc) > 0.5:
143 c1, c2 = colh, colstop
144 elif abs(pc) < 0.2:
145 c1, c2 = coll, colstop
146 else:
147 c1, c2 = '',''
148 print pad + c1 + "%1.3f"%(pc,) + c2,
149 print
151 print
152 print "S\S: ",
153 for s in xrange(4):
154 print "%d. "%(s+1,),
155 print "Year"
156 for num, c in enumerate(styles):
157 print "%2d.:"%(num+1,) if num <= 3 else ' Y: ',
158 for num2, s in enumerate(styles):
159 if num > num2:
160 print ' ',
161 continue
162 pc = pearson_coef(c,s)
163 if pc > 0:
164 pad = ' '
165 else:
166 pad = ''
167 if abs(pc) > 0.5:
168 c1, c2 = colh, colstop
169 elif abs(pc) < 0.2:
170 c1, c2 = coll, colstop
171 else:
172 c1, c2 = '',''
173 print pad + c1 + "%1.3f"%(pc,) + c2,
174 print