DRAFT1
[gostyle.git] / pca.py
blob018b05b777532a3419ade6c933db8ba2b69cba80
1 #!/usr/bin/python
2 """
3 This code creates input vectors and performs PCA on it. Each pca'd vector is then printed along with
4 the player name, suitable e.g. to plot using gnuplot.
5 """
6 # not true currently
7 """
8 OUTPUT FORMAT
9 player_name first_principal_component_of_player's_input_vector second_principal_component ...
10 second_player_name ...
11 ...
12 """
13 import sys
14 from gostyle import * #print_vector, OccurenceVectorGenerator, Rescale, PlayerStrategyIdentificator, PCA, InputVectorGenerator
15 from itertools import izip, count
16 from data_about_players import Data
17 from math import sqrt
19 if __name__ == '__main__':
20 main_pat_filename = Data.main_pat_filename
21 num_features = 500
22 pickle_filename = 'input_gen.pickle'
23 filename_pca = 'pca.data'
24 filename_proj = 'pca.dimdata'
25 players_ignore = ['Go Seigen', 'Ishida Yoshio', 'Yamashita Keigo']# 'Cho Tae-hyeon', 'Shao Zhenzhong', 'Wu Songsheng', 'Honinbo Shusaku', 'Kuwahara Shusaku', 'Yasuda Shusaku', 'Go Seigen', 'Suzuki Goro', 'Jie Li' ] #, 'Cho Chikun', 'Takemiya Masaki']
26 players_all = Data.players_all
27 #players_all = Data.strength_all
28 players = [ p for p in players_all if p not in players_ignore ]
29 #players = Data.player_vector.keys()
31 ### Objects creating input and output vectors when called
32 print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
33 ivg = InputVectorGenerator(main_pat_filename, num_features)
35 # Create pairs of (input vector, player name)
36 input_vectors = []
37 for name in players:
38 #input_vectors += [ivg( Data.pat_files_folder + name)]
39 input_vectors += [[float(occ) for occ in ivg(Data.pat_files_folder + name)]]
41 if len(input_vectors) == 0:
42 print >>sys.stderr, "No input vectors.", main_pat_filename
43 sys.exit()
45 # Create PCA object, trained on input_vectors
46 pca = PCA(input_vectors, output_dim=10)
47 #pca = PCA(input_vectors, reduce=True)
49 # Dump the input generator to file
50 print "Saving the pca input generator object to file:", pickle_filename
51 dump_object_to_file(Compose(ivg, pca), pickle_filename)
53 #print >> sys.stderr, "Eigenvalues (top 5): ",
54 #for val in pca.get_eigenvalues()[:5]:
55 # print >> sys.stderr, "%2.5f"%(val,),
56 #print >> sys.stderr
58 # Perform a PCA on input vectors
59 input_vectors = pca.process_list_of_vectors(input_vectors)
61 ### Now we rescale vectors, so that each component fits on -1.0 to 1.0
62 ### this makes a very nice plot!!
63 r = Rescale(-1.0,1.0)
65 ### Normalize each component separately
66 # We need to transpose input_vectors - a list of per-player-vector-of-pca-component
67 # to get list of vectors of per-component-vector-of-player-data
68 def transpose(list_of_vectors):
69 return zip(*list_of_vectors)
70 input_vectors = transpose([ r(vector) for vector in transpose(input_vectors)])
73 print >> sys.stderr, "Writing output to file: ", filename_pca
74 # prints vectors along with player names
75 f=open(filename_pca, 'w')
76 for name,vector in izip(players, input_vectors):
77 # Substitute ' ' by '_' to allow for gnuplot plotting (recognizing columns correctly)
78 name_to_print = '_'.join(name.split())
79 for p, i in izip(vector, count()):
80 print >> f, name_to_print, i+1, p
81 f.close()
83 print >> sys.stderr, "Writing projection info to file: ", filename_proj
84 f = open(filename_proj, 'w')
85 P = pca.get_projection_info()
86 for y in xrange(1, P.shape[0]):
87 for x in xrange(1, P.shape[1]):
88 print >> f, y, x, P[y,x], ivg.ovg.stringof(x)
89 f.close()
91 print >> sys.stderr, "\nNow print that by:"
92 print >> sys.stderr, 'gnuplot> set xrange[1:%d]'%(pca.pca.output_dim+1)
93 print >> sys.stderr, 'plot "%s" using 2:3:1 with labels font "arial,10" left point pt 4 offset 1,0'%(filename_pca,)
95 def pearson_coef(vec1, vec2):
96 assert len(vec1)
97 assert len(vec1) == len(vec2)
99 def norm(vec):
100 return sqrt(sum([ x*x for x in vec]))
101 def center(vec):
102 def avg(vec):
103 return sum(vec) / float(len(vec))
104 c = avg(vec)
105 return [ x - c for x in vec ]
106 vec1, vec2 = center(vec1), center(vec2)
107 return sum([x * y for x,y in zip(vec1,vec2)])/(norm(vec1) * norm(vec2))
109 # [ first_component , second_component
110 # first_component = [ 1 player, 2 player, ...
111 pca_components = transpose(input_vectors)
112 styles = [[],[],[],[]]
113 for s in xrange(4):
114 for p in players:
115 styles[s] += [float(Data.questionare_total[p][s])]
116 styles += [[ float(Data.player_year[p]) for p in players ]]
118 eiv = pca.get_eigenvalues()
119 colh = '\033[93m'
120 coll = '\033[94m'
121 colstop = '\033[0m'
122 print
123 print "Eigenvalue \ Style:",
124 for s in xrange(4):
125 print "%d. "%(s+1,) + (' ' if s != 3 else ''),
126 print "Year"
127 for num, c in enumerate(pca_components):
128 print "%1.7f %2d.: "%(eiv[num],num+1,),
129 for s in styles:
130 pc = pearson_coef(c,s)
131 if pc > 0:
132 pad = ' '
133 else:
134 pad = ''
135 if abs(pc) > 0.5:
136 c1, c2 = colh, colstop
137 elif abs(pc) < 0.2:
138 c1, c2 = coll, colstop
139 else:
140 c1, c2 = '',''
141 print pad + c1 + "%1.3f"%(pc,) + c2,
142 print
144 print
145 print "S\S: ",
146 for s in xrange(4):
147 print "%d. "%(s+1,),
148 print "Year"
149 for num, c in enumerate(styles):
150 print "%2d.:"%(num+1,) if num <= 3 else ' Y: ',
151 for num2, s in enumerate(styles):
152 if num > num2:
153 print ' ',
154 continue
155 pc = pearson_coef(c,s)
156 if pc > 0:
157 pad = ' '
158 else:
159 pad = ''
160 if abs(pc) > 0.5:
161 c1, c2 = colh, colstop
162 elif abs(pc) < 0.2:
163 c1, c2 = coll, colstop
164 else:
165 c1, c2 = '',''
166 print pad + c1 + "%1.3f"%(pc,) + c2,
167 print