pca.py

   1 #!/usr/bin/python
   2 """
   3 This code creates input vectors and performs PCA on it. Each pca'd vector is then printed along with
   4 the player name, suitable e.g. to plot using gnuplot.
   5 """
   6 # not true currently
   7 """
   8 OUTPUT FORMAT
   9 player_name first_principal_component_of_player's_input_vector second_principal_component ...
  10 second_player_name ...
  11 ...
  12 """
  13 import sys
  14 from gostyle import * #print_vector, OccurenceVectorGenerator, Rescale, PlayerStrategyIdentificator, PCA, InputVectorGenerator
  15 from itertools import izip, count
  16 from data_about_players import Data
  17 from math import sqrt
  18
  19 if __name__ == '__main__':
  20         main_pat_filename = Data.main_pat_filename
  21         num_features = 500
  22         pickle_filename = 'input_gen.pickle'
  23         filename_pca = 'pca.data'
  24         filename_proj = 'pca.dimdata'
  25         players_ignore = ['Go Seigen', 'Ishida Yoshio', 'Yamashita Keigo']# 'Cho Tae-hyeon', 'Shao Zhenzhong', 'Wu Songsheng', 'Honinbo Shusaku',  'Kuwahara Shusaku', 'Yasuda Shusaku', 'Go Seigen', 'Suzuki Goro', 'Jie Li' ] #, 'Cho Chikun', 'Takemiya Masaki']
  26         players_all = Data.players_all
  27         #players_all = Data.strength_all
  28         players = [ p for p in players_all if p not in players_ignore ]
  29         #players = Data.player_vector.keys()
  30
  31         ### Objects creating input and output vectors when called
  32         print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
  33         ivg = InputVectorGenerator(main_pat_filename, num_features)
  34
  35         # Create pairs of (input vector, player name)
  36         input_vectors = []
  37         for name in players:
  38                 #input_vectors += [ivg( Data.pat_files_folder + name)]
  39                 input_vectors += [[float(occ) for occ in ivg(Data.pat_files_folder + name)]]
  40
  41         if len(input_vectors) == 0:
  42                 print >>sys.stderr, "No input vectors.", main_pat_filename
  43                 sys.exit()
  44
  45         # Create PCA object, trained on input_vectors
  46         pca = PCA(input_vectors, output_dim=10)
  47         #pca = PCA(input_vectors, reduce=True)
  48
  49         # Dump the input generator to file
  50         print "Saving the pca input generator object to file:", pickle_filename
  51         dump_object_to_file(Compose(ivg, pca), pickle_filename)
  52
  53         #print >> sys.stderr, "Eigenvalues (top 5): ",
  54         #for val in pca.get_eigenvalues()[:5]:
  55         #       print >> sys.stderr, "%2.5f"%(val,),
  56         #print >> sys.stderr
  57
  58         # Perform a PCA on input vectors
  59         input_vectors = pca.process_list_of_vectors(input_vectors)
  60
  61         ### Now we rescale vectors, so that each component fits on -1.0 to 1.0
  62         ### this makes a very nice plot!!
  63         r = Rescale(-1.0,1.0)
  64
  65         ### Normalize each component separately
  66         # We need to transpose input_vectors - a list of per-player-vector-of-pca-component
  67         # to get list of vectors of per-component-vector-of-player-data
  68         def transpose(list_of_vectors):
  69                 return zip(*list_of_vectors)
  70         input_vectors = transpose([ r(vector) for vector in transpose(input_vectors)])
  71
  72
  73         print >> sys.stderr, "Writing output to file: ", filename_pca
  74         # prints vectors along with player names
  75         f=open(filename_pca, 'w')
  76         for name,vector in izip(players, input_vectors):
  77                 # Substitute ' ' by '_' to allow for gnuplot plotting (recognizing columns correctly)
  78                 name_to_print = '_'.join(name.split())
  79                 for p, i in izip(vector, count()):
  80                         print >> f, name_to_print, i+1, p
  81         f.close()
  82
  83         print >> sys.stderr, "Writing projection info to file: ", filename_proj
  84         f = open(filename_proj, 'w')
  85         P = pca.get_projection_info()
  86         for y in xrange(1, P.shape[0]):
  87                 for x in xrange(1, P.shape[1]):
  88                         print >> f, y, x, P[y,x], ivg.ovg.stringof(x)
  89         f.close()
  90
  91         print >> sys.stderr, "\nNow print that by:"
  92         print >> sys.stderr, 'gnuplot> set xrange[1:%d]'%(pca.pca.output_dim+1)
  93         print >> sys.stderr, 'plot "%s" using 2:3:1 with labels font "arial,10" left point pt 4 offset 1,0'%(filename_pca,)
  94
  95         def pearson_coef(vec1, vec2):
  96                 assert len(vec1)
  97                 assert len(vec1) == len(vec2)
  98
  99                 def norm(vec):
 100                         return sqrt(sum([ x*x for x in vec]))
 101                 def center(vec):
 102                         def avg(vec):
 103                                 return sum(vec) / float(len(vec))
 104                         c = avg(vec)
 105                         return [ x - c for x in vec ]
 106                 vec1, vec2 = center(vec1), center(vec2)
 107                 return sum([x * y for x,y in zip(vec1,vec2)])/(norm(vec1) * norm(vec2))
 108
 109         # [ first_component , second_component
 110         # first_component = [ 1 player, 2 player, ...
 111         pca_components = transpose(input_vectors)
 112         styles = [[],[],[],[]]
 113         for s in xrange(4):
 114                 for p in players:
 115                         styles[s] += [float(Data.questionare_total[p][s])]
 116         styles += [[ float(Data.player_year[p]) for p in players ]]
 117
 118         eiv = pca.get_eigenvalues()
 119         colh = '\033[93m'
 120         coll = '\033[94m'
 121         colstop = '\033[0m'
 122         print
 123         print "Eigenvalue \ Style:",
 124         for s in xrange(4):
 125                 print "%d.   "%(s+1,) + (' ' if s != 3 else ''),
 126         print "Year"
 127         for num, c in enumerate(pca_components):
 128                 print "%1.7f  %2d.: "%(eiv[num],num+1,),
 129                 for s in styles:
 130                         pc = pearson_coef(c,s)
 131                         if pc > 0:
 132                                 pad = ' '
 133                         else:
 134                                 pad = ''
 135                         if abs(pc) > 0.5:
 136                                 c1, c2 = colh, colstop
 137                         elif abs(pc) < 0.2:
 138                                 c1, c2 = coll, colstop
 139                         else:
 140                                 c1, c2 = '',''
 141                         print pad + c1 + "%1.3f"%(pc,) + c2,
 142                 print
 143
 144         print
 145         print "S\S:  ",
 146         for s in xrange(4):
 147                 print "%d.    "%(s+1,),
 148         print "Year"
 149         for num, c in enumerate(styles):
 150                 print "%2d.:"%(num+1,) if num <= 3 else ' Y: ',
 151                 for num2, s in enumerate(styles):
 152                         if num > num2:
 153                                 print '      ',
 154                                 continue
 155                         pc = pearson_coef(c,s)
 156                         if pc > 0:
 157                                 pad = ' '
 158                         else:
 159                                 pad = ''
 160                         if abs(pc) > 0.5:
 161                                 c1, c2 = colh, colstop
 162                         elif abs(pc) < 0.2:
 163                                 c1, c2 = coll, colstop
 164                         else:
 165                                 c1, c2 = '',''
 166                         print pad + c1 + "%1.3f"%(pc,) + c2,
 167                 print
 168