pca.py

   1 #!/usr/bin/python
   2 """
   3 This code creates input vectors and performs PCA on it. Each pca'd vector is then printed along with
   4 the player name, suitable e.g. to plot using gnuplot.
   5 """
   6 # not true currently
   7 """
   8 OUTPUT FORMAT
   9 player_name first_principal_component_of_player's_input_vector second_principal_component ...
  10 second_player_name ...
  11 ...
  12 """
  13 import sys
  14 from gostyle import * #print_vector, OccurenceVectorGenerator, Rescale, PlayerStrategyIdentificator, PCA, InputVectorGenerator
  15 from itertools import izip, count
  16 from data_about_players import Data
  17 from math import sqrt
  18
  19 if __name__ == '__main__':
  20         main_pat_filename = Data.main_pat_filename
  21         num_features = 500
  22         pickle_filename = 'input_gen.pickle'
  23         filename_raw = 'raw.data'
  24         filename_pca = 'pca.data'
  25         filename_proj = 'pca.dimdata'
  26         players_ignore = ['Go Seigen', 'Ishida Yoshio', 'Yamashita Keigo']# 'Cho Tae-hyeon', 'Shao Zhenzhong', 'Wu Songsheng', 'Honinbo Shusaku',  'Kuwahara Shusaku', 'Yasuda Shusaku', 'Go Seigen', 'Suzuki Goro', 'Jie Li' ] #, 'Cho Chikun', 'Takemiya Masaki']
  27         players_all = Data.players_all
  28         #players_all = Data.strength_all
  29         players = [ p for p in players_all if p not in players_ignore ]
  30         #players = Data.player_vector.keys()
  31
  32         ### Objects creating input and output vectors when called
  33         print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
  34         ivg = InputVectorGenerator(main_pat_filename, num_features)
  35
  36         # Create pairs of (input vector, player name)
  37         input_vectors = []
  38         for name in players:
  39                 #input_vectors += [ivg( Data.pat_files_folder + name)]
  40                 input_vectors += [[float(occ) for occ in ivg(Data.pat_files_folder + name)]]
  41
  42         if len(input_vectors) == 0:
  43                 print >>sys.stderr, "No input vectors.", main_pat_filename
  44                 sys.exit()
  45
  46         # Create PCA object, trained on input_vectors
  47         pca = PCA(input_vectors, output_dim=10)
  48         #pca = PCA(input_vectors, reduce=True)
  49
  50         # Dump the input generator to file
  51         print "Saving the pca input generator object to file:", pickle_filename
  52         dump_object_to_file(Compose(ivg, pca), pickle_filename)
  53
  54         print >> sys.stderr, "Writing raw vectors to file: ", filename_raw
  55         f=open(filename_raw, 'w')
  56         for name,vector in izip(players, input_vectors):
  57                 print >> f, '"'+name+'"', vector
  58         f.close()
  59
  60         #print >> sys.stderr, "Eigenvalues (top 5): ",
  61         #for val in pca.get_eigenvalues()[:5]:
  62         #       print >> sys.stderr, "%2.5f"%(val,),
  63         #print >> sys.stderr
  64
  65         # Perform a PCA on input vectors
  66         input_vectors = pca.process_list_of_vectors(input_vectors)
  67
  68         ### Now we rescale vectors, so that each component fits on -1.0 to 1.0
  69         ### this makes a very nice plot!!
  70         r = Rescale(-1.0,1.0)
  71
  72         ### Normalize each component separately
  73         # We need to transpose input_vectors - a list of per-player-vector-of-pca-component
  74         # to get list of vectors of per-component-vector-of-player-data
  75         def transpose(list_of_vectors):
  76                 return zip(*list_of_vectors)
  77         input_vectors = transpose([ r(vector) for vector in transpose(input_vectors)])
  78
  79
  80         print >> sys.stderr, "Writing output to file: ", filename_pca
  81         # prints vectors along with player names
  82         f=open(filename_pca, 'w')
  83         for name,vector in izip(players, input_vectors):
  84                 # Substitute ' ' by '_' to allow for gnuplot plotting (recognizing columns correctly)
  85                 name_to_print = '_'.join(name.split())
  86                 for p, i in izip(vector, count()):
  87                         print >> f, name_to_print, i+1, p
  88         f.close()
  89
  90         print >> sys.stderr, "Writing projection info to file: ", filename_proj
  91         f = open(filename_proj, 'w')
  92         P = pca.get_projection_info()
  93         for y in xrange(1, P.shape[0]):
  94                 for x in xrange(1, P.shape[1]):
  95                         print >> f, y, x, P[y,x], ivg.ovg.stringof(x)
  96         f.close()
  97
  98         print >> sys.stderr, "\nNow print that by:"
  99         print >> sys.stderr, 'gnuplot> set xrange[1:%d]'%(pca.pca.output_dim+1)
 100         print >> sys.stderr, 'plot "%s" using 2:3:1 with labels font "arial,10" left point pt 4 offset 1,0'%(filename_pca,)
 101
 102         def pearson_coef(vec1, vec2):
 103                 assert len(vec1)
 104                 assert len(vec1) == len(vec2)
 105
 106                 def norm(vec):
 107                         return sqrt(sum([ x*x for x in vec]))
 108                 def center(vec):
 109                         def avg(vec):
 110                                 return sum(vec) / float(len(vec))
 111                         c = avg(vec)
 112                         return [ x - c for x in vec ]
 113                 vec1, vec2 = center(vec1), center(vec2)
 114                 return sum([x * y for x,y in zip(vec1,vec2)])/(norm(vec1) * norm(vec2))
 115
 116         # [ first_component , second_component
 117         # first_component = [ 1 player, 2 player, ...
 118         pca_components = transpose(input_vectors)
 119         styles = [[],[],[],[]]
 120         for s in xrange(4):
 121                 for p in players:
 122                         styles[s] += [float(Data.questionare_total[p][s])]
 123         styles += [[ float(Data.player_year[p]) for p in players ]]
 124
 125         eiv = pca.get_eigenvalues()
 126         colh = '\033[93m'
 127         coll = '\033[94m'
 128         colstop = '\033[0m'
 129         print
 130         print "Eigenvalue \ Style:",
 131         for s in xrange(4):
 132                 print "%d.   "%(s+1,) + (' ' if s != 3 else ''),
 133         print "Year"
 134         for num, c in enumerate(pca_components):
 135                 print "%1.7f  %2d.: "%(eiv[num],num+1,),
 136                 for s in styles:
 137                         pc = pearson_coef(c,s)
 138                         if pc > 0:
 139                                 pad = ' '
 140                         else:
 141                                 pad = ''
 142                         if abs(pc) > 0.5:
 143                                 c1, c2 = colh, colstop
 144                         elif abs(pc) < 0.2:
 145                                 c1, c2 = coll, colstop
 146                         else:
 147                                 c1, c2 = '',''
 148                         print pad + c1 + "%1.3f"%(pc,) + c2,
 149                 print
 150
 151         print
 152         print "S\S:  ",
 153         for s in xrange(4):
 154                 print "%d.    "%(s+1,),
 155         print "Year"
 156         for num, c in enumerate(styles):
 157                 print "%2d.:"%(num+1,) if num <= 3 else ' Y: ',
 158                 for num2, s in enumerate(styles):
 159                         if num > num2:
 160                                 print '      ',
 161                                 continue
 162                         pc = pearson_coef(c,s)
 163                         if pc > 0:
 164                                 pad = ' '
 165                         else:
 166                                 pad = ''
 167                         if abs(pc) > 0.5:
 168                                 c1, c2 = colh, colstop
 169                         elif abs(pc) < 0.2:
 170                                 c1, c2 = coll, colstop
 171                         else:
 172                                 c1, c2 = '',''
 173                         print pad + c1 + "%1.3f"%(pc,) + c2,
 174                 print
 175