pca.py

   1 #!/usr/bin/python
   2 """
   3 This code creates input vectors and performs PCA on it. Each pca'd vector is then printed along with
   4 the player name, suitable e.g. to plot using gnuplot.
   5 """
   6 # not true currently
   7 """
   8 OUTPUT FORMAT
   9 player_name first_principal_component_of_player's_input_vector second_principal_component ...
  10 second_player_name ...
  11 ...
  12 """
  13 import sys
  14 from gostyle import * #print_vector, OccurenceVectorGenerator, Rescale, PlayerStrategyIdentificator, PCA, InputVectorGenerator
  15 from itertools import izip, count
  16 from data_about_players import Data
  17
  18 if __name__ == '__main__':
  19         main_pat_filename = Data.main_pat_filename
  20         num_features = 500
  21         pickle_filename = 'input_gen.pickle'
  22         filename_pca = 'pca.data'
  23         filename_proj = 'pca.dimdata'
  24         players_ignore = [ 'Cho Tae-hyeon', 'Shao Zhenzhong', 'Wu Songsheng', 'Honinbo Shusaku',  'Kuwahara Shusaku', 'Yasuda Shusaku', 'Go Seigen', 'Suzuki Goro', 'Jie Li' ] #, 'Cho Chikun', 'Takemiya Masaki']
  25         players_all = Data.players_all
  26         players = [ p for p in players_all if p not in players_ignore ]
  27         #players = Data.player_vector.keys()
  28
  29         ### Objects creating input and output vectors when called
  30         print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
  31         ivg = InputVectorGenerator(main_pat_filename, num_features)
  32
  33         # Create pairs of (input vector, player name)
  34         input_vectors = []
  35         for name in players:
  36                 #input_vectors += [ivg( Data.pat_files_folder + name)]
  37                 input_vectors += [[float(occ) for occ in ivg(Data.pat_files_folder + name)]]
  38
  39         if len(input_vectors) == 0:
  40                 print >>sys.stderr, "No input vectors.", main_pat_filename
  41                 sys.exit()
  42
  43         # Create PCA object, trained on input_vectors
  44         pca = PCA(input_vectors, output_dim=10)
  45
  46         # Dump the input generator to file
  47         print "Saving the pca input generator object to file:", pickle_filename
  48         dump_object_to_file(Compose(ivg, pca), pickle_filename)
  49
  50         #pca = PCA(input_vectors, reduce=True)
  51         print >> sys.stderr, "Eigenvalues (top 5): ",
  52         for val in pca.get_eigenvalues()[:5]:
  53                 print >> sys.stderr, "%2.5f"%(val,),
  54
  55         # Perform a PCA on input vectors
  56         input_vectors = pca.process_list_of_vectors(input_vectors)
  57
  58         ### Now we rescale vectors, so that each component fits on -1.0 to 1.0
  59         ### this makes a very nice plot!!
  60         r = Rescale(-1.0,1.0)
  61
  62         ### Normalize each component separately
  63         # We need to transpose input_vectors - a list of per-player-vector-of-pca-component
  64         # to get list of vectors of per-component-vector-of-player-data
  65         def transpose(list_of_vectors):
  66                 return zip(*list_of_vectors)
  67         input_vectors = transpose([ r(vector) for vector in transpose(input_vectors)])
  68
  69
  70         print >> sys.stderr, "Writing output to file: ", filename_pca
  71         # prints vectors along with player names
  72         f=open(filename_pca, 'w')
  73         for name,vector in izip(players, input_vectors):
  74                 # Substitute ' ' by '_' to allow for gnuplot plotting (recognizing columns correctly)
  75                 name_to_print = '_'.join(name.split())
  76                 for p, i in izip(vector, count()):
  77                         print >> f, name_to_print, i+1, p
  78         f.close()
  79
  80         print >> sys.stderr, "Writing projection info to file: ", filename_proj
  81         f = open(filename_proj, 'w')
  82         P = pca.get_projection_info()
  83         for y in xrange(1, P.shape[0]):
  84                 for x in xrange(1, P.shape[1]):
  85                         print >> f, y, x, P[y,x], ivg.ovg.stringof(x)
  86         f.close()
  87
  88         print >> sys.stderr, "\nNow print that by:"
  89         print >> sys.stderr, 'gnuplot> set xrange[1:%d]'%(pca.pca.output_dim+1)
  90         print >> sys.stderr, 'plot "%s" using 2:3:1 with labels font "arial,10" left point pt 4 offset 1,0'%(filename_pca,)