Add support+docs for easy dataset generation
[gostyle.git] / pca.py
blob9cd06818fbd0a9e58c679501cb805c48f5411ae8
1 #!/usr/bin/python
2 """
3 This code creates input vectors and performs PCA on it. Each pca'd vector is then printed along with
4 the player name, suitable e.g. to plot using gnuplot.
5 """
6 # not true currently
7 """
8 OUTPUT FORMAT
9 player_name first_principal_component_of_player's_input_vector second_principal_component ...
10 second_player_name ...
11 ...
12 """
13 import sys
14 from gostyle import print_vector, OccurenceVectorGenerator, Rescale, PlayerStrategyIdentificator, PCA, InputVectorGenerator
15 from itertools import izip, count
16 from data_about_players import Data
18 if __name__ == '__main__':
19 main_pat_filename = Data.main_pat_filename
20 num_features = 500
21 players_ignore = [ 'Honinbo Shusaku', 'Kuwahara Shusaku', 'Yasuda Shusaku', 'Go Seigen', 'Suzuki Goro', 'Jie Li' ] #, 'Cho Chikun', 'Takemiya Masaki']
22 players_all = PlayerStrategyIdentificator(Data.strategy_players).all_players
23 players = [ p for p in players_all if p not in players_ignore ]
24 #players = Data.player_vector.keys()
26 ### Objects creating input and output vectors when called
27 print >>sys.stderr, "Creating input vector generator from main pat file:", main_pat_filename
28 i = InputVectorGenerator(main_pat_filename, num_features)
30 # Create pairs of (input vector, player name)
31 input_vectors = []
32 for name in players:
33 #input_vectors += [i( Data.pat_files_folder + name)]
34 input_vectors += [[float(occ) for occ in i(Data.pat_files_folder + name)]]
36 if len(input_vectors) == 0:
37 print >>sys.stderr, "No input vectors.", main_pat_filename
38 sys.exit()
40 # Create PCA object, trained on input_vectors
41 pca = PCA(input_vectors, output_dim=10)
42 #pca = PCA(input_vectors, reduce=True)
44 # Perform a PCA on input vectors
45 input_vectors = pca.process_list_of_vectors(input_vectors)
47 def transpose(list_of_vectors):
48 tran = []
49 for i in xrange(len(list_of_vectors[0])):
50 tran.append([list_of_vectors[j][i] for j in xrange(len(list_of_vectors))])
51 return tran
53 ### Now we rescale vectors, so that each component fits on -1.0 to 1.0
54 ### this makes a very nice plot!!
55 r = Rescale(-1.0,1.0)
56 # Normalize each component separately
57 input_vectors = transpose([ r(vector) for vector in transpose(input_vectors)])
59 # prints vectors along with player names
60 for name,vector in izip(players, input_vectors):
61 # Substitute ' ' by '_' to allow for gnuplot plotting (recognizing columns correctly)
62 name_to_print = '_'.join(name.split())
64 for p, i in izip(vector, count()):
65 print name_to_print, i+1, p
67 #print name_to_print,
68 #print_vector(vector)
70 print >> sys.stderr, "\nNow print that by:"
71 print >> sys.stderr, 'gnuplot> plot set xrange[1:%d]'%(pca.pca.output_dim+1)
72 print >> sys.stderr, 'gnuplot> plot "./pca.data" using 2:3:1 with labels font "arial,10" left point pt 4 offset 1,0'