orange_hacks/fann_neural.py

   1 """
   2 Wrapper for the Fast Artificial Neural Network library:
   3     http://leenissen.dk/fann/wp/
   4
   5 This module mainly contains FannNeuralLearner and FannNeuralClassifier,
   6 the classifier supports both classification (both normal and multilabel)
   7 and reggresion.
   8
   9 The size of domains for Continuous classes is
  10 limited by the range of activation functions of the neurons.
  11
  12 """
  13 import Orange
  14 import Orange.core
  15
  16 import numpy
  17 import tempfile
  18 import itertools
  19 import logging
  20
  21 from pyfann import libfann
  22
  23 __author__ = "Josef Moudrik"
  24 __credits__ = [ 'Authors of the Fann library, http://leenissen.dk/fann/wp/' ]
  25 __license__ = "GPL"
  26 __version__ = "3.0"
  27 __maintainer__ = "Josef Moudrik"
  28 __email__ = "J.Moudrik@gmail.com"
  29
  30
  31 class FannNeuralNetPickable:
  32     def __init__(self, filename=None):
  33         self.ann = libfann.neural_net()
  34         if filename != None:
  35             self.ann = libfann.neural_net.create_from_file(filename)
  36
  37     def __getstate__(self):
  38         odict = self.__dict__.copy()
  39         del odict['ann']
  40         odict['fann_save'] = fake_file_call_f2s( self.ann.save )
  41
  42         return odict
  43
  44     def __setstate__(self, odict):
  45         ann = libfann.neural_net()
  46         fake_file_call_s2f( ann.create_from_file,
  47                             odict.pop('fann_save') )
  48
  49         self.__dict__.update(odict)
  50         self.ann = ann
  51
  52     def __getattr__(self, key):
  53         return self.ann.__getattribute__(key)
  54
  55 class FannNeuralLearner(Orange.classification.Learner):
  56     """
  57     """
  58
  59     def __new__(cls, examples=None, name='Fann neural', **kwargs):
  60         self = Orange.classification.Learner.__new__(cls, **kwargs)
  61         if examples:
  62             self.__init__(**kwargs)
  63             return self.__call__(examples, **kwargs)
  64         else:
  65             return self
  66
  67     def __init__(self, name='Fann neural',  **kwargs):
  68         """
  69
  70         See:
  71         for parameters and methods of the ANN
  72             http://leenissen.dk/fann/html/files/fann-h.html
  73
  74         for parameters and methods of the train data
  75             http://leenissen.dk/fann/html/files/fann_train-h.html
  76         """
  77         self.name = name
  78         # default parameters for the learner
  79         self.def_params = {
  80                  "nn_type":'standard',
  81                  # disable the check for data to be in <-1,1>
  82                  "allow_out_of_range":False,
  83                  "autorescale_output": False,
  84                  # dicts for setting properties of ann, and train data
  85                  "ann_prop":{},
  86                  "train_prop":{},
  87                  # custom postprocessing functions for more complicated modifications
  88                  # see the __call__ below
  89                  "ann_postprocess":None,  # will be called: ann_postprocess(ann)
  90                  "train_postprocess":None,   # will be called: train_postprocess(train_data)
  91                  # parameters:
  92                  # CREATION
  93                  "hidden_layers":[], # number of neurons in each of the hidden layers
  94                  # sparse
  95                  "connection_rate":0.5,
  96                  # TRAINING
  97                  "desired_error":0.0001,
  98                  # normal training
  99                  "max_epochs":2000,
 100                  "iterations_between_reports":0, # 0 turns it off
 101                  # cascade training
 102                  "max_neurons":20,
 103                  "neurons_between_reports":0, # 0 turns it off
 104         }
 105         self.def_params.update(kwargs)
 106
 107     def __call__(self, data,
 108                  weight=None,
 109                  **kwargs ):
 110         """
 111         Learn from the given table of data instances.
 112
 113         The learning proceeds as follows:
 114         1. data are transformed into pairs of input and output vectors, the size
 115            of these vector corresponding to number of neurons in input/output
 116            layers. The number of input neurons is determined by number of cols
 117            in Table.to_numpy, number of output neurons is as follows:
 118              1 output neuron for each Continuous class attribute (reggression)
 119              N output neurons for each Discrete class, where N is the number of
 120                possible class values.
 121
 122
 123             Because the omain of the neurons' output function is usually <-1,1>
 124             the reggression task only works if the data is scaled to this interval.
 125             If you wanna use the NN and you have different range of the output
 126             variables, you should do some scaling. This wrapper has one cannonical
 127             scaling available, if the autorescale_output option is set to True,
 128             the output is linearly scaled onto <-1,1> (min of the values to -1,
 129             max to 1, guys in the middle linearly in between). The Min and Max
 130             is learned from the training set, so if larger values are present
 131             later when testing, this will not work optimally. Though, it usually
 132             works well. (if this option is used, the output values from running
 133             the actual reggression are rescaled back, so this is transparent to the user)
 134
 135         2. FANN training data struct (call it train_data) is made from these
 136            input/output pairs. The train_data is then posprocessed by:
 137                 (a) for each iter pair (key, value) from the params.train_prop
 138                 the train_data.key(value) is called. This is used to set up
 139                 FANN properties of the train data, as specified in
 140
 141                     http://leenissen.dk/fann/html/files/fann_train-h.html
 142
 143                 (b) if params.train_postprocess function is given, than the
 144                 params.train_postprocess(train_data) is called. This param
 145                 may be used to set up a hook for some complicated FANN
 146                 train_data transformations.
 147
 148         3. The Neural Network (ANN) is then created. Fann offers 3 types of
 149            network types, 'standard', 'sparse' and 'shortcut', as described in
 150
 151                     http://leenissen.dk/fann/html/files/fann-h.html
 152
 153            along with 'cascade' type (I've added; discussed in the point 4.,
 154            below) these can be specified in the params.nn_type.
 155
 156            The network is postprocessed (similar to train_data postprocessing):
 157                 (a) for each iter pair (key, value) from the params.ann_prop
 158                 the ANN.key(value) is called. This is used to set up
 159                 FANN properties of the network, as specified in the FANN reference.
 160
 161                     for example, setting the kw parameter
 162                     ann_prop = {
 163                      'set_activation_function_hidden' : libfann.SIGMOID_STEPWISE ,
 164                      'set_activation_function_output' : libfann.SIGMOID_STEPWISE,
 165                      'set_training_algorithm' : FANN_TRAIN_QUICKPROP
 166                     }
 167                     will override the default activation function
 168                     libfann.SIGMOID_SYMMETRIC with its linear stepwise approximation,
 169                     and will change the default learning gradient learning
 170                     algorithm RPROP to QUICKPROP.
 171
 172                 (b) if params.ann_postprocess function is given, than the
 173                 params.ann_postprocess(ANN) is called.
 174
 175         4. The network is then trained. There are two different approaches to
 176            training in FANN:
 177                 (a) fixed topology training: this is the "usual" way of training,
 178                 the number of neurons and connections in the network is fixed, we
 179                 only choose the learning algorithm which iteratively changes
 180                 the weights.
 181
 182                 (b) cascade training (training with evolving topology): this
 183                 approach starts with an empty network and adds promising neurons
 184                 into the network. See
 185
 186                     http://leenissen.dk/fann/html/files/fann_cascade-h.html
 187
 188                 for details. When using cascade training, the network type can
 189                 only be the shortcut type with no hidden layers on the start.
 190                 Here, in the FannNeuralLearner, you can specify that you want
 191                 to do the cascade learning by setting the params.nn_type to
 192                 'cascade'. This triggers the shortcut topology and trains using
 193                 the FANN Cascade algorithm. Use params.nn_type = 'shortcut' if
 194                 you want the standard fixed topology training.
 195
 196         5. The classifier is returned. Surprisingly, huh? See the __doc__
 197         there for more stuff.
 198
 199         """
 200         # params for this run of __call__ are the default Learner's params
 201         # overriden by the __call__ kwargs
 202         class Params(object):
 203             pass
 204         params = Params()
 205         params.__dict__.update(self.def_params)
 206         params.__dict__.update(kwargs)
 207
 208         if not params.nn_type in ['standard',  'sparse',  'shortcut', 'cascade']:
 209             raise ValueError('Unknown network type "%s"'%params.nn_type)
 210
 211         ## Create the training input/output pairs
 212         # Step 1 in the __call__.__doc__
 213         X, Y =  table_to_XY(data)
 214
 215         def wrong_range(array):
 216             return not ((array >= -1.0) & (array <=  1.0)).all()
 217
 218         # no scaling by default
 219         autoscaler = None
 220         if wrong_range(Y):
 221             if params.autorescale_output:
 222                 lower = params.__dict__.get('autorescale_lower_bound', -1.0)
 223                 upper = params.__dict__.get('autorescale_upper_bound', 1.0)
 224
 225                 autoscaler = AutoScaler(Y, lower, upper)
 226                 Y = autoscaler(Y)
 227             elif not params.allow_out_of_range:
 228                 raise RuntimeError("The training data for the neural net are not scaled"
 229                                    "to <-1,1>. This will probably result to poor performance"
 230                                    "of the reggression."
 231                                    "Set allow_out_of_range to True to disable the check, or"
 232                                    "set autorescale_output to True to perform the automatic scaling"
 233                                    "(and descaling of output), or do some scaling yourself.")
 234
 235         ## Create and postprocess the training data
 236         # Step 2 in the __doc__
 237         train_data = XY_to_fann_train_data(X, Y)
 238
 239         # set properties
 240         fann_setter(train_data, params.train_prop)
 241
 242         # posprocess if relevant
 243         if params.train_postprocess:
 244             train_postprocess(train_data)
 245
 246         ## Create the ANN
 247         # Step 3 in the __doc__
 248
 249         ann = FannNeuralNetPickable()
 250         # this could be used instead, but we use the wrapper, so that the classifier
 251         # is pickable
 252         #ann = libfann.neural_net()
 253
 254         # topology = [ number of input neurons,
 255         #              number of neurons in 1.st layer ,
 256         #              number of neurons in 2.nd layer ,
 257         #               etc,
 258         #              number of output neurons ]
 259         topology = (len(X[0]), ) + tuple(params.hidden_layers) + (len(Y[0]), )
 260         if params.nn_type == 'standard':
 261             ann.create_standard_array( topology )
 262         elif params.nn_type == 'sparse':
 263             ann.create_sparse_array(params.connection_rate, topology )
 264         elif params.nn_type == 'shortcut':
 265             ann.create_shortcut_array( topology )
 266         elif params.nn_type == 'cascade':
 267             if params.hidden_layers:
 268                 raise ValueError("The cascade-trained network must not have any hidden layers on startup.")
 269             ann.create_shortcut_array( topology )
 270         else:
 271             assert False
 272
 273         # set the properties
 274         # some defaults
 275         ann.set_activation_function_hidden(libfann.SIGMOID_SYMMETRIC)
 276         ann.set_activation_function_output(libfann.SIGMOID_SYMMETRIC)
 277
 278         # override by
 279         fann_setter(ann, params.ann_prop)
 280
 281         # posprocess if relevant
 282         if params.ann_postprocess:
 283             ann_postprocess(ann)
 284
 285         ## Train the ANN
 286         # Step 4 in the __doc__
 287
 288         if params.nn_type == 'cascade':
 289             ann.cascadetrain_on_data(train_data,
 290                                      params.max_neurons,
 291                                      params.neurons_between_reports,
 292                                      params.desired_error
 293                                      )
 294         else:
 295             ann.train_on_data(train_data,
 296                               params.max_epochs,
 297                               params.iterations_between_reports,
 298                               params.desired_error
 299                               )
 300
 301         return FannNeuralClassifier(ann, data.domain, autoscaler)
 302
 303
 304 def table_to_XY(data):
 305     """Converts the Orange.Table data to pairs of input and output vectors
 306     (represented row-wise in two numpy.arrays X, Y)
 307     suitable to be used as a training/testing set for a Artificial neural network.
 308
 309     The attributes are created by the Table.to_numpy method. The class attribute(s)
 310     are transformed as follows:
 311         - each Continuous class attribute (regression), is assinged one output neuron
 312             (no scaling is performed on this step)
 313         - each Discrete class attribute (classification), is assinged one output neuron
 314             for each discrete value of this class. E.g. in the iris dataset
 315             (one discrete class attribute noting the name of the flower), we have
 316             3 neurons.
 317     """
 318     if not len(data):
 319         return numpy.array(), numpy.aray()
 320
 321     ## prepare the training data
 322     # classes
 323
 324     cls_descriptors = filter( lambda desc: desc, [data.domain.class_var] + list(data.domain.class_vars))
 325
 326     def get_unfolder(descriptor):
 327         """Unfolds class variable into a number of output neurons' output """
 328         if isinstance(descriptor, Orange.feature.Continuous):
 329             def unfold(value):
 330                 return [float(value)]
 331
 332         elif isinstance(descriptor, Orange.feature.Discrete):
 333             def unfold(value):
 334                 l = [-1.0] * len(descriptor.values)
 335                 l[int(value)] = 1.0
 336                 return l
 337
 338         else:
 339             raise ValueError("Unsupported class variable type '%s'. Must be either Discrete or Continuous."%descriptor.var_type)
 340
 341         return unfold
 342
 343     unfolders = map(get_unfolder, cls_descriptors)
 344
 345     def get_class_values(instance):
 346         l = []
 347         if data.domain.class_var:
 348             l = [instance.get_class()]
 349         return l + instance.get_classes()
 350
 351     y = []
 352
 353     # flatten([[0,0,0,1], [0.44], [1,0]]) =
 354     # [ 0, 0, 0, 1, 0.44, 1, 0 ]
 355     flatten = lambda it: list(itertools.chain.from_iterable(it))
 356
 357     # multi_map([lambda x: x + 1, lambda x: x * 2], [0, 10]) =
 358     # [1, 20]
 359     multi_map = lambda Fs, Args : [ f(arg) for f, arg in zip(Fs,  Args) ]
 360
 361     for instance in data:
 362         values = get_class_values(instance)
 363         y.append( flatten(multi_map( unfolders, values )) )
 364
 365     # attributes
 366     X = data.to_numpy()[0]
 367     # classes
 368     Y = numpy.array(y)
 369
 370     """
 371     print "X"
 372     for instance in data:
 373         print len(instance)
 374         print instance
 375     print "Y"
 376     print Y
 377     """
 378     return X, Y
 379
 380 def XY_to_fann_train_data(X, Y):
 381     if len(X) != len(Y):
 382         raise ValueError("X and Y must have the same number of lines.")
 383
 384     train_data = libfann.training_data()
 385
 386     if len(X):
 387         dim_X, dim_Y = len(X[0]), len(Y[0])
 388
 389         tmp = tempfile.NamedTemporaryFile(delete=False)
 390         with tmp:
 391             tmp.write("%d %d %d\n"%(len(X), dim_X,  dim_Y))
 392             for i in xrange(len(X)):
 393                 for line in [ X[i], Y[i] ]:
 394                     tmp.write("%s\n"% ' '.join( str(float(val)) for val in line ))
 395
 396         train_data.read_train_from_file(tmp.name)
 397         tmp.unlink(tmp.name)
 398
 399     return train_data
 400
 401 class RawScaler:
 402     def __init__(self, MIN, MAX, a, b):
 403         self.MIN = MIN
 404         self.MAX = MAX
 405         self.a = a
 406         self.b = b
 407
 408     def __call__(self, number):
 409         assert self.a <= self.b
 410
 411         if number < self.MIN or number > self.MAX:
 412             logging.warn("The MIN and MAX estimated from the train set"
 413                          " do not reflect real MIN and MAX from the test set."
 414                          " (%.2f < %.2f) or (%.2f > %.2f)"%(number, self.MIN,
 415                                                             number, self.MAX) )
 416
 417         if self.MIN == self.MAX:
 418             # return average value of the set
 419             return float(self.a + self.b) / 2
 420
 421         return self.a + (number - self.MIN) * ( float(self.b - self.a) / (self.MAX -  self.MIN) )
 422
 423
 424 class AutoScaler:
 425     def __init__(self, train_array,  a=-1,  b=1):
 426         assert a <= b
 427         self.a = a
 428         self.b = b
 429         self.train(train_array)
 430
 431     def train(self, array):
 432         rows, cols = array.shape
 433
 434         #self.MIN,  self.MAX = [], []
 435         self.trans = []
 436         self.trans_back = []
 437
 438         for col in xrange(cols):
 439             column = array[:, col]
 440             mi, ma =  column.min(),  column.max()
 441             #print mi,  ma
 442             #self.MIN.append(mi)
 443             #self.MAX.append(ma)
 444             self.trans.append(RawScaler(mi, ma, self.a, self.b))
 445             self.trans_back.append(RawScaler(self.a, self.b, mi, ma))
 446
 447     def scale(self, vector):
 448         return self._scale(vector, self.trans)
 449
 450     def scale_back(self, vector):
 451         return self._scale(vector, self.trans_back)
 452
 453     def _scale(self, vector, fcs):
 454         vector = numpy.array(vector)
 455         cols, = vector.shape
 456         assert cols == len(fcs)
 457         return numpy.array([fcs[i](vector[i]) for i in xrange(cols) ])
 458
 459     def scale_array(self, array):
 460         return self._scale_array(array, self.trans)
 461
 462     def scale_array_back(self, array):
 463         return self._scale_array(array, self.trans_back)
 464
 465     def _scale_array(self, array, fcs):
 466         by_rows = [ self._scale(vector, fcs) for vector in array ]
 467         return numpy.hstack(by_rows).reshape(array.shape)
 468
 469     def __call__(self, array):
 470         return self.scale_array(array)
 471
 472 ## FIXME Orange.classification.Classifier (which should be there)
 473 ## is commented because if it is not, pickling does not work...
 474 class FannNeuralClassifier: #Orange.classification.Classifier):
 475     """
 476     """
 477     def __init__(self, ann, domain, autoscaler=None):
 478         assert isinstance(ann, FannNeuralNetPickable)
 479
 480         self.ann = ann
 481         self.domain = domain
 482         self.autoscaler = autoscaler
 483
 484     def raw_response(self, instance ):
 485         instance = list(instance)
 486         if self.domain.class_var:
 487             instance = instance[:len(self.domain)-1]
 488
 489         if len(instance) !=  self.ann.get_num_input():
 490             raise ValueError("Instance '%s' has wrong length (%d instead of %d)."%(str(instance),
 491                                                                                    len(instance),
 492                                                                                    self.ann.get_num_input()))
 493
 494         input_vector = map(float, instance)
 495
 496         ## run the input throught the ANN
 497         output_vector = self.ann.run(input_vector)
 498
 499         if self.autoscaler:
 500             output_vector = self.autoscaler.scale_back(output_vector)
 501
 502         return output_vector
 503
 504     def _get_responses(self, instance ):
 505         # basically the opposite of unfolding in table_to_XY
 506
 507         output_vector = self.raw_response(instance)
 508
 509         cls_descriptors = filter( lambda desc: desc, [self.domain.class_var] + list(self.domain.class_vars))
 510
 511         def get_folder(descriptor):
 512             """Folds neurons' output into target value.
 513             returns a tuple (F, num), where F is function that takes list
 514             of len num (num is the consumed number
 515             """
 516             if isinstance(descriptor, Orange.feature.Continuous):
 517                 def fold(outputs):
 518                     value = descriptor(outputs[0])
 519                     dist = Orange.statistics.distribution.Continuous(descriptor)
 520                     dist[value] = 1.
 521                     return value, dist
 522                 return fold, 1
 523
 524             elif isinstance(descriptor, Orange.feature.Discrete):
 525                 def fold(outputs):
 526                     # the output neurons' range is <-1, 1>, where
 527                     # - 1 says this class is not likely
 528                     # 1 says this class is likely
 529                     # so we transform i to <0,2>, so that we do not have
 530                     # "negative" probabiliies after the normalization
 531                     outputs = [ o + 1 for o in outputs]
 532                     cprob = Orange.statistics.distribution.Discrete(outputs)
 533                     cprob.normalize()
 534
 535                     mt_prob = cprob
 536                     mt_value = Orange.data.Value(descriptor, cprob.values().index(max(cprob)))
 537                     return mt_value, mt_prob
 538                 return fold, len(descriptor.values)
 539
 540             else:
 541                 raise ValueError("Unsupported class variable type '%s'. Must be either Discrete or Continuous."%descriptor.var_type)
 542
 543         responses = []
 544         for folder, input_size in map(get_folder, cls_descriptors):
 545             responses.append( folder(output_vector[:input_size]) )
 546             output_vector = output_vector[input_size:]
 547
 548         return responses
 549
 550     def __call__(self, instance,
 551                  result_type=Orange.classification.Classifier.GetValue):
 552         """Classify a new instance.
 553         """
 554         ## Handles the ugly result_type discussion
 555
 556         ## see the self._get_responses
 557
 558         responses = self._get_responses(instance)
 559
 560         values, probs =  [], []
 561         for value, prob in responses:
 562             values.append(value)
 563             probs.append(prob)
 564
 565         # multilabel
 566         if self.domain.class_vars :
 567             if result_type == Orange.classification.Classifier.GetValue:
 568                 return values
 569             #if any( prob == None for prob in probs):
 570                 #raise ValueError("Wrong result_type for reggresion task")
 571             if result_type == Orange.classification.Classifier.GetProbabilities:
 572                 return probs
 573             if result_type == Orange.classification.Classifier.GetBoth:
 574                 return (tuple(values), tuple(probs))
 575             assert False
 576
 577         assert len(values) == 1
 578         value, prob = values[0], probs[0]
 579
 580         if result_type == Orange.classification.Classifier.GetValue:
 581             return value
 582         #if prob == None:
 583             #raise ValueError("Wrong result_type for reggresion task")
 584         if result_type == Orange.classification.Classifier.GetProbabilities:
 585             return prob
 586         if result_type == Orange.classification.Classifier.GetBoth:
 587             return (value, prob)
 588
 589         assert False
 590
 591 ## Utility functions
 592
 593 def fann_setter(obj, set_dict):
 594     """Small utility function for calling setters of FANN objects."""
 595     for key, val in set_dict.iteritems():
 596         setter = obj.__getattribute__(key)
 597         if not isinstance(val, tuple):
 598             val = (val, )
 599
 600         setter(*val)
 601
 602 def fake_file_call_s2f(func, string):
 603     """saves string into a file and calls
 604     func(filename)
 605
 606     delete the file afterwards
 607     """
 608
 609     f = tempfile.NamedTemporaryFile(delete=False)
 610     f.write(string)
 611     f.close()
 612
 613     func(f.name)
 614     f.unlink(f.name)
 615
 616 def fake_file_call_f2s(func):
 617     """lets function save something in a file and then returns the filecontent
 618
 619     f(filename)
 620     return filecontent
 621
 622     delete the file afterwards
 623     """
 624     f = tempfile.NamedTemporaryFile()
 625     func(f.name)
 626     ret = f.read()
 627     f.close() # and delete
 628     return ret
 629
 630 ##
 631 ## tests and examples
 632 ##
 633
 634 def test_xor():
 635     """
 636     Test simple reggression by learning the XOR function, famous problem,
 637     imppossible for 1 layered network (without hidden layers).
 638     """
 639     ## the data
 640     attrs = [ Orange.feature.Continuous(name) for name in ['X', 'Y', 'X^Y'] ]
 641     insts =  [ [x, y, x ^ y] for x, y in itertools.product([0, 1], [0, 1])]
 642     data = Orange.data.Table(Orange.data.Domain(attrs), insts)
 643
 644     ## the NeuralNetwork
 645     print "\n   Test: Xor Function\n" + test_xor.__doc__
 646
 647     classifier = FannNeuralLearner( data,
 648                                     # one hidden layer with 2 neurons...
 649                                     # XOR cannot be done without hidden layer
 650                                     hidden_layers=[3],
 651                                     desired_error=0.0001,
 652                                     iterations_between_reports=500,
 653                                     max_epochs=5000 )
 654
 655     for inst in data:
 656         print "%d xor %d = %d, nn(%d, %d) = %.2f " % (
 657                         inst[0],  inst[1],  inst[2],
 658                         inst[0],  inst[1],  classifier(inst) )
 659
 660 def test_iris():
 661     """
 662     Test simple classification by learning to classify the iris dataset.
 663     """
 664     data = Orange.data.Table("iris.tab")
 665
 666     print "\n   Test: Iris Dataset\n" + test_iris.__doc__
 667     classifier = FannNeuralLearner( data,
 668                                     hidden_layers=[5],
 669                                     max_epochs=2000,
 670                                     desired_error=0.005,
 671                                     iterations_between_reports=200
 672                                     )
 673
 674     show_predictions(classifier, data,  probs=True)
 675
 676 def show_predictions(classifier, data,  top=5,  probs=False):
 677     print
 678     if probs:
 679         print "Probability key:\n", data.domain.class_var.values
 680         print
 681     print "Random five classifications%s:" % (' and probabilities' if probs else '')
 682     print
 683     cnt = 0
 684     data.shuffle()
 685
 686     for num, inst in enumerate(data):
 687         pred, prob = classifier(inst, Orange.classification.Classifier.GetBoth)
 688         cls = inst.get_class()
 689
 690         if num < top:
 691             if probs:
 692                 print prob
 693             print "%d: Instance %s predicted as %s" % (num + 1, cls, pred)
 694             print
 695
 696         if cls !=  pred:
 697             cnt += 1
 698
 699     print "\nMissed: %d out of %d examples = %.1f%%" % (cnt, len(data), 100.0 * cnt / len(data))
 700
 701 def test_cascade():
 702     """
 703     Test classification on the voting dataset and GetProbabilities output. Also,
 704     the learning method used to train the neural net is the cascade learning:
 705         See: http://leenissen.dk/fann/html/files/fann_cascade-h.html
 706     """
 707     data = Orange.data.Table("voting.tab")
 708     # Impute
 709     data = Orange.data.imputation.ImputeTable(data, method=Orange.feature.imputation.AverageConstructor())
 710     # take half as train data
 711     selection = Orange.data.sample.SubsetIndices2(data, 0.5)
 712     train_data = data.select(selection, 0)
 713     test_data = data.select(selection, 1)
 714
 715     print "\n   Test: Cascade Train, Voting Dataset and GetProbabilities\n" + test_cascade.__doc__
 716     classifier = FannNeuralLearner( train_data,
 717                                     nn_type='cascade',
 718                                     max_neurons=5,
 719                                     neurons_between_reports=2, # 0 turns it off
 720                                     desired_error=0.005,
 721                                     )
 722
 723     print
 724     print "Possible classes:", data.domain.classVar.values
 725     print "Probabilities for democrats:"
 726     print """
 727     (Note that this are not really 'probabilities';
 728     more like a measure of sureness of the network.
 729     This basically are normed neurons' outputs.)"""
 730     print
 731
 732     test_data.shuffle()
 733     show_predictions( classifier,  test_data,  probs=True)
 734
 735 def test_compare():
 736     iris = Orange.data.Table("iris")
 737     learners = [
 738             Orange.classification.knn.kNNLearner(),
 739             Orange.classification.bayes.NaiveLearner(),
 740             Orange.classification.majority.MajorityLearner(),
 741             FannNeuralLearner()
 742     ]
 743
 744     cv = Orange.evaluation.testing.cross_validation(learners, iris, folds=5)
 745     print ["%.4f" % score for score in Orange.evaluation.scoring.CA(cv)]
 746
 747 def test_housing():
 748     """
 749     Test reggression together with automatic scaling -- when the output
 750     domain is out of range <-1,1>.
 751     """
 752     data = Orange.data.Table("housing")
 753
 754     # rescale the domain to -1.2, 1.2
 755     # default, X=1
 756     X =  1.2
 757
 758     #print "\n   Test: Iris Dataset\n" + test_iris.__doc__
 759     learner = FannNeuralLearner(
 760                                  hidden_layers=[50],
 761                                  max_epochs=2000,
 762                                  desired_error=0.005,
 763                                  iterations_between_reports=0,
 764                                  allow_out_of_range=False,
 765                                  autorescale_output=True,
 766                                  autorescale_lower_bound=-X,
 767                                  autorescale_upper_bound=X,
 768                                 )
 769
 770     #show_predictions(classifier, data,  probs=True)
 771     cv = Orange.evaluation.testing.cross_validation([
 772         learner,
 773         Orange.regression.linear.LinearRegressionLearner()
 774         ], data, folds=5)
 775
 776     print '\n'.join("%s : %.4f" % (text, score)
 777             for score, text in zip(Orange.evaluation.scoring.RMSE(cv),
 778                                     ["ann", "linear"])
 779                     )
 780
 781 def equal_within_epsilon(a, b, epsilon=1e-10):
 782     if a.shape !=  b.shape:
 783         return False
 784     return ( numpy.abs(a - b) <= epsilon ).all()
 785
 786 def test_autoscale():
 787     data = 10 * numpy.random.random((40,4))
 788     test, train =  data[:5], data[5:]
 789
 790     # we could also specify smaller domain
 791     #at = AutoScaler(train, -0.8, 0.8 )
 792     at = AutoScaler(train) # (-1,1) by default
 793
 794     print "train scaled"
 795     print at(train)
 796     print "test scaled"
 797     print at(test)
 798
 799     to_list =  lambda arr : map(list,list(arr))
 800
 801     #print "test - to and fro"
 802     #print at.scale_array_back(at(test))
 803     #print "test normal"
 804     #print test
 805
 806     assert equal_within_epsilon(test,  at.scale_array_back(at(test)) )
 807
 808 def test_pickle():
 809     """
 810     Test pickling on the xor network
 811     """
 812     ## the data
 813     attrs = [ Orange.feature.Continuous(name) for name in ['X', 'Y', 'X^Y'] ]
 814     insts =  [ [x, y, x ^ y] for x, y in itertools.product([0, 1], [0, 1])]
 815     data = Orange.data.Table(Orange.data.Domain(attrs), insts)
 816
 817     ## the NeuralNetwork
 818     print "\n   Test: Xor Function\n" + test_xor.__doc__
 819
 820     classifier = FannNeuralLearner( data,
 821                                     # one hidden layer with 2 neurons...
 822                                     # XOR cannot be done without hidden layer
 823                                     hidden_layers=[3],
 824                                     desired_error=0.0001,
 825                                     iterations_between_reports=500,
 826                                     max_epochs=5000 )
 827     import pickle
 828     with open("OUT.pkl", 'wb') as fout:
 829         pickle.dump(classifier, fout)
 830
 831     print 'saved'
 832     with open("OUT.pkl", 'rb') as fin:
 833         print pickle.load(fin)
 834
 835
 836 if __name__ == "__main__":
 837     test_xor()
 838     #test_iris()
 839     #test_cascade()
 840     #test_compare()
 841     #test_housing()
 842     #test_autoscale()
 843     #test_pickle()
 844
 845
 846