pyx/data.py

   1 #!/usr/bin/env python
   2 # -*- coding: ISO-8859-1 -*-
   3 #
   4 #
   5 # Copyright (C) 2002-2004 Jörg Lehmann <joergl@users.sourceforge.net>
   6 # Copyright (C) 2003-2004 Michael Schindler <m-schindler@users.sourceforge.net>
   7 # Copyright (C) 2002-2004 André Wobst <wobsta@users.sourceforge.net>
   8 #
   9 # This file is part of PyX (http://pyx.sourceforge.net/).
  10 #
  11 # PyX is free software; you can redistribute it and/or modify
  12 # it under the terms of the GNU General Public License as published by
  13 # the Free Software Foundation; either version 2 of the License, or
  14 # (at your option) any later version.
  15 #
  16 # PyX is distributed in the hope that it will be useful,
  17 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 # GNU General Public License for more details.
  20 #
  21 # You should have received a copy of the GNU General Public License
  22 # along with PyX; if not, write to the Free Software
  23 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  24
  25
  26 import re, ConfigParser
  27 import helper, mathtree
  28
  29
  30 class ColumnError(Exception): pass
  31
  32 # XXX: for new mathtree parser
  33 class MathTreeFuncCol(mathtree.MathTreeValVar):
  34
  35     def __init__(self, *args):
  36         self.name = "_col_"
  37         self.VarName = None
  38         mathtree.MathTreeValVar.__init__(self, *args)
  39
  40     def VarList(self):
  41         return [self]
  42
  43     def ColNo(HIDDEN_self, **args):
  44         i = int(HIDDEN_self.Args[0].Calc(**args))
  45         HIDDEN_self.VarName = "_col_%d" % (i)
  46         return i
  47
  48     def Calc(HIDDEN_self, **args):
  49         return mathtree.MathTreeValVar(HIDDEN_self.VarName).Calc(**args)
  50
  51 MathTreeFuncsWithCol = list(mathtree.DefaultMathTreeFuncs) + [MathTreeFuncCol]
  52 # XXX: end of snip for new mathtree-parser
  53 # XXX: begin of snip for old mathtree-parser
  54 ColPattern = re.compile(r"\$(\(-?[0-9]+\)|-?[0-9]+)")
  55
  56 class MathTreeValCol(mathtree.MathTreeValVar):
  57     """column id pattern like "$1" or "$(1)"
  58     defines a new value pattern to identify columns by its number"""
  59
  60     # __implements__ = ...    # TODO: mathtree interfaces
  61
  62     def InitByParser(self, arg):
  63         Match = arg.MatchPattern(ColPattern)
  64         if Match:
  65             # just store the matched string -> handle this variable name later on
  66             self.AddArg(Match)
  67             return 1
  68
  69
  70 # extent the list of possible values by MathTreeValCol
  71 MathTreeValsWithCol = tuple(list(mathtree.DefaultMathTreeVals) + [MathTreeValCol])
  72 # XXX: end of snip for old mathtree-parser
  73
  74
  75 class _Idata:
  76     """interface definition of a data object
  77     data objects store data arranged in rows and columns"""
  78
  79     titles = []
  80     """column titles
  81     - a list of strings storing the column titles
  82     - the length of the list must match the number of columns
  83     - any titles entry might be None, thus explicitly not providing a column title"""
  84
  85     data = []
  86     """column/row data
  87     - a list of rows where each row represents a data point
  88     - each row contains a list, where each entry of the list represents a value for a column
  89     - the number of columns for each data point must match the number of columns
  90     - any column enty of any data point might be a float, a string, or None"""
  91
  92     def getcolumnno(self, column):
  93         """returns a column number
  94         - the column parameter might be an integer to be used as a column number
  95         - a column number must be a valid list index (negative values are allowed)
  96         - the column parameter might be a string contained in the titles list;
  97           to be valid, the string must be unique within the titles list
  98         - the method raises ColumnError when the value of the column parameter is invalid"""
  99
 100     def getcolumn(self, column):
 101         """returns a column
 102         - extracts a column out of self.data and returns it as a list
 103         - the column is identified by the parameter column as in getcolumnno"""
 104
 105     def addcolumn(self, expression, context={}):
 106         """adds a column defined by a mathematical expression
 107         - evaluates the expression for each data row and adds a new column at
 108           the end of each data row
 109         - the expression must be a valid mathtree expression (see module mathtree)
 110           with an extended variable name syntax: strings like "$i" and "$(i)" are
 111           allowed where i is an integer
 112         - a variable of the mathematical expression might either be a column title
 113           or, by the extended variable name syntax, it defines an integer to be used
 114           as a list index within the column list for each row
 115         - context is a dictionary, where external variables and functions can be
 116           given; those are used in the evaluation of the expression
 117         - when the expression contains the character "=", everything after the last
 118           "=" is interpreted as the mathematical expression while everything before
 119           this character will be used as a column title for the new column; when no
 120           "=" is contained in the expression, the hole expression is taken as the
 121           mathematical expression and the column title is set to None"""
 122
 123
 124 class _data:
 125
 126     """an (minimal) implementor of _Idata
 127     other classes providing _Idata might be based on is class"""
 128
 129     __implements__ = _Idata
 130
 131     def __init__(self, data, titles, parser=None):
 132         """initializes an instance
 133         - data and titles are just set as instance variables without further checks ---
 134           they must be valid in terms of _Idata (expecially their sizes must fit)
 135         - parser is used in addcolumn and thus must implement the expression parsing as
 136           defined in _Idata"""
 137         if parser == None:
 138             if mathtree.__useparser__ == mathtree.__oldparser__:
 139                 parser=mathtree.parser(MathTreeVals=mathtree.DefaultMathTreeVals+MathTreeValsWithCol)
 140             if mathtree.__useparser__ == mathtree.__newparser__:
 141                 parser=mathtree.parser(MathTreeFuncs=MathTreeFuncsWithCol)
 142         self.data = data
 143         self.titles = titles
 144         self.parser = parser
 145
 146     def getcolumnno(self, column):
 147         if helper.isstring(column) and self.titles.count(column) == 1:
 148             return self.titles.index(column)
 149         try:
 150             self.titles[column]
 151         except (TypeError, IndexError, ValueError):
 152             raise ColumnError
 153         return column
 154
 155     def getcolumn(self, column):
 156         columnno = self.getcolumnno(column)
 157         return [x[columnno] for x in self.data]
 158
 159     def addcolumn(self, expression, context={}):
 160         try:
 161             split = expression.rindex("=")
 162         except ValueError:
 163             self.titles.append(None)
 164         else:
 165             self.titles.append(expression[:split])
 166             expression = expression[split+1:]
 167         tree = self.parser.parse(expression)
 168         columnlist = {}
 169         varlist = context.copy() # do not modify context
 170         if mathtree.__useparser__ == mathtree.__newparser__: # XXX: switch between mathtree-parsers
 171             for key in tree.VarList():
 172                 if isinstance(key, MathTreeFuncCol):
 173                     column = int(key.ColNo(**varlist))
 174                     try:
 175                         self.titles[column]
 176                     except:
 177                         raise ColumnError
 178                     columnlist["_col_%d" % (column)] = column
 179                 elif key[0:5] == "_col_":
 180                     column = int(key[5:])
 181                     try:
 182                         self.titles[column]
 183                     except:
 184                         raise ColumnError
 185                     columnlist[key] = column
 186                 else:
 187                     try:
 188                         columnlist[key] = self.getcolumnno(key)
 189                     except ColumnError, e:
 190                         if key not in context.keys():
 191                             raise e
 192         else:
 193             for key in tree.VarList():
 194                 if key[0] == "$":
 195                     if key[1] == "(":
 196                         column = int(key[2:-1])
 197                     else:
 198                         column = int(key[1:])
 199                     try:
 200                         self.titles[column]
 201                     except:
 202                         raise ColumnError
 203                     columnlist[key] = column
 204                 else:
 205                     try:
 206                         columnlist[key] = self.getcolumnno(key)
 207                     except ColumnError, e:
 208                         if key not in context.keys():
 209                             raise e
 210
 211         for data in self.data:
 212             try:
 213                 for key in columnlist.keys():
 214                     varlist[key] = float(data[columnlist[key]])
 215             except (TypeError, ValueError):
 216                 data.append(None)
 217             else:
 218                 data.append(tree.Calc(**varlist))
 219
 220
 221 class data(_data):
 222
 223     "an implementation of _Idata with an easy to use constructor"
 224
 225     __implements__ = _Idata
 226
 227     def __init__(self, data=[], titles=[], maxcolumns=helper.nodefault, **kwargs):
 228         """initializes an instance
 229         - data titles must be valid in terms of _Idata except for the number of
 230           columns for each row, especially titles might be the default, e.g. []
 231         - instead of lists for data, each row in data, and titles, tuples or
 232           any other data structure with sequence like behavior might be used,
 233           but they are converted to lists
 234         - maxcolumns is an integer; when not set, maxcolumns is evaluated out of
 235           the maximum column number in each row of data (not taking into account
 236           the titles list)
 237         - titles and each row in data is extended (or cutted) to fit maxcolumns;
 238           when extending those lists, None entries are appended
 239         - parser is used in addcolumn and thus must implement the expression parsing as
 240           defined in _Idata
 241         - further keyword arguments are passed to the constructor of _data"""
 242         if len(data):
 243             if maxcolumns is helper.nodefault:
 244                 maxcolumns = len(data[0])
 245                 for line in data[1:]:
 246                     if len(line) > maxcolumns:
 247                         maxcolumns = len(line)
 248             titles = list(titles[:maxcolumns])
 249             titles += [None] * (maxcolumns - len(titles))
 250             data = list(data)
 251             for i in range(len(data)):
 252                 data[i] = list(data[i]) + [None] * (maxcolumns - len(data[i]))
 253         else:
 254             titles = []
 255         _data.__init__(self, data, titles, **kwargs)
 256
 257
 258 class datafile(data):
 259
 260     "an implementation of _Idata reading data from a file"
 261
 262     __implements__ = _Idata
 263
 264     defaultcommentpattern = re.compile(r"(#+|!+|%+)\s*")
 265     defaultstringpattern = re.compile(r"\"(.*?)\"(\s+|$)")
 266     defaultcolumnpattern = re.compile(r"(.*?)(\s+|$)")
 267
 268     def splitline(self, line, stringpattern, columnpattern, tofloat=1):
 269         """returns a tuple created out of the string line
 270         - matches stringpattern and columnpattern, adds the first group of that
 271           match to the result and and removes those matches until the line is empty
 272         - when stringpattern matched, the result is always kept as a string
 273         - when columnpattern matched and tofloat is true, a conversion to a float
 274           is tried; when this conversion fails, the string is kept"""
 275         result = []
 276         # try to gain speed by skip matching regular expressions
 277         if line.find('"')!=-1 or \
 278            stringpattern is not self.defaultstringpattern or \
 279            columnpattern is not self.defaultcolumnpattern:
 280             while len(line):
 281                 match = stringpattern.match(line)
 282                 if match:
 283                     result.append(match.groups()[0])
 284                     line = line[match.end():]
 285                 else:
 286                     match = columnpattern.match(line)
 287                     if tofloat:
 288                         try:
 289                             result.append(float(match.groups()[0]))
 290                         except (TypeError, ValueError):
 291                             result.append(match.groups()[0])
 292                     else:
 293                         result.append(match.groups()[0])
 294                     line = line[match.end():]
 295         else:
 296             if tofloat:
 297                 try:
 298                     return map(float, line.split())
 299                 except (TypeError, ValueError):
 300                     result = []
 301                     for r in line.split():
 302                         try:
 303                             result.append(float(r))
 304                         except (TypeError, ValueError):
 305                             result.append(r)
 306             else:
 307                 return line.split()
 308
 309         return result
 310
 311     def __init__(self, file, commentpattern=defaultcommentpattern,
 312                              stringpattern=defaultstringpattern,
 313                              columnpattern=defaultcolumnpattern,
 314                              skiphead=0, skiptail=0, every=1, **kwargs):
 315         """read data from a file
 316         - file might either be a string or a file instance (something, that
 317           provides readlines())
 318         - each non-empty line, which does not match the commentpattern, is
 319           considered to be a data row; columns are extracted by the splitline
 320           method using tofloat=1
 321         - the last line before a data line matching the commentpattern and
 322           containing further characters is considered as the title line;
 323           the title list is extracted by the splitline method using tofloat=0
 324         - the first skiphead data lines are skiped
 325         - the last skiptail data lines are skiped
 326         - only every "every" data line is used (starting at the skiphead + 1 line)
 327         - the number of columns is equalized between data and titles like
 328           in the data constructor without setting maxcolumns
 329         - further keyword arguments are passed to the constructor of data,
 330           keyword arguments data, titles, and maxcolumns excluded"""
 331         if helper.isstring(file):
 332             file = open(file, "r")
 333         usetitles = []
 334         usedata = []
 335         linenumber = 0
 336         maxcolumns = 0
 337         for line in file.readlines():
 338             line = line.strip()
 339             match = commentpattern.match(line)
 340             if match:
 341                 if not len(usedata):
 342                     newtitles = self.splitline(line[match.end():], stringpattern, columnpattern, tofloat=0)
 343                     if len(newtitles):
 344                         usetitles = newtitles
 345             else:
 346                 linedata = []
 347                 for value in self.splitline(line, stringpattern, columnpattern, tofloat=1):
 348                     linedata.append(value)
 349                 if len(linedata):
 350                     if linenumber >= skiphead and not ((linenumber - skiphead) % every):
 351                         linedata = [linenumber + 1] + linedata
 352                         if len(linedata) > maxcolumns:
 353                             maxcolumns = len(linedata)
 354                         usedata.append(linedata)
 355                     linenumber += 1
 356         if skiptail:
 357             del usedata[-skiptail:]
 358         data.__init__(self, data=usedata, titles=[None] + usetitles, maxcolumns=maxcolumns, **kwargs)
 359
 360
 361
 362 class sectionfile(_data):
 363
 364     def __init__(self, file, sectionstr = "section", **kwargs):
 365         """read data from a config-like file
 366         - file might either be a string or a file instance (something, that
 367           is valid in config.readfp())
 368         - each row is defined by a section in the config-like file (see
 369           config module description)
 370         - the columns for each row are defined by lines in the section file;
 371           the title entries are used to identify the columns
 372         - further keyword arguments are passed to the constructor of _data,
 373           keyword arguments data and titles excluded"""
 374         config = ConfigParser.ConfigParser()
 375         config.optionxform = str
 376         if helper.isstring(file):
 377             config.readfp(open(file, "r"))
 378         else:
 379             config.readfp(file)
 380         usedata = []
 381         usetitles = [sectionstr]
 382         sections = config.sections()
 383         sections.sort()
 384         for section in sections:
 385             usedata.append([section] + [None for x in range(len(usetitles) - 1)])
 386             for option in config.options(section):
 387                 if option == sectionstr:
 388                     raise ValueError("'%s' is already used as the section identifier" % sectionstr)
 389                 try:
 390                     index = usetitles.index(option)
 391                 except ValueError:
 392                     index = len(usetitles)
 393                     usetitles.append(option)
 394                     for line in usedata:
 395                         line.append(None)
 396                 value = config.get(section, option)
 397                 try:
 398                     usedata[-1][index] = float(value)
 399                 except (TypeError, ValueError):
 400                     usedata[-1][index] = value
 401         _data.__init__(self, usedata, usetitles, **kwargs)
 402