pyx/data.py

   1 #!/usr/bin/env python
   2 #
   3 #
   4 # Copyright (C) 2002 Jörg Lehmann <joergl@users.sourceforge.net>
   5 # Copyright (C) 2002 André Wobst <wobsta@users.sourceforge.net>
   6 #
   7 # This file is part of PyX (http://pyx.sourceforge.net/).
   8 #
   9 # PyX is free software; you can redistribute it and/or modify
  10 # it under the terms of the GNU General Public License as published by
  11 # the Free Software Foundation; either version 2 of the License, or
  12 # (at your option) any later version.
  13 #
  14 # PyX is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details.
  18 #
  19 # You should have received a copy of the GNU General Public License
  20 # along with PyX; if not, write to the Free Software
  21 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  22
  23
  24 import re, ConfigParser
  25 import helper, mathtree
  26
  27
  28 class ColumnError(Exception): pass
  29
  30
  31
  32 ColPattern = re.compile(r"\$(\(-?[0-9]+\)|-?[0-9]+)")
  33
  34 class MathTreeValCol(mathtree.MathTreeValVar):
  35     """column id pattern like "$1" or "$(1)"
  36     defines a new value pattern to identify columns by its number"""
  37
  38     # __implements__ = ...    # TODO: mathtree interfaces
  39
  40     def InitByParser(self, arg):
  41         Match = arg.MatchPattern(ColPattern)
  42         if Match:
  43             # just store the matched string -> handle this variable name later on
  44             self.AddArg(Match)
  45             return 1
  46
  47
  48 # extent the list of possible values by MathTreeValCol
  49 MathTreeValsWithCol = tuple(list(mathtree.DefaultMathTreeVals) + [MathTreeValCol])
  50
  51
  52 class _Idata:
  53     """interface definition of a data object
  54     data objects store data arranged in rows and columns"""
  55
  56     titles = []
  57     """column titles
  58     - a list of strings storing the column titles
  59     - the length of the list must match the number of columns
  60     - any titles entry might be None, thus explicitly not providing a column title"""
  61
  62     data = []
  63     """column/row data
  64     - a list of rows where each row represents a data point
  65     - each row contains a list, where each entry of the list represents a value for a column
  66     - the number of columns for each data point must match the number of columns
  67     - any column enty of any data point might be a float, a string, or None"""
  68
  69     def getcolumnno(self, column):
  70         """returns a column number
  71         - the column parameter might be an integer to be used as a column number
  72         - a column number must be a valid list index (negative values are allowed)
  73         - the column parameter might be a string contained in the titles list;
  74           to be valid, the string must be unique within the titles list
  75         - the method raises ColumnError when the value of the column parameter is invalid"""
  76
  77     def getcolumn(self, column):
  78         """returns a column
  79         - extracts a column out of self.data and returns it as a list
  80         - the column is identified by the parameter column as in getcolumnno"""
  81
  82     def addcolumn(self, expression, context={}):
  83         """adds a column defined by a mathematical expression
  84         - evaluates the expression for each data row and adds a new column at
  85           the end of each data row
  86         - the expression must be a valid mathtree expression (see module mathtree)
  87           with an extended variable name syntax: strings like "$i" and "$(i)" are
  88           allowed where i is an integer
  89         - a variable of the mathematical expression might either be a column title
  90           or, by the extended variable name syntax, it defines an integer to be used
  91           as a list index within the column list for each row
  92         - context is a dictionary, where external variables and functions can be
  93           given; those are used in the evaluation of the expression
  94         - when the expression contains the character "=", everything after the last
  95           "=" is interpreted as the mathematical expression while everything before
  96           this character will be used as a column title for the new column; when no
  97           "=" is contained in the expression, the hole expression is taken as the
  98           mathematical expression and the column title is set to None"""
  99
 100
 101 class _data:
 102
 103     """an (minimal) implementor of _Idata
 104     other classes providing _Idata might be based on is class"""
 105
 106     __implements__ = _Idata
 107
 108     def __init__(self, data, titles, parser=mathtree.parser(MathTreeVals=MathTreeValsWithCol)):
 109         """initializes an instance
 110         - data and titles are just set as instance variables without further checks ---
 111           they must be valid in terms of _Idata (expecially their sizes must fit)
 112         - parser is used in addcolumn and thus must implement the expression parsing as
 113           defined in _Idata"""
 114         self.data = data
 115         self.titles = titles
 116         self.parser = parser
 117
 118     def getcolumnno(self, column):
 119         if helper.isstring(column) and self.titles.count(column) == 1:
 120             return self.titles.index(column)
 121         try:
 122             self.titles[column]
 123         except (TypeError, IndexError, ValueError):
 124             raise ColumnError
 125         return column
 126
 127     def getcolumn(self, column):
 128         columnno = self.getcolumnno(column)
 129         return [x[columnno] for x in self.data]
 130
 131     def addcolumn(self, expression, context={}):
 132         try:
 133             split = expression.rindex("=")
 134         except ValueError:
 135             self.titles.append(None)
 136         else:
 137             self.titles.append(expression[:split])
 138             expression = expression[split+1:]
 139         tree = self.parser.parse(expression)
 140         columnlist = {}
 141         for key in tree.VarList():
 142             if key[0] == "$":
 143                 if key[1] == "(":
 144                     column = int(key[2:-1])
 145                 else:
 146                     column = int(key[1:])
 147                 try:
 148                     self.titles[column]
 149                 except:
 150                     raise ColumnError
 151                 columnlist[key] = column
 152             else:
 153                 try:
 154                     columnlist[key] = self.getcolumnno(key)
 155                 except ColumnError, e:
 156                     if key not in context.keys():
 157                         raise e
 158
 159         varlist = context.copy() # do not modify context
 160         for data in self.data:
 161             try:
 162                 for key in columnlist.keys():
 163                     varlist[key] = float(data[columnlist[key]])
 164             except (TypeError, ValueError):
 165                 data.append(None)
 166             else:
 167                 data.append(tree.Calc(**varlist))
 168
 169
 170 class data(_data):
 171
 172     "an implementation of _Idata with an easy to use constructor"
 173
 174     __implements__ = _Idata
 175
 176     def __init__(self, data=[], titles=[], maxcolumns=helper.nodefault, **kwargs):
 177         """initializes an instance
 178         - data titles must be valid in terms of _Idata except for the number of
 179           columns for each row, especially titles might be the default, e.g. []
 180         - instead of lists for data, each row in data, and titles, tuples or
 181           any other data structure with sequence like behavior might be used,
 182           but they are converted to lists
 183         - maxcolumns is an integer; when not set, maxcolumns is evaluated out of
 184           the maximum column number in each row of data (not taking into account
 185           the titles list)
 186         - titles and each row in data is extended (or cutted) to fit maxcolumns;
 187           when extending those lists, None entries are appended
 188         - parser is used in addcolumn and thus must implement the expression parsing as
 189           defined in _Idata
 190         - further keyword arguments are passed to the constructor of _data"""
 191         if len(data):
 192             if maxcolumns is helper.nodefault:
 193                 maxcolumns = len(data[0])
 194                 for line in data[1:]:
 195                     if len(line) > maxcolumns:
 196                         maxcolumns = len(line)
 197             titles = list(titles[:maxcolumns])
 198             titles += [None] * (maxcolumns - len(titles))
 199             data = list(data)
 200             for i in range(len(data)):
 201                 data[i] = list(data[i]) + [None] * (maxcolumns - len(data[i]))
 202         else:
 203             titles = []
 204         _data.__init__(self, data, titles, **kwargs)
 205
 206
 207 class datafile(data):
 208
 209     "an implementation of _Idata reading data from a file"
 210
 211     __implements__ = _Idata
 212
 213     defaultcommentpattern = re.compile(r"(#+|!+|%+)\s*")
 214     defaultstringpattern = re.compile(r"\"(.*?)\"(\s+|$)")
 215     defaultcolumnpattern = re.compile(r"(.*?)(\s+|$)")
 216
 217     def splitline(self, line, stringpattern, columnpattern, tofloat=1):
 218         """returns a tuple created out of the string line
 219         - matches stringpattern and columnpattern, adds the first group of that
 220           match to the result and and removes those matches until the line is empty
 221         - when stringpattern matched, the result is always kept as a string
 222         - when columnpattern matched and tofloat is true, a conversion to a float
 223           is tried; when this conversion fails, the string is kept"""
 224         result = []
 225         # try to gain speed by skip matching regular expressions
 226         if line.find('"')!=-1 or \
 227            stringpattern is not self.defaultstringpattern or \
 228            columnpattern is not self.defaultcolumnpattern:
 229             while len(line):
 230                 match = stringpattern.match(line)
 231                 if match:
 232                     result.append(match.groups()[0])
 233                     line = line[match.end():]
 234                 else:
 235                     match = columnpattern.match(line)
 236                     if tofloat:
 237                         try:
 238                             result.append(float(match.groups()[0]))
 239                         except (TypeError, ValueError):
 240                             result.append(match.groups()[0])
 241                     else:
 242                         result.append(match.groups()[0])
 243                     line = line[match.end():]
 244         else:
 245             if tofloat:
 246                 try:
 247                     return map(float, line.split())
 248                 except (TypeError, ValueError):
 249                     result = []
 250                     for r in line.split():
 251                         try:
 252                             result.append(float(r))
 253                         except (TypeError, ValueError):
 254                             result.append(r)
 255             else:
 256                 return line.split()
 257
 258         return result
 259
 260     def __init__(self, file, commentpattern=defaultcommentpattern,
 261                              stringpattern=defaultstringpattern,
 262                              columnpattern=defaultcolumnpattern,
 263                              skiphead=0, skiptail=0, every=1, **kwargs):
 264         """read data from a file
 265         - file might either be a string or a file instance (something, that
 266           provides readlines())
 267         - each non-empty line, which does not match the commentpattern, is
 268           considered to be a data row; columns are extracted by the splitline
 269           method using tofloat=1
 270         - the last line before a data line matching the commentpattern and
 271           containing further characters is considered as the title line;
 272           the title list is extracted by the splitline method using tofloat=0
 273         - the first skiphead data lines are skiped
 274         - the last skiptail data lines are skiped
 275         - only every "every" data line is used (starting at the skiphead + 1 line)
 276         - the number of columns is equalized between data and titles like
 277           in the data constructor without setting maxcolumns
 278         - further keyword arguments are passed to the constructor of data,
 279           keyword arguments data, titles, and maxcolumns excluded"""
 280         if helper.isstring(file):
 281             file = open(file, "r")
 282         usetitles = []
 283         usedata = []
 284         linenumber = 0
 285         maxcolumns = 0
 286         for line in file.readlines():
 287             line = line.strip()
 288             match = commentpattern.match(line)
 289             if match:
 290                 if not len(usedata):
 291                     newtitles = self.splitline(line[match.end():], stringpattern, columnpattern, tofloat=0)
 292                     if len(newtitles):
 293                         usetitles = newtitles
 294             else:
 295                 linedata = []
 296                 for value in self.splitline(line, stringpattern, columnpattern, tofloat=1):
 297                     linedata.append(value)
 298                 if len(linedata):
 299                     if linenumber >= skiphead and not ((linenumber - skiphead) % every):
 300                         linedata = [linenumber + 1] + linedata
 301                         if len(linedata) > maxcolumns:
 302                             maxcolumns = len(linedata)
 303                         usedata.append(linedata)
 304                     linenumber += 1
 305         if skiptail:
 306             del usedata[-skiptail:]
 307         data.__init__(self, data=usedata, titles=[None] + usetitles, maxcolumns=maxcolumns, **kwargs)
 308
 309
 310
 311 class sectionfile(_data):
 312
 313     def __init__(self, file, sectionstr = "section", **kwargs):
 314         """read data from a config-like file
 315         - file might either be a string or a file instance (something, that
 316           is valid in config.readfp())
 317         - each row is defined by a section in the config-like file (see
 318           config module description)
 319         - the columns for each row are defined by lines in the section file;
 320           the title entries are used to identify the columns
 321         - further keyword arguments are passed to the constructor of _data,
 322           keyword arguments data and titles excluded"""
 323         config = ConfigParser.ConfigParser()
 324         config.optionxform = str
 325         if helper.isstring(file):
 326             config.readfp(open(file, "r"))
 327         else:
 328             config.readfp(file)
 329         usedata = []
 330         usetitles = [sectionstr]
 331         sections = config.sections()
 332         sections.sort()
 333         for section in sections:
 334             usedata.append([section] + [None for x in range(len(usetitles) - 1)])
 335             for option in config.options(section):
 336                 if option == sectionstr:
 337                     raise ValueError("'%s' is already used as the section identifier" % sectionstr)
 338                 try:
 339                     index = usetitles.index(option)
 340                 except ValueError:
 341                     index = len(usetitles)
 342                     usetitles.append(option)
 343                     for line in usedata:
 344                         line.append(None)
 345                 value = config.get(section, option)
 346                 try:
 347                     usedata[-1][index] = float(value)
 348                 except (TypeError, ValueError):
 349                     usedata[-1][index] = value
 350         _data.__init__(self, usedata, usetitles, **kwargs)
 351