pyx/data.py

   1 #!/usr/bin/env python
   2 # -*- coding: ISO-8859-1 -*-
   3 #
   4 #
   5 # Copyright (C) 2002 Jörg Lehmann <joergl@users.sourceforge.net>
   6 # Copyright (C) 2002 André Wobst <wobsta@users.sourceforge.net>
   7 #
   8 # This file is part of PyX (http://pyx.sourceforge.net/).
   9 #
  10 # PyX is free software; you can redistribute it and/or modify
  11 # it under the terms of the GNU General Public License as published by
  12 # the Free Software Foundation; either version 2 of the License, or
  13 # (at your option) any later version.
  14 #
  15 # PyX is distributed in the hope that it will be useful,
  16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 # GNU General Public License for more details.
  19 #
  20 # You should have received a copy of the GNU General Public License
  21 # along with PyX; if not, write to the Free Software
  22 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  23
  24
  25 import re, ConfigParser
  26 import helper, mathtree
  27
  28
  29 class ColumnError(Exception): pass
  30
  31
  32
  33 ColPattern = re.compile(r"\$(\(-?[0-9]+\)|-?[0-9]+)")
  34
  35 class MathTreeValCol(mathtree.MathTreeValVar):
  36     """column id pattern like "$1" or "$(1)"
  37     defines a new value pattern to identify columns by its number"""
  38
  39     # __implements__ = ...    # TODO: mathtree interfaces
  40
  41     def InitByParser(self, arg):
  42         Match = arg.MatchPattern(ColPattern)
  43         if Match:
  44             # just store the matched string -> handle this variable name later on
  45             self.AddArg(Match)
  46             return 1
  47
  48
  49 # extent the list of possible values by MathTreeValCol
  50 MathTreeValsWithCol = tuple(list(mathtree.DefaultMathTreeVals) + [MathTreeValCol])
  51
  52
  53 class _Idata:
  54     """interface definition of a data object
  55     data objects store data arranged in rows and columns"""
  56
  57     titles = []
  58     """column titles
  59     - a list of strings storing the column titles
  60     - the length of the list must match the number of columns
  61     - any titles entry might be None, thus explicitly not providing a column title"""
  62
  63     data = []
  64     """column/row data
  65     - a list of rows where each row represents a data point
  66     - each row contains a list, where each entry of the list represents a value for a column
  67     - the number of columns for each data point must match the number of columns
  68     - any column enty of any data point might be a float, a string, or None"""
  69
  70     def getcolumnno(self, column):
  71         """returns a column number
  72         - the column parameter might be an integer to be used as a column number
  73         - a column number must be a valid list index (negative values are allowed)
  74         - the column parameter might be a string contained in the titles list;
  75           to be valid, the string must be unique within the titles list
  76         - the method raises ColumnError when the value of the column parameter is invalid"""
  77
  78     def getcolumn(self, column):
  79         """returns a column
  80         - extracts a column out of self.data and returns it as a list
  81         - the column is identified by the parameter column as in getcolumnno"""
  82
  83     def addcolumn(self, expression, context={}):
  84         """adds a column defined by a mathematical expression
  85         - evaluates the expression for each data row and adds a new column at
  86           the end of each data row
  87         - the expression must be a valid mathtree expression (see module mathtree)
  88           with an extended variable name syntax: strings like "$i" and "$(i)" are
  89           allowed where i is an integer
  90         - a variable of the mathematical expression might either be a column title
  91           or, by the extended variable name syntax, it defines an integer to be used
  92           as a list index within the column list for each row
  93         - context is a dictionary, where external variables and functions can be
  94           given; those are used in the evaluation of the expression
  95         - when the expression contains the character "=", everything after the last
  96           "=" is interpreted as the mathematical expression while everything before
  97           this character will be used as a column title for the new column; when no
  98           "=" is contained in the expression, the hole expression is taken as the
  99           mathematical expression and the column title is set to None"""
 100
 101
 102 class _data:
 103
 104     """an (minimal) implementor of _Idata
 105     other classes providing _Idata might be based on is class"""
 106
 107     __implements__ = _Idata
 108
 109     def __init__(self, data, titles, parser=mathtree.parser(MathTreeVals=MathTreeValsWithCol)):
 110         """initializes an instance
 111         - data and titles are just set as instance variables without further checks ---
 112           they must be valid in terms of _Idata (expecially their sizes must fit)
 113         - parser is used in addcolumn and thus must implement the expression parsing as
 114           defined in _Idata"""
 115         self.data = data
 116         self.titles = titles
 117         self.parser = parser
 118
 119     def getcolumnno(self, column):
 120         if helper.isstring(column) and self.titles.count(column) == 1:
 121             return self.titles.index(column)
 122         try:
 123             self.titles[column]
 124         except (TypeError, IndexError, ValueError):
 125             raise ColumnError
 126         return column
 127
 128     def getcolumn(self, column):
 129         columnno = self.getcolumnno(column)
 130         return [x[columnno] for x in self.data]
 131
 132     def addcolumn(self, expression, context={}):
 133         try:
 134             split = expression.rindex("=")
 135         except ValueError:
 136             self.titles.append(None)
 137         else:
 138             self.titles.append(expression[:split])
 139             expression = expression[split+1:]
 140         tree = self.parser.parse(expression)
 141         columnlist = {}
 142         for key in tree.VarList():
 143             if key[0] == "$":
 144                 if key[1] == "(":
 145                     column = int(key[2:-1])
 146                 else:
 147                     column = int(key[1:])
 148                 try:
 149                     self.titles[column]
 150                 except:
 151                     raise ColumnError
 152                 columnlist[key] = column
 153             else:
 154                 try:
 155                     columnlist[key] = self.getcolumnno(key)
 156                 except ColumnError, e:
 157                     if key not in context.keys():
 158                         raise e
 159
 160         varlist = context.copy() # do not modify context
 161         for data in self.data:
 162             try:
 163                 for key in columnlist.keys():
 164                     varlist[key] = float(data[columnlist[key]])
 165             except (TypeError, ValueError):
 166                 data.append(None)
 167             else:
 168                 data.append(tree.Calc(**varlist))
 169
 170
 171 class data(_data):
 172
 173     "an implementation of _Idata with an easy to use constructor"
 174
 175     __implements__ = _Idata
 176
 177     def __init__(self, data=[], titles=[], maxcolumns=helper.nodefault, **kwargs):
 178         """initializes an instance
 179         - data titles must be valid in terms of _Idata except for the number of
 180           columns for each row, especially titles might be the default, e.g. []
 181         - instead of lists for data, each row in data, and titles, tuples or
 182           any other data structure with sequence like behavior might be used,
 183           but they are converted to lists
 184         - maxcolumns is an integer; when not set, maxcolumns is evaluated out of
 185           the maximum column number in each row of data (not taking into account
 186           the titles list)
 187         - titles and each row in data is extended (or cutted) to fit maxcolumns;
 188           when extending those lists, None entries are appended
 189         - parser is used in addcolumn and thus must implement the expression parsing as
 190           defined in _Idata
 191         - further keyword arguments are passed to the constructor of _data"""
 192         if len(data):
 193             if maxcolumns is helper.nodefault:
 194                 maxcolumns = len(data[0])
 195                 for line in data[1:]:
 196                     if len(line) > maxcolumns:
 197                         maxcolumns = len(line)
 198             titles = list(titles[:maxcolumns])
 199             titles += [None] * (maxcolumns - len(titles))
 200             data = list(data)
 201             for i in range(len(data)):
 202                 data[i] = list(data[i]) + [None] * (maxcolumns - len(data[i]))
 203         else:
 204             titles = []
 205         _data.__init__(self, data, titles, **kwargs)
 206
 207
 208 class datafile(data):
 209
 210     "an implementation of _Idata reading data from a file"
 211
 212     __implements__ = _Idata
 213
 214     defaultcommentpattern = re.compile(r"(#+|!+|%+)\s*")
 215     defaultstringpattern = re.compile(r"\"(.*?)\"(\s+|$)")
 216     defaultcolumnpattern = re.compile(r"(.*?)(\s+|$)")
 217
 218     def splitline(self, line, stringpattern, columnpattern, tofloat=1):
 219         """returns a tuple created out of the string line
 220         - matches stringpattern and columnpattern, adds the first group of that
 221           match to the result and and removes those matches until the line is empty
 222         - when stringpattern matched, the result is always kept as a string
 223         - when columnpattern matched and tofloat is true, a conversion to a float
 224           is tried; when this conversion fails, the string is kept"""
 225         result = []
 226         # try to gain speed by skip matching regular expressions
 227         if line.find('"')!=-1 or \
 228            stringpattern is not self.defaultstringpattern or \
 229            columnpattern is not self.defaultcolumnpattern:
 230             while len(line):
 231                 match = stringpattern.match(line)
 232                 if match:
 233                     result.append(match.groups()[0])
 234                     line = line[match.end():]
 235                 else:
 236                     match = columnpattern.match(line)
 237                     if tofloat:
 238                         try:
 239                             result.append(float(match.groups()[0]))
 240                         except (TypeError, ValueError):
 241                             result.append(match.groups()[0])
 242                     else:
 243                         result.append(match.groups()[0])
 244                     line = line[match.end():]
 245         else:
 246             if tofloat:
 247                 try:
 248                     return map(float, line.split())
 249                 except (TypeError, ValueError):
 250                     result = []
 251                     for r in line.split():
 252                         try:
 253                             result.append(float(r))
 254                         except (TypeError, ValueError):
 255                             result.append(r)
 256             else:
 257                 return line.split()
 258
 259         return result
 260
 261     def __init__(self, file, commentpattern=defaultcommentpattern,
 262                              stringpattern=defaultstringpattern,
 263                              columnpattern=defaultcolumnpattern,
 264                              skiphead=0, skiptail=0, every=1, **kwargs):
 265         """read data from a file
 266         - file might either be a string or a file instance (something, that
 267           provides readlines())
 268         - each non-empty line, which does not match the commentpattern, is
 269           considered to be a data row; columns are extracted by the splitline
 270           method using tofloat=1
 271         - the last line before a data line matching the commentpattern and
 272           containing further characters is considered as the title line;
 273           the title list is extracted by the splitline method using tofloat=0
 274         - the first skiphead data lines are skiped
 275         - the last skiptail data lines are skiped
 276         - only every "every" data line is used (starting at the skiphead + 1 line)
 277         - the number of columns is equalized between data and titles like
 278           in the data constructor without setting maxcolumns
 279         - further keyword arguments are passed to the constructor of data,
 280           keyword arguments data, titles, and maxcolumns excluded"""
 281         if helper.isstring(file):
 282             file = open(file, "r")
 283         usetitles = []
 284         usedata = []
 285         linenumber = 0
 286         maxcolumns = 0
 287         for line in file.readlines():
 288             line = line.strip()
 289             match = commentpattern.match(line)
 290             if match:
 291                 if not len(usedata):
 292                     newtitles = self.splitline(line[match.end():], stringpattern, columnpattern, tofloat=0)
 293                     if len(newtitles):
 294                         usetitles = newtitles
 295             else:
 296                 linedata = []
 297                 for value in self.splitline(line, stringpattern, columnpattern, tofloat=1):
 298                     linedata.append(value)
 299                 if len(linedata):
 300                     if linenumber >= skiphead and not ((linenumber - skiphead) % every):
 301                         linedata = [linenumber + 1] + linedata
 302                         if len(linedata) > maxcolumns:
 303                             maxcolumns = len(linedata)
 304                         usedata.append(linedata)
 305                     linenumber += 1
 306         if skiptail:
 307             del usedata[-skiptail:]
 308         data.__init__(self, data=usedata, titles=[None] + usetitles, maxcolumns=maxcolumns, **kwargs)
 309
 310
 311
 312 class sectionfile(_data):
 313
 314     def __init__(self, file, sectionstr = "section", **kwargs):
 315         """read data from a config-like file
 316         - file might either be a string or a file instance (something, that
 317           is valid in config.readfp())
 318         - each row is defined by a section in the config-like file (see
 319           config module description)
 320         - the columns for each row are defined by lines in the section file;
 321           the title entries are used to identify the columns
 322         - further keyword arguments are passed to the constructor of _data,
 323           keyword arguments data and titles excluded"""
 324         config = ConfigParser.ConfigParser()
 325         config.optionxform = str
 326         if helper.isstring(file):
 327             config.readfp(open(file, "r"))
 328         else:
 329             config.readfp(file)
 330         usedata = []
 331         usetitles = [sectionstr]
 332         sections = config.sections()
 333         sections.sort()
 334         for section in sections:
 335             usedata.append([section] + [None for x in range(len(usetitles) - 1)])
 336             for option in config.options(section):
 337                 if option == sectionstr:
 338                     raise ValueError("'%s' is already used as the section identifier" % sectionstr)
 339                 try:
 340                     index = usetitles.index(option)
 341                 except ValueError:
 342                     index = len(usetitles)
 343                     usetitles.append(option)
 344                     for line in usedata:
 345                         line.append(None)
 346                 value = config.get(section, option)
 347                 try:
 348                     usedata[-1][index] = float(value)
 349                 except (TypeError, ValueError):
 350                     usedata[-1][index] = value
 351         _data.__init__(self, usedata, usetitles, **kwargs)
 352