sympy/statistics/distributions.py

   1 from sympy.core import *
   2 from sympy.core import sympify
   3 from sympy.functions import sqrt, exp, erf
   4 from sympy.printing import _StrPrinter as StrPrinter
   5 import random
   6
   7
   8 class Sample(tuple):
   9     """
  10     Sample([x1, x2, x3, ...]) represents a collection of samples.
  11     Sample parameters like mean, variance and stddev can be accessed as
  12     properties.
  13     """
  14     def __new__(cls, sample):
  15         s = tuple.__new__(cls, sample)
  16         s.mean = mean = sum(s) / Integer(len(s))
  17         s.variance = sum([(x-mean)**2 for x in s]) / Integer(len(s))
  18         s.stddev = sqrt(s.variance)
  19         return s
  20
  21     @property
  22     def median(self):
  23         raise NotImplementedError
  24
  25     def __repr__(self):
  26         return StrPrinter.doprint(self)
  27
  28
  29 class ContinuousProbability(object):
  30     """Base class for continuous probability distributions"""
  31
  32     def probability(s, a, b):
  33         """Calculate the probability that a random number x generated
  34         from the distribution satisfies a <= x <= b """
  35         return s.cdf(b) - s.cdf(a)
  36
  37     def random(s, n=None):
  38         """
  39         random() -- generate a random number from the distribution.
  40         random(n) -- generate a Sample of n random numbers.
  41         """
  42         if n is None:
  43             return s._random()
  44         else:
  45             return Sample([s._random() for i in xrange(n)])
  46
  47     def __repr__(self):
  48         return StrPrinter.doprint(self)
  49
  50
  51 class Normal(ContinuousProbability):
  52     """
  53     Normal(mu, sigma) represents the normal or Gaussian distribution
  54     with mean value mu and standard deviation sigma.
  55
  56     Example usage:
  57
  58         >>> N = Normal(1, 2)
  59         >>> N.mean
  60         1
  61         >>> N.variance
  62         4
  63         >>> N.probability(-oo, 1)   # probability on an interval
  64         1/2
  65         >>> N.probability(1, oo)
  66         1/2
  67         >>> N.probability(-oo, oo)
  68         1
  69         >>> N.probability(-1, 3)
  70         erf(1/2*2**(1/2))
  71         >>> _.evalf()
  72         0.682689492137086
  73
  74     """
  75     def __init__(self, mu, sigma):
  76         self.mu = sympify(mu)
  77         self.sigma = sympify(sigma)
  78
  79     mean = property(lambda s: s.mu)
  80     median = property(lambda s: s.mu)
  81     mode = property(lambda s: s.mu)
  82     stddev = property(lambda s: s.sigma)
  83     variance = property(lambda s: s.sigma**2)
  84
  85     def pdf(s, x):
  86         """Return the probability density function as an expression in x"""
  87         x = sympify(x)
  88         return 1/(s.sigma*sqrt(2*pi)) * exp(-(x-s.mu)**2 / (2*s.sigma**2))
  89
  90     def cdf(s, x):
  91         """Return the cumulative density function as an expression in x"""
  92         x = sympify(x)
  93         return (1+erf((x-s.mu)/(s.sigma*sqrt(2))))/2
  94
  95     def _random(s):
  96         return random.gauss(float(s.mu), float(s.sigma))
  97
  98     def confidence(s, p):
  99         """Return a symmetric (p*100)% confidence interval. For example,
 100         p=0.95 gives a 95% confidence interval. Currently this function
 101         only handles numerical values except in the trivial case p=1.
 102
 103         Examples usage:
 104             # One standard deviation
 105             >>> N = Normal(0, 1)
 106             >>> N.confidence(0.68)
 107             (-0.994457883209753, 0.994457883209753)
 108             >>> N.probability(*_).evalf()
 109             0.680000000000000
 110
 111             # Two standard deviations
 112             >>> N = Normal(0, 1)
 113             >>> N.confidence(0.95)
 114             (-1.95996398454005, 1.95996398454005)
 115             >>> N.probability(*_).evalf()
 116             0.950000000000000
 117         """
 118
 119         if p == 1:
 120             return (-oo, oo)
 121
 122         assert p <= 1
 123
 124         # In terms of n*sigma, we have n = sqrt(2)*ierf(p). The inverse
 125         # error function is not yet implemented in SymPy but can easily be
 126         # computed numerically
 127
 128         from sympy.mpmath import mpf, secant, erf
 129
 130         p = mpf(p)
 131         # calculate y = ierf(p) by solving erf(y) - p = 0
 132         y = secant(lambda y: erf(y) - p, 0)
 133         t = Real(str(mpf(float(s.sigma)) * mpf(2)**0.5 * y))
 134         mu = s.mu.evalf()
 135         return (mu-t, mu+t)
 136
 137     @staticmethod
 138     def fit(sample):
 139         """Create a normal distribution fit to the mean and standard
 140         deviation of the given distribution or sample."""
 141         if not hasattr(sample, "stddev"):
 142             sample = Sample(sample)
 143         return Normal(sample.mean, sample.stddev)
 144
 145
 146 class Uniform(ContinuousProbability):
 147     """
 148     Uniform(a, b) represents a probability distribution with uniform
 149     probability density on the interval [a, b] and zero density
 150     everywhere else.
 151     """
 152     def __init__(self, a, b):
 153         self.a = sympify(a)
 154         self.b = sympify(b)
 155
 156     mean = property(lambda s: (s.a+s.b)/2)
 157     median = property(lambda s: (s.a+s.b)/2)
 158     mode = property(lambda s: (s.a+s.b)/2)  # arbitrary
 159     variance = property(lambda s: (s.b-s.a)**2 / 12)
 160     stddev = property(lambda s: sqrt(s.variance))
 161
 162     def pdf(s, x):
 163         """Return the probability density function as an expression in x"""
 164         x = sympify(x)
 165         if not x.is_Number:
 166             raise NotImplementedError("SymPy does not yet support"
 167                 "piecewise functions")
 168         if x < s.a or x > s.b:
 169             return Rational(0)
 170         return 1/(s.b-s.a)
 171
 172     def cdf(s, x):
 173         """Return the cumulative density function as an expression in x"""
 174         x = sympify(x)
 175         if not x.is_Number:
 176             raise NotImplementedError("SymPy does not yet support"
 177                 "piecewise functions")
 178         if x <= s.a:
 179             return Rational(0)
 180         if x >= s.b:
 181             return Rational(1)
 182         return (x-s.a)/(s.b-s.a)
 183
 184     def _random(s):
 185         return Real(random.uniform(float(s.a), float(s.b)))
 186
 187     def confidence(s, p):
 188         """Generate a symmetric (p*100)% confidence interval.
 189
 190         >>> U = Uniform(1, 2)
 191         >>> U.confidence(1)
 192         (1, 2)
 193         >>> U.confidence(Rational(1,2))
 194         (5/4, 7/4)
 195         """
 196         p = sympify(p)
 197         assert p <= 1
 198
 199         d = (s.b-s.a)*p / 2
 200         return (s.mean - d, s.mean + d)
 201
 202     @staticmethod
 203     def fit(sample):
 204         """Create a uniform distribution fit to the mean and standard
 205         deviation of the given distribution or sample."""
 206         if not hasattr(sample, "stddev"):
 207             sample = Sample(sample)
 208         m = sample.mean
 209         d = sqrt(12*sample.variance)/2
 210         return Uniform(m-d, m+d)
 211
 212 class PDF(ContinuousProbability):
 213     """
 214     PDF(func, (x, a, b)) represents continuous probability distribution
 215     with probability distribution function func(x) on interval (a, b)
 216
 217     If func is not normalized so that integrate(func, (x, a, b)) == 1,
 218     it can be normalized using PDF.normalize() method
 219
 220     Example usage:
 221
 222         >>> from sympy import Symbol
 223         >>> x = Symbol('x')
 224         >>> a = Symbol('a', positive=True)
 225
 226         >>> exponential = PDF(exp(-x/a)/a, (x,0,oo))
 227         >>> exponential.pdf(x)
 228         1/a*exp(-x/a)
 229         >>> exponential.cdf(x)
 230         1 - exp(-x/a)
 231         >>> exponential.mean
 232         a
 233         >>> exponential.variance
 234         a**2
 235     """
 236
 237     def __init__(self, pdf, var):
 238         #XXX maybe add some checking of parameters
 239         if  isinstance(var, (tuple, list)):
 240             self.pdf = Lambda(var[0], pdf)
 241             self.domain = tuple(var[1:])
 242         else:
 243             self.pdf = Lambda(var, pdf)
 244             self.domain = (-oo, oo)
 245         self._cdf = None
 246         self._mean = None
 247         self._variance = None
 248         self._stddev = None
 249
 250
 251     def normalize(self):
 252         """
 253         Normalize the probability distribution function so that
 254         integrate(self.pdf(x), (x, a, b)) == 1
 255
 256         Example usage:
 257
 258             >>> from sympy import Symbol
 259             >>> x = Symbol('x')
 260             >>> a = Symbol('a', positive=True)
 261
 262             >>> exponential = PDF(exp(-x/a), (x,0,oo))
 263             >>> exponential.normalize().pdf(x)
 264             1/a*exp(-x/a)
 265         """
 266
 267         norm = self.probability(*self.domain)
 268         if norm != 1:
 269             w = Symbol('w', real=True, dummy=True)
 270             return self.__class__(self.pdf(w)/norm, (w, self.domain[0], self.domain[1]))
 271             #self._cdf = Lambda(w, (self.cdf(w) - self.cdf(self.domain[0]))/norm)
 272             #if self._mean is not None:
 273             #    self._mean /= norm
 274             #if self._variance is not None:
 275             #    self._variance = (self._variance + (self._mean*norm)**2)/norm - self.mean**2
 276             #if self._stddev is not None:
 277             #    self._stddev = sqrt(self._variance)
 278         else:
 279             return self
 280
 281
 282     def cdf(self, x):
 283         x = sympify(x)
 284         if self._cdf is not None:
 285             return self._cdf(x)
 286         else:
 287             from sympy import integrate
 288             w = Symbol('w', real=True, dummy=True)
 289             self._cdf = integrate(self.pdf(w), w)
 290             self._cdf = Lambda(w, self._cdf - self._cdf.subs(w, self.domain[0]))
 291             return self._cdf(x)
 292
 293     def _get_mean(self):
 294         if self._mean is not None:
 295             return self._mean
 296         else:
 297             from sympy import integrate
 298             w = Symbol('w', real=True, dummy=True)
 299             self._mean = integrate(self.pdf(w)*w,(w,self.domain[0],self.domain[1]))
 300             return self._mean
 301
 302     def _get_variance(self):
 303         if self._variance is not None:
 304             return self._variance
 305         else:
 306             from sympy import integrate, trim, together
 307             w = Symbol('w', real=True, dummy=True)
 308             self._variance = integrate(self.pdf(w)*w**2,(w,self.domain[0],self.domain[1])) - self.mean**2
 309             self._variance = trim(self._variance)
 310             return self._variance
 311
 312     def _get_stddev(self):
 313         if self._stddev is not None:
 314             return self._stddev
 315         else:
 316             self._stddev = sqrt(self.variance)
 317             return self._stddev
 318
 319     mean = property(_get_mean)
 320     variance = property(_get_variance)
 321     stddev = property(_get_stddev)
 322
 323
 324     def _random(s):
 325         raise NotImplementedError
 326
 327     def transform(self,func,var):
 328         """Return a probability distribution of random variable func(x)
 329         currently only some simple injective functions are supported"""
 330
 331         w = Symbol('w', real=True, dummy=True)
 332
 333         from sympy import solve
 334         inverse = solve(func-w, var)
 335         newPdf = S.Zero
 336         funcdiff = func.diff(var)
 337         #TODO check if x is in domain
 338         for x in inverse:
 339             # this assignment holds only for x in domain
 340             # in general it would require implementing
 341             # piecewise defined functions in sympy
 342             newPdf += (self.pdf(var)/abs(funcdiff)).subs(var,x)
 343
 344         return PDF(newPdf, (w, func.subs(var, self.domain[0]), func.subs(var, self.domain[1])))