From 7ec1689240351e99042feea022099d02163ce447 Mon Sep 17 00:00:00 2001 From: Robert Dodier Date: Sun, 15 Jan 2023 09:20:04 -0800 Subject: [PATCH] Implement functions to calculate maximum likelihood estimates for parameters for some statistical distributions. So far, mle_normal, mle_lognormal, and mle_weibull are implemented. The goal is to have a function mle_foo for every foo in the distrib package. This commit includes test cases for the new functions. --- share/distrib/distrib.mac | 82 ++++++++++++++++++++++++++++++++++++----- share/distrib/rtest_distrib.mac | 36 ++++++++++++++++++ 2 files changed, 109 insertions(+), 9 deletions(-) diff --git a/share/distrib/distrib.mac b/share/distrib/distrib.mac index e54252c01..b0baaa0f1 100644 --- a/share/distrib/distrib.mac +++ b/share/distrib/distrib.mac @@ -51,15 +51,16 @@ Continuous distributions: Discrete distributions: Noncentral Student (*noncentral_student_t) Functions: - Density function (pdf_*) - Distribution function (cdf_*) - Quantile (quantile_*) - Mean (mean_*) - Variance (var_*) - Standard deviation (std_*) - Skewness coefficient (skewness_*) - Kurtosis coefficient (kurtosis_*) - Random variate (random_*) + Density function (pdf_*) + Distribution function (cdf_*) + Quantile (quantile_*) + Mean (mean_*) + Variance (var_*) + Standard deviation (std_*) + Skewness coefficient (skewness_*) + Kurtosis coefficient (kurtosis_*) + Random variate (random_*) + Maximum likelihood estimates (mle_*) For example, pdf_student_t(x,n) is the density function of the Student distribution @@ -80,6 +81,8 @@ riotorto @@@ yahoo DOT com put('distrib, 2, 'version) $ +if get ('descriptive, 'version) = false then load ("descriptive"); + /* Sets the random state according to the computer clock time */ set_random_state(make_random_state(true))$ @@ -143,6 +146,13 @@ random_normal(m,s,[num]) := then m + s * ?rndnormal(no) else error("random_normal: check sample size")) $ +mle_normal (x, [w]) := + if x = [] + then ['location = und, 'scale = und] + else + if w = [] + then ['location = mean (x), 'scale = std (x)] + else ['location = mean (x, w[1]), 'scale = std (x, w[1])]; /* STUDENT DISTRIBUTION */ @@ -542,6 +552,14 @@ random_lognormal(m,s,[num]) := then exp(m + s * ?rndnormal(no)) else error("random_lognormal: check sample size") ) $ +mle_lognormal (x, [w]) := + if x = [] + then ['location = und, 'scale = und] + else + if w = [] + then ['location = mean (log (x)), 'scale = std (log (x))] + else ['location = mean (log (x), w[1]), 'scale = std (log (x), w[1])]; + /* GAMMA DISTRIBUTION */ @@ -921,6 +939,52 @@ random_weibull(a,b,[num]) := else b * (-map('log,makelist(random(1.0),k,1,no)))^(1.0/a) else error("random_weibull: check sample size")) $ +mle_weibull (x, [w]) := + if x = [] + then ['shape = und, 'scale = und] + else block ([shape, scale], + w: if w = [] then 1 else w[1], + shape: mle_weibull_shape (x, w), + scale: mle_weibull_scale (x, shape, w), + ['shape = shape, 'scale = scale]); + +mle_weibull_shape (x, [w]) := + block ([listarith: true, shape_eq, eq1], + w: if w = [] then 1 else w[1], + + /* The shape parameter is unchanged by rescaling x, + * so we could try to avoid overflow for large x and large k + * by rescaling; however, it's not clear what's a strategy + * for rescaling which works for very large and very small + * values at the same time. Just let it be for now. + */ + + shape_eq: lambda ([k], mean (x^k*log(x), w) / mean (x^k, w) - 1/k - mean (log(x), w)), + + /* Shape equation is monotonically increasing in k; see: + * N. Balakrishnan and M. Kateri. "On the maximum likelihood estimation of parameters + of Weibull distribution based on complete and censored data," + * Statistics and Probability Letters, vol. 78 (2008), pp. 2971--2975. + * + * Shape equation is negative for sufficiently small k, + * evaluate at k = 1 and then step down (if positive there) + * or step up (if negative) to find the other end point for search interval. + */ + + eq1: shape_eq(1), + if eq1 = 0 then 1 + elseif eq1 > 0 + then block ([k1: 1, k0: 1], + while shape_eq(k0) >= 0 do k0: k0/2, + find_root (shape_eq, k0, k1)) + else block ([k0: 1, k1: 1], + while shape_eq(k1) <= 0 do k1: k1*2, + find_root (shape_eq, k0, k1))); + +mle_weibull_scale (x, shape, [w]) := + if w = [] + then (noncentral_moment (x, shape))^(1/shape) + else (noncentral_moment (x, shape, w[1]))^(1/shape); /* RAYLEIGH DISTRIBUTION */ diff --git a/share/distrib/rtest_distrib.mac b/share/distrib/rtest_distrib.mac index c131a7868..a5b6b3e94 100644 --- a/share/distrib/rtest_distrib.mac +++ b/share/distrib/rtest_distrib.mac @@ -45,6 +45,15 @@ inf; ev (foo, q = 1/7); sqrt(2)*inverse_erf(-5/7)+2; +mle_normal ([11, 13, 17, 19, 23, 29]); +[location = 56/3, scale = sqrt(329)/3]; + +mle_normal ([11, 13, 13, 17, 17, 17, 19, 23, 23, 29, 29, 29]); +[location = 20, scale = sqrt(39)]; + +mle_normal ([11, 13, 17, 19, 23, 29], [1, 2, 3, 1, 2, 3]); +[location = 20, scale = sqrt(39)]; + killcontext (mycontext); done; @@ -166,6 +175,15 @@ inf; ev (foo, q = 2/5); %e^(2/10*sqrt(2)*inverse_erf(-1/5)+22/10); +mle_lognormal ([exp(17), exp(13), exp(11), exp(7), exp(5)]); +[location = 53/5, scale = 2*sqrt(114)/5]; + +mle_lognormal ([exp(17), exp(17), exp(17), exp(13), exp(11), exp(11), exp(7), exp(7), exp(7), exp(7), exp(5)]); +[location = 119/11, scale = 2*sqrt(582)/11]; + +mle_lognormal ([exp(17), exp(13), exp(11), exp(7), exp(5)], [3, 1, 2, 4, 1]/11); +[location = 119/11, scale = 2*sqrt(582)/11]; + /* gamma distribution */ @@ -298,6 +316,24 @@ if equal(q, 0) then 0 elseif equal(q, 1) then inf else ((-log(1 - q))^(1/11))/7; ev (foo, q = 3/17); (-log(14/17))^(1/11)/7; +/* I checked this result by applying lbfgs to the negative log likelihood */ +mle_weibull ([1, 19, 17, 23, 11, 43]); +[shape = 1.305349878443785, scale = 20.35298540894298]; + +/* Rescaling data should yield same shape and rescaled scale */ +mle_weibull ([1, 19, 17, 23, 11, 43]/10); +[shape = 1.305349878443785, scale = 2.035298540894298]; + +/* I checked this result by applying lbfgs to the negative log likelihood */ +mle_weibull ([1, 1, 1, 19, 19, 17, 23, 23, 23, 23, 11, 43]); +[shape = 1.147500774671715, scale = 17.68225583330855]; + +mle_weibull ([1, 19, 17, 23, 11, 43], [3, 2, 1, 4, 1, 1]/12); +[shape = 1.147500774671715, scale = 17.68225583330855]; + +mle_weibull (10*[1, 19, 17, 23, 11, 43], [3, 2, 1, 4, 1, 1]/12); +[shape = 1.147500774671715, scale = 176.8225583330855]; + /* rayleigh distribution is based on weibull */ -- 2.11.4.GIT