2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "statistics.h"
42 #include "gromacs/math/vec.h"
43 #include "gromacs/utility/real.h"
44 #include "gromacs/utility/smalloc.h"
46 static int gmx_dnint(double x
)
51 typedef struct gmx_stats
{
52 double aa
, a
, b
, sigma_aa
, sigma_a
, sigma_b
, aver
, sigma_aver
, error
;
53 double rmsd
, Rdata
, Rfit
, Rfitaa
, chi2
, chi2aa
;
54 double *x
, *y
, *dx
, *dy
;
59 gmx_stats_t
gmx_stats_init()
65 return (gmx_stats_t
) stats
;
68 int gmx_stats_get_npoints(gmx_stats_t gstats
, int *N
)
70 gmx_stats
*stats
= (gmx_stats
*) gstats
;
77 int gmx_stats_done(gmx_stats_t gstats
)
79 gmx_stats
*stats
= (gmx_stats
*) gstats
;
93 int gmx_stats_add_point(gmx_stats_t gstats
, double x
, double y
,
96 gmx_stats
*stats
= (gmx_stats
*) gstats
;
99 if (stats
->np
+1 >= stats
->nalloc
)
101 if (stats
->nalloc
== 0)
103 stats
->nalloc
= 1024;
109 srenew(stats
->x
, stats
->nalloc
);
110 srenew(stats
->y
, stats
->nalloc
);
111 srenew(stats
->dx
, stats
->nalloc
);
112 srenew(stats
->dy
, stats
->nalloc
);
113 for (i
= stats
->np
; (i
< stats
->nalloc
); i
++)
121 stats
->x
[stats
->np
] = x
;
122 stats
->y
[stats
->np
] = y
;
123 stats
->dx
[stats
->np
] = dx
;
124 stats
->dy
[stats
->np
] = dy
;
131 int gmx_stats_get_point(gmx_stats_t gstats
, real
*x
, real
*y
,
132 real
*dx
, real
*dy
, real level
)
134 gmx_stats
*stats
= (gmx_stats
*) gstats
;
138 if ((ok
= gmx_stats_get_rmsd(gstats
, &rmsd
)) != estatsOK
)
143 while ((outlier
== 0) && (stats
->np_c
< stats
->np
))
145 r
= fabs(stats
->x
[stats
->np_c
] - stats
->y
[stats
->np_c
]);
146 outlier
= (r
> rmsd
*level
);
151 *x
= stats
->x
[stats
->np_c
];
155 *y
= stats
->y
[stats
->np_c
];
159 *dx
= stats
->dx
[stats
->np_c
];
163 *dy
= stats
->dy
[stats
->np_c
];
176 return estatsNO_POINTS
;
179 int gmx_stats_add_points(gmx_stats_t gstats
, int n
, real
*x
, real
*y
,
184 for (i
= 0; (i
< n
); i
++)
186 if ((ok
= gmx_stats_add_point(gstats
, x
[i
], y
[i
],
187 (NULL
!= dx
) ? dx
[i
] : 0,
188 (NULL
!= dy
) ? dy
[i
] : 0)) != estatsOK
)
196 static int gmx_stats_compute(gmx_stats
*stats
, int weight
)
198 double yy
, yx
, xx
, sx
, sy
, dy
, chi2
, chi2aa
, d2
;
199 double ssxx
, ssyy
, ssxy
;
200 double w
, wtot
, yx_nw
, sy_nw
, sx_nw
, yy_nw
, xx_nw
, dx2
, dy2
;
204 if (stats
->computed
== 0)
208 return estatsNO_POINTS
;
218 for (i
= 0; (i
< N
); i
++)
220 d2
+= dsqr(stats
->x
[i
]-stats
->y
[i
]);
221 if ((stats
->dy
[i
]) && (weight
== elsqWEIGHT_Y
))
223 w
= 1/dsqr(stats
->dy
[i
]);
232 xx
+= w
*dsqr(stats
->x
[i
]);
233 xx_nw
+= dsqr(stats
->x
[i
]);
235 yy
+= w
*dsqr(stats
->y
[i
]);
236 yy_nw
+= dsqr(stats
->y
[i
]);
238 yx
+= w
*stats
->y
[i
]*stats
->x
[i
];
239 yx_nw
+= stats
->y
[i
]*stats
->x
[i
];
242 sx_nw
+= stats
->x
[i
];
245 sy_nw
+= stats
->y
[i
];
248 /* Compute average, sigma and error */
249 stats
->aver
= sy_nw
/N
;
250 stats
->sigma_aver
= sqrt(yy_nw
/N
- dsqr(sy_nw
/N
));
251 stats
->error
= stats
->sigma_aver
/sqrt(N
);
253 /* Compute RMSD between x and y */
254 stats
->rmsd
= sqrt(d2
/N
);
256 /* Correlation coefficient for data */
262 ssxx
= N
*(xx_nw
- dsqr(sx_nw
));
263 ssyy
= N
*(yy_nw
- dsqr(sy_nw
));
264 ssxy
= N
*(yx_nw
- (sx_nw
*sy_nw
));
265 stats
->Rdata
= sqrt(dsqr(ssxy
)/(ssxx
*ssyy
));
267 /* Compute straight line through datapoints, either with intercept
268 zero (result in aa) or with intercept variable (results in a
276 stats
->a
= (yx
-sx
*sy
)/(xx
-sx
*sx
);
277 stats
->b
= (sy
)-(stats
->a
)*(sx
);
279 /* Compute chi2, deviation from a line y = ax+b. Also compute
280 chi2aa which returns the deviation from a line y = ax. */
283 for (i
= 0; (i
< N
); i
++)
285 if (stats
->dy
[i
] > 0)
293 chi2aa
+= dsqr((stats
->y
[i
]-(stats
->aa
*stats
->x
[i
]))/dy
);
294 chi2
+= dsqr((stats
->y
[i
]-(stats
->a
*stats
->x
[i
]+stats
->b
))/dy
);
298 stats
->chi2
= sqrt(chi2
/(N
-2));
299 stats
->chi2aa
= sqrt(chi2aa
/(N
-2));
301 /* Look up equations! */
304 stats
->sigma_a
= sqrt(stats
->chi2
/((N
-2)*dx2
));
305 stats
->sigma_b
= stats
->sigma_a
*sqrt(xx
);
306 stats
->Rfit
= fabs(ssxy
)/sqrt(ssxx
*ssyy
);
307 /*stats->a*sqrt(dx2/dy2);*/
308 stats
->Rfitaa
= stats
->aa
*sqrt(dx2
/dy2
);
326 int gmx_stats_get_ab(gmx_stats_t gstats
, int weight
,
327 real
*a
, real
*b
, real
*da
, real
*db
,
328 real
*chi2
, real
*Rfit
)
330 gmx_stats
*stats
= (gmx_stats
*) gstats
;
333 if ((ok
= gmx_stats_compute(stats
, weight
)) != estatsOK
)
347 *da
= stats
->sigma_a
;
351 *db
= stats
->sigma_b
;
365 int gmx_stats_get_a(gmx_stats_t gstats
, int weight
, real
*a
, real
*da
,
366 real
*chi2
, real
*Rfit
)
368 gmx_stats
*stats
= (gmx_stats
*) gstats
;
371 if ((ok
= gmx_stats_compute(stats
, weight
)) != estatsOK
)
381 *da
= stats
->sigma_aa
;
385 *chi2
= stats
->chi2aa
;
389 *Rfit
= stats
->Rfitaa
;
395 int gmx_stats_get_average(gmx_stats_t gstats
, real
*aver
)
397 gmx_stats
*stats
= (gmx_stats
*) gstats
;
400 if ((ok
= gmx_stats_compute(stats
, elsqWEIGHT_NONE
)) != estatsOK
)
410 int gmx_stats_get_ase(gmx_stats_t gstats
, real
*aver
, real
*sigma
, real
*error
)
412 gmx_stats
*stats
= (gmx_stats
*) gstats
;
415 if ((ok
= gmx_stats_compute(stats
, elsqWEIGHT_NONE
)) != estatsOK
)
426 *sigma
= stats
->sigma_aver
;
430 *error
= stats
->error
;
436 int gmx_stats_get_sigma(gmx_stats_t gstats
, real
*sigma
)
438 gmx_stats
*stats
= (gmx_stats
*) gstats
;
441 if ((ok
= gmx_stats_compute(stats
, elsqWEIGHT_NONE
)) != estatsOK
)
446 *sigma
= stats
->sigma_aver
;
451 int gmx_stats_get_error(gmx_stats_t gstats
, real
*error
)
453 gmx_stats
*stats
= (gmx_stats
*) gstats
;
456 if ((ok
= gmx_stats_compute(stats
, elsqWEIGHT_NONE
)) != estatsOK
)
461 *error
= stats
->error
;
466 int gmx_stats_get_corr_coeff(gmx_stats_t gstats
, real
*R
)
468 gmx_stats
*stats
= (gmx_stats
*) gstats
;
471 if ((ok
= gmx_stats_compute(stats
, elsqWEIGHT_NONE
)) != estatsOK
)
481 int gmx_stats_get_rmsd(gmx_stats_t gstats
, real
*rmsd
)
483 gmx_stats
*stats
= (gmx_stats
*) gstats
;
486 if ((ok
= gmx_stats_compute(stats
, elsqWEIGHT_NONE
)) != estatsOK
)
496 int gmx_stats_dump_xy(gmx_stats_t gstats
, FILE *fp
)
498 gmx_stats
*stats
= (gmx_stats
*) gstats
;
501 for (i
= 0; (i
< stats
->np
); i
++)
503 fprintf(fp
, "%12g %12g %12g %12g\n", stats
->x
[i
], stats
->y
[i
],
504 stats
->dx
[i
], stats
->dy
[i
]);
510 int gmx_stats_remove_outliers(gmx_stats_t gstats
, double level
)
512 gmx_stats
*stats
= (gmx_stats
*) gstats
;
513 int i
, iter
= 1, done
= 0, ok
;
516 while ((stats
->np
>= 10) && !done
)
518 if ((ok
= gmx_stats_get_rmsd(gstats
, &rmsd
)) != estatsOK
)
523 for (i
= 0; (i
< stats
->np
); )
525 r
= fabs(stats
->x
[i
]-stats
->y
[i
]);
528 fprintf(stderr
, "Removing outlier, iter = %d, rmsd = %g, x = %g, y = %g\n",
529 iter
, rmsd
, stats
->x
[i
], stats
->y
[i
]);
532 stats
->x
[i
] = stats
->x
[stats
->np
-1];
533 stats
->y
[i
] = stats
->y
[stats
->np
-1];
534 stats
->dx
[i
] = stats
->dx
[stats
->np
-1];
535 stats
->dy
[i
] = stats
->dy
[stats
->np
-1];
551 int gmx_stats_make_histogram(gmx_stats_t gstats
, real binwidth
, int *nb
,
552 int ehisto
, int normalized
, real
**x
, real
**y
)
554 gmx_stats
*stats
= (gmx_stats
*) gstats
;
555 int i
, ok
, index
= 0, nbins
= *nb
, *nindex
;
556 double minx
, maxx
, maxy
, miny
, delta
, dd
, minh
;
558 if (((binwidth
<= 0) && (nbins
<= 0)) ||
559 ((binwidth
> 0) && (nbins
> 0)))
561 return estatsINVALID_INPUT
;
565 return estatsNO_POINTS
;
567 minx
= maxx
= stats
->x
[0];
568 miny
= maxy
= stats
->y
[0];
569 for (i
= 1; (i
< stats
->np
); i
++)
571 miny
= (stats
->y
[i
] < miny
) ? stats
->y
[i
] : miny
;
572 maxy
= (stats
->y
[i
] > maxy
) ? stats
->y
[i
] : maxy
;
573 minx
= (stats
->x
[i
] < minx
) ? stats
->x
[i
] : minx
;
574 maxx
= (stats
->x
[i
] > maxx
) ? stats
->x
[i
] : maxx
;
576 if (ehisto
== ehistoX
)
581 else if (ehisto
== ehistoY
)
588 return estatsINVALID_INPUT
;
593 binwidth
= (delta
)/nbins
;
597 nbins
= gmx_dnint((delta
)/binwidth
+ 0.5);
601 for (i
= 0; (i
< nbins
); i
++)
603 (*x
)[i
] = minh
+ binwidth
*(i
+0.5);
611 dd
= 1.0/(binwidth
*stats
->np
);
615 for (i
= 0; (i
< stats
->np
); i
++)
617 if (ehisto
== ehistoY
)
619 index
= (stats
->y
[i
]-miny
)/binwidth
;
621 else if (ehisto
== ehistoX
)
623 index
= (stats
->x
[i
]-minx
)/binwidth
;
640 for (i
= 0; (i
< nbins
); i
++)
644 (*y
)[i
] /= nindex
[i
];
653 static const char *stats_error
[estatsNR
] =
655 "All well in STATS land",
658 "Invalid histogram input",
660 "Not implemented yet"
663 const char *gmx_stats_message(int estats
)
665 if ((estats
>= 0) && (estats
< estatsNR
))
667 return stats_error
[estats
];
671 return stats_error
[estatsERROR
];
675 /* Old convenience functions, should be merged with the core
677 int lsq_y_ax(int n
, real x
[], real y
[], real
*a
)
679 gmx_stats_t lsq
= gmx_stats_init();
683 gmx_stats_add_points(lsq
, n
, x
, y
, 0, 0);
684 if ((ok
= gmx_stats_get_a(lsq
, elsqWEIGHT_NONE
, a
, &da
, &chi2
, &Rfit
)) != estatsOK
)
693 for (i=0; i<n; i++) {
702 static int low_lsq_y_ax_b(int n
, real
*xr
, double *xd
, real yr
[],
703 real
*a
, real
*b
, real
*r
, real
*chi2
)
708 lsq
= gmx_stats_init();
709 for (i
= 0; (i
< n
); i
++)
711 if ((ok
= gmx_stats_add_point(lsq
, (NULL
!= xd
) ? xd
[i
] : xr
[i
], yr
[i
], 0, 0))
717 if ((ok
= gmx_stats_get_ab(lsq
, elsqWEIGHT_NONE
, a
, b
, NULL
, NULL
, chi2
, r
)) != estatsOK
)
724 double x,y,yx,xx,yy,sx,sy,chi2;
727 for (i=0; i<n; i++) {
741 * a = (n*yx-sy*sx)/(n*xx-sx*sx);
742 * b = (sy-(*a)*sx)/n;
743 * r = sqrt((xx-sx*sx)/(yy-sy*sy));
748 chi2 += dsqr(yr[i] - ((*a)*xd[i] + (*b)));
751 chi2 += dsqr(yr[i] - ((*a)*xr[i] + (*b)));
755 return sqrt(chi2/(n-2));
761 int lsq_y_ax_b(int n
, real x
[], real y
[], real
*a
, real
*b
, real
*r
, real
*chi2
)
763 return low_lsq_y_ax_b(n
, x
, NULL
, y
, a
, b
, r
, chi2
);
766 int lsq_y_ax_b_xdouble(int n
, double x
[], real y
[], real
*a
, real
*b
,
769 return low_lsq_y_ax_b(n
, NULL
, x
, y
, a
, b
, r
, chi2
);
772 int lsq_y_ax_b_error(int n
, real x
[], real y
[], real dy
[],
773 real
*a
, real
*b
, real
*da
, real
*db
,
779 lsq
= gmx_stats_init();
780 for (i
= 0; (i
< n
); i
++)
782 if ((ok
= gmx_stats_add_point(lsq
, x
[i
], y
[i
], 0, dy
[i
])) != estatsOK
)
787 if ((ok
= gmx_stats_get_ab(lsq
, elsqWEIGHT_Y
, a
, b
, da
, db
, chi2
, r
)) != estatsOK
)
791 if ((ok
= gmx_stats_done(lsq
)) != estatsOK
)
799 double sxy,sxx,syy,sx,sy,w,s_2,dx2,dy2,mins;
801 sxy=sxx=syy=sx=sy=w=0.0;
804 mins = min(mins,dy[i]);
806 gmx_fatal(FARGS,"Zero or negative weigths in linear regression analysis");
808 for (i=0; i<n; i++) {
809 s_2 = dsqr(1.0/dy[i]);
810 sxx += s_2*dsqr(x[i]);
811 sxy += s_2*y[i]*x[i];
812 syy += s_2*dsqr(y[i]);
829 * chi2+=dsqr((y[i]-((*a)*x[i]+(*b)))/dy[i]);
832 * da = sqrt(*chi2/((n-2)*dx2));
833 * db = *da*sqrt(sxx);
834 * r = *a*sqrt(dx2/dy2);
837 fprintf(debug,"sx = %g, sy = %g, sxy = %g, sxx = %g, w = %g\n"
838 "chi2 = %g, dx2 = %g\n",
839 sx,sy,sxy,sxx,w,*chi2,dx2);
842 * chi2 = sqrt(*chi2/(n-2));