kfunc.c 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. #include <math.h>
  2. #include <stdlib.h>
  3. #include "htslib/kfunc.h"
  4. /* Log gamma function
  5. * \log{\Gamma(z)}
  6. * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
  7. */
  8. double kf_lgamma(double z)
  9. {
  10. double x = 0;
  11. x += 0.1659470187408462e-06 / (z+7);
  12. x += 0.9934937113930748e-05 / (z+6);
  13. x -= 0.1385710331296526 / (z+5);
  14. x += 12.50734324009056 / (z+4);
  15. x -= 176.6150291498386 / (z+3);
  16. x += 771.3234287757674 / (z+2);
  17. x -= 1259.139216722289 / (z+1);
  18. x += 676.5203681218835 / z;
  19. x += 0.9999999999995183;
  20. return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
  21. }
  22. /* complementary error function
  23. * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
  24. * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
  25. */
  26. double kf_erfc(double x)
  27. {
  28. const double p0 = 220.2068679123761;
  29. const double p1 = 221.2135961699311;
  30. const double p2 = 112.0792914978709;
  31. const double p3 = 33.912866078383;
  32. const double p4 = 6.37396220353165;
  33. const double p5 = .7003830644436881;
  34. const double p6 = .03526249659989109;
  35. const double q0 = 440.4137358247522;
  36. const double q1 = 793.8265125199484;
  37. const double q2 = 637.3336333788311;
  38. const double q3 = 296.5642487796737;
  39. const double q4 = 86.78073220294608;
  40. const double q5 = 16.06417757920695;
  41. const double q6 = 1.755667163182642;
  42. const double q7 = .08838834764831844;
  43. double expntl, z, p;
  44. z = fabs(x) * M_SQRT2;
  45. if (z > 37.) return x > 0.? 0. : 2.;
  46. expntl = exp(z * z * - .5);
  47. if (z < 10. / M_SQRT2) // for small z
  48. p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
  49. / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
  50. else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
  51. return x > 0.? 2. * p : 2. * (1. - p);
  52. }
  53. /* The following computes regularized incomplete gamma functions.
  54. * Formulas are taken from Wiki, with additional input from Numerical
  55. * Recipes in C (for modified Lentz's algorithm) and AS245
  56. * (http://lib.stat.cmu.edu/apstat/245).
  57. *
  58. * A good online calculator is available at:
  59. *
  60. * http://www.danielsoper.com/statcalc/calc23.aspx
  61. *
  62. * It calculates upper incomplete gamma function, which equals
  63. * kf_gammaq(s,z)*tgamma(s).
  64. */
  65. #define KF_GAMMA_EPS 1e-14
  66. #define KF_TINY 1e-290
  67. // regularized lower incomplete gamma function, by series expansion
  68. static double _kf_gammap(double s, double z)
  69. {
  70. double sum, x;
  71. int k;
  72. for (k = 1, sum = x = 1.; k < 100; ++k) {
  73. sum += (x *= z / (s + k));
  74. if (x / sum < KF_GAMMA_EPS) break;
  75. }
  76. return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
  77. }
  78. // regularized upper incomplete gamma function, by continued fraction
  79. static double _kf_gammaq(double s, double z)
  80. {
  81. int j;
  82. double C, D, f;
  83. f = 1. + z - s; C = f; D = 0.;
  84. // Modified Lentz's algorithm for computing continued fraction
  85. // See Numerical Recipes in C, 2nd edition, section 5.2
  86. for (j = 1; j < 100; ++j) {
  87. double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
  88. D = b + a * D;
  89. if (D < KF_TINY) D = KF_TINY;
  90. C = b + a / C;
  91. if (C < KF_TINY) C = KF_TINY;
  92. D = 1. / D;
  93. d = C * D;
  94. f *= d;
  95. if (fabs(d - 1.) < KF_GAMMA_EPS) break;
  96. }
  97. return exp(s * log(z) - z - kf_lgamma(s) - log(f));
  98. }
  99. double kf_gammap(double s, double z)
  100. {
  101. return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
  102. }
  103. double kf_gammaq(double s, double z)
  104. {
  105. return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
  106. }
  107. /* Regularized incomplete beta function. The method is taken from
  108. * Numerical Recipe in C, 2nd edition, section 6.4. The following web
  109. * page calculates the incomplete beta function, which equals
  110. * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
  111. *
  112. * http://www.danielsoper.com/statcalc/calc36.aspx
  113. */
  114. static double kf_betai_aux(double a, double b, double x)
  115. {
  116. double C, D, f;
  117. int j;
  118. if (x == 0.) return 0.;
  119. if (x == 1.) return 1.;
  120. f = 1.; C = f; D = 0.;
  121. // Modified Lentz's algorithm for computing continued fraction
  122. for (j = 1; j < 200; ++j) {
  123. double aa, d;
  124. int m = j>>1;
  125. aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
  126. : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
  127. D = 1. + aa * D;
  128. if (D < KF_TINY) D = KF_TINY;
  129. C = 1. + aa / C;
  130. if (C < KF_TINY) C = KF_TINY;
  131. D = 1. / D;
  132. d = C * D;
  133. f *= d;
  134. if (fabs(d - 1.) < KF_GAMMA_EPS) break;
  135. }
  136. return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
  137. }
  138. double kf_betai(double a, double b, double x)
  139. {
  140. return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
  141. }
  142. #ifdef KF_MAIN
  143. #include <stdio.h>
  144. int main(int argc, char *argv[])
  145. {
  146. double x = 5.5, y = 3;
  147. double a, b;
  148. printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
  149. printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
  150. a = 2; b = 2; x = 0.5;
  151. printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
  152. return 0;
  153. }
  154. #endif
  155. // log\binom{n}{k}
  156. static double lbinom(int n, int k)
  157. {
  158. if (k == 0 || n == k) return 0;
  159. return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
  160. }
  161. // n11 n12 | n1_
  162. // n21 n22 | n2_
  163. //-----------+----
  164. // n_1 n_2 | n
  165. // hypergeometric distribution
  166. static double hypergeo(int n11, int n1_, int n_1, int n)
  167. {
  168. return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
  169. }
  170. typedef struct {
  171. int n11, n1_, n_1, n;
  172. double p;
  173. } hgacc_t;
  174. // incremental version of hypergenometric distribution
  175. static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
  176. {
  177. if (n1_ || n_1 || n) {
  178. aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
  179. } else { // then only n11 changed; the rest fixed
  180. if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
  181. if (n11 == aux->n11 + 1) { // incremental
  182. aux->p *= (double)(aux->n1_ - aux->n11) / n11
  183. * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
  184. aux->n11 = n11;
  185. return aux->p;
  186. }
  187. if (n11 == aux->n11 - 1) { // incremental
  188. aux->p *= (double)aux->n11 / (aux->n1_ - n11)
  189. * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
  190. aux->n11 = n11;
  191. return aux->p;
  192. }
  193. }
  194. aux->n11 = n11;
  195. }
  196. aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
  197. return aux->p;
  198. }
  199. double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
  200. {
  201. int i, j, max, min;
  202. double p, q, left, right;
  203. hgacc_t aux;
  204. int n1_, n_1, n;
  205. n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
  206. max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
  207. min = n1_ + n_1 - n; // not sure why n11-n22 is used instead of min(n_1,n1_)
  208. if (min < 0) min = 0; // min n11, for left tail
  209. *two = *_left = *_right = 1.;
  210. if (min == max) return 1.; // no need to do test
  211. q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
  212. // left tail
  213. p = hypergeo_acc(min, 0, 0, 0, &aux);
  214. for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow
  215. left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
  216. --i;
  217. if (p < 1.00000001 * q) left += p;
  218. else --i;
  219. // right tail
  220. p = hypergeo_acc(max, 0, 0, 0, &aux);
  221. for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow
  222. right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
  223. ++j;
  224. if (p < 1.00000001 * q) right += p;
  225. else ++j;
  226. // two-tail
  227. *two = left + right;
  228. if (*two > 1.) *two = 1.;
  229. // adjust left and right
  230. if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
  231. else left = 1.0 - right + q;
  232. *_left = left; *_right = right;
  233. return q;
  234. }