Coverage for brainbox/task/_statsmodels.py: 8%
229 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-17 09:55 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-17 09:55 +0000
1"""
2Multiple Testing and P-Value Correction
3Author: Josef Perktold
4License: BSD-3
5Statsmodels
6https://www.statsmodels.org/
9Copyright (C) 2006, Jonathan E. Taylor
10All rights reserved.
12Copyright (c) 2006-2008 Scipy Developers.
13All rights reserved.
15Copyright (c) 2009-2018 statsmodels Developers.
16All rights reserved.
19Redistribution and use in source and binary forms, with or without
20modification, are permitted provided that the following conditions are met:
22 a. Redistributions of source code must retain the above copyright notice,
23 this list of conditions and the following disclaimer.
24 b. Redistributions in binary form must reproduce the above copyright
25 notice, this list of conditions and the following disclaimer in the
26 documentation and/or other materials provided with the distribution.
27 c. Neither the name of statsmodels nor the names of its contributors
28 may be used to endorse or promote products derived from this software
29 without specific prior written permission.
32THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR
36ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
38SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
42DAMAGE.
43"""
46from collections import OrderedDict
48import numpy as np
50from ._knockoff import RegressionFDR
52__all__ = ['fdrcorrection', 'fdrcorrection_twostage', 'local_fdr',
53 'multipletests', 'NullDistribution', 'RegressionFDR']
55# ==============================================
56#
57# Part 1: Multiple Tests and P-Value Correction
58#
59# ==============================================
62def _ecdf(x):
63 '''no frills empirical cdf used in fdrcorrection
64 '''
65 nobs = len(x)
66 return np.arange(1, nobs + 1) / float(nobs)
69multitest_methods_names = {'b': 'Bonferroni',
70 's': 'Sidak',
71 'h': 'Holm',
72 'hs': 'Holm-Sidak',
73 'sh': 'Simes-Hochberg',
74 'ho': 'Hommel',
75 'fdr_bh': 'FDR Benjamini-Hochberg',
76 'fdr_by': 'FDR Benjamini-Yekutieli',
77 'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg',
78 'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli',
79 'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar'
80 }
82_alias_list = [['b', 'bonf', 'bonferroni'],
83 ['s', 'sidak'],
84 ['h', 'holm'],
85 ['hs', 'holm-sidak'],
86 ['sh', 'simes-hochberg'],
87 ['ho', 'hommel'],
88 ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'],
89 ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'],
90 ['fdr_tsbh', 'fdr_2sbh'],
91 ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'],
92 ['fdr_gbs']
93 ]
96multitest_alias = OrderedDict()
97for m in _alias_list:
98 multitest_alias[m[0]] = m[0]
99 for a in m[1:]:
100 multitest_alias[a] = m[0]
103def multipletests(pvals, alpha=0.05, method='hs', is_sorted=False,
104 returnsorted=False):
105 """
106 Test results and p-value correction for multiple tests
107 Parameters
108 ----------
109 pvals : array_like, 1-d
110 uncorrected p-values. Must be 1-dimensional.
111 alpha : float
112 FWER, family-wise error rate, e.g. 0.1
113 method : str
114 Method used for testing and adjustment of pvalues. Can be either the
115 full name or initial letters. Available methods are:
116 - `bonferroni` : one-step correction
117 - `sidak` : one-step correction
118 - `holm-sidak` : step down method using Sidak adjustments
119 - `holm` : step-down method using Bonferroni adjustments
120 - `simes-hochberg` : step-up method (independent)
121 - `hommel` : closed method based on Simes tests (non-negative)
122 - `fdr_bh` : Benjamini/Hochberg (non-negative)
123 - `fdr_by` : Benjamini/Yekutieli (negative)
124 - `fdr_tsbh` : two stage fdr correction (non-negative)
125 - `fdr_tsbky` : two stage fdr correction (non-negative)
126 is_sorted : bool
127 If False (default), the p_values will be sorted, but the corrected
128 pvalues are in the original order. If True, then it assumed that the
129 pvalues are already sorted in ascending order.
130 returnsorted : bool
131 not tested, return sorted p-values instead of original sequence
132 Returns
133 -------
134 reject : ndarray, boolean
135 true for hypothesis that can be rejected for given alpha
136 pvals_corrected : ndarray
137 p-values corrected for multiple tests
138 alphacSidak: float
139 corrected alpha for Sidak method
140 alphacBonf: float
141 corrected alpha for Bonferroni method
142 Notes
143 -----
144 There may be API changes for this function in the future.
145 Except for 'fdr_twostage', the p-value correction is independent of the
146 alpha specified as argument. In these cases the corrected p-values
147 can also be compared with a different alpha. In the case of 'fdr_twostage',
148 the corrected p-values are specific to the given alpha, see
149 ``fdrcorrection_twostage``.
150 The 'fdr_gbs' procedure is not verified against another package, p-values
151 are derived from scratch and are not derived in the reference. In Monte
152 Carlo experiments the method worked correctly and maintained the false
153 discovery rate.
154 All procedures that are included, control FWER or FDR in the independent
155 case, and most are robust in the positively correlated case.
156 `fdr_gbs`: high power, fdr control for independent case and only small
157 violation in positively correlated case
158 **Timing**:
159 Most of the time with large arrays is spent in `argsort`. When
160 we want to calculate the p-value for several methods, then it is more
161 efficient to presort the pvalues, and put the results back into the
162 original order outside of the function.
163 Method='hommel' is very slow for large arrays, since it requires the
164 evaluation of n partitions, where n is the number of p-values.
165 """
166 import gc
167 pvals = np.asarray(pvals)
168 alphaf = alpha # Notation ?
170 if not is_sorted:
171 sortind = np.argsort(pvals)
172 pvals = np.take(pvals, sortind)
174 ntests = len(pvals)
175 alphacSidak = 1 - np.power((1. - alphaf), 1. / ntests)
176 alphacBonf = alphaf / float(ntests)
177 if method.lower() in ['b', 'bonf', 'bonferroni']:
178 reject = pvals <= alphacBonf
179 pvals_corrected = pvals * float(ntests)
181 elif method.lower() in ['s', 'sidak']:
182 reject = pvals <= alphacSidak
183 pvals_corrected = 1 - np.power((1. - pvals), ntests)
185 elif method.lower() in ['hs', 'holm-sidak']:
186 alphacSidak_all = 1 - np.power((1. - alphaf),
187 1. / np.arange(ntests, 0, -1))
188 notreject = pvals > alphacSidak_all
189 del alphacSidak_all
191 nr_index = np.nonzero(notreject)[0]
192 if nr_index.size == 0:
193 # nonreject is empty, all rejected
194 notrejectmin = len(pvals)
195 else:
196 notrejectmin = np.min(nr_index)
197 notreject[notrejectmin:] = True
198 reject = ~notreject
199 del notreject
201 pvals_corrected_raw = 1 - np.power((1. - pvals),
202 np.arange(ntests, 0, -1))
203 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)
204 del pvals_corrected_raw
206 elif method.lower() in ['h', 'holm']:
207 notreject = pvals > alphaf / np.arange(ntests, 0, -1)
208 nr_index = np.nonzero(notreject)[0]
209 if nr_index.size == 0:
210 # nonreject is empty, all rejected
211 notrejectmin = len(pvals)
212 else:
213 notrejectmin = np.min(nr_index)
214 notreject[notrejectmin:] = True
215 reject = ~notreject
216 pvals_corrected_raw = pvals * np.arange(ntests, 0, -1)
217 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)
218 del pvals_corrected_raw
219 gc.collect()
221 elif method.lower() in ['sh', 'simes-hochberg']:
222 alphash = alphaf / np.arange(ntests, 0, -1)
223 reject = pvals <= alphash
224 rejind = np.nonzero(reject)
225 if rejind[0].size > 0:
226 rejectmax = np.max(np.nonzero(reject))
227 reject[:rejectmax] = True
228 pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals
229 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
230 del pvals_corrected_raw
232 elif method.lower() in ['ho', 'hommel']:
233 # we need a copy because we overwrite it in a loop
234 a = pvals.copy()
235 for m in range(ntests, 1, -1):
236 cim = np.min(m * pvals[-m:] / np.arange(1, m + 1.))
237 a[-m:] = np.maximum(a[-m:], cim)
238 a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim))
239 pvals_corrected = a
240 reject = a <= alphaf
242 elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']:
243 # delegate, call with sorted pvals
244 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha,
245 method='indep',
246 is_sorted=True)
247 elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']:
248 # delegate, call with sorted pvals
249 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha,
250 method='n',
251 is_sorted=True)
252 elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']:
253 # delegate, call with sorted pvals
254 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha,
255 method='bky',
256 is_sorted=True)[:2]
257 elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']:
258 # delegate, call with sorted pvals
259 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha,
260 method='bh',
261 is_sorted=True)[:2]
263 elif method.lower() in ['fdr_gbs']:
264 # adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009
266 ii = np.arange(1, ntests + 1)
267 q = (ntests + 1. - ii) / ii * pvals / (1. - pvals)
268 pvals_corrected_raw = np.maximum.accumulate(q) # up requirementd
270 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
271 del pvals_corrected_raw
272 reject = pvals_corrected <= alpha
274 else:
275 raise ValueError('method not recognized')
277 if pvals_corrected is not None: # not necessary anymore
278 pvals_corrected[pvals_corrected > 1] = 1
279 if is_sorted or returnsorted:
280 return reject, pvals_corrected, alphacSidak, alphacBonf
281 else:
282 pvals_corrected_ = np.empty_like(pvals_corrected)
283 pvals_corrected_[sortind] = pvals_corrected
284 del pvals_corrected
285 reject_ = np.empty_like(reject)
286 reject_[sortind] = reject
287 return reject_, pvals_corrected_, alphacSidak, alphacBonf
290def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False):
291 '''pvalue correction for false discovery rate
292 This covers Benjamini/Hochberg for independent or positively correlated and
293 Benjamini/Yekutieli for general or negatively correlated tests. Both are
294 available in the function multipletests, as method=`fdr_bh`, resp. `fdr_by`.
295 Parameters
296 ----------
297 pvals : array_like
298 set of p-values of the individual tests.
299 alpha : float
300 error rate
301 method : {'indep', 'negcorr')
302 Returns
303 -------
304 rejected : ndarray, bool
305 True if a hypothesis is rejected, False if not
306 pvalue-corrected : ndarray
307 pvalues adjusted for multiple hypothesis testing to limit FDR
308 Notes
309 -----
310 If there is prior information on the fraction of true hypothesis, then alpha
311 should be set to alpha * m/m_0 where m is the number of tests,
312 given by the p-values, and m_0 is an estimate of the true hypothesis.
313 (see Benjamini, Krieger and Yekuteli)
314 The two-step method of Benjamini, Krieger and Yekutiel that estimates the number
315 of false hypotheses will be available (soon).
316 Method names can be abbreviated to first letter, 'i' or 'p' for fdr_bh and 'n' for
317 fdr_by.
318 '''
319 pvals = np.asarray(pvals)
321 if not is_sorted:
322 pvals_sortind = np.argsort(pvals)
323 pvals_sorted = np.take(pvals, pvals_sortind)
324 else:
325 pvals_sorted = pvals # alias
327 if method in ['i', 'indep', 'p', 'poscorr']:
328 ecdffactor = _ecdf(pvals_sorted)
329 elif method in ['n', 'negcorr']:
330 cm = np.sum(1. / np.arange(1, len(pvals_sorted) + 1)) # corrected this
331 ecdffactor = _ecdf(pvals_sorted) / cm
332 else:
333 raise ValueError('only indep and negcorr implemented')
334 reject = pvals_sorted <= ecdffactor * alpha
335 if reject.any():
336 rejectmax = max(np.nonzero(reject)[0])
337 reject[:rejectmax] = True
339 pvals_corrected_raw = pvals_sorted / ecdffactor
340 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
341 del pvals_corrected_raw
342 pvals_corrected[pvals_corrected > 1] = 1
343 if not is_sorted:
344 pvals_corrected_ = np.empty_like(pvals_corrected)
345 pvals_corrected_[pvals_sortind] = pvals_corrected
346 del pvals_corrected
347 reject_ = np.empty_like(reject)
348 reject_[pvals_sortind] = reject
349 return reject_, pvals_corrected_
350 else:
351 return reject, pvals_corrected
354def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False,
355 is_sorted=False):
356 '''(iterated) two stage linear step-up procedure with estimation of number of true
357 hypotheses
358 Benjamini, Krieger and Yekuteli, procedure in Definition 6
359 Parameters
360 ----------
361 pvals : array_like
362 set of p-values of the individual tests.
363 alpha : float
364 error rate
365 method : {'bky', 'bh')
366 see Notes for details
367 * 'bky' - implements the procedure in Definition 6 of Benjamini, Krieger
368 and Yekuteli 2006
369 * 'bh' - the two stage method of Benjamini and Hochberg
370 iter : bool
371 Returns
372 -------
373 rejected : ndarray, bool
374 True if a hypothesis is rejected, False if not
375 pvalue-corrected : ndarray
376 pvalues adjusted for multiple hypotheses testing to limit FDR
377 m0 : int
378 ntest - rej, estimated number of true hypotheses
379 alpha_stages : list of floats
380 A list of alphas that have been used at each stage
381 Notes
382 -----
383 The returned corrected p-values are specific to the given alpha, they
384 cannot be used for a different alpha.
385 The returned corrected p-values are from the last stage of the fdr_bh
386 linear step-up procedure (fdrcorrection0 with method='indep') corrected
387 for the estimated fraction of true hypotheses.
388 This means that the rejection decision can be obtained with
389 ``pval_corrected <= alpha``, where ``alpha`` is the original significance
390 level.
391 (Note: This has changed from earlier versions (<0.5.0) of statsmodels.)
392 BKY described several other multi-stage methods, which would be easy to implement.
393 However, in their simulation the simple two-stage method (with iter=False) was the
394 most robust to the presence of positive correlation
395 TODO: What should be returned?
396 '''
397 pvals = np.asarray(pvals)
399 if not is_sorted:
400 pvals_sortind = np.argsort(pvals)
401 pvals = np.take(pvals, pvals_sortind)
403 ntests = len(pvals)
404 if method == 'bky':
405 fact = (1. + alpha)
406 alpha_prime = alpha / fact
407 elif method == 'bh':
408 fact = 1.
409 alpha_prime = alpha
410 else:
411 raise ValueError("only 'bky' and 'bh' are available as method")
413 alpha_stages = [alpha_prime]
414 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep',
415 is_sorted=True)
416 r1 = rej.sum()
417 if (r1 == 0) or (r1 == ntests):
418 return rej, pvalscorr * fact, ntests - r1, alpha_stages
419 ri_old = r1
421 while True:
422 ntests0 = 1.0 * ntests - ri_old
423 alpha_star = alpha_prime * ntests / ntests0
424 alpha_stages.append(alpha_star)
425 # print ntests0, alpha_star
426 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep',
427 is_sorted=True)
428 ri = rej.sum()
429 if (not iter) or ri == ri_old:
430 break
431 elif ri < ri_old:
432 # prevent cycles and endless loops
433 raise RuntimeError(" oops - should not be here")
434 ri_old = ri
436 # make adjustment to pvalscorr to reflect estimated number of Non-Null cases
437 # decision is then pvalscorr < alpha (or <=)
438 pvalscorr *= ntests0 * 1.0 / ntests
439 if method == 'bky':
440 pvalscorr *= (1. + alpha)
442 if not is_sorted:
443 pvalscorr_ = np.empty_like(pvalscorr)
444 pvalscorr_[pvals_sortind] = pvalscorr
445 del pvalscorr
446 reject = np.empty_like(rej)
447 reject[pvals_sortind] = rej
448 return reject, pvalscorr_, ntests - ri, alpha_stages
449 else:
450 return rej, pvalscorr, ntests - ri, alpha_stages
453def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7,
454 nbins=30):
455 """
456 Calculate local FDR values for a list of Z-scores.
457 Parameters
458 ----------
459 zscores : array_like
460 A vector of Z-scores
461 null_proportion : float
462 The assumed proportion of true null hypotheses
463 null_pdf : function mapping reals to positive reals
464 The density of null Z-scores; if None, use standard normal
465 deg : int
466 The maximum exponent in the polynomial expansion of the
467 density of non-null Z-scores
468 nbins : int
469 The number of bins for estimating the marginal density
470 of Z-scores.
471 Returns
472 -------
473 fdr : array_like
474 A vector of FDR values
475 References
476 ----------
477 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups
478 Model. Statistical Science 23:1, 1-22.
479 Examples
480 --------
481 Basic use (the null Z-scores are taken to be standard normal):
482 >>> from statsmodels.stats.multitest import local_fdr
483 >>> import numpy as np
484 >>> zscores = np.random.randn(30)
485 >>> fdr = local_fdr(zscores)
486 Use a Gaussian null distribution estimated from the data:
487 >>> null = EmpiricalNull(zscores)
488 >>> fdr = local_fdr(zscores, null_pdf=null.pdf)
489 """
491 from statsmodels.genmod.generalized_linear_model import GLM
492 from statsmodels.genmod.generalized_linear_model import families
493 from statsmodels.regression.linear_model import OLS
495 # Bins for Poisson modeling of the marginal Z-score density
496 minz = min(zscores)
497 maxz = max(zscores)
498 bins = np.linspace(minz, maxz, nbins)
500 # Bin counts
501 zhist = np.histogram(zscores, bins)[0]
503 # Bin centers
504 zbins = (bins[:-1] + bins[1:]) / 2
506 # The design matrix at bin centers
507 dmat = np.vander(zbins, deg + 1)
509 # Use this to get starting values for Poisson regression
510 md = OLS(np.log(1 + zhist), dmat).fit()
512 # Poisson regression
513 md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params)
515 # The design matrix for all Z-scores
516 dmat_full = np.vander(zscores, deg + 1)
518 # The height of the estimated marginal density of Z-scores,
519 # evaluated at every observed Z-score.
520 fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))
522 # The null density.
523 if null_pdf is None:
524 f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)
525 else:
526 f0 = null_pdf(zscores)
528 # The local FDR values
529 fdr = null_proportion * f0 / fz
531 fdr = np.clip(fdr, 0, 1)
533 return fdr
536class NullDistribution(object):
537 """
538 Estimate a Gaussian distribution for the null Z-scores.
539 The observed Z-scores consist of both null and non-null values.
540 The fitted distribution of null Z-scores is Gaussian, but may have
541 non-zero mean and/or non-unit scale.
542 Parameters
543 ----------
544 zscores : array_like
545 The observed Z-scores.
546 null_lb : float
547 Z-scores between `null_lb` and `null_ub` are all considered to be
548 true null hypotheses.
549 null_ub : float
550 See `null_lb`.
551 estimate_mean : bool
552 If True, estimate the mean of the distribution. If False, the
553 mean is fixed at zero.
554 estimate_scale : bool
555 If True, estimate the scale of the distribution. If False, the
556 scale parameter is fixed at 1.
557 estimate_null_proportion : bool
558 If True, estimate the proportion of true null hypotheses (i.e.
559 the proportion of z-scores with expected value zero). If False,
560 this parameter is fixed at 1.
561 Attributes
562 ----------
563 mean : float
564 The estimated mean of the empirical null distribution
565 sd : float
566 The estimated standard deviation of the empirical null distribution
567 null_proportion : float
568 The estimated proportion of true null hypotheses among all hypotheses
569 References
570 ----------
571 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups
572 Model. Statistical Science 23:1, 1-22.
573 Notes
574 -----
575 See also:
576 http://nipy.org/nipy/labs/enn.html#nipy.algorithms.statistics.empirical_pvalue.NormalEmpiricalNull.fdr
577 """
579 def __init__(self, zscores, null_lb=-1, null_ub=1, estimate_mean=True,
580 estimate_scale=True, estimate_null_proportion=False):
582 # Extract the null z-scores
583 ii = np.flatnonzero((zscores >= null_lb) & (zscores <= null_ub))
584 if len(ii) == 0:
585 raise RuntimeError("No Z-scores fall between null_lb and null_ub")
586 zscores0 = zscores[ii]
588 # Number of Z-scores, and null Z-scores
589 n_zs, n_zs0 = len(zscores), len(zscores0)
591 # Unpack and transform the parameters to the natural scale, hold
592 # parameters fixed as specified.
593 def xform(params):
595 mean = 0.
596 sd = 1.
597 prob = 1.
599 ii = 0
600 if estimate_mean:
601 mean = params[ii]
602 ii += 1
603 if estimate_scale:
604 sd = np.exp(params[ii])
605 ii += 1
606 if estimate_null_proportion:
607 prob = 1 / (1 + np.exp(-params[ii]))
609 return mean, sd, prob
611 from scipy.stats.distributions import norm
613 def fun(params):
614 """
615 Negative log-likelihood of z-scores.
616 The function has three arguments, packed into a vector:
617 mean : location parameter
618 logscale : log of the scale parameter
619 logitprop : logit of the proportion of true nulls
620 The implementation follows section 4 from Efron 2008.
621 """
623 d, s, p = xform(params)
625 # Mass within the central region
626 central_mass = (norm.cdf((null_ub - d) / s) -
627 norm.cdf((null_lb - d) / s))
629 # Probability that a Z-score is null and is in the central region
630 cp = p * central_mass
632 # Binomial term
633 rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp)
635 # Truncated Gaussian term for null Z-scores
636 zv = (zscores0 - d) / s
637 rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s)
638 rval -= n_zs0 * np.log(central_mass)
640 return -rval
642 # Estimate the parameters
643 from scipy.optimize import minimize
644 # starting values are mean = 0, scale = 1, p0 ~ 1
645 mz = minimize(fun, np.r_[0., 0, 3], method="Nelder-Mead")
646 mean, sd, prob = xform(mz['x'])
648 self.mean = mean
649 self.sd = sd
650 self.null_proportion = prob
652 # The fitted null density function
653 def pdf(self, zscores):
654 """
655 Evaluates the fitted empirical null Z-score density.
656 Parameters
657 ----------
658 zscores : scalar or array_like
659 The point or points at which the density is to be
660 evaluated.
661 Returns
662 -------
663 The empirical null Z-score density evaluated at the given
664 points.
665 """
667 zval = (zscores - self.mean) / self.sd
668 return np.exp(-0.5 * zval**2 - np.log(self.sd) - 0.5 * np.log(2 * np.pi))