Coverage for brainbox/task/_statsmodels.py: 8%

1"""

2Multiple Testing and P-Value Correction

3Author: Josef Perktold

4License: BSD-3

5Statsmodels

6https://www.statsmodels.org/

10All rights reserved.

13All rights reserved.

16All rights reserved.

19Redistribution and use in source and binary forms, with or without

20modification, are permitted provided that the following conditions are met:

22 a. Redistributions of source code must retain the above copyright notice,

23 this list of conditions and the following disclaimer.

24 b. Redistributions in binary form must reproduce the above copyright

25 notice, this list of conditions and the following disclaimer in the

26 documentation and/or other materials provided with the distribution.

27 c. Neither the name of statsmodels nor the names of its contributors

28 may be used to endorse or promote products derived from this software

29 without specific prior written permission.

32THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

33AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

34IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

35ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR

36ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

37DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

38SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

39CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

40LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

41OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH

42DAMAGE.

43"""

46from collections import OrderedDict (empty)

48import numpy as np (empty)

50from ._knockoff import RegressionFDR (empty)

52__all__ = ['fdrcorrection', 'fdrcorrection_twostage', 'local_fdr', (empty)

53 'multipletests', 'NullDistribution', 'RegressionFDR']

55# ==============================================

56#

57# Part 1: Multiple Tests and P-Value Correction

58#

59# ==============================================

62def _ecdf(x): (empty)

63 '''no frills empirical cdf used in fdrcorrection

64 '''

65 nobs = len(x)

66 return np.arange(1, nobs + 1) / float(nobs)

69multitest_methods_names = {'b': 'Bonferroni', (empty)

70 's': 'Sidak',

71 'h': 'Holm',

72 'hs': 'Holm-Sidak',

73 'sh': 'Simes-Hochberg',

74 'ho': 'Hommel',

75 'fdr_bh': 'FDR Benjamini-Hochberg',

76 'fdr_by': 'FDR Benjamini-Yekutieli',

77 'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg',

78 'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli',

79 'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar'

80 }

82_alias_list = [['b', 'bonf', 'bonferroni'], (empty)

83 ['s', 'sidak'],

84 ['h', 'holm'],

85 ['hs', 'holm-sidak'],

86 ['sh', 'simes-hochberg'],

87 ['ho', 'hommel'],

88 ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'],

89 ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'],

90 ['fdr_tsbh', 'fdr_2sbh'],

91 ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'],

92 ['fdr_gbs']

93 ]

96multitest_alias = OrderedDict() (empty)

97for m in _alias_list: (empty)

98 multitest_alias[m[0]] = m[0] (empty)

99 for a in m[1:]: (empty)

100 multitest_alias[a] = m[0] (empty)

101

102

103def multipletests(pvals, alpha=0.05, method='hs', is_sorted=False, (empty)

104 returnsorted=False):

105 """

106 Test results and p-value correction for multiple tests

107 Parameters

108 ----------

109 pvals : array_like, 1-d

110 uncorrected p-values. Must be 1-dimensional.

111 alpha : float

112 FWER, family-wise error rate, e.g. 0.1

113 method : str

114 Method used for testing and adjustment of pvalues. Can be either the

115 full name or initial letters. Available methods are:

116 - `bonferroni` : one-step correction

117 - `sidak` : one-step correction

118 - `holm-sidak` : step down method using Sidak adjustments

119 - `holm` : step-down method using Bonferroni adjustments

120 - `simes-hochberg` : step-up method (independent)

121 - `hommel` : closed method based on Simes tests (non-negative)

122 - `fdr_bh` : Benjamini/Hochberg (non-negative)

123 - `fdr_by` : Benjamini/Yekutieli (negative)

124 - `fdr_tsbh` : two stage fdr correction (non-negative)

125 - `fdr_tsbky` : two stage fdr correction (non-negative)

126 is_sorted : bool

127 If False (default), the p_values will be sorted, but the corrected

128 pvalues are in the original order. If True, then it assumed that the

129 pvalues are already sorted in ascending order.

130 returnsorted : bool

131 not tested, return sorted p-values instead of original sequence

132 Returns

133 -------

134 reject : ndarray, boolean

135 true for hypothesis that can be rejected for given alpha

136 pvals_corrected : ndarray

137 p-values corrected for multiple tests

138 alphacSidak: float

139 corrected alpha for Sidak method

140 alphacBonf: float

141 corrected alpha for Bonferroni method

142 Notes

143 -----

144 There may be API changes for this function in the future.

145 Except for 'fdr_twostage', the p-value correction is independent of the

146 alpha specified as argument. In these cases the corrected p-values

147 can also be compared with a different alpha. In the case of 'fdr_twostage',

148 the corrected p-values are specific to the given alpha, see

149 ``fdrcorrection_twostage``.

150 The 'fdr_gbs' procedure is not verified against another package, p-values

151 are derived from scratch and are not derived in the reference. In Monte

152 Carlo experiments the method worked correctly and maintained the false

153 discovery rate.

154 All procedures that are included, control FWER or FDR in the independent

155 case, and most are robust in the positively correlated case.

156 `fdr_gbs`: high power, fdr control for independent case and only small

157 violation in positively correlated case

158 **Timing**:

159 Most of the time with large arrays is spent in `argsort`. When

160 we want to calculate the p-value for several methods, then it is more

161 efficient to presort the pvalues, and put the results back into the

162 original order outside of the function.

163 Method='hommel' is very slow for large arrays, since it requires the

164 evaluation of n partitions, where n is the number of p-values.

165 """

166 import gc

167 pvals = np.asarray(pvals)

168 alphaf = alpha # Notation ?

169

170 if not is_sorted:

171 sortind = np.argsort(pvals)

172 pvals = np.take(pvals, sortind)

173

174 ntests = len(pvals)

175 alphacSidak = 1 - np.power((1. - alphaf), 1. / ntests)

176 alphacBonf = alphaf / float(ntests)

177 if method.lower() in ['b', 'bonf', 'bonferroni']:

178 reject = pvals <= alphacBonf

179 pvals_corrected = pvals * float(ntests)

180

181 elif method.lower() in ['s', 'sidak']:

182 reject = pvals <= alphacSidak

183 pvals_corrected = 1 - np.power((1. - pvals), ntests)

184

185 elif method.lower() in ['hs', 'holm-sidak']:

186 alphacSidak_all = 1 - np.power((1. - alphaf),

187 1. / np.arange(ntests, 0, -1))

188 notreject = pvals > alphacSidak_all

189 del alphacSidak_all

190

191 nr_index = np.nonzero(notreject)[0]

192 if nr_index.size == 0:

193 # nonreject is empty, all rejected

194 notrejectmin = len(pvals)

195 else:

196 notrejectmin = np.min(nr_index)

197 notreject[notrejectmin:] = True

198 reject = ~notreject

199 del notreject

200

201 pvals_corrected_raw = 1 - np.power((1. - pvals),

202 np.arange(ntests, 0, -1))

203 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)

204 del pvals_corrected_raw

205

206 elif method.lower() in ['h', 'holm']:

207 notreject = pvals > alphaf / np.arange(ntests, 0, -1)

208 nr_index = np.nonzero(notreject)[0]

209 if nr_index.size == 0:

210 # nonreject is empty, all rejected

211 notrejectmin = len(pvals)

212 else:

213 notrejectmin = np.min(nr_index)

214 notreject[notrejectmin:] = True

215 reject = ~notreject

216 pvals_corrected_raw = pvals * np.arange(ntests, 0, -1)

217 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)

218 del pvals_corrected_raw

219 gc.collect()

220

221 elif method.lower() in ['sh', 'simes-hochberg']:

222 alphash = alphaf / np.arange(ntests, 0, -1)

223 reject = pvals <= alphash

224 rejind = np.nonzero(reject)

225 if rejind[0].size > 0:

226 rejectmax = np.max(np.nonzero(reject))

227 reject[:rejectmax] = True

228 pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals

229 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]

230 del pvals_corrected_raw

231

232 elif method.lower() in ['ho', 'hommel']:

233 # we need a copy because we overwrite it in a loop

234 a = pvals.copy()

235 for m in range(ntests, 1, -1):

236 cim = np.min(m * pvals[-m:] / np.arange(1, m + 1.))

237 a[-m:] = np.maximum(a[-m:], cim)

238 a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim))

239 pvals_corrected = a

240 reject = a <= alphaf

241

242 elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']:

243 # delegate, call with sorted pvals

244 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha,

245 method='indep',

246 is_sorted=True)

247 elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']:

248 # delegate, call with sorted pvals

249 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha,

250 method='n',

251 is_sorted=True)

252 elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']:

253 # delegate, call with sorted pvals

254 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha,

255 method='bky',

256 is_sorted=True)[:2]

257 elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']:

258 # delegate, call with sorted pvals

259 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha,

260 method='bh',

261 is_sorted=True)[:2]

262

263 elif method.lower() in ['fdr_gbs']:

264 # adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009

265

266 ii = np.arange(1, ntests + 1)

267 q = (ntests + 1. - ii) / ii * pvals / (1. - pvals)

268 pvals_corrected_raw = np.maximum.accumulate(q) # up requirementd

269

270 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]

271 del pvals_corrected_raw

272 reject = pvals_corrected <= alpha

273

274 else:

275 raise ValueError('method not recognized')

276

277 if pvals_corrected is not None: # not necessary anymore

278 pvals_corrected[pvals_corrected > 1] = 1

279 if is_sorted or returnsorted:

280 return reject, pvals_corrected, alphacSidak, alphacBonf

281 else:

282 pvals_corrected_ = np.empty_like(pvals_corrected)

283 pvals_corrected_[sortind] = pvals_corrected

284 del pvals_corrected

285 reject_ = np.empty_like(reject)

286 reject_[sortind] = reject

287 return reject_, pvals_corrected_, alphacSidak, alphacBonf

288

289

290def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False): (empty)

291 '''pvalue correction for false discovery rate

292 This covers Benjamini/Hochberg for independent or positively correlated and

293 Benjamini/Yekutieli for general or negatively correlated tests. Both are

294 available in the function multipletests, as method=`fdr_bh`, resp. `fdr_by`.

295 Parameters

296 ----------

297 pvals : array_like

298 set of p-values of the individual tests.

299 alpha : float

300 error rate

301 method : {'indep', 'negcorr')

302 Returns

303 -------

304 rejected : ndarray, bool

305 True if a hypothesis is rejected, False if not

306 pvalue-corrected : ndarray

307 pvalues adjusted for multiple hypothesis testing to limit FDR

308 Notes

309 -----

310 If there is prior information on the fraction of true hypothesis, then alpha

311 should be set to alpha * m/m_0 where m is the number of tests,

312 given by the p-values, and m_0 is an estimate of the true hypothesis.

313 (see Benjamini, Krieger and Yekuteli)

314 The two-step method of Benjamini, Krieger and Yekutiel that estimates the number

315 of false hypotheses will be available (soon).

316 Method names can be abbreviated to first letter, 'i' or 'p' for fdr_bh and 'n' for

317 fdr_by.

318 '''

319 pvals = np.asarray(pvals)

320

321 if not is_sorted:

322 pvals_sortind = np.argsort(pvals)

323 pvals_sorted = np.take(pvals, pvals_sortind)

324 else:

325 pvals_sorted = pvals # alias

326

327 if method in ['i', 'indep', 'p', 'poscorr']:

328 ecdffactor = _ecdf(pvals_sorted)

329 elif method in ['n', 'negcorr']:

330 cm = np.sum(1. / np.arange(1, len(pvals_sorted) + 1)) # corrected this

331 ecdffactor = _ecdf(pvals_sorted) / cm

332 else:

333 raise ValueError('only indep and negcorr implemented')

334 reject = pvals_sorted <= ecdffactor * alpha

335 if reject.any():

336 rejectmax = max(np.nonzero(reject)[0])

337 reject[:rejectmax] = True

338

339 pvals_corrected_raw = pvals_sorted / ecdffactor

340 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]

341 del pvals_corrected_raw

342 pvals_corrected[pvals_corrected > 1] = 1

343 if not is_sorted:

344 pvals_corrected_ = np.empty_like(pvals_corrected)

345 pvals_corrected_[pvals_sortind] = pvals_corrected

346 del pvals_corrected

347 reject_ = np.empty_like(reject)

348 reject_[pvals_sortind] = reject

349 return reject_, pvals_corrected_

350 else:

351 return reject, pvals_corrected

352

353

354def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False, (empty)

355 is_sorted=False):

356 '''(iterated) two stage linear step-up procedure with estimation of number of true

357 hypotheses

358 Benjamini, Krieger and Yekuteli, procedure in Definition 6

359 Parameters

360 ----------

361 pvals : array_like

362 set of p-values of the individual tests.

363 alpha : float

364 error rate

365 method : {'bky', 'bh')

366 see Notes for details

367 * 'bky' - implements the procedure in Definition 6 of Benjamini, Krieger

368 and Yekuteli 2006

369 * 'bh' - the two stage method of Benjamini and Hochberg

370 iter : bool

371 Returns

372 -------

373 rejected : ndarray, bool

374 True if a hypothesis is rejected, False if not

375 pvalue-corrected : ndarray

376 pvalues adjusted for multiple hypotheses testing to limit FDR

377 m0 : int

378 ntest - rej, estimated number of true hypotheses

379 alpha_stages : list of floats

380 A list of alphas that have been used at each stage

381 Notes

382 -----

383 The returned corrected p-values are specific to the given alpha, they

384 cannot be used for a different alpha.

385 The returned corrected p-values are from the last stage of the fdr_bh

386 linear step-up procedure (fdrcorrection0 with method='indep') corrected

387 for the estimated fraction of true hypotheses.

388 This means that the rejection decision can be obtained with

389 ``pval_corrected <= alpha``, where ``alpha`` is the original significance

390 level.

391 (Note: This has changed from earlier versions (<0.5.0) of statsmodels.)

392 BKY described several other multi-stage methods, which would be easy to implement.

393 However, in their simulation the simple two-stage method (with iter=False) was the

394 most robust to the presence of positive correlation

395 TODO: What should be returned?

396 '''

397 pvals = np.asarray(pvals)

398

399 if not is_sorted:

400 pvals_sortind = np.argsort(pvals)

401 pvals = np.take(pvals, pvals_sortind)

402

403 ntests = len(pvals)

404 if method == 'bky':

405 fact = (1. + alpha)

406 alpha_prime = alpha / fact

407 elif method == 'bh':

408 fact = 1.

409 alpha_prime = alpha

410 else:

411 raise ValueError("only 'bky' and 'bh' are available as method")

412

413 alpha_stages = [alpha_prime]

414 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep',

415 is_sorted=True)

416 r1 = rej.sum()

417 if (r1 == 0) or (r1 == ntests):

418 return rej, pvalscorr * fact, ntests - r1, alpha_stages

419 ri_old = r1

420

421 while True:

422 ntests0 = 1.0 * ntests - ri_old

423 alpha_star = alpha_prime * ntests / ntests0

424 alpha_stages.append(alpha_star)

425 # print ntests0, alpha_star

426 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep',

427 is_sorted=True)

428 ri = rej.sum()

429 if (not iter) or ri == ri_old:

430 break

431 elif ri < ri_old:

432 # prevent cycles and endless loops

433 raise RuntimeError(" oops - should not be here")

434 ri_old = ri

435

436 # make adjustment to pvalscorr to reflect estimated number of Non-Null cases

437 # decision is then pvalscorr < alpha (or <=)

438 pvalscorr *= ntests0 * 1.0 / ntests

439 if method == 'bky':

440 pvalscorr *= (1. + alpha)

441

442 if not is_sorted:

443 pvalscorr_ = np.empty_like(pvalscorr)

444 pvalscorr_[pvals_sortind] = pvalscorr

445 del pvalscorr

446 reject = np.empty_like(rej)

447 reject[pvals_sortind] = rej

448 return reject, pvalscorr_, ntests - ri, alpha_stages

449 else:

450 return rej, pvalscorr, ntests - ri, alpha_stages

451

452

453def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, (empty)

454 nbins=30):

455 """

456 Calculate local FDR values for a list of Z-scores.

457 Parameters

458 ----------

459 zscores : array_like

460 A vector of Z-scores

461 null_proportion : float

462 The assumed proportion of true null hypotheses

463 null_pdf : function mapping reals to positive reals

464 The density of null Z-scores; if None, use standard normal

465 deg : int

466 The maximum exponent in the polynomial expansion of the

467 density of non-null Z-scores

468 nbins : int

469 The number of bins for estimating the marginal density

470 of Z-scores.

471 Returns

472 -------

473 fdr : array_like

474 A vector of FDR values

475 References

476 ----------

477 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups

478 Model. Statistical Science 23:1, 1-22.

479 Examples

480 --------

481 Basic use (the null Z-scores are taken to be standard normal):

482 >>> from statsmodels.stats.multitest import local_fdr

483 >>> import numpy as np

484 >>> zscores = np.random.randn(30)

485 >>> fdr = local_fdr(zscores)

486 Use a Gaussian null distribution estimated from the data:

487 >>> null = EmpiricalNull(zscores)

488 >>> fdr = local_fdr(zscores, null_pdf=null.pdf)

489 """

490

491 from statsmodels.genmod.generalized_linear_model import GLM

492 from statsmodels.genmod.generalized_linear_model import families

493 from statsmodels.regression.linear_model import OLS

494

495 # Bins for Poisson modeling of the marginal Z-score density

496 minz = min(zscores)

497 maxz = max(zscores)

498 bins = np.linspace(minz, maxz, nbins)

499

500 # Bin counts

501 zhist = np.histogram(zscores, bins)[0]

502

503 # Bin centers

504 zbins = (bins[:-1] + bins[1:]) / 2

505

506 # The design matrix at bin centers

507 dmat = np.vander(zbins, deg + 1)

508

509 # Use this to get starting values for Poisson regression

510 md = OLS(np.log(1 + zhist), dmat).fit()

511

512 # Poisson regression

513 md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params)

514

515 # The design matrix for all Z-scores

516 dmat_full = np.vander(zscores, deg + 1)

517

518 # The height of the estimated marginal density of Z-scores,

519 # evaluated at every observed Z-score.

520 fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))

521

522 # The null density.

523 if null_pdf is None:

524 f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)

525 else:

526 f0 = null_pdf(zscores)

527

528 # The local FDR values

529 fdr = null_proportion * f0 / fz

530

531 fdr = np.clip(fdr, 0, 1)

532

533 return fdr

534

535

536class NullDistribution(object): (empty)

537 """

538 Estimate a Gaussian distribution for the null Z-scores.

539 The observed Z-scores consist of both null and non-null values.

540 The fitted distribution of null Z-scores is Gaussian, but may have

541 non-zero mean and/or non-unit scale.

542 Parameters

543 ----------

544 zscores : array_like

545 The observed Z-scores.

546 null_lb : float

547 Z-scores between `null_lb` and `null_ub` are all considered to be

548 true null hypotheses.

549 null_ub : float

550 See `null_lb`.

551 estimate_mean : bool

552 If True, estimate the mean of the distribution. If False, the

553 mean is fixed at zero.

554 estimate_scale : bool

555 If True, estimate the scale of the distribution. If False, the

556 scale parameter is fixed at 1.

557 estimate_null_proportion : bool

558 If True, estimate the proportion of true null hypotheses (i.e.

559 the proportion of z-scores with expected value zero). If False,

560 this parameter is fixed at 1.

561 Attributes

562 ----------

563 mean : float

564 The estimated mean of the empirical null distribution

565 sd : float

566 The estimated standard deviation of the empirical null distribution

567 null_proportion : float

568 The estimated proportion of true null hypotheses among all hypotheses

569 References

570 ----------

571 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups

572 Model. Statistical Science 23:1, 1-22.

573 Notes

574 -----

575 See also:

576 http://nipy.org/nipy/labs/enn.html#nipy.algorithms.statistics.empirical_pvalue.NormalEmpiricalNull.fdr

577 """

578

579 def __init__(self, zscores, null_lb=-1, null_ub=1, estimate_mean=True, (empty)

580 estimate_scale=True, estimate_null_proportion=False):

581

582 # Extract the null z-scores

583 ii = np.flatnonzero((zscores >= null_lb) & (zscores <= null_ub))

584 if len(ii) == 0:

585 raise RuntimeError("No Z-scores fall between null_lb and null_ub")

586 zscores0 = zscores[ii]

587

588 # Number of Z-scores, and null Z-scores

589 n_zs, n_zs0 = len(zscores), len(zscores0)

590

591 # Unpack and transform the parameters to the natural scale, hold

592 # parameters fixed as specified.

593 def xform(params):

594

595 mean = 0.

596 sd = 1.

597 prob = 1.

598

599 ii = 0

600 if estimate_mean:

601 mean = params[ii]

602 ii += 1

603 if estimate_scale:

604 sd = np.exp(params[ii])

605 ii += 1

606 if estimate_null_proportion:

607 prob = 1 / (1 + np.exp(-params[ii]))

608

609 return mean, sd, prob

610

611 from scipy.stats.distributions import norm

612

613 def fun(params):

614 """

615 Negative log-likelihood of z-scores.

616 The function has three arguments, packed into a vector:

617 mean : location parameter

618 logscale : log of the scale parameter

619 logitprop : logit of the proportion of true nulls

620 The implementation follows section 4 from Efron 2008.

621 """

622

623 d, s, p = xform(params)

624

625 # Mass within the central region

626 central_mass = (norm.cdf((null_ub - d) / s) -

627 norm.cdf((null_lb - d) / s))

628

629 # Probability that a Z-score is null and is in the central region

630 cp = p * central_mass

631

632 # Binomial term

633 rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp)

634

635 # Truncated Gaussian term for null Z-scores

636 zv = (zscores0 - d) / s

637 rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s)

638 rval -= n_zs0 * np.log(central_mass)

639

640 return -rval

641

642 # Estimate the parameters

643 from scipy.optimize import minimize

644 # starting values are mean = 0, scale = 1, p0 ~ 1

645 mz = minimize(fun, np.r_[0., 0, 3], method="Nelder-Mead")

646 mean, sd, prob = xform(mz['x'])

647

648 self.mean = mean

649 self.sd = sd

650 self.null_proportion = prob

651

652 # The fitted null density function

653 def pdf(self, zscores): (empty)

654 """

655 Evaluates the fitted empirical null Z-score density.

656 Parameters

657 ----------

658 zscores : scalar or array_like

659 The point or points at which the density is to be

660 evaluated.

661 Returns

662 -------

663 The empirical null Z-score density evaluated at the given

664 points.

665 """

666

667 zval = (zscores - self.mean) / self.sd

668 return np.exp(-0.5 * zval**2 - np.log(self.sd) - 0.5 * np.log(2 * np.pi))