Coverage for brainbox/task/_statsmodels.py: 8%

229 statements  

« prev     ^ index     » next       coverage.py v7.5.4, created at 2024-07-08 17:16 +0100

1""" 

2Multiple Testing and P-Value Correction 

3Author: Josef Perktold 

4License: BSD-3 

5Statsmodels 

6https://www.statsmodels.org/ 

7 

8 

9Copyright (C) 2006, Jonathan E. Taylor 

10All rights reserved. 

11 

12Copyright (c) 2006-2008 Scipy Developers. 

13All rights reserved. 

14 

15Copyright (c) 2009-2018 statsmodels Developers. 

16All rights reserved. 

17 

18 

19Redistribution and use in source and binary forms, with or without 

20modification, are permitted provided that the following conditions are met: 

21 

22 a. Redistributions of source code must retain the above copyright notice, 

23 this list of conditions and the following disclaimer. 

24 b. Redistributions in binary form must reproduce the above copyright 

25 notice, this list of conditions and the following disclaimer in the 

26 documentation and/or other materials provided with the distribution. 

27 c. Neither the name of statsmodels nor the names of its contributors 

28 may be used to endorse or promote products derived from this software 

29 without specific prior written permission. 

30 

31 

32THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

33AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

34IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

35ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR 

36ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 

37DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 

38SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 

39CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 

40LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 

41OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 

42DAMAGE. 

43""" 

44 

45 

46from collections import OrderedDict 

47 

48import numpy as np 

49 

50from ._knockoff import RegressionFDR 

51 

52__all__ = ['fdrcorrection', 'fdrcorrection_twostage', 'local_fdr', 

53 'multipletests', 'NullDistribution', 'RegressionFDR'] 

54 

55# ============================================== 

56# 

57# Part 1: Multiple Tests and P-Value Correction 

58# 

59# ============================================== 

60 

61 

62def _ecdf(x): 

63 '''no frills empirical cdf used in fdrcorrection 

64 ''' 

65 nobs = len(x) 

66 return np.arange(1, nobs + 1) / float(nobs) 

67 

68 

69multitest_methods_names = {'b': 'Bonferroni', 

70 's': 'Sidak', 

71 'h': 'Holm', 

72 'hs': 'Holm-Sidak', 

73 'sh': 'Simes-Hochberg', 

74 'ho': 'Hommel', 

75 'fdr_bh': 'FDR Benjamini-Hochberg', 

76 'fdr_by': 'FDR Benjamini-Yekutieli', 

77 'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg', 

78 'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli', 

79 'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar' 

80 } 

81 

82_alias_list = [['b', 'bonf', 'bonferroni'], 

83 ['s', 'sidak'], 

84 ['h', 'holm'], 

85 ['hs', 'holm-sidak'], 

86 ['sh', 'simes-hochberg'], 

87 ['ho', 'hommel'], 

88 ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'], 

89 ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'], 

90 ['fdr_tsbh', 'fdr_2sbh'], 

91 ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'], 

92 ['fdr_gbs'] 

93 ] 

94 

95 

96multitest_alias = OrderedDict() 

97for m in _alias_list: 

98 multitest_alias[m[0]] = m[0] 

99 for a in m[1:]: 

100 multitest_alias[a] = m[0] 

101 

102 

103def multipletests(pvals, alpha=0.05, method='hs', is_sorted=False, 

104 returnsorted=False): 

105 """ 

106 Test results and p-value correction for multiple tests 

107 Parameters 

108 ---------- 

109 pvals : array_like, 1-d 

110 uncorrected p-values. Must be 1-dimensional. 

111 alpha : float 

112 FWER, family-wise error rate, e.g. 0.1 

113 method : str 

114 Method used for testing and adjustment of pvalues. Can be either the 

115 full name or initial letters. Available methods are: 

116 - `bonferroni` : one-step correction 

117 - `sidak` : one-step correction 

118 - `holm-sidak` : step down method using Sidak adjustments 

119 - `holm` : step-down method using Bonferroni adjustments 

120 - `simes-hochberg` : step-up method (independent) 

121 - `hommel` : closed method based on Simes tests (non-negative) 

122 - `fdr_bh` : Benjamini/Hochberg (non-negative) 

123 - `fdr_by` : Benjamini/Yekutieli (negative) 

124 - `fdr_tsbh` : two stage fdr correction (non-negative) 

125 - `fdr_tsbky` : two stage fdr correction (non-negative) 

126 is_sorted : bool 

127 If False (default), the p_values will be sorted, but the corrected 

128 pvalues are in the original order. If True, then it assumed that the 

129 pvalues are already sorted in ascending order. 

130 returnsorted : bool 

131 not tested, return sorted p-values instead of original sequence 

132 Returns 

133 ------- 

134 reject : ndarray, boolean 

135 true for hypothesis that can be rejected for given alpha 

136 pvals_corrected : ndarray 

137 p-values corrected for multiple tests 

138 alphacSidak: float 

139 corrected alpha for Sidak method 

140 alphacBonf: float 

141 corrected alpha for Bonferroni method 

142 Notes 

143 ----- 

144 There may be API changes for this function in the future. 

145 Except for 'fdr_twostage', the p-value correction is independent of the 

146 alpha specified as argument. In these cases the corrected p-values 

147 can also be compared with a different alpha. In the case of 'fdr_twostage', 

148 the corrected p-values are specific to the given alpha, see 

149 ``fdrcorrection_twostage``. 

150 The 'fdr_gbs' procedure is not verified against another package, p-values 

151 are derived from scratch and are not derived in the reference. In Monte 

152 Carlo experiments the method worked correctly and maintained the false 

153 discovery rate. 

154 All procedures that are included, control FWER or FDR in the independent 

155 case, and most are robust in the positively correlated case. 

156 `fdr_gbs`: high power, fdr control for independent case and only small 

157 violation in positively correlated case 

158 **Timing**: 

159 Most of the time with large arrays is spent in `argsort`. When 

160 we want to calculate the p-value for several methods, then it is more 

161 efficient to presort the pvalues, and put the results back into the 

162 original order outside of the function. 

163 Method='hommel' is very slow for large arrays, since it requires the 

164 evaluation of n partitions, where n is the number of p-values. 

165 """ 

166 import gc 

167 pvals = np.asarray(pvals) 

168 alphaf = alpha # Notation ? 

169 

170 if not is_sorted: 

171 sortind = np.argsort(pvals) 

172 pvals = np.take(pvals, sortind) 

173 

174 ntests = len(pvals) 

175 alphacSidak = 1 - np.power((1. - alphaf), 1. / ntests) 

176 alphacBonf = alphaf / float(ntests) 

177 if method.lower() in ['b', 'bonf', 'bonferroni']: 

178 reject = pvals <= alphacBonf 

179 pvals_corrected = pvals * float(ntests) 

180 

181 elif method.lower() in ['s', 'sidak']: 

182 reject = pvals <= alphacSidak 

183 pvals_corrected = 1 - np.power((1. - pvals), ntests) 

184 

185 elif method.lower() in ['hs', 'holm-sidak']: 

186 alphacSidak_all = 1 - np.power((1. - alphaf), 

187 1. / np.arange(ntests, 0, -1)) 

188 notreject = pvals > alphacSidak_all 

189 del alphacSidak_all 

190 

191 nr_index = np.nonzero(notreject)[0] 

192 if nr_index.size == 0: 

193 # nonreject is empty, all rejected 

194 notrejectmin = len(pvals) 

195 else: 

196 notrejectmin = np.min(nr_index) 

197 notreject[notrejectmin:] = True 

198 reject = ~notreject 

199 del notreject 

200 

201 pvals_corrected_raw = 1 - np.power((1. - pvals), 

202 np.arange(ntests, 0, -1)) 

203 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) 

204 del pvals_corrected_raw 

205 

206 elif method.lower() in ['h', 'holm']: 

207 notreject = pvals > alphaf / np.arange(ntests, 0, -1) 

208 nr_index = np.nonzero(notreject)[0] 

209 if nr_index.size == 0: 

210 # nonreject is empty, all rejected 

211 notrejectmin = len(pvals) 

212 else: 

213 notrejectmin = np.min(nr_index) 

214 notreject[notrejectmin:] = True 

215 reject = ~notreject 

216 pvals_corrected_raw = pvals * np.arange(ntests, 0, -1) 

217 pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) 

218 del pvals_corrected_raw 

219 gc.collect() 

220 

221 elif method.lower() in ['sh', 'simes-hochberg']: 

222 alphash = alphaf / np.arange(ntests, 0, -1) 

223 reject = pvals <= alphash 

224 rejind = np.nonzero(reject) 

225 if rejind[0].size > 0: 

226 rejectmax = np.max(np.nonzero(reject)) 

227 reject[:rejectmax] = True 

228 pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals 

229 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 

230 del pvals_corrected_raw 

231 

232 elif method.lower() in ['ho', 'hommel']: 

233 # we need a copy because we overwrite it in a loop 

234 a = pvals.copy() 

235 for m in range(ntests, 1, -1): 

236 cim = np.min(m * pvals[-m:] / np.arange(1, m + 1.)) 

237 a[-m:] = np.maximum(a[-m:], cim) 

238 a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim)) 

239 pvals_corrected = a 

240 reject = a <= alphaf 

241 

242 elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']: 

243 # delegate, call with sorted pvals 

244 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, 

245 method='indep', 

246 is_sorted=True) 

247 elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']: 

248 # delegate, call with sorted pvals 

249 reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, 

250 method='n', 

251 is_sorted=True) 

252 elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']: 

253 # delegate, call with sorted pvals 

254 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, 

255 method='bky', 

256 is_sorted=True)[:2] 

257 elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']: 

258 # delegate, call with sorted pvals 

259 reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, 

260 method='bh', 

261 is_sorted=True)[:2] 

262 

263 elif method.lower() in ['fdr_gbs']: 

264 # adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009 

265 

266 ii = np.arange(1, ntests + 1) 

267 q = (ntests + 1. - ii) / ii * pvals / (1. - pvals) 

268 pvals_corrected_raw = np.maximum.accumulate(q) # up requirementd 

269 

270 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 

271 del pvals_corrected_raw 

272 reject = pvals_corrected <= alpha 

273 

274 else: 

275 raise ValueError('method not recognized') 

276 

277 if pvals_corrected is not None: # not necessary anymore 

278 pvals_corrected[pvals_corrected > 1] = 1 

279 if is_sorted or returnsorted: 

280 return reject, pvals_corrected, alphacSidak, alphacBonf 

281 else: 

282 pvals_corrected_ = np.empty_like(pvals_corrected) 

283 pvals_corrected_[sortind] = pvals_corrected 

284 del pvals_corrected 

285 reject_ = np.empty_like(reject) 

286 reject_[sortind] = reject 

287 return reject_, pvals_corrected_, alphacSidak, alphacBonf 

288 

289 

290def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False): 

291 '''pvalue correction for false discovery rate 

292 This covers Benjamini/Hochberg for independent or positively correlated and 

293 Benjamini/Yekutieli for general or negatively correlated tests. Both are 

294 available in the function multipletests, as method=`fdr_bh`, resp. `fdr_by`. 

295 Parameters 

296 ---------- 

297 pvals : array_like 

298 set of p-values of the individual tests. 

299 alpha : float 

300 error rate 

301 method : {'indep', 'negcorr') 

302 Returns 

303 ------- 

304 rejected : ndarray, bool 

305 True if a hypothesis is rejected, False if not 

306 pvalue-corrected : ndarray 

307 pvalues adjusted for multiple hypothesis testing to limit FDR 

308 Notes 

309 ----- 

310 If there is prior information on the fraction of true hypothesis, then alpha 

311 should be set to alpha * m/m_0 where m is the number of tests, 

312 given by the p-values, and m_0 is an estimate of the true hypothesis. 

313 (see Benjamini, Krieger and Yekuteli) 

314 The two-step method of Benjamini, Krieger and Yekutiel that estimates the number 

315 of false hypotheses will be available (soon). 

316 Method names can be abbreviated to first letter, 'i' or 'p' for fdr_bh and 'n' for 

317 fdr_by. 

318 ''' 

319 pvals = np.asarray(pvals) 

320 

321 if not is_sorted: 

322 pvals_sortind = np.argsort(pvals) 

323 pvals_sorted = np.take(pvals, pvals_sortind) 

324 else: 

325 pvals_sorted = pvals # alias 

326 

327 if method in ['i', 'indep', 'p', 'poscorr']: 

328 ecdffactor = _ecdf(pvals_sorted) 

329 elif method in ['n', 'negcorr']: 

330 cm = np.sum(1. / np.arange(1, len(pvals_sorted) + 1)) # corrected this 

331 ecdffactor = _ecdf(pvals_sorted) / cm 

332 else: 

333 raise ValueError('only indep and negcorr implemented') 

334 reject = pvals_sorted <= ecdffactor * alpha 

335 if reject.any(): 

336 rejectmax = max(np.nonzero(reject)[0]) 

337 reject[:rejectmax] = True 

338 

339 pvals_corrected_raw = pvals_sorted / ecdffactor 

340 pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] 

341 del pvals_corrected_raw 

342 pvals_corrected[pvals_corrected > 1] = 1 

343 if not is_sorted: 

344 pvals_corrected_ = np.empty_like(pvals_corrected) 

345 pvals_corrected_[pvals_sortind] = pvals_corrected 

346 del pvals_corrected 

347 reject_ = np.empty_like(reject) 

348 reject_[pvals_sortind] = reject 

349 return reject_, pvals_corrected_ 

350 else: 

351 return reject, pvals_corrected 

352 

353 

354def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False, 

355 is_sorted=False): 

356 '''(iterated) two stage linear step-up procedure with estimation of number of true 

357 hypotheses 

358 Benjamini, Krieger and Yekuteli, procedure in Definition 6 

359 Parameters 

360 ---------- 

361 pvals : array_like 

362 set of p-values of the individual tests. 

363 alpha : float 

364 error rate 

365 method : {'bky', 'bh') 

366 see Notes for details 

367 * 'bky' - implements the procedure in Definition 6 of Benjamini, Krieger 

368 and Yekuteli 2006 

369 * 'bh' - the two stage method of Benjamini and Hochberg 

370 iter : bool 

371 Returns 

372 ------- 

373 rejected : ndarray, bool 

374 True if a hypothesis is rejected, False if not 

375 pvalue-corrected : ndarray 

376 pvalues adjusted for multiple hypotheses testing to limit FDR 

377 m0 : int 

378 ntest - rej, estimated number of true hypotheses 

379 alpha_stages : list of floats 

380 A list of alphas that have been used at each stage 

381 Notes 

382 ----- 

383 The returned corrected p-values are specific to the given alpha, they 

384 cannot be used for a different alpha. 

385 The returned corrected p-values are from the last stage of the fdr_bh 

386 linear step-up procedure (fdrcorrection0 with method='indep') corrected 

387 for the estimated fraction of true hypotheses. 

388 This means that the rejection decision can be obtained with 

389 ``pval_corrected <= alpha``, where ``alpha`` is the original significance 

390 level. 

391 (Note: This has changed from earlier versions (<0.5.0) of statsmodels.) 

392 BKY described several other multi-stage methods, which would be easy to implement. 

393 However, in their simulation the simple two-stage method (with iter=False) was the 

394 most robust to the presence of positive correlation 

395 TODO: What should be returned? 

396 ''' 

397 pvals = np.asarray(pvals) 

398 

399 if not is_sorted: 

400 pvals_sortind = np.argsort(pvals) 

401 pvals = np.take(pvals, pvals_sortind) 

402 

403 ntests = len(pvals) 

404 if method == 'bky': 

405 fact = (1. + alpha) 

406 alpha_prime = alpha / fact 

407 elif method == 'bh': 

408 fact = 1. 

409 alpha_prime = alpha 

410 else: 

411 raise ValueError("only 'bky' and 'bh' are available as method") 

412 

413 alpha_stages = [alpha_prime] 

414 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep', 

415 is_sorted=True) 

416 r1 = rej.sum() 

417 if (r1 == 0) or (r1 == ntests): 

418 return rej, pvalscorr * fact, ntests - r1, alpha_stages 

419 ri_old = r1 

420 

421 while True: 

422 ntests0 = 1.0 * ntests - ri_old 

423 alpha_star = alpha_prime * ntests / ntests0 

424 alpha_stages.append(alpha_star) 

425 # print ntests0, alpha_star 

426 rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep', 

427 is_sorted=True) 

428 ri = rej.sum() 

429 if (not iter) or ri == ri_old: 

430 break 

431 elif ri < ri_old: 

432 # prevent cycles and endless loops 

433 raise RuntimeError(" oops - should not be here") 

434 ri_old = ri 

435 

436 # make adjustment to pvalscorr to reflect estimated number of Non-Null cases 

437 # decision is then pvalscorr < alpha (or <=) 

438 pvalscorr *= ntests0 * 1.0 / ntests 

439 if method == 'bky': 

440 pvalscorr *= (1. + alpha) 

441 

442 if not is_sorted: 

443 pvalscorr_ = np.empty_like(pvalscorr) 

444 pvalscorr_[pvals_sortind] = pvalscorr 

445 del pvalscorr 

446 reject = np.empty_like(rej) 

447 reject[pvals_sortind] = rej 

448 return reject, pvalscorr_, ntests - ri, alpha_stages 

449 else: 

450 return rej, pvalscorr, ntests - ri, alpha_stages 

451 

452 

453def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, 

454 nbins=30): 

455 """ 

456 Calculate local FDR values for a list of Z-scores. 

457 Parameters 

458 ---------- 

459 zscores : array_like 

460 A vector of Z-scores 

461 null_proportion : float 

462 The assumed proportion of true null hypotheses 

463 null_pdf : function mapping reals to positive reals 

464 The density of null Z-scores; if None, use standard normal 

465 deg : int 

466 The maximum exponent in the polynomial expansion of the 

467 density of non-null Z-scores 

468 nbins : int 

469 The number of bins for estimating the marginal density 

470 of Z-scores. 

471 Returns 

472 ------- 

473 fdr : array_like 

474 A vector of FDR values 

475 References 

476 ---------- 

477 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups 

478 Model. Statistical Science 23:1, 1-22. 

479 Examples 

480 -------- 

481 Basic use (the null Z-scores are taken to be standard normal): 

482 >>> from statsmodels.stats.multitest import local_fdr 

483 >>> import numpy as np 

484 >>> zscores = np.random.randn(30) 

485 >>> fdr = local_fdr(zscores) 

486 Use a Gaussian null distribution estimated from the data: 

487 >>> null = EmpiricalNull(zscores) 

488 >>> fdr = local_fdr(zscores, null_pdf=null.pdf) 

489 """ 

490 

491 from statsmodels.genmod.generalized_linear_model import GLM 

492 from statsmodels.genmod.generalized_linear_model import families 

493 from statsmodels.regression.linear_model import OLS 

494 

495 # Bins for Poisson modeling of the marginal Z-score density 

496 minz = min(zscores) 

497 maxz = max(zscores) 

498 bins = np.linspace(minz, maxz, nbins) 

499 

500 # Bin counts 

501 zhist = np.histogram(zscores, bins)[0] 

502 

503 # Bin centers 

504 zbins = (bins[:-1] + bins[1:]) / 2 

505 

506 # The design matrix at bin centers 

507 dmat = np.vander(zbins, deg + 1) 

508 

509 # Use this to get starting values for Poisson regression 

510 md = OLS(np.log(1 + zhist), dmat).fit() 

511 

512 # Poisson regression 

513 md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params) 

514 

515 # The design matrix for all Z-scores 

516 dmat_full = np.vander(zscores, deg + 1) 

517 

518 # The height of the estimated marginal density of Z-scores, 

519 # evaluated at every observed Z-score. 

520 fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) 

521 

522 # The null density. 

523 if null_pdf is None: 

524 f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) 

525 else: 

526 f0 = null_pdf(zscores) 

527 

528 # The local FDR values 

529 fdr = null_proportion * f0 / fz 

530 

531 fdr = np.clip(fdr, 0, 1) 

532 

533 return fdr 

534 

535 

536class NullDistribution(object): 

537 """ 

538 Estimate a Gaussian distribution for the null Z-scores. 

539 The observed Z-scores consist of both null and non-null values. 

540 The fitted distribution of null Z-scores is Gaussian, but may have 

541 non-zero mean and/or non-unit scale. 

542 Parameters 

543 ---------- 

544 zscores : array_like 

545 The observed Z-scores. 

546 null_lb : float 

547 Z-scores between `null_lb` and `null_ub` are all considered to be 

548 true null hypotheses. 

549 null_ub : float 

550 See `null_lb`. 

551 estimate_mean : bool 

552 If True, estimate the mean of the distribution. If False, the 

553 mean is fixed at zero. 

554 estimate_scale : bool 

555 If True, estimate the scale of the distribution. If False, the 

556 scale parameter is fixed at 1. 

557 estimate_null_proportion : bool 

558 If True, estimate the proportion of true null hypotheses (i.e. 

559 the proportion of z-scores with expected value zero). If False, 

560 this parameter is fixed at 1. 

561 Attributes 

562 ---------- 

563 mean : float 

564 The estimated mean of the empirical null distribution 

565 sd : float 

566 The estimated standard deviation of the empirical null distribution 

567 null_proportion : float 

568 The estimated proportion of true null hypotheses among all hypotheses 

569 References 

570 ---------- 

571 B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups 

572 Model. Statistical Science 23:1, 1-22. 

573 Notes 

574 ----- 

575 See also: 

576 http://nipy.org/nipy/labs/enn.html#nipy.algorithms.statistics.empirical_pvalue.NormalEmpiricalNull.fdr 

577 """ 

578 

579 def __init__(self, zscores, null_lb=-1, null_ub=1, estimate_mean=True, 

580 estimate_scale=True, estimate_null_proportion=False): 

581 

582 # Extract the null z-scores 

583 ii = np.flatnonzero((zscores >= null_lb) & (zscores <= null_ub)) 

584 if len(ii) == 0: 

585 raise RuntimeError("No Z-scores fall between null_lb and null_ub") 

586 zscores0 = zscores[ii] 

587 

588 # Number of Z-scores, and null Z-scores 

589 n_zs, n_zs0 = len(zscores), len(zscores0) 

590 

591 # Unpack and transform the parameters to the natural scale, hold 

592 # parameters fixed as specified. 

593 def xform(params): 

594 

595 mean = 0. 

596 sd = 1. 

597 prob = 1. 

598 

599 ii = 0 

600 if estimate_mean: 

601 mean = params[ii] 

602 ii += 1 

603 if estimate_scale: 

604 sd = np.exp(params[ii]) 

605 ii += 1 

606 if estimate_null_proportion: 

607 prob = 1 / (1 + np.exp(-params[ii])) 

608 

609 return mean, sd, prob 

610 

611 from scipy.stats.distributions import norm 

612 

613 def fun(params): 

614 """ 

615 Negative log-likelihood of z-scores. 

616 The function has three arguments, packed into a vector: 

617 mean : location parameter 

618 logscale : log of the scale parameter 

619 logitprop : logit of the proportion of true nulls 

620 The implementation follows section 4 from Efron 2008. 

621 """ 

622 

623 d, s, p = xform(params) 

624 

625 # Mass within the central region 

626 central_mass = (norm.cdf((null_ub - d) / s) - 

627 norm.cdf((null_lb - d) / s)) 

628 

629 # Probability that a Z-score is null and is in the central region 

630 cp = p * central_mass 

631 

632 # Binomial term 

633 rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp) 

634 

635 # Truncated Gaussian term for null Z-scores 

636 zv = (zscores0 - d) / s 

637 rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s) 

638 rval -= n_zs0 * np.log(central_mass) 

639 

640 return -rval 

641 

642 # Estimate the parameters 

643 from scipy.optimize import minimize 

644 # starting values are mean = 0, scale = 1, p0 ~ 1 

645 mz = minimize(fun, np.r_[0., 0, 3], method="Nelder-Mead") 

646 mean, sd, prob = xform(mz['x']) 

647 

648 self.mean = mean 

649 self.sd = sd 

650 self.null_proportion = prob 

651 

652 # The fitted null density function 

653 def pdf(self, zscores): 

654 """ 

655 Evaluates the fitted empirical null Z-score density. 

656 Parameters 

657 ---------- 

658 zscores : scalar or array_like 

659 The point or points at which the density is to be 

660 evaluated. 

661 Returns 

662 ------- 

663 The empirical null Z-score density evaluated at the given 

664 points. 

665 """ 

666 

667 zval = (zscores - self.mean) / self.sd 

668 return np.exp(-0.5 * zval**2 - np.log(self.sd) - 0.5 * np.log(2 * np.pi))