Coverage for brainbox/task/_knockoff.py: 8%

102 statements  

« prev     ^ index     » next       coverage.py v7.5.4, created at 2024-07-08 17:16 +0100

1""" 

2Taken from Statsmodels, https://statsmodels.org, copyright notice at bottom 

3 

4The RegressionFDR class implements the 'Knockoff' approach for 

5controlling false discovery rates (FDR) in regression analysis. 

6The knockoff approach does not require standard errors. Thus one 

7application is to provide inference for parameter estimates that are 

8not smooth functions of the data. For example, the knockoff approach 

9can be used to do inference for parameter estimates obtained from the 

10LASSO, of from stepwise variable selection. 

11The knockoff approach controls FDR for parameter estimates that may be 

12dependent, such as coefficient estimates in a multiple regression 

13model. 

14The knockoff approach is applicable whenever the test statistic can be 

15computed entirely from x'y and x'x, where x is the design matrix and y 

16is the vector of responses. 

17Reference 

18--------- 

19Rina Foygel Barber, Emmanuel Candes (2015). Controlling the False 

20Discovery Rate via Knockoffs. Annals of Statistics 43:5. 

21http://statweb.stanford.edu/~candes/papers/FDR_regression.pdf 

22 

23 

24Copyright (C) 2006, Jonathan E. Taylor 

25All rights reserved. 

26 

27Copyright (c) 2006-2008 Scipy Developers. 

28All rights reserved. 

29 

30Copyright (c) 2009-2018 statsmodels Developers. 

31All rights reserved. 

32 

33 

34Redistribution and use in source and binary forms, with or without 

35modification, are permitted provided that the following conditions are met: 

36 

37 a. Redistributions of source code must retain the above copyright notice, 

38 this list of conditions and the following disclaimer. 

39 b. Redistributions in binary form must reproduce the above copyright 

40 notice, this list of conditions and the following disclaimer in the 

41 documentation and/or other materials provided with the distribution. 

42 c. Neither the name of statsmodels nor the names of its contributors 

43 may be used to endorse or promote products derived from this software 

44 without specific prior written permission. 

45 

46 

47THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 

48AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 

49IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 

50ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR 

51ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 

52DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 

53SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 

54CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 

55LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 

56OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 

57DAMAGE. 

58""" 

59 

60import numpy as np 

61import pandas as pd 

62 

63 

64class RegressionFDR(object): 

65 """ 

66 Control FDR in a regression procedure. 

67 Parameters 

68 ---------- 

69 endog : array_like 

70 The dependent variable of the regression 

71 exog : array_like 

72 The independent variables of the regression 

73 regeffects : RegressionEffects instance 

74 An instance of a RegressionEffects class that can compute 

75 effect sizes for the regression coefficients. 

76 method : str 

77 The approach used to assess and control FDR, currently 

78 must be 'knockoff'. 

79 Returns 

80 ------- 

81 Returns an instance of the RegressionFDR class. The `fdr` attribute 

82 holds the estimated false discovery rates. 

83 Notes 

84 ----- 

85 This class Implements the knockoff method of Barber and Candes. 

86 This is an approach for controlling the FDR of a variety of 

87 regression estimation procedures, including correlation 

88 coefficients, OLS regression, OLS with forward selection, and 

89 LASSO regression. 

90 For other approaches to FDR control in regression, see the 

91 statsmodels.stats.multitest module. Methods provided in that 

92 module use Z-scores or p-values, and therefore require standard 

93 errors for the coefficient estimates to be available. 

94 The default method for constructing the augmented design matrix is 

95 the 'equivariant' approach, set `design_method='sdp'` to use an 

96 alternative approach involving semidefinite programming. See 

97 Barber and Candes for more information about both approaches. The 

98 sdp approach requires that the cvxopt package be installed. 

99 """ 

100 

101 def __init__(self, endog, exog, regeffects, method="knockoff", 

102 **kwargs): 

103 

104 if hasattr(exog, "columns"): 

105 self.xnames = exog.columns 

106 else: 

107 self.xnames = ["x%d" % j for j in range(exog.shape[1])] 

108 

109 exog = np.asarray(exog) 

110 endog = np.asarray(endog) 

111 

112 if "design_method" not in kwargs: 

113 kwargs["design_method"] = "equi" 

114 

115 nobs, nvar = exog.shape 

116 

117 if kwargs["design_method"] == "equi": 

118 exog1, exog2, _ = _design_knockoff_equi(exog) 

119 elif kwargs["design_method"] == "sdp": 

120 exog1, exog2, _ = _design_knockoff_sdp(exog) 

121 endog = endog - np.mean(endog) 

122 

123 self.endog = endog 

124 self.exog = np.concatenate((exog1, exog2), axis=1) 

125 self.exog1 = exog1 

126 self.exog2 = exog2 

127 

128 self.stats = regeffects.stats(self) 

129 

130 unq, inv, cnt = np.unique(self.stats, return_inverse=True, 

131 return_counts=True) 

132 

133 # The denominator of the FDR 

134 cc = np.cumsum(cnt) 

135 denom = len(self.stats) - cc + cnt 

136 denom[denom < 1] = 1 

137 

138 # The numerator of the FDR 

139 ii = np.searchsorted(unq, -unq, side='right') - 1 

140 numer = cc[ii] 

141 numer[ii < 0] = 0 

142 

143 # The knockoff+ estimated FDR 

144 fdrp = (1 + numer) / denom 

145 

146 # The knockoff estimated FDR 

147 fdr = numer / denom 

148 

149 self.fdr = fdr[inv] 

150 self.fdrp = fdrp[inv] 

151 self._ufdr = fdr 

152 self._unq = unq 

153 

154 df = pd.DataFrame(index=self.xnames) 

155 df["Stat"] = self.stats 

156 df["FDR+"] = self.fdrp 

157 df["FDR"] = self.fdr 

158 self.fdr_df = df 

159 

160 def threshold(self, tfdr): 

161 """ 

162 Returns the threshold statistic for a given target FDR. 

163 """ 

164 

165 if np.min(self._ufdr) <= tfdr: 

166 return self._unq[self._ufdr <= tfdr][0] 

167 else: 

168 return np.inf 

169 

170 

171def _design_knockoff_sdp(exog): 

172 """ 

173 Use semidefinite programming to construct a knockoff design 

174 matrix. 

175 Requires cvxopt to be installed. 

176 """ 

177 

178 try: 

179 from cvxopt import solvers, matrix 

180 except ImportError: 

181 raise ValueError("SDP knockoff designs require installation of cvxopt") 

182 

183 nobs, nvar = exog.shape 

184 

185 # Standardize exog 

186 xnm = np.sum(exog**2, 0) 

187 xnm = np.sqrt(xnm) 

188 exog = exog / xnm 

189 

190 Sigma = np.dot(exog.T, exog) 

191 

192 c = matrix(-np.ones(nvar)) 

193 

194 h0 = np.concatenate((np.zeros(nvar), np.ones(nvar))) 

195 h0 = matrix(h0) 

196 G0 = np.concatenate((-np.eye(nvar), np.eye(nvar)), axis=0) 

197 G0 = matrix(G0) 

198 

199 h1 = 2 * Sigma 

200 h1 = matrix(h1) 

201 i, j = np.diag_indices(nvar) 

202 G1 = np.zeros((nvar * nvar, nvar)) 

203 G1[i * nvar + j, i] = 1 

204 G1 = matrix(G1) 

205 

206 solvers.options['show_progress'] = False 

207 sol = solvers.sdp(c, G0, h0, [G1], [h1]) 

208 sl = np.asarray(sol['x']).ravel() 

209 

210 xcov = np.dot(exog.T, exog) 

211 exogn = _get_knmat(exog, xcov, sl) 

212 

213 return exog, exogn, sl 

214 

215 

216def _design_knockoff_equi(exog): 

217 """ 

218 Construct an equivariant design matrix for knockoff analysis. 

219 Follows the 'equi-correlated knockoff approach of equation 2.4 in 

220 Barber and Candes. 

221 Constructs a pair of design matrices exogs, exogn such that exogs 

222 is a scaled/centered version of the input matrix exog, exogn is 

223 another matrix of the same shape with cov(exogn) = cov(exogs), and 

224 the covariances between corresponding columns of exogn and exogs 

225 are as small as possible. 

226 """ 

227 

228 nobs, nvar = exog.shape 

229 

230 if nobs < 2 * nvar: 

231 msg = "The equivariant knockoff can ony be used when n >= 2*p" 

232 raise ValueError(msg) 

233 

234 # Standardize exog 

235 xnm = np.sum(exog**2, 0) 

236 xnm = np.sqrt(xnm) 

237 exog = exog / xnm 

238 

239 xcov = np.dot(exog.T, exog) 

240 ev, _ = np.linalg.eig(xcov) 

241 evmin = np.min(ev) 

242 

243 sl = min(2 * evmin, 1) 

244 sl = sl * np.ones(nvar) 

245 

246 exogn = _get_knmat(exog, xcov, sl) 

247 

248 return exog, exogn, sl 

249 

250 

251def _get_knmat(exog, xcov, sl): 

252 # Utility function, see equation 2.2 of Barber & Candes. 

253 

254 nobs, nvar = exog.shape 

255 

256 ash = np.linalg.inv(xcov) 

257 ash *= -np.outer(sl, sl) 

258 i, j = np.diag_indices(nvar) 

259 ash[i, j] += 2 * sl 

260 

261 umat = np.random.normal(size=(nobs, nvar)) 

262 u, _ = np.linalg.qr(exog) 

263 umat -= np.dot(u, np.dot(u.T, umat)) 

264 umat, _ = np.linalg.qr(umat) 

265 

266 ashr, xc, _ = np.linalg.svd(ash, 0) 

267 ashr *= np.sqrt(xc) 

268 ashr = ashr.T 

269 

270 ex = (sl[:, None] * np.linalg.solve(xcov, exog.T)).T 

271 exogn = exog - ex + np.dot(umat, ashr) 

272 

273 return exogn