Coverage for brainbox/task/

1"""

2Taken from Statsmodels, https://statsmodels.org, copyright notice at bottom

4The RegressionFDR class implements the 'Knockoff' approach for

5controlling false discovery rates (FDR) in regression analysis.

6The knockoff approach does not require standard errors. Thus one

7application is to provide inference for parameter estimates that are

8not smooth functions of the data. For example, the knockoff approach

9can be used to do inference for parameter estimates obtained from the

10LASSO, of from stepwise variable selection.

11The knockoff approach controls FDR for parameter estimates that may be

12dependent, such as coefficient estimates in a multiple regression

13model.

14The knockoff approach is applicable whenever the test statistic can be

15computed entirely from x'y and x'x, where x is the design matrix and y

16is the vector of responses.

17Reference

18---------

19Rina Foygel Barber, Emmanuel Candes (2015). Controlling the False

20Discovery Rate via Knockoffs. Annals of Statistics 43:5.

21http://statweb.stanford.edu/~candes/papers/FDR_regression.pdf

25All rights reserved.

28All rights reserved.

31All rights reserved.

34Redistribution and use in source and binary forms, with or without

35modification, are permitted provided that the following conditions are met:

37 a. Redistributions of source code must retain the above copyright notice,

38 this list of conditions and the following disclaimer.

39 b. Redistributions in binary form must reproduce the above copyright

40 notice, this list of conditions and the following disclaimer in the

41 documentation and/or other materials provided with the distribution.

42 c. Neither the name of statsmodels nor the names of its contributors

43 may be used to endorse or promote products derived from this software

44 without specific prior written permission.

47THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

48AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

49IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

50ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR

51ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

52DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR

53SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

54CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

55LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

56OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH

57DAMAGE.

58"""

60import numpy as np (empty)

61import pandas as pd (empty)

64class RegressionFDR(object): (empty)

65 """

66 Control FDR in a regression procedure.

67 Parameters

68 ----------

69 endog : array_like

70 The dependent variable of the regression

71 exog : array_like

72 The independent variables of the regression

73 regeffects : RegressionEffects instance

74 An instance of a RegressionEffects class that can compute

75 effect sizes for the regression coefficients.

76 method : str

77 The approach used to assess and control FDR, currently

78 must be 'knockoff'.

79 Returns

80 -------

81 Returns an instance of the RegressionFDR class. The `fdr` attribute

82 holds the estimated false discovery rates.

83 Notes

84 -----

85 This class Implements the knockoff method of Barber and Candes.

86 This is an approach for controlling the FDR of a variety of

87 regression estimation procedures, including correlation

88 coefficients, OLS regression, OLS with forward selection, and

89 LASSO regression.

90 For other approaches to FDR control in regression, see the

91 statsmodels.stats.multitest module. Methods provided in that

92 module use Z-scores or p-values, and therefore require standard

93 errors for the coefficient estimates to be available.

94 The default method for constructing the augmented design matrix is

95 the 'equivariant' approach, set `design_method='sdp'` to use an

96 alternative approach involving semidefinite programming. See

97 Barber and Candes for more information about both approaches. The

98 sdp approach requires that the cvxopt package be installed.

99 """

100

101 def __init__(self, endog, exog, regeffects, method="knockoff", (empty)

102 **kwargs):

103

104 if hasattr(exog, "columns"):

105 self.xnames = exog.columns

106 else:

107 self.xnames = ["x%d" % j for j in range(exog.shape[1])]

108

109 exog = np.asarray(exog)

110 endog = np.asarray(endog)

111

112 if "design_method" not in kwargs:

113 kwargs["design_method"] = "equi"

114

115 nobs, nvar = exog.shape

116

117 if kwargs["design_method"] == "equi":

118 exog1, exog2, _ = _design_knockoff_equi(exog)

119 elif kwargs["design_method"] == "sdp":

120 exog1, exog2, _ = _design_knockoff_sdp(exog)

121 endog = endog - np.mean(endog)

122

123 self.endog = endog

124 self.exog = np.concatenate((exog1, exog2), axis=1)

125 self.exog1 = exog1

126 self.exog2 = exog2

127

128 self.stats = regeffects.stats(self)

129

130 unq, inv, cnt = np.unique(self.stats, return_inverse=True,

131 return_counts=True)

132

133 # The denominator of the FDR

134 cc = np.cumsum(cnt)

135 denom = len(self.stats) - cc + cnt

136 denom[denom < 1] = 1

137

138 # The numerator of the FDR

139 ii = np.searchsorted(unq, -unq, side='right') - 1

140 numer = cc[ii]

141 numer[ii < 0] = 0

142

143 # The knockoff+ estimated FDR

144 fdrp = (1 + numer) / denom

145

146 # The knockoff estimated FDR

147 fdr = numer / denom

148

149 self.fdr = fdr[inv]

150 self.fdrp = fdrp[inv]

151 self._ufdr = fdr

152 self._unq = unq

153

154 df = pd.DataFrame(index=self.xnames)

155 df["Stat"] = self.stats

156 df["FDR+"] = self.fdrp

157 df["FDR"] = self.fdr

158 self.fdr_df = df

159

160 def threshold(self, tfdr): (empty)

161 """

162 Returns the threshold statistic for a given target FDR.

163 """

164

165 if np.min(self._ufdr) <= tfdr:

166 return self._unq[self._ufdr <= tfdr][0]

167 else:

168 return np.inf

169

170

171def _design_knockoff_sdp(exog): (empty)

172 """

173 Use semidefinite programming to construct a knockoff design

174 matrix.

175 Requires cvxopt to be installed.

176 """

177

178 try:

179 from cvxopt import solvers, matrix

180 except ImportError:

181 raise ValueError("SDP knockoff designs require installation of cvxopt")

182

183 nobs, nvar = exog.shape

184

185 # Standardize exog

186 xnm = np.sum(exog**2, 0)

187 xnm = np.sqrt(xnm)

188 exog = exog / xnm

189

190 Sigma = np.dot(exog.T, exog)

191

192 c = matrix(-np.ones(nvar))

193

194 h0 = np.concatenate((np.zeros(nvar), np.ones(nvar)))

195 h0 = matrix(h0)

196 G0 = np.concatenate((-np.eye(nvar), np.eye(nvar)), axis=0)

197 G0 = matrix(G0)

198

199 h1 = 2 * Sigma

200 h1 = matrix(h1)

201 i, j = np.diag_indices(nvar)

202 G1 = np.zeros((nvar * nvar, nvar))

203 G1[i * nvar + j, i] = 1

204 G1 = matrix(G1)

205

206 solvers.options['show_progress'] = False

207 sol = solvers.sdp(c, G0, h0, [G1], [h1])

208 sl = np.asarray(sol['x']).ravel()

209

210 xcov = np.dot(exog.T, exog)

211 exogn = _get_knmat(exog, xcov, sl)

212

213 return exog, exogn, sl

214

215

216def _design_knockoff_equi(exog): (empty)

217 """

218 Construct an equivariant design matrix for knockoff analysis.

219 Follows the 'equi-correlated knockoff approach of equation 2.4 in

220 Barber and Candes.

221 Constructs a pair of design matrices exogs, exogn such that exogs

222 is a scaled/centered version of the input matrix exog, exogn is

223 another matrix of the same shape with cov(exogn) = cov(exogs), and

224 the covariances between corresponding columns of exogn and exogs

225 are as small as possible.

226 """

227

228 nobs, nvar = exog.shape

229

230 if nobs < 2 * nvar:

231 msg = "The equivariant knockoff can ony be used when n >= 2*p"

232 raise ValueError(msg)

233

234 # Standardize exog

235 xnm = np.sum(exog**2, 0)

236 xnm = np.sqrt(xnm)

237 exog = exog / xnm

238

239 xcov = np.dot(exog.T, exog)

240 ev, _ = np.linalg.eig(xcov)

241 evmin = np.min(ev)

242

243 sl = min(2 * evmin, 1)

244 sl = sl * np.ones(nvar)

245

246 exogn = _get_knmat(exog, xcov, sl)

247

248 return exog, exogn, sl

249

250

251def _get_knmat(exog, xcov, sl): (empty)

252 # Utility function, see equation 2.2 of Barber & Candes.

253

254 nobs, nvar = exog.shape

255

256 ash = np.linalg.inv(xcov)

257 ash *= -np.outer(sl, sl)

258 i, j = np.diag_indices(nvar)

259 ash[i, j] += 2 * sl

260

261 umat = np.random.normal(size=(nobs, nvar))

262 u, _ = np.linalg.qr(exog)

263 umat -= np.dot(u, np.dot(u.T, umat))

264 umat, _ = np.linalg.qr(umat)

265

266 ashr, xc, _ = np.linalg.svd(ash, 0)

267 ashr *= np.sqrt(xc)

268 ashr = ashr.T

269

270 ex = (sl[:, None] * np.linalg.solve(xcov, exog.T)).T

271 exogn = exog - ex + np.dot(umat, ashr)

272

273 return exogn

Coverage for brainbox/task/_knockoff.py: 8%

102 statements