Coverage for brainbox/task/_knockoff.py: 8%
102 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-17 09:55 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-03-17 09:55 +0000
1"""
2Taken from Statsmodels, https://statsmodels.org, copyright notice at bottom
4The RegressionFDR class implements the 'Knockoff' approach for
5controlling false discovery rates (FDR) in regression analysis.
6The knockoff approach does not require standard errors. Thus one
7application is to provide inference for parameter estimates that are
8not smooth functions of the data. For example, the knockoff approach
9can be used to do inference for parameter estimates obtained from the
10LASSO, of from stepwise variable selection.
11The knockoff approach controls FDR for parameter estimates that may be
12dependent, such as coefficient estimates in a multiple regression
13model.
14The knockoff approach is applicable whenever the test statistic can be
15computed entirely from x'y and x'x, where x is the design matrix and y
16is the vector of responses.
17Reference
18---------
19Rina Foygel Barber, Emmanuel Candes (2015). Controlling the False
20Discovery Rate via Knockoffs. Annals of Statistics 43:5.
21http://statweb.stanford.edu/~candes/papers/FDR_regression.pdf
24Copyright (C) 2006, Jonathan E. Taylor
25All rights reserved.
27Copyright (c) 2006-2008 Scipy Developers.
28All rights reserved.
30Copyright (c) 2009-2018 statsmodels Developers.
31All rights reserved.
34Redistribution and use in source and binary forms, with or without
35modification, are permitted provided that the following conditions are met:
37 a. Redistributions of source code must retain the above copyright notice,
38 this list of conditions and the following disclaimer.
39 b. Redistributions in binary form must reproduce the above copyright
40 notice, this list of conditions and the following disclaimer in the
41 documentation and/or other materials provided with the distribution.
42 c. Neither the name of statsmodels nor the names of its contributors
43 may be used to endorse or promote products derived from this software
44 without specific prior written permission.
47THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
48AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR
51ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
53SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
57DAMAGE.
58"""
60import numpy as np
61import pandas as pd
64class RegressionFDR(object):
65 """
66 Control FDR in a regression procedure.
67 Parameters
68 ----------
69 endog : array_like
70 The dependent variable of the regression
71 exog : array_like
72 The independent variables of the regression
73 regeffects : RegressionEffects instance
74 An instance of a RegressionEffects class that can compute
75 effect sizes for the regression coefficients.
76 method : str
77 The approach used to assess and control FDR, currently
78 must be 'knockoff'.
79 Returns
80 -------
81 Returns an instance of the RegressionFDR class. The `fdr` attribute
82 holds the estimated false discovery rates.
83 Notes
84 -----
85 This class Implements the knockoff method of Barber and Candes.
86 This is an approach for controlling the FDR of a variety of
87 regression estimation procedures, including correlation
88 coefficients, OLS regression, OLS with forward selection, and
89 LASSO regression.
90 For other approaches to FDR control in regression, see the
91 statsmodels.stats.multitest module. Methods provided in that
92 module use Z-scores or p-values, and therefore require standard
93 errors for the coefficient estimates to be available.
94 The default method for constructing the augmented design matrix is
95 the 'equivariant' approach, set `design_method='sdp'` to use an
96 alternative approach involving semidefinite programming. See
97 Barber and Candes for more information about both approaches. The
98 sdp approach requires that the cvxopt package be installed.
99 """
101 def __init__(self, endog, exog, regeffects, method="knockoff",
102 **kwargs):
104 if hasattr(exog, "columns"):
105 self.xnames = exog.columns
106 else:
107 self.xnames = ["x%d" % j for j in range(exog.shape[1])]
109 exog = np.asarray(exog)
110 endog = np.asarray(endog)
112 if "design_method" not in kwargs:
113 kwargs["design_method"] = "equi"
115 nobs, nvar = exog.shape
117 if kwargs["design_method"] == "equi":
118 exog1, exog2, _ = _design_knockoff_equi(exog)
119 elif kwargs["design_method"] == "sdp":
120 exog1, exog2, _ = _design_knockoff_sdp(exog)
121 endog = endog - np.mean(endog)
123 self.endog = endog
124 self.exog = np.concatenate((exog1, exog2), axis=1)
125 self.exog1 = exog1
126 self.exog2 = exog2
128 self.stats = regeffects.stats(self)
130 unq, inv, cnt = np.unique(self.stats, return_inverse=True,
131 return_counts=True)
133 # The denominator of the FDR
134 cc = np.cumsum(cnt)
135 denom = len(self.stats) - cc + cnt
136 denom[denom < 1] = 1
138 # The numerator of the FDR
139 ii = np.searchsorted(unq, -unq, side='right') - 1
140 numer = cc[ii]
141 numer[ii < 0] = 0
143 # The knockoff+ estimated FDR
144 fdrp = (1 + numer) / denom
146 # The knockoff estimated FDR
147 fdr = numer / denom
149 self.fdr = fdr[inv]
150 self.fdrp = fdrp[inv]
151 self._ufdr = fdr
152 self._unq = unq
154 df = pd.DataFrame(index=self.xnames)
155 df["Stat"] = self.stats
156 df["FDR+"] = self.fdrp
157 df["FDR"] = self.fdr
158 self.fdr_df = df
160 def threshold(self, tfdr):
161 """
162 Returns the threshold statistic for a given target FDR.
163 """
165 if np.min(self._ufdr) <= tfdr:
166 return self._unq[self._ufdr <= tfdr][0]
167 else:
168 return np.inf
171def _design_knockoff_sdp(exog):
172 """
173 Use semidefinite programming to construct a knockoff design
174 matrix.
175 Requires cvxopt to be installed.
176 """
178 try:
179 from cvxopt import solvers, matrix
180 except ImportError:
181 raise ValueError("SDP knockoff designs require installation of cvxopt")
183 nobs, nvar = exog.shape
185 # Standardize exog
186 xnm = np.sum(exog**2, 0)
187 xnm = np.sqrt(xnm)
188 exog = exog / xnm
190 Sigma = np.dot(exog.T, exog)
192 c = matrix(-np.ones(nvar))
194 h0 = np.concatenate((np.zeros(nvar), np.ones(nvar)))
195 h0 = matrix(h0)
196 G0 = np.concatenate((-np.eye(nvar), np.eye(nvar)), axis=0)
197 G0 = matrix(G0)
199 h1 = 2 * Sigma
200 h1 = matrix(h1)
201 i, j = np.diag_indices(nvar)
202 G1 = np.zeros((nvar * nvar, nvar))
203 G1[i * nvar + j, i] = 1
204 G1 = matrix(G1)
206 solvers.options['show_progress'] = False
207 sol = solvers.sdp(c, G0, h0, [G1], [h1])
208 sl = np.asarray(sol['x']).ravel()
210 xcov = np.dot(exog.T, exog)
211 exogn = _get_knmat(exog, xcov, sl)
213 return exog, exogn, sl
216def _design_knockoff_equi(exog):
217 """
218 Construct an equivariant design matrix for knockoff analysis.
219 Follows the 'equi-correlated knockoff approach of equation 2.4 in
220 Barber and Candes.
221 Constructs a pair of design matrices exogs, exogn such that exogs
222 is a scaled/centered version of the input matrix exog, exogn is
223 another matrix of the same shape with cov(exogn) = cov(exogs), and
224 the covariances between corresponding columns of exogn and exogs
225 are as small as possible.
226 """
228 nobs, nvar = exog.shape
230 if nobs < 2 * nvar:
231 msg = "The equivariant knockoff can ony be used when n >= 2*p"
232 raise ValueError(msg)
234 # Standardize exog
235 xnm = np.sum(exog**2, 0)
236 xnm = np.sqrt(xnm)
237 exog = exog / xnm
239 xcov = np.dot(exog.T, exog)
240 ev, _ = np.linalg.eig(xcov)
241 evmin = np.min(ev)
243 sl = min(2 * evmin, 1)
244 sl = sl * np.ones(nvar)
246 exogn = _get_knmat(exog, xcov, sl)
248 return exog, exogn, sl
251def _get_knmat(exog, xcov, sl):
252 # Utility function, see equation 2.2 of Barber & Candes.
254 nobs, nvar = exog.shape
256 ash = np.linalg.inv(xcov)
257 ash *= -np.outer(sl, sl)
258 i, j = np.diag_indices(nvar)
259 ash[i, j] += 2 * sl
261 umat = np.random.normal(size=(nobs, nvar))
262 u, _ = np.linalg.qr(exog)
263 umat -= np.dot(u, np.dot(u.T, umat))
264 umat, _ = np.linalg.qr(umat)
266 ashr, xc, _ = np.linalg.svd(ash, 0)
267 ashr *= np.sqrt(xc)
268 ashr = ashr.T
270 ex = (sl[:, None] * np.linalg.solve(xcov, exog.T)).T
271 exogn = exog - ex + np.dot(umat, ashr)
273 return exogn