Coverage for qdscreen/sklearn.py: 81%
32 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
1import sys
2import numpy as np
4PY2 = sys.version_info < (3,)
5if PY2: 5 ↛ 7line 5 didn't jump to line 7, because the condition on line 5 was never true
6 # Python 2 is not happy with our package having the same name, we need dynamic import
7 import importlib
8 sklearn_exc = importlib.import_module("sklearn.exceptions")
9 NotFittedError = sklearn_exc.NotFittedError
10else:
11 # normal imports on python 3
12 from sklearn.exceptions import NotFittedError
14from .compat import BaseEstimator, SelectorMixin
15from .main import qd_screen
18class QDScreen(SelectorMixin, BaseEstimator):
19 """Feature selector that removes all features that are (quasi-)deterministically predicted from others.
21 This feature selection algorithm looks only at the features (X), not the
22 desired outputs (y), and can thus be used for unsupervised learning.
24 Read more in the User Guide.
26 Parameters
27 ----------
28 absolute_eps : float, optional
29 Absolute entropy threshold. Any feature Y that can be predicted from
30 another feature X in a quasi-deterministic way, that is, where
31 conditional entropy H(Y|X) <= absolute_eps, will be removed. The default
32 value is 0 and corresponds to removing deterministic relationships only.
33 relative_eps : float, optional
34 Relative entropy threshold. Any feature Y that can be predicted from
35 another feature X in a quasi-deterministic way, that is, where relative
36 conditional entropy H(Y|X)/H(Y) <= relative_eps (between 0 and 1), will
37 be removed. Only one of absolute_eps and relative_eps should be
38 provided.
40 Attributes
41 ----------
42 model_ : instance of ``QDForest``
43 Variances of individual features.
45 Notes
46 -----
47 Allows NaN in the input.
49 Examples
50 --------
51 TODO make this better ? see test_readme.py
52 The following dataset has integer features, two of which are constant, and all of which being 'predictable' from
53 the third one::
55 >>> X = [[0, 2, 0, 3],
56 ... [0, 1, 4, 3],
57 ... [0, 1, 1, 3]]
58 >>> selector = QDScreen()
59 >>> Xsel = selector.fit_transform(X)
60 >>> Xsel
61 array([[0],
62 [4],
63 [1]])
64 >>> selector.inverse_transform(Xsel)
65 array([[0, 2, 0, 3],
66 ... [0, 1, 4, 3],
67 ... [0, 1, 1, 3]])
68 """
70 def __init__(self,
71 absolute_eps=None, relative_eps=None):
72 self.absolute_eps = absolute_eps
73 self.relative_eps = relative_eps
75 def fit(self, X, y=None):
76 """Learn determinism from X.
78 Parameters
79 ----------
80 X : {array-like, sparse matrix}, shape (n_samples, n_features)
81 Sample vectors from which to compute variances.
83 y : any
84 Ignored. This parameter exists only for compatibility with
85 sklearn.pipeline.Pipeline.
87 Returns
88 -------
89 self
90 """
91 X = self._validate_data(X, accept_sparse=False, #('csr', 'csc'),
92 dtype=object,
93 force_all_finite='allow-nan')
95 # if hasattr(X, "toarray"): # sparse matrix
96 # _, self.variances_ = mean_variance_axis(X, axis=0)
97 # if self.threshold == 0:
98 # mins, maxes = min_max_axis(X, axis=0)
99 # peak_to_peaks = maxes - mins
100 # else:
102 # First find the forest structure
103 forest_ = qd_screen(X, absolute_eps=self.absolute_eps, relative_eps=self.relative_eps)
105 # Then learn the parameter maps
106 self.model_ = forest_.fit_selector_model(X)
108 return self
110 def check_is_fitted(self):
111 if not hasattr(self, "model_"): 111 ↛ 112line 111 didn't jump to line 112, because the condition on line 111 was never true
112 msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
113 "appropriate arguments before using this estimator.")
114 raise NotFittedError(msg % {'name': type(self).__name__})
116 def _get_support_mask(self):
117 self.check_is_fitted()
118 return self.model_.forest.roots_mask_ar
120 def inverse_transform(self, X):
121 """
122 Reverse the transformation operation
124 Parameters
125 ----------
126 X : array of shape [n_samples, n_selected_features]
127 The input samples.
129 Returns
130 -------
131 X_r : array of shape [n_samples, n_original_features]
132 `X` with columns of zeros inserted where features would have
133 been removed by :meth:`transform`.
134 """
135 Xt = super(QDScreen, self).inverse_transform(X)
136 # use inplace = True because Xt is already prepared
137 self.model_.predict_qd(Xt, inplace=True)
138 return Xt
140 def _more_tags(self):
141 return {'allow_nan': True}