Coverage for qdscreen/sklearn.py: 81%

1import sys

2import numpy as np

4PY2 = sys.version_info < (3,)

5if PY2: 5 ↛ 7line 5 didn't jump to line 7, because the condition on line 5 was never true

6 # Python 2 is not happy with our package having the same name, we need dynamic import

7 import importlib

8 sklearn_exc = importlib.import_module("sklearn.exceptions")

9 NotFittedError = sklearn_exc.NotFittedError

10else:

11 # normal imports on python 3

12 from sklearn.exceptions import NotFittedError

14from .compat import BaseEstimator, SelectorMixin

15from .main import qd_screen

18class QDScreen(SelectorMixin, BaseEstimator):

19 """Feature selector that removes all features that are (quasi-)deterministically predicted from others.

21 This feature selection algorithm looks only at the features (X), not the

22 desired outputs (y), and can thus be used for unsupervised learning.

24 Read more in the User Guide.

26 Parameters

27 ----------

28 absolute_eps : float, optional

29 Absolute entropy threshold. Any feature Y that can be predicted from

30 another feature X in a quasi-deterministic way, that is, where

31 conditional entropy H(Y|X) <= absolute_eps, will be removed. The default

32 value is 0 and corresponds to removing deterministic relationships only.

33 relative_eps : float, optional

34 Relative entropy threshold. Any feature Y that can be predicted from

35 another feature X in a quasi-deterministic way, that is, where relative

36 conditional entropy H(Y|X)/H(Y) <= relative_eps (between 0 and 1), will

37 be removed. Only one of absolute_eps and relative_eps should be

38 provided.

40 Attributes

41 ----------

42 model_ : instance of ``QDForest``

43 Variances of individual features.

45 Notes

46 -----

47 Allows NaN in the input.

49 Examples

50 --------

51 TODO make this better ? see test_readme.py

52 The following dataset has integer features, two of which are constant, and all of which being 'predictable' from

53 the third one::

55 >>> X = [[0, 2, 0, 3],

56 ... [0, 1, 4, 3],

57 ... [0, 1, 1, 3]]

58 >>> selector = QDScreen()

59 >>> Xsel = selector.fit_transform(X)

60 >>> Xsel

61 array([[0],

62 [4],

63 [1]])

64 >>> selector.inverse_transform(Xsel)

65 array([[0, 2, 0, 3],

66 ... [0, 1, 4, 3],

67 ... [0, 1, 1, 3]])

68 """

70 def __init__(self,

71 absolute_eps=None, relative_eps=None):

72 self.absolute_eps = absolute_eps

73 self.relative_eps = relative_eps

75 def fit(self, X, y=None):

76 """Learn determinism from X.

78 Parameters

79 ----------

80 X : {array-like, sparse matrix}, shape (n_samples, n_features)

81 Sample vectors from which to compute variances.

83 y : any

84 Ignored. This parameter exists only for compatibility with

85 sklearn.pipeline.Pipeline.

87 Returns

88 -------

89 self

90 """

91 X = self._validate_data(X, accept_sparse=False, #('csr', 'csc'),

92 dtype=object,

93 force_all_finite='allow-nan')

95 # if hasattr(X, "toarray"): # sparse matrix

96 # _, self.variances_ = mean_variance_axis(X, axis=0)

97 # if self.threshold == 0:

98 # mins, maxes = min_max_axis(X, axis=0)

99 # peak_to_peaks = maxes - mins

100 # else:

101

102 # First find the forest structure

103 forest_ = qd_screen(X, absolute_eps=self.absolute_eps, relative_eps=self.relative_eps)

104

105 # Then learn the parameter maps

106 self.model_ = forest_.fit_selector_model(X)

107

108 return self

109

110 def check_is_fitted(self):

111 if not hasattr(self, "model_"): 111 ↛ 112line 111 didn't jump to line 112, because the condition on line 111 was never true

112 msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "

113 "appropriate arguments before using this estimator.")

114 raise NotFittedError(msg % {'name': type(self).__name__})

115

116 def _get_support_mask(self):

117 self.check_is_fitted()

118 return self.model_.forest.roots_mask_ar

119

120 def inverse_transform(self, X):

121 """

122 Reverse the transformation operation

123

124 Parameters

125 ----------

126 X : array of shape [n_samples, n_selected_features]

127 The input samples.

128

129 Returns

130 -------

131 X_r : array of shape [n_samples, n_original_features]

132 `X` with columns of zeros inserted where features would have

133 been removed by :meth:`transform`.

134 """

135 Xt = super(QDScreen, self).inverse_transform(X)

136 # use inplace = True because Xt is already prepared

137 self.model_.predict_qd(Xt, inplace=True)

138 return Xt

139

140 def _more_tags(self):

141 return {'allow_nan': True}