Coverage for qdscreen/sklearn.py: 81%

32 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-17 11:02 +0000

1import sys 

2import numpy as np 

3 

4PY2 = sys.version_info < (3,) 

5if PY2: 5 ↛ 7line 5 didn't jump to line 7, because the condition on line 5 was never true

6 # Python 2 is not happy with our package having the same name, we need dynamic import 

7 import importlib 

8 sklearn_exc = importlib.import_module("sklearn.exceptions") 

9 NotFittedError = sklearn_exc.NotFittedError 

10else: 

11 # normal imports on python 3 

12 from sklearn.exceptions import NotFittedError 

13 

14from .compat import BaseEstimator, SelectorMixin 

15from .main import qd_screen 

16 

17 

18class QDScreen(SelectorMixin, BaseEstimator): 

19 """Feature selector that removes all features that are (quasi-)deterministically predicted from others. 

20 

21 This feature selection algorithm looks only at the features (X), not the 

22 desired outputs (y), and can thus be used for unsupervised learning. 

23 

24 Read more in the User Guide. 

25 

26 Parameters 

27 ---------- 

28 absolute_eps : float, optional 

29 Absolute entropy threshold. Any feature Y that can be predicted from 

30 another feature X in a quasi-deterministic way, that is, where 

31 conditional entropy H(Y|X) <= absolute_eps, will be removed. The default 

32 value is 0 and corresponds to removing deterministic relationships only. 

33 relative_eps : float, optional 

34 Relative entropy threshold. Any feature Y that can be predicted from 

35 another feature X in a quasi-deterministic way, that is, where relative 

36 conditional entropy H(Y|X)/H(Y) <= relative_eps (between 0 and 1), will 

37 be removed. Only one of absolute_eps and relative_eps should be 

38 provided. 

39 

40 Attributes 

41 ---------- 

42 model_ : instance of ``QDForest`` 

43 Variances of individual features. 

44 

45 Notes 

46 ----- 

47 Allows NaN in the input. 

48 

49 Examples 

50 -------- 

51 TODO make this better ? see test_readme.py 

52 The following dataset has integer features, two of which are constant, and all of which being 'predictable' from 

53 the third one:: 

54 

55 >>> X = [[0, 2, 0, 3], 

56 ... [0, 1, 4, 3], 

57 ... [0, 1, 1, 3]] 

58 >>> selector = QDScreen() 

59 >>> Xsel = selector.fit_transform(X) 

60 >>> Xsel 

61 array([[0], 

62 [4], 

63 [1]]) 

64 >>> selector.inverse_transform(Xsel) 

65 array([[0, 2, 0, 3], 

66 ... [0, 1, 4, 3], 

67 ... [0, 1, 1, 3]]) 

68 """ 

69 

70 def __init__(self, 

71 absolute_eps=None, relative_eps=None): 

72 self.absolute_eps = absolute_eps 

73 self.relative_eps = relative_eps 

74 

75 def fit(self, X, y=None): 

76 """Learn determinism from X. 

77 

78 Parameters 

79 ---------- 

80 X : {array-like, sparse matrix}, shape (n_samples, n_features) 

81 Sample vectors from which to compute variances. 

82 

83 y : any 

84 Ignored. This parameter exists only for compatibility with 

85 sklearn.pipeline.Pipeline. 

86 

87 Returns 

88 ------- 

89 self 

90 """ 

91 X = self._validate_data(X, accept_sparse=False, #('csr', 'csc'), 

92 dtype=object, 

93 force_all_finite='allow-nan') 

94 

95 # if hasattr(X, "toarray"): # sparse matrix 

96 # _, self.variances_ = mean_variance_axis(X, axis=0) 

97 # if self.threshold == 0: 

98 # mins, maxes = min_max_axis(X, axis=0) 

99 # peak_to_peaks = maxes - mins 

100 # else: 

101 

102 # First find the forest structure 

103 forest_ = qd_screen(X, absolute_eps=self.absolute_eps, relative_eps=self.relative_eps) 

104 

105 # Then learn the parameter maps 

106 self.model_ = forest_.fit_selector_model(X) 

107 

108 return self 

109 

110 def check_is_fitted(self): 

111 if not hasattr(self, "model_"): 111 ↛ 112line 111 didn't jump to line 112, because the condition on line 111 was never true

112 msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " 

113 "appropriate arguments before using this estimator.") 

114 raise NotFittedError(msg % {'name': type(self).__name__}) 

115 

116 def _get_support_mask(self): 

117 self.check_is_fitted() 

118 return self.model_.forest.roots_mask_ar 

119 

120 def inverse_transform(self, X): 

121 """ 

122 Reverse the transformation operation 

123 

124 Parameters 

125 ---------- 

126 X : array of shape [n_samples, n_selected_features] 

127 The input samples. 

128 

129 Returns 

130 ------- 

131 X_r : array of shape [n_samples, n_original_features] 

132 `X` with columns of zeros inserted where features would have 

133 been removed by :meth:`transform`. 

134 """ 

135 Xt = super(QDScreen, self).inverse_transform(X) 

136 # use inplace = True because Xt is already prepared 

137 self.model_.predict_qd(Xt, inplace=True) 

138 return Xt 

139 

140 def _more_tags(self): 

141 return {'allow_nan': True}