Coverage for qdscreen/selector.py: 80%

112 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-17 11:02 +0000

1import numpy as np 

2import pandas as pd 

3from scipy.stats import mode as scipy_mode 

4 

5try: 

6 from typing import Union, Optional, Dict, Any 

7except: # noqa 

8 pass 

9 

10from .main import QDForest 

11 

12 

13class InvalidDataInputError(ValueError): 

14 """Raised when input data is invalid""" 

15 

16 

17def _get_most_common_value(x): 

18 # From https://stackoverflow.com/a/47778607/7262247 

19 # `scipy_mode` is the most robust to the various pitfalls (nans, ...) 

20 # but they will deprecate it 

21 # return scipy_mode(x, nan_policy=None)[0][0] 

22 res = x.mode(dropna=True) 

23 if len(res) == 0: 

24 return np.nan 

25 else: 

26 return res 

27 

28 

29class ParentChildMapping: 

30 __slots__ = ('_mapping_dct', '_otypes') 

31 

32 def __init__( 

33 self, 

34 mapping_dct # type: Dict 

35 ): 

36 self._mapping_dct = mapping_dct 

37 # Find the correct otype to use in the vectorized operation 

38 self._otypes = [np.array(mapping_dct.values()).dtype] 

39 

40 def predict_child_from_parent_ar( 

41 self, 

42 parent_values # type: np.ndarray 

43 ): 

44 """For numpy""" 

45 # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247 

46 return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values) 

47 

48 def predict_child_from_parent( 

49 self, 

50 parent_values # type: pd.DataFrame 

51 ): 

52 """For pandas""" 

53 # See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary 

54 return parent_values.map(self._mapping_dct) 

55 

56 

57class QDSelectorModel(object): 

58 """ 

59 A quasi-determinism feature selection model that can be 

60 

61 - fit from a dataset using <model>.fit(X) 

62 - used to select only the relevant (root) features using <model>.remove_qd(X) 

63 - used to predict the other columns from the relevant (root) ones using <model>.predict_qd(X) 

64 

65 """ 

66 __slots__ = ('forest', # the QDForest 

67 '_maps' # a nested dict {parent: {child: mapping_dct with index in the order of self.varnames}} 

68 # note: scipy.sparse now raises an error with dtype=object 

69 ) 

70 

71 def __init__(self, 

72 qd_forest # type: QDForest 

73 ): 

74 self.forest = qd_forest 

75 self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]] 

76 

77 def assert_valid_input( 

78 self, 

79 X, # type: Union[np.ndarray, pd.DataFrame] 

80 df_extras_allowed=False # type: bool 

81 ): 

82 """Raises an InvalidDataInputError if X does not match the expectation""" 

83 

84 if self.forest.is_nparray: 

85 if not isinstance(X, np.ndarray): 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise InvalidDataInputError( 

87 "Input data must be an numpy array. Found: %s" % type(X)) 

88 

89 if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]: 89 ↛ 90line 89 didn't jump to line 90, because the condition on line 89 was never true

90 raise InvalidDataInputError( 

91 "Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1])) 

92 else: 

93 if not isinstance(X, pd.DataFrame): 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true

94 raise InvalidDataInputError( 

95 "Input data must be a pandas DataFrame. Found: %s" % type(X)) 

96 

97 actual = set(X.columns) 

98 expected = set(self.forest.varnames) 

99 if actual != expected: 99 ↛ 100line 99 didn't jump to line 100, because the condition on line 99 was never true

100 missing = expected - actual 

101 if missing or not df_extras_allowed: 

102 extra = actual - expected 

103 raise InvalidDataInputError( 

104 "Input pandas DataFrame must have column names matching the ones in the model. " 

105 "Missing: %s. Extra: %s " % (missing, extra) 

106 ) 

107 

108 def fit( 

109 self, 

110 X # type: Union[np.ndarray, pd.DataFrame] 

111 ): 

112 """Fits the maps able to predict determined features from others""" 

113 forest = self.forest 

114 

115 # Validate the input 

116 self.assert_valid_input(X, df_extras_allowed=False) 

117 

118 # we will create a sparse coordinate representation of maps 

119 n = forest.nb_vars 

120 

121 if forest.is_nparray: 

122 assert isinstance(X, np.ndarray) 

123 

124 # detect numpy structured arrays 

125 is_struct_array = X.dtype.names is not None 

126 if is_struct_array: 126 ↛ 129line 126 didn't jump to line 129, because the condition on line 126 was never true

127 # names = X.dtype.names 

128 # assert len(names) == n 

129 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an " 

130 "unstructured array") 

131 else: 

132 assert X.shape[1] == n 

133 

134 self._maps = maps = dict() 

135 

136 for parent, child in forest.get_arcs(): 

137 # assert (parent, child) not in maps, "Error: edge already exists" 

138 

139 # create a dictionary mapping each parent level to most frequent child level 

140 # 

141 # -- seems suboptimal with numpy... 

142 # map_dct = dict() 

143 # for parent_lev in np.unique(X[:, parent]): 

144 # values, counts = np.unique(X[X[:, parent] == parent_lev, child], return_counts=True) 

145 # map_dct[parent_lev] = values[np.argmax(counts)] 

146 # 

147 # -- same with pandas groupby 

148 # if is_struct_array: 

149 # pc_df = pd.DataFrame(X[[names[parent], names[child]]]) 

150 # pc_df.columns = [0, 1] # forget the names 

151 # else: 

152 pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"]) 

153 levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value) 

154 

155 # Init the dict for parent if it does not exit 

156 maps.setdefault(parent, dict()) 

157 

158 # Fill the parent-child item with the mapping object 

159 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict()) 

160 

161 else: 

162 assert isinstance(X, pd.DataFrame) 

163 

164 # unfortunately pandas dataframe sparse do not allow item assignment :( so we need to work on numpy array 

165 # first get the numpy array in correct order 

166 varnames = forest.varnames 

167 X_ar = X.loc[:, varnames].values 

168 

169 self._maps = maps = dict() 

170 

171 for parent, child in forest.get_arcs(names=False): 

172 # assert (parent, child) not in maps, "Error: edge already exists" 

173 

174 # levels_mapping_df = X.loc[:, (parent, child)].groupby(parent).agg(lambda x: x.value_counts().index[0]) 

175 # maps[parent, child] = levels_mapping_df[child].to_dict() 

176 pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"]) 

177 levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value) 

178 

179 # Init the dict for parent if it does not exit 

180 maps.setdefault(parent, dict()) 

181 

182 # Fill the parent-child item with the mapping object 

183 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict()) 

184 

185 def remove_qd(self, 

186 X, # type: Union[np.ndarray, pd.DataFrame] 

187 inplace=False # type: bool 

188 ): 

189 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]] 

190 """ 

191 Removes from X the features that can be (quasi-)determined from the others 

192 This returns a copy by default, except if `inplace=True` 

193 

194 :param X: 

195 :param inplace: if this is set to True, 

196 :return: 

197 """ 

198 forest = self.forest 

199 

200 self.assert_valid_input(X, df_extras_allowed=True) 

201 

202 is_x_nparray = isinstance(X, np.ndarray) 

203 assert is_x_nparray == forest.is_nparray 

204 

205 if is_x_nparray: 

206 is_structured = X.dtype.names is not None 

207 if is_structured: 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true

208 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an " 

209 "unstructured array") 

210 if inplace: 210 ↛ 211line 210 didn't jump to line 211, because the condition on line 210 was never true

211 np.delete(X, forest.roots_mask_ar, axis=1) 

212 else: 

213 # for structured: return X[np.array(X.dtype.names)[forest.roots_mask_ar]] 

214 return X[:, forest.roots_mask_ar] 

215 else: 

216 # pandas dataframe 

217 if inplace: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true

218 del X[forest.roots] 

219 else: 

220 return X.loc[:, forest.roots_mask] 

221 

222 def predict_qd(self, 

223 X, # type: Union[np.ndarray, pd.DataFrame] 

224 inplace=False # type: bool 

225 ): 

226 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]] 

227 """ 

228 Adds columns to X corresponding to the features that can be determined from the roots. 

229 By default, 

230 

231 :param X: 

232 :param inplace: if `True` and X is a dataframe, predicted columns will be added inplace. Note that the order 

233 may differ from the initial trainin 

234 :return: 

235 """ 

236 forest = self.forest 

237 

238 # if inplace is None: 

239 # inplace = not self.is_nparray 

240 

241 is_x_nparray = isinstance(X, np.ndarray) 

242 assert is_x_nparray == forest.is_nparray 

243 

244 if is_x_nparray: 

245 is_structured = X.dtype.names is not None 

246 if is_structured: 246 ↛ 247line 246 didn't jump to line 247, because the condition on line 246 was never true

247 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an " 

248 "unstructured array") 

249 if not inplace: 

250 # Same as in sklearn inverse_transform: create the missing columns in X first 

251 X_in = X 

252 support = forest.roots_mask_ar 

253 # X = check_array(X, dtype=None) 

254 nbcols_received = X_in.shape[1] 

255 if support.sum() != nbcols_received: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise ValueError("X has a different nb columns than the number of roots found during fitting.") 

257 # if a single column, make sure this is 2d 

258 if X_in.ndim == 1: 258 ↛ 259line 258 didn't jump to line 259, because the condition on line 258 was never true

259 X_in = X_in[None, :] 

260 # create a copy with the extra columns 

261 X = np.zeros((X_in.shape[0], support.size), dtype=X_in.dtype) 

262 X[:, support] = X_in 

263 else: 

264 if X.shape[1] != forest.nb_vars: 264 ↛ 265line 264 didn't jump to line 265, because the condition on line 264 was never true

265 raise ValueError("If `inplace=True`, `predict` expects an X input with the correct number of " 

266 "columns. Use `inplace=False` to pass only the array of roots. Note that this" 

267 "is the default behaviour of inplace.") 

268 

269 # walk the tree from the roots 

270 for _, parent, child in forest.walk_arcs(): 

271 X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent]) 

272 else: 

273 if not inplace: 273 ↛ 277line 273 didn't jump to line 277, because the condition on line 273 was never false

274 X = X.copy() 

275 

276 # walk the tree from the roots 

277 varnames = forest.varnames 

278 for _, parent, child in forest.walk_arcs(names=False): 

279 X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]]) 

280 

281 if not inplace: 

282 return X