Coverage for qdscreen/selector.py: 80%

1import numpy as np

2import pandas as pd

3from scipy.stats import mode as scipy_mode

5try:

6 from typing import Union, Optional, Dict, Any

7except: # noqa

8 pass

10from .main import QDForest

13class InvalidDataInputError(ValueError):

14 """Raised when input data is invalid"""

17def _get_most_common_value(x):

18 # From https://stackoverflow.com/a/47778607/7262247

19 # `scipy_mode` is the most robust to the various pitfalls (nans, ...)

20 # but they will deprecate it

21 # return scipy_mode(x, nan_policy=None)[0][0]

22 res = x.mode(dropna=True)

23 if len(res) == 0:

24 return np.nan

25 else:

26 return res

29class ParentChildMapping:

30 __slots__ = ('_mapping_dct', '_otypes')

32 def __init__(

33 self,

34 mapping_dct # type: Dict

35 ):

36 self._mapping_dct = mapping_dct

37 # Find the correct otype to use in the vectorized operation

38 self._otypes = [np.array(mapping_dct.values()).dtype]

40 def predict_child_from_parent_ar(

41 self,

42 parent_values # type: np.ndarray

43 ):

44 """For numpy"""

45 # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247

46 return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values)

48 def predict_child_from_parent(

49 self,

50 parent_values # type: pd.DataFrame

51 ):

52 """For pandas"""

53 # See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary

54 return parent_values.map(self._mapping_dct)

57class QDSelectorModel(object):

58 """

59 A quasi-determinism feature selection model that can be

61 - fit from a dataset using <model>.fit(X)

62 - used to select only the relevant (root) features using <model>.remove_qd(X)

63 - used to predict the other columns from the relevant (root) ones using <model>.predict_qd(X)

65 """

66 __slots__ = ('forest', # the QDForest

67 '_maps' # a nested dict {parent: {child: mapping_dct with index in the order of self.varnames}}

68 # note: scipy.sparse now raises an error with dtype=object

69 )

71 def __init__(self,

72 qd_forest # type: QDForest

73 ):

74 self.forest = qd_forest

75 self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]]

77 def assert_valid_input(

78 self,

79 X, # type: Union[np.ndarray, pd.DataFrame]

80 df_extras_allowed=False # type: bool

81 ):

82 """Raises an InvalidDataInputError if X does not match the expectation"""

84 if self.forest.is_nparray:

85 if not isinstance(X, np.ndarray): 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise InvalidDataInputError(

87 "Input data must be an numpy array. Found: %s" % type(X))

89 if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]: 89 ↛ 90line 89 didn't jump to line 90, because the condition on line 89 was never true

90 raise InvalidDataInputError(

91 "Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1]))

92 else:

93 if not isinstance(X, pd.DataFrame): 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true

94 raise InvalidDataInputError(

95 "Input data must be a pandas DataFrame. Found: %s" % type(X))

97 actual = set(X.columns)

98 expected = set(self.forest.varnames)

99 if actual != expected: 99 ↛ 100line 99 didn't jump to line 100, because the condition on line 99 was never true

100 missing = expected - actual

101 if missing or not df_extras_allowed:

102 extra = actual - expected

103 raise InvalidDataInputError(

104 "Input pandas DataFrame must have column names matching the ones in the model. "

105 "Missing: %s. Extra: %s " % (missing, extra)

106 )

107

108 def fit(

109 self,

110 X # type: Union[np.ndarray, pd.DataFrame]

111 ):

112 """Fits the maps able to predict determined features from others"""

113 forest = self.forest

114

115 # Validate the input

116 self.assert_valid_input(X, df_extras_allowed=False)

117

118 # we will create a sparse coordinate representation of maps

119 n = forest.nb_vars

120

121 if forest.is_nparray:

122 assert isinstance(X, np.ndarray)

123

124 # detect numpy structured arrays

125 is_struct_array = X.dtype.names is not None

126 if is_struct_array: 126 ↛ 129line 126 didn't jump to line 129, because the condition on line 126 was never true

127 # names = X.dtype.names

128 # assert len(names) == n

129 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "

130 "unstructured array")

131 else:

132 assert X.shape[1] == n

133

134 self._maps = maps = dict()

135

136 for parent, child in forest.get_arcs():

137 # assert (parent, child) not in maps, "Error: edge already exists"

138

139 # create a dictionary mapping each parent level to most frequent child level

140 #

141 # -- seems suboptimal with numpy...

142 # map_dct = dict()

143 # for parent_lev in np.unique(X[:, parent]):

144 # values, counts = np.unique(X[X[:, parent] == parent_lev, child], return_counts=True)

145 # map_dct[parent_lev] = values[np.argmax(counts)]

146 #

147 # -- same with pandas groupby

148 # if is_struct_array:

149 # pc_df = pd.DataFrame(X[[names[parent], names[child]]])

150 # pc_df.columns = [0, 1] # forget the names

151 # else:

152 pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"])

153 levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value)

154

155 # Init the dict for parent if it does not exit

156 maps.setdefault(parent, dict())

157

158 # Fill the parent-child item with the mapping object

159 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())

160

161 else:

162 assert isinstance(X, pd.DataFrame)

163

164 # unfortunately pandas dataframe sparse do not allow item assignment :( so we need to work on numpy array

165 # first get the numpy array in correct order

166 varnames = forest.varnames

167 X_ar = X.loc[:, varnames].values

168

169 self._maps = maps = dict()

170

171 for parent, child in forest.get_arcs(names=False):

172 # assert (parent, child) not in maps, "Error: edge already exists"

173

174 # levels_mapping_df = X.loc[:, (parent, child)].groupby(parent).agg(lambda x: x.value_counts().index[0])

175 # maps[parent, child] = levels_mapping_df[child].to_dict()

176 pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"])

177 levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value)

178

179 # Init the dict for parent if it does not exit

180 maps.setdefault(parent, dict())

181

182 # Fill the parent-child item with the mapping object

183 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())

184

185 def remove_qd(self,

186 X, # type: Union[np.ndarray, pd.DataFrame]

187 inplace=False # type: bool

188 ):

189 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]]

190 """

191 Removes from X the features that can be (quasi-)determined from the others

192 This returns a copy by default, except if `inplace=True`

193

194 :param X:

195 :param inplace: if this is set to True,

196 :return:

197 """

198 forest = self.forest

199

200 self.assert_valid_input(X, df_extras_allowed=True)

201

202 is_x_nparray = isinstance(X, np.ndarray)

203 assert is_x_nparray == forest.is_nparray

204

205 if is_x_nparray:

206 is_structured = X.dtype.names is not None

207 if is_structured: 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true

208 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "

209 "unstructured array")

210 if inplace: 210 ↛ 211line 210 didn't jump to line 211, because the condition on line 210 was never true

211 np.delete(X, forest.roots_mask_ar, axis=1)

212 else:

213 # for structured: return X[np.array(X.dtype.names)[forest.roots_mask_ar]]

214 return X[:, forest.roots_mask_ar]

215 else:

216 # pandas dataframe

217 if inplace: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true

218 del X[forest.roots]

219 else:

220 return X.loc[:, forest.roots_mask]

221

222 def predict_qd(self,

223 X, # type: Union[np.ndarray, pd.DataFrame]

224 inplace=False # type: bool

225 ):

226 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]]

227 """

228 Adds columns to X corresponding to the features that can be determined from the roots.

229 By default,

230

231 :param X:

232 :param inplace: if `True` and X is a dataframe, predicted columns will be added inplace. Note that the order

233 may differ from the initial trainin

234 :return:

235 """

236 forest = self.forest

237

238 # if inplace is None:

239 # inplace = not self.is_nparray

240

241 is_x_nparray = isinstance(X, np.ndarray)

242 assert is_x_nparray == forest.is_nparray

243

244 if is_x_nparray:

245 is_structured = X.dtype.names is not None

246 if is_structured: 246 ↛ 247line 246 didn't jump to line 247, because the condition on line 246 was never true

247 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "

248 "unstructured array")

249 if not inplace:

250 # Same as in sklearn inverse_transform: create the missing columns in X first

251 X_in = X

252 support = forest.roots_mask_ar

253 # X = check_array(X, dtype=None)

254 nbcols_received = X_in.shape[1]

255 if support.sum() != nbcols_received: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise ValueError("X has a different nb columns than the number of roots found during fitting.")

257 # if a single column, make sure this is 2d

258 if X_in.ndim == 1: 258 ↛ 259line 258 didn't jump to line 259, because the condition on line 258 was never true

259 X_in = X_in[None, :]

260 # create a copy with the extra columns

261 X = np.zeros((X_in.shape[0], support.size), dtype=X_in.dtype)

262 X[:, support] = X_in

263 else:

264 if X.shape[1] != forest.nb_vars: 264 ↛ 265line 264 didn't jump to line 265, because the condition on line 264 was never true

265 raise ValueError("If `inplace=True`, `predict` expects an X input with the correct number of "

266 "columns. Use `inplace=False` to pass only the array of roots. Note that this"

267 "is the default behaviour of inplace.")

268

269 # walk the tree from the roots

270 for _, parent, child in forest.walk_arcs():

271 X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent])

272 else:

273 if not inplace: 273 ↛ 277line 273 didn't jump to line 277, because the condition on line 273 was never false

274 X = X.copy()

275

276 # walk the tree from the roots

277 varnames = forest.varnames

278 for _, parent, child in forest.walk_arcs(names=False):

279 X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]])

280

281 if not inplace:

282 return X