Coverage for qdscreen/selector.py: 80%
112 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
1import numpy as np
2import pandas as pd
3from scipy.stats import mode as scipy_mode
5try:
6 from typing import Union, Optional, Dict, Any
7except: # noqa
8 pass
10from .main import QDForest
13class InvalidDataInputError(ValueError):
14 """Raised when input data is invalid"""
17def _get_most_common_value(x):
18 # From https://stackoverflow.com/a/47778607/7262247
19 # `scipy_mode` is the most robust to the various pitfalls (nans, ...)
20 # but they will deprecate it
21 # return scipy_mode(x, nan_policy=None)[0][0]
22 res = x.mode(dropna=True)
23 if len(res) == 0:
24 return np.nan
25 else:
26 return res
29class ParentChildMapping:
30 __slots__ = ('_mapping_dct', '_otypes')
32 def __init__(
33 self,
34 mapping_dct # type: Dict
35 ):
36 self._mapping_dct = mapping_dct
37 # Find the correct otype to use in the vectorized operation
38 self._otypes = [np.array(mapping_dct.values()).dtype]
40 def predict_child_from_parent_ar(
41 self,
42 parent_values # type: np.ndarray
43 ):
44 """For numpy"""
45 # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
46 return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values)
48 def predict_child_from_parent(
49 self,
50 parent_values # type: pd.DataFrame
51 ):
52 """For pandas"""
53 # See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary
54 return parent_values.map(self._mapping_dct)
57class QDSelectorModel(object):
58 """
59 A quasi-determinism feature selection model that can be
61 - fit from a dataset using <model>.fit(X)
62 - used to select only the relevant (root) features using <model>.remove_qd(X)
63 - used to predict the other columns from the relevant (root) ones using <model>.predict_qd(X)
65 """
66 __slots__ = ('forest', # the QDForest
67 '_maps' # a nested dict {parent: {child: mapping_dct with index in the order of self.varnames}}
68 # note: scipy.sparse now raises an error with dtype=object
69 )
71 def __init__(self,
72 qd_forest # type: QDForest
73 ):
74 self.forest = qd_forest
75 self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]]
77 def assert_valid_input(
78 self,
79 X, # type: Union[np.ndarray, pd.DataFrame]
80 df_extras_allowed=False # type: bool
81 ):
82 """Raises an InvalidDataInputError if X does not match the expectation"""
84 if self.forest.is_nparray:
85 if not isinstance(X, np.ndarray): 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise InvalidDataInputError(
87 "Input data must be an numpy array. Found: %s" % type(X))
89 if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]: 89 ↛ 90line 89 didn't jump to line 90, because the condition on line 89 was never true
90 raise InvalidDataInputError(
91 "Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1]))
92 else:
93 if not isinstance(X, pd.DataFrame): 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true
94 raise InvalidDataInputError(
95 "Input data must be a pandas DataFrame. Found: %s" % type(X))
97 actual = set(X.columns)
98 expected = set(self.forest.varnames)
99 if actual != expected: 99 ↛ 100line 99 didn't jump to line 100, because the condition on line 99 was never true
100 missing = expected - actual
101 if missing or not df_extras_allowed:
102 extra = actual - expected
103 raise InvalidDataInputError(
104 "Input pandas DataFrame must have column names matching the ones in the model. "
105 "Missing: %s. Extra: %s " % (missing, extra)
106 )
108 def fit(
109 self,
110 X # type: Union[np.ndarray, pd.DataFrame]
111 ):
112 """Fits the maps able to predict determined features from others"""
113 forest = self.forest
115 # Validate the input
116 self.assert_valid_input(X, df_extras_allowed=False)
118 # we will create a sparse coordinate representation of maps
119 n = forest.nb_vars
121 if forest.is_nparray:
122 assert isinstance(X, np.ndarray)
124 # detect numpy structured arrays
125 is_struct_array = X.dtype.names is not None
126 if is_struct_array: 126 ↛ 129line 126 didn't jump to line 129, because the condition on line 126 was never true
127 # names = X.dtype.names
128 # assert len(names) == n
129 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "
130 "unstructured array")
131 else:
132 assert X.shape[1] == n
134 self._maps = maps = dict()
136 for parent, child in forest.get_arcs():
137 # assert (parent, child) not in maps, "Error: edge already exists"
139 # create a dictionary mapping each parent level to most frequent child level
140 #
141 # -- seems suboptimal with numpy...
142 # map_dct = dict()
143 # for parent_lev in np.unique(X[:, parent]):
144 # values, counts = np.unique(X[X[:, parent] == parent_lev, child], return_counts=True)
145 # map_dct[parent_lev] = values[np.argmax(counts)]
146 #
147 # -- same with pandas groupby
148 # if is_struct_array:
149 # pc_df = pd.DataFrame(X[[names[parent], names[child]]])
150 # pc_df.columns = [0, 1] # forget the names
151 # else:
152 pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"])
153 levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value)
155 # Init the dict for parent if it does not exit
156 maps.setdefault(parent, dict())
158 # Fill the parent-child item with the mapping object
159 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
161 else:
162 assert isinstance(X, pd.DataFrame)
164 # unfortunately pandas dataframe sparse do not allow item assignment :( so we need to work on numpy array
165 # first get the numpy array in correct order
166 varnames = forest.varnames
167 X_ar = X.loc[:, varnames].values
169 self._maps = maps = dict()
171 for parent, child in forest.get_arcs(names=False):
172 # assert (parent, child) not in maps, "Error: edge already exists"
174 # levels_mapping_df = X.loc[:, (parent, child)].groupby(parent).agg(lambda x: x.value_counts().index[0])
175 # maps[parent, child] = levels_mapping_df[child].to_dict()
176 pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"])
177 levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value)
179 # Init the dict for parent if it does not exit
180 maps.setdefault(parent, dict())
182 # Fill the parent-child item with the mapping object
183 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
185 def remove_qd(self,
186 X, # type: Union[np.ndarray, pd.DataFrame]
187 inplace=False # type: bool
188 ):
189 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]]
190 """
191 Removes from X the features that can be (quasi-)determined from the others
192 This returns a copy by default, except if `inplace=True`
194 :param X:
195 :param inplace: if this is set to True,
196 :return:
197 """
198 forest = self.forest
200 self.assert_valid_input(X, df_extras_allowed=True)
202 is_x_nparray = isinstance(X, np.ndarray)
203 assert is_x_nparray == forest.is_nparray
205 if is_x_nparray:
206 is_structured = X.dtype.names is not None
207 if is_structured: 207 ↛ 208line 207 didn't jump to line 208, because the condition on line 207 was never true
208 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "
209 "unstructured array")
210 if inplace: 210 ↛ 211line 210 didn't jump to line 211, because the condition on line 210 was never true
211 np.delete(X, forest.roots_mask_ar, axis=1)
212 else:
213 # for structured: return X[np.array(X.dtype.names)[forest.roots_mask_ar]]
214 return X[:, forest.roots_mask_ar]
215 else:
216 # pandas dataframe
217 if inplace: 217 ↛ 218line 217 didn't jump to line 218, because the condition on line 217 was never true
218 del X[forest.roots]
219 else:
220 return X.loc[:, forest.roots_mask]
222 def predict_qd(self,
223 X, # type: Union[np.ndarray, pd.DataFrame]
224 inplace=False # type: bool
225 ):
226 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]]
227 """
228 Adds columns to X corresponding to the features that can be determined from the roots.
229 By default,
231 :param X:
232 :param inplace: if `True` and X is a dataframe, predicted columns will be added inplace. Note that the order
233 may differ from the initial trainin
234 :return:
235 """
236 forest = self.forest
238 # if inplace is None:
239 # inplace = not self.is_nparray
241 is_x_nparray = isinstance(X, np.ndarray)
242 assert is_x_nparray == forest.is_nparray
244 if is_x_nparray:
245 is_structured = X.dtype.names is not None
246 if is_structured: 246 ↛ 247line 246 didn't jump to line 247, because the condition on line 246 was never true
247 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "
248 "unstructured array")
249 if not inplace:
250 # Same as in sklearn inverse_transform: create the missing columns in X first
251 X_in = X
252 support = forest.roots_mask_ar
253 # X = check_array(X, dtype=None)
254 nbcols_received = X_in.shape[1]
255 if support.sum() != nbcols_received: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true
256 raise ValueError("X has a different nb columns than the number of roots found during fitting.")
257 # if a single column, make sure this is 2d
258 if X_in.ndim == 1: 258 ↛ 259line 258 didn't jump to line 259, because the condition on line 258 was never true
259 X_in = X_in[None, :]
260 # create a copy with the extra columns
261 X = np.zeros((X_in.shape[0], support.size), dtype=X_in.dtype)
262 X[:, support] = X_in
263 else:
264 if X.shape[1] != forest.nb_vars: 264 ↛ 265line 264 didn't jump to line 265, because the condition on line 264 was never true
265 raise ValueError("If `inplace=True`, `predict` expects an X input with the correct number of "
266 "columns. Use `inplace=False` to pass only the array of roots. Note that this"
267 "is the default behaviour of inplace.")
269 # walk the tree from the roots
270 for _, parent, child in forest.walk_arcs():
271 X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent])
272 else:
273 if not inplace: 273 ↛ 277line 273 didn't jump to line 277, because the condition on line 273 was never false
274 X = X.copy()
276 # walk the tree from the roots
277 varnames = forest.varnames
278 for _, parent, child in forest.walk_arcs(names=False):
279 X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]])
281 if not inplace:
282 return X