⬅ qdscreen/selector.py source

1 import numpy as np
2 import pandas as pd
  • F401 'scipy.stats.mode as scipy_mode' imported but unused
3 from scipy.stats import mode as scipy_mode
4  
5 try:
6 from typing import Union, Optional, Dict, Any
7 except: # noqa
8 pass
9  
10 from .main import QDForest
11  
12  
13 class InvalidDataInputError(ValueError):
14 """Raised when input data is invalid"""
15  
16  
17 def _get_most_common_value(x):
18 # From https://stackoverflow.com/a/47778607/7262247
19 # `scipy_mode` is the most robust to the various pitfalls (nans, ...)
20 # but they will deprecate it
21 # return scipy_mode(x, nan_policy=None)[0][0]
22 res = x.mode(dropna=True)
23 if len(res) == 0:
24 return np.nan
25 else:
26 return res
27  
28  
29 class ParentChildMapping:
30 __slots__ = ('_mapping_dct', '_otypes')
31  
32 def __init__(
33 self,
34 mapping_dct # type: Dict
35 ):
36 self._mapping_dct = mapping_dct
37 # Find the correct otype to use in the vectorized operation
38 self._otypes = [np.array(mapping_dct.values()).dtype]
39  
40 def predict_child_from_parent_ar(
41 self,
42 parent_values # type: np.ndarray
43 ):
44 """For numpy"""
45 # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
46 return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values)
47  
48 def predict_child_from_parent(
49 self,
50 parent_values # type: pd.DataFrame
51 ):
52 """For pandas"""
53 # See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary
54 return parent_values.map(self._mapping_dct)
55  
56  
57 class QDSelectorModel(object):
58 """
59 A quasi-determinism feature selection model that can be
60  
61 - fit from a dataset using <model>.fit(X)
62 - used to select only the relevant (root) features using <model>.remove_qd(X)
63 - used to predict the other columns from the relevant (root) ones using <model>.predict_qd(X)
64  
65 """
  • E261 At least two spaces before inline comment
66 __slots__ = ('forest', # the QDForest
67 '_maps' # a nested dict {parent: {child: mapping_dct with index in the order of self.varnames}}
68 # note: scipy.sparse now raises an error with dtype=object
69 )
70  
71 def __init__(self,
72 qd_forest # type: QDForest
73 ):
74 self.forest = qd_forest
75 self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]]
76  
77 def assert_valid_input(
78 self,
79 X, # type: Union[np.ndarray, pd.DataFrame]
80 df_extras_allowed=False # type: bool
81 ):
82 """Raises an InvalidDataInputError if X does not match the expectation"""
83  
84 if self.forest.is_nparray:
85 if not isinstance(X, np.ndarray):
86 raise InvalidDataInputError(
87 "Input data must be an numpy array. Found: %s" % type(X))
88  
89 if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]:
90 raise InvalidDataInputError(
91 "Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1]))
92 else:
93 if not isinstance(X, pd.DataFrame):
94 raise InvalidDataInputError(
95 "Input data must be a pandas DataFrame. Found: %s" % type(X))
96  
97 actual = set(X.columns)
98 expected = set(self.forest.varnames)
99 if actual != expected:
100 missing = expected - actual
101 if missing or not df_extras_allowed:
102 extra = actual - expected
103 raise InvalidDataInputError(
104 "Input pandas DataFrame must have column names matching the ones in the model. "
105 "Missing: %s. Extra: %s " % (missing, extra)
106 )
107  
108 def fit(
109 self,
110 X # type: Union[np.ndarray, pd.DataFrame]
111 ):
112 """Fits the maps able to predict determined features from others"""
113 forest = self.forest
114  
115 # Validate the input
116 self.assert_valid_input(X, df_extras_allowed=False)
117  
118 # we will create a sparse coordinate representation of maps
119 n = forest.nb_vars
120  
121 if forest.is_nparray:
  • S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
122 assert isinstance(X, np.ndarray)
123  
124 # detect numpy structured arrays
125 is_struct_array = X.dtype.names is not None
126 if is_struct_array:
127 # names = X.dtype.names
128 # assert len(names) == n
129 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "
130 "unstructured array")
131 else:
  • S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
132 assert X.shape[1] == n
133  
134 self._maps = maps = dict()
135  
136 for parent, child in forest.get_arcs():
137 # assert (parent, child) not in maps, "Error: edge already exists"
138  
139 # create a dictionary mapping each parent level to most frequent child level
140 #
141 # -- seems suboptimal with numpy...
142 # map_dct = dict()
143 # for parent_lev in np.unique(X[:, parent]):
144 # values, counts = np.unique(X[X[:, parent] == parent_lev, child], return_counts=True)
145 # map_dct[parent_lev] = values[np.argmax(counts)]
146 #
147 # -- same with pandas groupby
148 # if is_struct_array:
149 # pc_df = pd.DataFrame(X[[names[parent], names[child]]])
150 # pc_df.columns = [0, 1] # forget the names
151 # else:
152 pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"])
153 levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value)
154  
155 # Init the dict for parent if it does not exit
156 maps.setdefault(parent, dict())
157  
158 # Fill the parent-child item with the mapping object
159 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
160  
161 else:
  • S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
162 assert isinstance(X, pd.DataFrame)
163  
164 # unfortunately pandas dataframe sparse do not allow item assignment :( so we need to work on numpy array
165 # first get the numpy array in correct order
166 varnames = forest.varnames
167 X_ar = X.loc[:, varnames].values
168  
169 self._maps = maps = dict()
170  
171 for parent, child in forest.get_arcs(names=False):
172 # assert (parent, child) not in maps, "Error: edge already exists"
173  
174 # levels_mapping_df = X.loc[:, (parent, child)].groupby(parent).agg(lambda x: x.value_counts().index[0])
175 # maps[parent, child] = levels_mapping_df[child].to_dict()
176 pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"])
177 levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value)
178  
179 # Init the dict for parent if it does not exit
180 maps.setdefault(parent, dict())
181  
182 # Fill the parent-child item with the mapping object
183 maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
184  
185 def remove_qd(self,
186 X, # type: Union[np.ndarray, pd.DataFrame]
187 inplace=False # type: bool
188 ):
189 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]]
190 """
191 Removes from X the features that can be (quasi-)determined from the others
192 This returns a copy by default, except if `inplace=True`
193  
194 :param X:
195 :param inplace: if this is set to True,
196 :return:
197 """
198 forest = self.forest
199  
200 self.assert_valid_input(X, df_extras_allowed=True)
201  
202 is_x_nparray = isinstance(X, np.ndarray)
  • S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
203 assert is_x_nparray == forest.is_nparray
204  
205 if is_x_nparray:
206 is_structured = X.dtype.names is not None
207 if is_structured:
208 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "
209 "unstructured array")
210 if inplace:
211 np.delete(X, forest.roots_mask_ar, axis=1)
212 else:
213 # for structured: return X[np.array(X.dtype.names)[forest.roots_mask_ar]]
214 return X[:, forest.roots_mask_ar]
215 else:
216 # pandas dataframe
217 if inplace:
218 del X[forest.roots]
219 else:
220 return X.loc[:, forest.roots_mask]
221  
222 def predict_qd(self,
223 X, # type: Union[np.ndarray, pd.DataFrame]
224 inplace=False # type: bool
225 ):
226 # type: (...) -> Optional[Union[np.ndarray, pd.DataFrame]]
227 """
228 Adds columns to X corresponding to the features that can be determined from the roots.
229 By default,
230  
231 :param X:
232 :param inplace: if `True` and X is a dataframe, predicted columns will be added inplace. Note that the order
233 may differ from the initial trainin
234 :return:
235 """
236 forest = self.forest
237  
238 # if inplace is None:
239 # inplace = not self.is_nparray
240  
241 is_x_nparray = isinstance(X, np.ndarray)
  • S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
242 assert is_x_nparray == forest.is_nparray
243  
244 if is_x_nparray:
245 is_structured = X.dtype.names is not None
246 if is_structured:
247 raise NotImplementedError("structured numpy arrays are not supported. Please convert your array to an "
248 "unstructured array")
249 if not inplace:
250 # Same as in sklearn inverse_transform: create the missing columns in X first
251 X_in = X
252 support = forest.roots_mask_ar
253 # X = check_array(X, dtype=None)
254 nbcols_received = X_in.shape[1]
255 if support.sum() != nbcols_received:
256 raise ValueError("X has a different nb columns than the number of roots found during fitting.")
257 # if a single column, make sure this is 2d
258 if X_in.ndim == 1:
259 X_in = X_in[None, :]
260 # create a copy with the extra columns
261 X = np.zeros((X_in.shape[0], support.size), dtype=X_in.dtype)
262 X[:, support] = X_in
263 else:
264 if X.shape[1] != forest.nb_vars:
265 raise ValueError("If `inplace=True`, `predict` expects an X input with the correct number of "
266 "columns. Use `inplace=False` to pass only the array of roots. Note that this"
267 "is the default behaviour of inplace.")
268  
269 # walk the tree from the roots
270 for _, parent, child in forest.walk_arcs():
271 X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent])
272 else:
273 if not inplace:
274 X = X.copy()
275  
276 # walk the tree from the roots
277 varnames = forest.varnames
278 for _, parent, child in forest.walk_arcs(names=False):
  • E501 Line too long (123 > 120 characters)
279 X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]])
280  
281 if not inplace:
282 return X