1 # -*- coding: utf-8 -*-
2 # the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)
3 # from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2
4 import sys
5
6 import numpy as np
7 import pandas as pd
8 import pytest
9
10 from qdscreen import qd_screen
11
12
13 try:
14 from pathlib import Path
15 except ImportError:
-
F401
'pathlib2.Path' imported but unused
16 from pathlib2 import Path
17
18
19 PY2 = sys.version_info < (3,)
20
21
22 def df_mix1():
23 """The dataset for reuse"""
24 df = pd.DataFrame({
25 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],
26 'V': ["a", "b", "c", "a", "b", "c", "a", "b", "c", "c"],
27 'W': ["a", "b", "b", "a", "b", "b", "a", "b", "b", "b"],
28 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],
29 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],
30 'Z': ["a", "a", "b", "a", "a", "b", "b", "a", "a", "b"] # if the 3d element were an "a", X->Z and Y->Z
31 })
32
33 # In this data: the following relationships hold:
34 # H(W|V) = 0 H(W|U) = 0 H(V|U) = 0 # U -> V -> W
35 # H(Y|X) = H(X|Y) = 0 # Y <-> X but X selected as representant as 1st appearing
36 # H(Z|Y) = H(Z|X) = 0.275 AND H(Z|Y) / H(Z) = 0.284 # X -> Z if qd threshold > 0.19 (absolute) or 0.28 (relative)
37 return df
38
39
40 def test_encoding(capsys):
41 """A small test to be sure we can compare the tree text characters correctly"""
43 captured = capsys.readouterr()
44 with capsys.disabled():
45 # here this is unicode match
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
46 assert captured.out == u"└─\n"
47
48 # here this is bytes match
49 from .encoding_ref_help import Foo
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
50 assert "└─ab\n" == str(Foo())
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
51 assert "└─ab\n" == repr(Foo())
52
53
54 def test_readme_simple(capsys):
55 """The exact scenario showed in the readme"""
56
57 df = df_mix1()
58
59 # detect strict deterministic relationships
60 qd_forest = qd_screen(df)
62 captured = capsys.readouterr()
63 with capsys.disabled():
64 # note the u for unicode string in py2, see test_encoding
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
65 assert "\n" + captured.out == u"""
66 QDForest (6 vars):
67 - 3 roots (1+2*): U*, X*, Z
68 - 3 other nodes: V, W, Y
69
70 U
71 └─ V
72 └─ W
73
74 X
75 └─ Y
76
77 """
78 print("Columns in df: %s" % list(df.columns))
79
80 # fit a feature selection model
81 feat_selector = qd_forest.fit_selector_model(df)
82
83 # use it to filter...
84 only_important_features = feat_selector.remove_qd(df)
85 print("Columns in only_important_features: %s" % list(only_important_features.columns))
86
87 # or to restore/predict
88 restored_full_df = feat_selector.predict_qd(only_important_features)
89 print("Columns in restored_full_df: %s" % list(restored_full_df.columns))
90
91 # note that the order of columns differs frmo origin
92 pd.testing.assert_frame_equal(df, restored_full_df[df.columns])
93
94 captured = capsys.readouterr()
95 with capsys.disabled():
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
96 assert "\n" + captured.out == """
97 Columns in df: ['U', 'V', 'W', 'X', 'Y', 'Z']
98 Columns in only_important_features: ['U', 'X', 'Z']
99 Columns in restored_full_df: ['U', 'X', 'Z', 'V', 'W', 'Y']
100 """
101
102
103 def test_readme_quasi(capsys):
104 df = df_mix1()
105
106 # keep stats at the end of the screening
107 qd_forest = qd_screen(df, keep_stats=True)
108 print(qd_forest.stats)
109
110 captured = capsys.readouterr()
111 with capsys.disabled():
112 # print(captured.out)
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
113 assert "\n" + captured.out == """
114 Statistics computed for dataset:
115 U V W X Y Z
116 0 a a a a b a
117 1 b b b a b a
118 ...(10 rows)
119
120 Entropies (H):
121 U 1.970951
122 V 1.570951
123 W 0.881291
124 X 1.570951
125 Y 1.570951
126 Z 0.970951
127 dtype: float64
128
129 Conditional entropies (Hcond = H(row|col)):
130 U V W X Y Z
131 U 0.000000 0.400000 1.089660 0.875489 0.875489 1.475489
132 V 0.000000 0.000000 0.689660 0.875489 0.875489 1.200000
133 W 0.000000 0.000000 0.000000 0.875489 0.875489 0.875489
134 X 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489
135 Y 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489
136 Z 0.475489 0.600000 0.965148 0.275489 0.275489 0.000000
137
138 Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):
139 U V W X Y Z
140 U 0.000000 0.202948 0.552860 0.444196 0.444196 0.748618
141 V 0.000000 0.000000 0.439008 0.557299 0.557299 0.763869
142 W 0.000000 0.000000 0.000000 0.993416 0.993416 0.993416
143 X 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299
144 Y 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299
145 Z 0.489715 0.617951 0.994024 0.283731 0.283731 0.000000
146
147 """
148 qd_forest2 = qd_screen(df, relative_eps=0.29)
150 captured = capsys.readouterr()
151 with capsys.disabled():
152 # print(captured.out)
153 # note the u for unicode string in py2, see test_encoding
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
154 assert "\n" + captured.out == u"""
155 QDForest (6 vars):
156 - 2 roots (0+2*): U*, X*
157 - 4 other nodes: V, W, Y, Z
158
159 U
160 └─ V
161 └─ W
162
163 X
164 └─ Y
165 └─ Z
166
167 """
168 ce_df = qd_forest.get_entropies_table(from_to=False, sort_by="rel_cond_entropy")
169 print(ce_df.head(10))
170 captured = capsys.readouterr()
171 with capsys.disabled():
172 # print(captured.out)
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
173 assert "\n" + captured.out == """
174 cond_entropy rel_cond_entropy
176 U->V 0.000000 0.000000
177 U->W 0.000000 0.000000
178 V->W 0.000000 0.000000
179 Y->X 0.000000 0.000000
180 X->Y 0.000000 0.000000
181 V->U 0.400000 0.202948
182 Y->Z 0.275489 0.283731
183 X->Z 0.275489 0.283731
184 U->X 0.475489 0.302676
185 U->Y 0.475489 0.302676
186 """
187 if not PY2:
188 # we have issues on travis CI with matplotlib on PY2: skip
189 import matplotlib.pyplot as plt
190 qd_forest.plot_increasing_entropies()
191 # fig = plt.gcf()
192 # fig.savefig(str(IMGS_FOLDER / "increasing_entropies.png"))
193 plt.close("all")
194
195
196 @pytest.mark.parametrize("typ", ['int', 'str', 'mixed'])
197 def test_readme_skl(typ):
198 from qdscreen.sklearn import QDScreen
199
200 if typ in ('int', 'str'):
201 X = [[0, 2, 0, 3],
202 [0, 1, 4, 3],
203 [0, 1, 1, 3]]
204 if typ == 'str':
205 X = np.array(X).astype(str)
206
207 elif typ == 'mixed':
208 # X = [['0', 2, '0', 3],
209 # ['0', 1, '4', 3],
210 # ['0', 1, '1', 3]]
211 # X = np.array(X)
212 # assert X.dtype == '<U1' #str
213 pytest.skip("Mixed dtypes do not exist in numpy unstructured arrays")
214 else:
215 raise ValueError()
216
217 selector = QDScreen()
218 X2 = selector.fit_transform(X)
219 expected_res = [[0], [4], [1]] if typ == 'int' else [['0'], ['4'], ['1']]
220 np.testing.assert_array_equal(X2, np.array(expected_res))
221
222
223 @pytest.mark.parametrize("input_type", ["pandas",
224 "numpy_unstructured",
225 # "numpy_structured", "numpy_recarray"
226 ])
227 def test_readme(input_type):
228 """All variants that exist around the nominal scenario from the readme"""
229
230 if input_type == "pandas":
231 df_orig = data = df_mix1()
232 var_names = list(data.columns)
233 data_ar = data.values
234 elif input_type == "numpy_unstructured":
235 df_orig = df_mix1()
236 var_names = list(df_orig.columns)
237 data = df_orig.to_numpy(copy=True) # to_numpy(df_orig, copy=True)
238 data_ar = data
239 # elif input_type == "numpy_structured":
240 # df_orig = df_mix1()
241 # var_names = list(df_orig.columns)
242 # data = df_orig.to_records()
243 # data = data.view(data.dtype.fields or data.dtype, np.ndarray)[var_names].copy()
244 # data_ar = data
245 # elif input_type == "numpy_recarray":
246 # df_orig = df_mix1()
247 # var_names = list(df_orig.columns)
248 # data = df_orig.to_records()[var_names].copy()
249 # data_ar = data
250 # data.dtype.names = tuple(range(len(var_names)))
251 else:
252 raise ValueError(input_type)
253
254 # ref roots arrays
255 data_roots_df = df_orig[['U', 'X', 'Z']]
256 data_roots_ar = data_roots_df.values
257 data_roots_quasi_df = df_orig[['U', 'X']]
258 data_roots_quasi_ar = data_roots_quasi_df.values
259
260 nb_vars = len(var_names)
261
262 # check the stats, for debug
263 # df_stats = Entropies(data)
264
265 # Sklearn use case
266 from qdscreen.sklearn import QDScreen
267 if input_type not in ("numpy_structured", "numpy_recarray"):
268 # --strict
269 sel = QDScreen()
270 sel_data = sel.fit_transform(data_ar)
271 np.testing.assert_array_equal(sel_data, data_roots_ar)
272 data2 = sel.inverse_transform(sel_data)
273 np.testing.assert_array_equal(data2, data_ar)
274 # --quasi absolute - Z should become a child of X
275 sel = QDScreen(absolute_eps=0.28)
276 sel_data = sel.fit_transform(data_ar)
277 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)
278 data2 = sel.inverse_transform(sel_data)
279 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
280 assert data2[2, 5] == "a"
281 data2[2, 5] = "b"
282 np.testing.assert_array_equal(data2, data_ar)
283
284 # --quasi relative - Z should become a child of X
285 sel = QDScreen(relative_eps=0.29)
286 sel_data = sel.fit_transform(data_ar)
287 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)
288 data2 = sel.inverse_transform(sel_data)
289 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
290 assert data2[2, 5] == "a"
291 data2[2, 5] = "b"
292 np.testing.assert_array_equal(data2, data_ar)
293
294 # qdscreen full use case
295 # (1) strict
296 # -- (a) forest
297 ref = pd.DataFrame(data=np.zeros((nb_vars, nb_vars), dtype=bool), index=var_names, columns=var_names)
298 ref.loc['U', 'V'] = True
299 ref.loc['V', 'W'] = True
300 ref.loc['X', 'Y'] = True
301 qd_forest = qd_screen(data)
302 adj_mat = qd_forest.adjmat
303 if input_type != "pandas":
304 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)
305 pd.testing.assert_frame_equal(adj_mat, ref)
306 # -- (b) feature selector
307 if input_type == "pandas":
308 # Note: this should be also tested with numpy structured & records input type
309 # we can test that randomized column order still works
310 new_cols_order = np.array(data.columns)
311 np.random.shuffle(new_cols_order)
312 new_roots_order = [c for c in new_cols_order if c in data_roots_df.columns]
313 data_randcol = data[new_cols_order].copy()
314 else:
315 # numpy: index has meaning, no randomization possible
316 data_randcol = data
317 selector_model = qd_forest.fit_selector_model(data_randcol)
318 sel2 = selector_model.remove_qd(data_randcol)
319 if input_type == "pandas":
320 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order])
321 # elif input_type == "numpy_structured":
322 # sel2u = np.array(sel2.tolist(), dtype=object)
323 # np.testing.assert_array_equal(sel2u, data_roots_ar)
324 else:
325 np.testing.assert_array_equal(sel2, data_roots_ar)
326 data2 = selector_model.predict_qd(sel2)
327 if input_type == "pandas":
328 pd.testing.assert_frame_equal(data2[data.columns], data)
329 # elif input_type == "numpy_structured":
330 # np.testing.assert_array_equal(data2, data_ar)
331 else:
332 np.testing.assert_array_equal(data2, data_ar)
333
334 # (2) quasi
335 # -- (a) forest
336 ref.loc['X', 'Z'] = True
337 # Z si threshold qd > 0.28 (absolu) ou 0.29 (relatif)
338 qd_forest = qd_screen(data, absolute_eps=0.28)
339 adj_mat = qd_forest.adjmat
340 if input_type != "pandas":
341 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)
342 pd.testing.assert_frame_equal(adj_mat, ref)
343
344 qd_forest = qd_screen(data, relative_eps=0.29)
345 adj_mat = qd_forest.adjmat
346 if input_type != "pandas":
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
347 assert isinstance(qd_forest.parents, np.ndarray)
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
348 assert isinstance(adj_mat, np.ndarray)
349 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)
350 else:
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
351 assert isinstance(qd_forest.parents, pd.DataFrame)
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
352 assert list(qd_forest.parents.index) == var_names
353 pd.testing.assert_frame_equal(adj_mat, ref)
354
355 # -- (b) feature selector
356
357 selector_model = qd_forest.fit_selector_model(data_randcol)
358 sel2 = selector_model.remove_qd(data_randcol)
359 if input_type == "pandas":
360 new_roots_order_quasi = [r for r in new_roots_order if r != 'Z']
361 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order_quasi])
362 # elif input_type == "numpy_structured":
363 # sel2u = np.array(sel2.tolist(), dtype=object)
364 # np.testing.assert_array_equal(sel2u, data_roots_ar)
365 else:
366 np.testing.assert_array_equal(sel2, data_roots_quasi_ar)
367 data2 = selector_model.predict_qd(sel2)
368 if input_type == "pandas":
369 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
370 assert data2.loc[2, "Z"] == "a"
371 data2.loc[2, "Z"] = "b"
372 pd.testing.assert_frame_equal(data2[data.columns], data)
373 # elif input_type == "numpy_structured":
374 # np.testing.assert_array_equal(data2, data_ar)
375 else:
376 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
377 assert data2[2, 5] == "a"
378 data2[2, 5] = "b"
379 np.testing.assert_array_equal(data2, data_ar)