Coverage for qdscreen/tests/test_readme.py: 97%
162 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
1# -*- coding: utf-8 -*-
2# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)
3# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2
4import sys
6import numpy as np
7import pandas as pd
8import pytest
10from qdscreen import qd_screen
13try:
14 from pathlib import Path
15except ImportError:
16 from pathlib2 import Path
19PY2 = sys.version_info < (3,)
22def df_mix1():
23 """The dataset for reuse"""
24 df = pd.DataFrame({
25 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],
26 'V': ["a", "b", "c", "a", "b", "c", "a", "b", "c", "c"],
27 'W': ["a", "b", "b", "a", "b", "b", "a", "b", "b", "b"],
28 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],
29 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],
30 'Z': ["a", "a", "b", "a", "a", "b", "b", "a", "a", "b"] # if the 3d element were an "a", X->Z and Y->Z
31 })
33 # In this data: the following relationships hold:
34 # H(W|V) = 0 H(W|U) = 0 H(V|U) = 0 # U -> V -> W
35 # H(Y|X) = H(X|Y) = 0 # Y <-> X but X selected as representant as 1st appearing
36 # H(Z|Y) = H(Z|X) = 0.275 AND H(Z|Y) / H(Z) = 0.284 # X -> Z if qd threshold > 0.19 (absolute) or 0.28 (relative)
37 return df
40def test_encoding(capsys):
41 """A small test to be sure we can compare the tree text characters correctly"""
42 print("└─")
43 captured = capsys.readouterr()
44 with capsys.disabled():
45 # here this is unicode match
46 assert captured.out == u"└─\n"
48 # here this is bytes match
49 from .encoding_ref_help import Foo
50 assert "└─ab\n" == str(Foo())
51 assert "└─ab\n" == repr(Foo())
54def test_readme_simple(capsys):
55 """The exact scenario showed in the readme"""
57 df = df_mix1()
59 # detect strict deterministic relationships
60 qd_forest = qd_screen(df)
61 print(qd_forest)
62 captured = capsys.readouterr()
63 with capsys.disabled():
64 # note the u for unicode string in py2, see test_encoding
65 assert "\n" + captured.out == u"""
66QDForest (6 vars):
67 - 3 roots (1+2*): U*, X*, Z
68 - 3 other nodes: V, W, Y
70U
71└─ V
72 └─ W
74X
75└─ Y
77"""
78 print("Columns in df: %s" % list(df.columns))
80 # fit a feature selection model
81 feat_selector = qd_forest.fit_selector_model(df)
83 # use it to filter...
84 only_important_features = feat_selector.remove_qd(df)
85 print("Columns in only_important_features: %s" % list(only_important_features.columns))
87 # or to restore/predict
88 restored_full_df = feat_selector.predict_qd(only_important_features)
89 print("Columns in restored_full_df: %s" % list(restored_full_df.columns))
91 # note that the order of columns differs frmo origin
92 pd.testing.assert_frame_equal(df, restored_full_df[df.columns])
94 captured = capsys.readouterr()
95 with capsys.disabled():
96 assert "\n" + captured.out == """
97Columns in df: ['U', 'V', 'W', 'X', 'Y', 'Z']
98Columns in only_important_features: ['U', 'X', 'Z']
99Columns in restored_full_df: ['U', 'X', 'Z', 'V', 'W', 'Y']
100"""
103def test_readme_quasi(capsys):
104 df = df_mix1()
106 # keep stats at the end of the screening
107 qd_forest = qd_screen(df, keep_stats=True)
108 print(qd_forest.stats)
110 captured = capsys.readouterr()
111 with capsys.disabled():
112 # print(captured.out)
113 assert "\n" + captured.out == """
114Statistics computed for dataset:
115 U V W X Y Z
1160 a a a a b a
1171 b b b a b a
118...(10 rows)
120Entropies (H):
121U 1.970951
122V 1.570951
123W 0.881291
124X 1.570951
125Y 1.570951
126Z 0.970951
127dtype: float64
129Conditional entropies (Hcond = H(row|col)):
130 U V W X Y Z
131U 0.000000 0.400000 1.089660 0.875489 0.875489 1.475489
132V 0.000000 0.000000 0.689660 0.875489 0.875489 1.200000
133W 0.000000 0.000000 0.000000 0.875489 0.875489 0.875489
134X 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489
135Y 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489
136Z 0.475489 0.600000 0.965148 0.275489 0.275489 0.000000
138Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):
139 U V W X Y Z
140U 0.000000 0.202948 0.552860 0.444196 0.444196 0.748618
141V 0.000000 0.000000 0.439008 0.557299 0.557299 0.763869
142W 0.000000 0.000000 0.000000 0.993416 0.993416 0.993416
143X 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299
144Y 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299
145Z 0.489715 0.617951 0.994024 0.283731 0.283731 0.000000
147"""
148 qd_forest2 = qd_screen(df, relative_eps=0.29)
149 print(qd_forest2)
150 captured = capsys.readouterr()
151 with capsys.disabled():
152 # print(captured.out)
153 # note the u for unicode string in py2, see test_encoding
154 assert "\n" + captured.out == u"""
155QDForest (6 vars):
156 - 2 roots (0+2*): U*, X*
157 - 4 other nodes: V, W, Y, Z
159U
160└─ V
161 └─ W
163X
164└─ Y
165└─ Z
167"""
168 ce_df = qd_forest.get_entropies_table(from_to=False, sort_by="rel_cond_entropy")
169 print(ce_df.head(10))
170 captured = capsys.readouterr()
171 with capsys.disabled():
172 # print(captured.out)
173 assert "\n" + captured.out == """
174 cond_entropy rel_cond_entropy
175arc
176U->V 0.000000 0.000000
177U->W 0.000000 0.000000
178V->W 0.000000 0.000000
179Y->X 0.000000 0.000000
180X->Y 0.000000 0.000000
181V->U 0.400000 0.202948
182Y->Z 0.275489 0.283731
183X->Z 0.275489 0.283731
184U->X 0.475489 0.302676
185U->Y 0.475489 0.302676
186"""
187 if not PY2: 187 ↛ exitline 187 didn't return from function 'test_readme_quasi', because the condition on line 187 was never false
188 # we have issues on travis CI with matplotlib on PY2: skip
189 import matplotlib.pyplot as plt
190 qd_forest.plot_increasing_entropies()
191 # fig = plt.gcf()
192 # fig.savefig(str(IMGS_FOLDER / "increasing_entropies.png"))
193 plt.close("all")
196@pytest.mark.parametrize("typ", ['int', 'str', 'mixed'])
197def test_readme_skl(typ):
198 from qdscreen.sklearn import QDScreen
200 if typ in ('int', 'str'):
201 X = [[0, 2, 0, 3],
202 [0, 1, 4, 3],
203 [0, 1, 1, 3]]
204 if typ == 'str':
205 X = np.array(X).astype(str)
207 elif typ == 'mixed': 207 ↛ 215line 207 didn't jump to line 215, because the condition on line 207 was never false
208 # X = [['0', 2, '0', 3],
209 # ['0', 1, '4', 3],
210 # ['0', 1, '1', 3]]
211 # X = np.array(X)
212 # assert X.dtype == '<U1' #str
213 pytest.skip("Mixed dtypes do not exist in numpy unstructured arrays")
214 else:
215 raise ValueError()
217 selector = QDScreen()
218 X2 = selector.fit_transform(X)
219 expected_res = [[0], [4], [1]] if typ == 'int' else [['0'], ['4'], ['1']]
220 np.testing.assert_array_equal(X2, np.array(expected_res))
223@pytest.mark.parametrize("input_type", ["pandas",
224 "numpy_unstructured",
225 # "numpy_structured", "numpy_recarray"
226 ])
227def test_readme(input_type):
228 """All variants that exist around the nominal scenario from the readme"""
230 if input_type == "pandas":
231 df_orig = data = df_mix1()
232 var_names = list(data.columns)
233 data_ar = data.values
234 elif input_type == "numpy_unstructured": 234 ↛ 252line 234 didn't jump to line 252, because the condition on line 234 was never false
235 df_orig = df_mix1()
236 var_names = list(df_orig.columns)
237 data = df_orig.to_numpy(copy=True) # to_numpy(df_orig, copy=True)
238 data_ar = data
239 # elif input_type == "numpy_structured":
240 # df_orig = df_mix1()
241 # var_names = list(df_orig.columns)
242 # data = df_orig.to_records()
243 # data = data.view(data.dtype.fields or data.dtype, np.ndarray)[var_names].copy()
244 # data_ar = data
245 # elif input_type == "numpy_recarray":
246 # df_orig = df_mix1()
247 # var_names = list(df_orig.columns)
248 # data = df_orig.to_records()[var_names].copy()
249 # data_ar = data
250 # data.dtype.names = tuple(range(len(var_names)))
251 else:
252 raise ValueError(input_type)
254 # ref roots arrays
255 data_roots_df = df_orig[['U', 'X', 'Z']]
256 data_roots_ar = data_roots_df.values
257 data_roots_quasi_df = df_orig[['U', 'X']]
258 data_roots_quasi_ar = data_roots_quasi_df.values
260 nb_vars = len(var_names)
262 # check the stats, for debug
263 # df_stats = Entropies(data)
265 # Sklearn use case
266 from qdscreen.sklearn import QDScreen
267 if input_type not in ("numpy_structured", "numpy_recarray"): 267 ↛ 297line 267 didn't jump to line 297, because the condition on line 267 was never false
268 # --strict
269 sel = QDScreen()
270 sel_data = sel.fit_transform(data_ar)
271 np.testing.assert_array_equal(sel_data, data_roots_ar)
272 data2 = sel.inverse_transform(sel_data)
273 np.testing.assert_array_equal(data2, data_ar)
274 # --quasi absolute - Z should become a child of X
275 sel = QDScreen(absolute_eps=0.28)
276 sel_data = sel.fit_transform(data_ar)
277 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)
278 data2 = sel.inverse_transform(sel_data)
279 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
280 assert data2[2, 5] == "a"
281 data2[2, 5] = "b"
282 np.testing.assert_array_equal(data2, data_ar)
284 # --quasi relative - Z should become a child of X
285 sel = QDScreen(relative_eps=0.29)
286 sel_data = sel.fit_transform(data_ar)
287 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)
288 data2 = sel.inverse_transform(sel_data)
289 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
290 assert data2[2, 5] == "a"
291 data2[2, 5] = "b"
292 np.testing.assert_array_equal(data2, data_ar)
294 # qdscreen full use case
295 # (1) strict
296 # -- (a) forest
297 ref = pd.DataFrame(data=np.zeros((nb_vars, nb_vars), dtype=bool), index=var_names, columns=var_names)
298 ref.loc['U', 'V'] = True
299 ref.loc['V', 'W'] = True
300 ref.loc['X', 'Y'] = True
301 qd_forest = qd_screen(data)
302 adj_mat = qd_forest.adjmat
303 if input_type != "pandas":
304 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)
305 pd.testing.assert_frame_equal(adj_mat, ref)
306 # -- (b) feature selector
307 if input_type == "pandas":
308 # Note: this should be also tested with numpy structured & records input type
309 # we can test that randomized column order still works
310 new_cols_order = np.array(data.columns)
311 np.random.shuffle(new_cols_order)
312 new_roots_order = [c for c in new_cols_order if c in data_roots_df.columns]
313 data_randcol = data[new_cols_order].copy()
314 else:
315 # numpy: index has meaning, no randomization possible
316 data_randcol = data
317 selector_model = qd_forest.fit_selector_model(data_randcol)
318 sel2 = selector_model.remove_qd(data_randcol)
319 if input_type == "pandas":
320 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order])
321 # elif input_type == "numpy_structured":
322 # sel2u = np.array(sel2.tolist(), dtype=object)
323 # np.testing.assert_array_equal(sel2u, data_roots_ar)
324 else:
325 np.testing.assert_array_equal(sel2, data_roots_ar)
326 data2 = selector_model.predict_qd(sel2)
327 if input_type == "pandas":
328 pd.testing.assert_frame_equal(data2[data.columns], data)
329 # elif input_type == "numpy_structured":
330 # np.testing.assert_array_equal(data2, data_ar)
331 else:
332 np.testing.assert_array_equal(data2, data_ar)
334 # (2) quasi
335 # -- (a) forest
336 ref.loc['X', 'Z'] = True
337 # Z si threshold qd > 0.28 (absolu) ou 0.29 (relatif)
338 qd_forest = qd_screen(data, absolute_eps=0.28)
339 adj_mat = qd_forest.adjmat
340 if input_type != "pandas":
341 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)
342 pd.testing.assert_frame_equal(adj_mat, ref)
344 qd_forest = qd_screen(data, relative_eps=0.29)
345 adj_mat = qd_forest.adjmat
346 if input_type != "pandas":
347 assert isinstance(qd_forest.parents, np.ndarray)
348 assert isinstance(adj_mat, np.ndarray)
349 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)
350 else:
351 assert isinstance(qd_forest.parents, pd.DataFrame)
352 assert list(qd_forest.parents.index) == var_names
353 pd.testing.assert_frame_equal(adj_mat, ref)
355 # -- (b) feature selector
357 selector_model = qd_forest.fit_selector_model(data_randcol)
358 sel2 = selector_model.remove_qd(data_randcol)
359 if input_type == "pandas":
360 new_roots_order_quasi = [r for r in new_roots_order if r != 'Z']
361 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order_quasi])
362 # elif input_type == "numpy_structured":
363 # sel2u = np.array(sel2.tolist(), dtype=object)
364 # np.testing.assert_array_equal(sel2u, data_roots_ar)
365 else:
366 np.testing.assert_array_equal(sel2, data_roots_quasi_ar)
367 data2 = selector_model.predict_qd(sel2)
368 if input_type == "pandas":
369 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
370 assert data2.loc[2, "Z"] == "a"
371 data2.loc[2, "Z"] = "b"
372 pd.testing.assert_frame_equal(data2[data.columns], data)
373 # elif input_type == "numpy_structured":
374 # np.testing.assert_array_equal(data2, data_ar)
375 else:
376 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input
377 assert data2[2, 5] == "a"
378 data2[2, 5] = "b"
379 np.testing.assert_array_equal(data2, data_ar)