Coverage for qdscreen/tests/test_core.py: 99%
141 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
1# -*- coding: utf-8 -*-
2# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)
3# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2
4import sys
6import numpy as np
7import pandas as pd
8import pytest
10from qdscreen import QDForest, qd_screen
11from qdscreen.compat import PY2
12from qdscreen.main import get_adjacency_matrix, remove_redundancies
13from qdscreen.sklearn import QDScreen
16def df_strict1():
17 """ A dataframe with two equivalent redundant variables X and Y """
18 df = pd.DataFrame({
19 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],
20 })
21 # 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],
22 df['Y'] = df['X'].replace("c", "d").replace("b", "c").replace("a", "b").replace("d", "a")
23 return df
26def test_adjacency_strict():
27 """ Tests that `get_adjacency_matrix` works as expected in front of a dataset with redundancy """
29 # Compute the adjacency matrix
30 adj_df, df_stats = get_adjacency_matrix(df_strict1())
31 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[
32 [False, True],
33 [True, False]
34 ], index=['X', 'Y'], columns=['X', 'Y']))
37def quasi_df1():
38 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657
39 return pd.DataFrame({
40 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],
41 'Y': ["a", "b", "c", "c", "b", "a", "a", "b", "c", "a"],
42 })
45def test_adjacency_invalid_thresholds():
46 # an error is raised if both are provided, even if both are zero
47 with pytest.raises(ValueError):
48 get_adjacency_matrix(pd.DataFrame(), eps_absolute=0., eps_relative=0.)
51@pytest.mark.parametrize("eps_absolute, eps_relative", [
52 (None, None), # strict mode
53 (0.2, None), # too low absolute
54 (None, 0.15) # too low relative
55])
56def test_adjacency_quasi_low_threshold(eps_absolute, eps_relative):
57 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """
59 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657
60 # strict mode or with too low threshold: no arc is detected
61 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative)
62 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[
63 [False, False],
64 [False, False]
65 ], index=['X', 'Y'], columns=['X', 'Y']))
68@pytest.mark.parametrize("eps_absolute, eps_relative", [
69 (0.33, None), # high enough absolute
70 (None, 0.21) # high enough relative
71])
72def test_adjacency_quasi_high_threshold(eps_absolute, eps_relative):
73 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """
75 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657
76 # strict mode or with too low threshold: no arc is detected
77 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative)
78 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[
79 [False, True],
80 [True, False]
81 ], index=['X', 'Y'], columns=['X', 'Y']))
84def test_remove_redundancies():
85 """ Tests that the redundancies removal routine works as expected """
87 # an adjacency matrix with two redundant nodes X and Y
88 adj_df = pd.DataFrame(data=[
89 [False, True],
90 [True, False]
91 ], index=['X', 'Y'], columns=['X', 'Y'])
93 # clean the redundancies with natural order: X is the representative of the class
94 adj_df_clean1 = remove_redundancies(adj_df)
95 pd.testing.assert_frame_equal(adj_df_clean1, pd.DataFrame(data=[
96 [False, True],
97 [False, False]
98 ], index=['X', 'Y'], columns=['X', 'Y']))
100 # clean the redundancies with reverse order: Y is the representative of the class
101 adj_df_clean2 = remove_redundancies(adj_df, selection_order=[1, 0])
102 pd.testing.assert_frame_equal(adj_df_clean2, pd.DataFrame(data=[
103 [False, False],
104 [True, False]
105 ], index=['X', 'Y'], columns=['X', 'Y']))
108# def test_identify_redundancy_quasi():
109# df = pd.DataFrame({
110# 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],
111# })
112# df['V'] = df['U'].replace("d", "c") # d -> c
113# df['W'] = df['V'].replace("c", "b") # c -> b
114#
115# adj_df = get_adjacency_matrix(df)
116# df2 = identify_redundancy(adj_df)
119def get_qd_forest1(is_np):
120 """
121 Created a forest with two trees 3->5 and 9->(1,7)
123 :param is_np: if true
124 :return:
125 """
126 adjmat_ar = adjmat = np.zeros((10, 10), dtype=bool)
127 adjmat[1, 8] = True
128 adjmat[3, 5] = True
129 adjmat[9, 1] = True
130 adjmat[9, 7] = True
132 parents_ar = parents = -np.ones((10,),
133 dtype=np.int64) # indeed computing parents from adjmat with np.where returns this dtype
134 parents[5] = 3
135 parents[1] = 9
136 parents[7] = 9
137 parents[8] = 1
139 roots_ar = roots = np.array([0, 2, 3, 4, 6, 9])
140 roots_wc_ar = roots_wc = np.array([3, 9])
142 if not is_np:
143 varnames = list("abcdefghij")
144 adjmat = pd.DataFrame(adjmat_ar, columns=varnames, index=varnames)
145 parents = pd.DataFrame(parents_ar, index=varnames, columns=('idx',))
146 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None)
147 roots = np.array(varnames)[roots_ar]
148 roots_wc = np.array(varnames)[roots_wc_ar]
150 return adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar
153@pytest.mark.parametrize("from_adjmat", [True, False], ids="from_adjmat={}".format)
154@pytest.mark.parametrize("is_np", [True, False], ids="is_np={}".format)
155def test_qd_forest(is_np, from_adjmat):
156 """Tests that QDForest works correctly whether created from adj matrix or parents list"""
158 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np)
160 if from_adjmat:
161 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix
162 else:
163 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates
165 # roots
166 assert qd1.get_roots(names=False) == list(roots_ar)
167 np.testing.assert_array_equal(qd1.indices_to_mask(roots_ar), qd1.roots_mask_ar)
168 assert qd1.get_roots_with_children(names=False) == list(roots_wc_ar)
169 if not is_np:
170 pd.testing.assert_series_equal(qd1.indices_to_mask(roots), qd1.roots_mask)
171 assert qd1.get_roots(names=True) == list(roots)
172 assert qd1.get_roots_with_children(names=True) == list(roots_wc)
173 else:
174 with pytest.raises(ValueError):
175 qd1.get_roots(names=True)
176 with pytest.raises(ValueError):
177 qd1.get_roots_with_children(names=True)
178 # default
179 assert qd1.get_roots() == qd1.get_roots(names=not is_np)
180 assert qd1.get_roots_with_children() == qd1.get_roots_with_children(names=not is_np)
182 # string representation of arcs
183 # -- indices
184 if from_adjmat:
185 assert qd1.get_arcs_str_list(names=False) == ['1 -> 8', '3 -> 5', '9 -> 1', '9 -> 7']
186 else:
187 # make sure the adj matrix was not computed automatically, to test the right method
188 assert qd1._adjmat is None
189 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9
190 assert qd1.get_arcs_str_list(names=False) == ['9 -> 1', '3 -> 5', '9 -> 7', '1 -> 8']
191 # -- names
192 if not is_np:
193 if from_adjmat:
194 assert qd1.get_arcs_str_list(names=True) == ['b -> i', 'd -> f', 'j -> b', 'j -> h']
195 else:
196 # make sure the adj matrix was not computed automatically, to test the right method
197 assert qd1._adjmat is None
198 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9
199 assert qd1.get_arcs_str_list(names=True) == ['j -> b', 'd -> f', 'j -> h', 'b -> i']
200 else:
201 with pytest.raises(ValueError):
202 qd1.get_arcs_str_list(names=True)
203 # check the default value of `names`
204 assert qd1.get_arcs_str_list() == qd1.get_arcs_str_list(names=not is_np)
206 # equivalent adjmat and parents computation, to be sure
207 np.testing.assert_array_equal(qd1.parents_indices_ar, parents_ar)
208 if not is_np:
209 pd.testing.assert_frame_equal(qd1.parents, parents)
211 # equivalent adjmat computation, to be sure
212 np.testing.assert_array_equal(qd1.adjmat_ar, adjmat_ar)
213 if not is_np:
214 pd.testing.assert_frame_equal(qd1.adjmat, adjmat)
217@pytest.mark.parametrize("from_adjmat", [True, False])
218@pytest.mark.parametrize("is_np", [True, False])
219def test_qd_forest_str(is_np, from_adjmat):
220 """Tests the string representation """
222 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np)
224 if from_adjmat:
225 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix
226 else:
227 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates
229 # string representation
230 compact_str = qd1.to_str(mode="compact")
231 assert compact_str == "QDForest (10 vars = 6 roots + 4 determined by 2 of the roots)"
233 roots_str = "0, 2, 3*, 4, 6, 9*" if is_np else "a, c, d*, e, g, j*"
234 others_str = "1, 5, 7, 8" if is_np else "b, f, h, i"
235 headers_str = qd1.to_str(mode="headers")
236 assert headers_str == """QDForest (10 vars):
237 - 6 roots (4+2*): %s
238 - 4 other nodes: %s""" % (roots_str, others_str)
239 # this should be the default
240 assert qd1.to_str() == headers_str
242 trees_str = "\n" + "\n".join(qd1.get_trees_str_list())
243 if is_np:
244 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars
245 assert trees_str == u"""
2463
247└─ 5
2499
250└─ 1
251 └─ 8
252└─ 7
253"""
254 else:
255 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars
256 assert trees_str == u"""
257d
258└─ f
260j
261└─ b
262 └─ i
263└─ h
264"""
265 full_str = qd1.to_str(mode="full")
266 assert full_str == headers_str + u"\n" + trees_str
267 # this should be the default string representation if the nb vars is small enough
268 if PY2: 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true
269 assert full_str.encode('utf-8') == str(qd1)
270 else:
271 assert full_str == str(qd1)
274# def test_sklearn_compat():
275# """Trying to make sure that this compatibility code works: it does NOT with scikit-learn 0.22.2.post1 :("""
276# from qdscreen.compat import BaseEstimator
277# assert BaseEstimator()._more_tags()['requires_y'] is False
278# assert BaseEstimator()._get_tags()['requires_y'] is False
281def test_nans_in_data():
282 """See https://github.com/python-qds/qdscreen/issues/28"""
284 df = pd.DataFrame([
285 ["A", "B"],
286 ["A", "B"],
287 ["N", np.nan],
288 ], columns=["a", "b"])
290 qd_forest = qd_screen(df)
291 feat_selector = qd_forest.fit_selector_model(df)
292 df_sel = feat_selector.remove_qd(df)
293 pd.testing.assert_frame_equal(df_sel, pd.DataFrame([
294 ["A"],
295 ["A"],
296 ["N"],
297 ], columns=["a"]))
300def test_nans_in_data_sklearn():
301 """See https://github.com/python-qds/qdscreen/issues/28"""
303 df = pd.DataFrame([
304 ["A", "B"],
305 ["A", "B"],
306 ["N", np.nan],
307 ])
309 selector = QDScreen()
310 Xsel = selector.fit_transform(df.to_numpy())
312 assert Xsel.tolist() == [['A'], ['A'], ['N']]
315def test_issue_37_non_categorical():
316 df = pd.DataFrame({
317 "nb": [1, 2],
318 "name": ["A", "B"]
319 })
320 with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"):
321 qd_screen(df)
324@pytest.mark.skipif(sys.version_info < (3, 6),
325 reason="This test is known to fail for 3.5 and 2.7, see GH#43")
326def test_issue_40_nan_then_str():
327 df = pd.DataFrame({
328 "foo": ["1", "2"],
329 "bar": [np.nan, "B"]
330 })
331 qd_forest = qd_screen(df)
332 assert list(qd_forest.roots) == ["foo"]
334 feat_selector = qd_forest.fit_selector_model(df)
335 only_important_features_df = feat_selector.remove_qd(df)
336 assert list(only_important_features_df.columns) == ["foo"]
338 result = feat_selector.predict_qd(only_important_features_df)
339 pd.testing.assert_frame_equal(df, result)