Coverage for qdscreen/tests/test_core.py: 99%

141 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-17 11:02 +0000

1# -*- coding: utf-8 -*- 

2# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments) 

3# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2 

4import sys 

5 

6import numpy as np 

7import pandas as pd 

8import pytest 

9 

10from qdscreen import QDForest, qd_screen 

11from qdscreen.compat import PY2 

12from qdscreen.main import get_adjacency_matrix, remove_redundancies 

13from qdscreen.sklearn import QDScreen 

14 

15 

16def df_strict1(): 

17 """ A dataframe with two equivalent redundant variables X and Y """ 

18 df = pd.DataFrame({ 

19 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"], 

20 }) 

21 # 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"], 

22 df['Y'] = df['X'].replace("c", "d").replace("b", "c").replace("a", "b").replace("d", "a") 

23 return df 

24 

25 

26def test_adjacency_strict(): 

27 """ Tests that `get_adjacency_matrix` works as expected in front of a dataset with redundancy """ 

28 

29 # Compute the adjacency matrix 

30 adj_df, df_stats = get_adjacency_matrix(df_strict1()) 

31 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[ 

32 [False, True], 

33 [True, False] 

34 ], index=['X', 'Y'], columns=['X', 'Y'])) 

35 

36 

37def quasi_df1(): 

38 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657 

39 return pd.DataFrame({ 

40 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"], 

41 'Y': ["a", "b", "c", "c", "b", "a", "a", "b", "c", "a"], 

42 }) 

43 

44 

45def test_adjacency_invalid_thresholds(): 

46 # an error is raised if both are provided, even if both are zero 

47 with pytest.raises(ValueError): 

48 get_adjacency_matrix(pd.DataFrame(), eps_absolute=0., eps_relative=0.) 

49 

50 

51@pytest.mark.parametrize("eps_absolute, eps_relative", [ 

52 (None, None), # strict mode 

53 (0.2, None), # too low absolute 

54 (None, 0.15) # too low relative 

55]) 

56def test_adjacency_quasi_low_threshold(eps_absolute, eps_relative): 

57 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """ 

58 

59 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657 

60 # strict mode or with too low threshold: no arc is detected 

61 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative) 

62 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[ 

63 [False, False], 

64 [False, False] 

65 ], index=['X', 'Y'], columns=['X', 'Y'])) 

66 

67 

68@pytest.mark.parametrize("eps_absolute, eps_relative", [ 

69 (0.33, None), # high enough absolute 

70 (None, 0.21) # high enough relative 

71]) 

72def test_adjacency_quasi_high_threshold(eps_absolute, eps_relative): 

73 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """ 

74 

75 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657 

76 # strict mode or with too low threshold: no arc is detected 

77 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative) 

78 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[ 

79 [False, True], 

80 [True, False] 

81 ], index=['X', 'Y'], columns=['X', 'Y'])) 

82 

83 

84def test_remove_redundancies(): 

85 """ Tests that the redundancies removal routine works as expected """ 

86 

87 # an adjacency matrix with two redundant nodes X and Y 

88 adj_df = pd.DataFrame(data=[ 

89 [False, True], 

90 [True, False] 

91 ], index=['X', 'Y'], columns=['X', 'Y']) 

92 

93 # clean the redundancies with natural order: X is the representative of the class 

94 adj_df_clean1 = remove_redundancies(adj_df) 

95 pd.testing.assert_frame_equal(adj_df_clean1, pd.DataFrame(data=[ 

96 [False, True], 

97 [False, False] 

98 ], index=['X', 'Y'], columns=['X', 'Y'])) 

99 

100 # clean the redundancies with reverse order: Y is the representative of the class 

101 adj_df_clean2 = remove_redundancies(adj_df, selection_order=[1, 0]) 

102 pd.testing.assert_frame_equal(adj_df_clean2, pd.DataFrame(data=[ 

103 [False, False], 

104 [True, False] 

105 ], index=['X', 'Y'], columns=['X', 'Y'])) 

106 

107 

108# def test_identify_redundancy_quasi(): 

109# df = pd.DataFrame({ 

110# 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"], 

111# }) 

112# df['V'] = df['U'].replace("d", "c") # d -> c 

113# df['W'] = df['V'].replace("c", "b") # c -> b 

114# 

115# adj_df = get_adjacency_matrix(df) 

116# df2 = identify_redundancy(adj_df) 

117 

118 

119def get_qd_forest1(is_np): 

120 """ 

121 Created a forest with two trees 3->5 and 9->(1,7) 

122 

123 :param is_np: if true 

124 :return: 

125 """ 

126 adjmat_ar = adjmat = np.zeros((10, 10), dtype=bool) 

127 adjmat[1, 8] = True 

128 adjmat[3, 5] = True 

129 adjmat[9, 1] = True 

130 adjmat[9, 7] = True 

131 

132 parents_ar = parents = -np.ones((10,), 

133 dtype=np.int64) # indeed computing parents from adjmat with np.where returns this dtype 

134 parents[5] = 3 

135 parents[1] = 9 

136 parents[7] = 9 

137 parents[8] = 1 

138 

139 roots_ar = roots = np.array([0, 2, 3, 4, 6, 9]) 

140 roots_wc_ar = roots_wc = np.array([3, 9]) 

141 

142 if not is_np: 

143 varnames = list("abcdefghij") 

144 adjmat = pd.DataFrame(adjmat_ar, columns=varnames, index=varnames) 

145 parents = pd.DataFrame(parents_ar, index=varnames, columns=('idx',)) 

146 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None) 

147 roots = np.array(varnames)[roots_ar] 

148 roots_wc = np.array(varnames)[roots_wc_ar] 

149 

150 return adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar 

151 

152 

153@pytest.mark.parametrize("from_adjmat", [True, False], ids="from_adjmat={}".format) 

154@pytest.mark.parametrize("is_np", [True, False], ids="is_np={}".format) 

155def test_qd_forest(is_np, from_adjmat): 

156 """Tests that QDForest works correctly whether created from adj matrix or parents list""" 

157 

158 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np) 

159 

160 if from_adjmat: 

161 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix 

162 else: 

163 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates 

164 

165 # roots 

166 assert qd1.get_roots(names=False) == list(roots_ar) 

167 np.testing.assert_array_equal(qd1.indices_to_mask(roots_ar), qd1.roots_mask_ar) 

168 assert qd1.get_roots_with_children(names=False) == list(roots_wc_ar) 

169 if not is_np: 

170 pd.testing.assert_series_equal(qd1.indices_to_mask(roots), qd1.roots_mask) 

171 assert qd1.get_roots(names=True) == list(roots) 

172 assert qd1.get_roots_with_children(names=True) == list(roots_wc) 

173 else: 

174 with pytest.raises(ValueError): 

175 qd1.get_roots(names=True) 

176 with pytest.raises(ValueError): 

177 qd1.get_roots_with_children(names=True) 

178 # default 

179 assert qd1.get_roots() == qd1.get_roots(names=not is_np) 

180 assert qd1.get_roots_with_children() == qd1.get_roots_with_children(names=not is_np) 

181 

182 # string representation of arcs 

183 # -- indices 

184 if from_adjmat: 

185 assert qd1.get_arcs_str_list(names=False) == ['1 -> 8', '3 -> 5', '9 -> 1', '9 -> 7'] 

186 else: 

187 # make sure the adj matrix was not computed automatically, to test the right method 

188 assert qd1._adjmat is None 

189 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9 

190 assert qd1.get_arcs_str_list(names=False) == ['9 -> 1', '3 -> 5', '9 -> 7', '1 -> 8'] 

191 # -- names 

192 if not is_np: 

193 if from_adjmat: 

194 assert qd1.get_arcs_str_list(names=True) == ['b -> i', 'd -> f', 'j -> b', 'j -> h'] 

195 else: 

196 # make sure the adj matrix was not computed automatically, to test the right method 

197 assert qd1._adjmat is None 

198 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9 

199 assert qd1.get_arcs_str_list(names=True) == ['j -> b', 'd -> f', 'j -> h', 'b -> i'] 

200 else: 

201 with pytest.raises(ValueError): 

202 qd1.get_arcs_str_list(names=True) 

203 # check the default value of `names` 

204 assert qd1.get_arcs_str_list() == qd1.get_arcs_str_list(names=not is_np) 

205 

206 # equivalent adjmat and parents computation, to be sure 

207 np.testing.assert_array_equal(qd1.parents_indices_ar, parents_ar) 

208 if not is_np: 

209 pd.testing.assert_frame_equal(qd1.parents, parents) 

210 

211 # equivalent adjmat computation, to be sure 

212 np.testing.assert_array_equal(qd1.adjmat_ar, adjmat_ar) 

213 if not is_np: 

214 pd.testing.assert_frame_equal(qd1.adjmat, adjmat) 

215 

216 

217@pytest.mark.parametrize("from_adjmat", [True, False]) 

218@pytest.mark.parametrize("is_np", [True, False]) 

219def test_qd_forest_str(is_np, from_adjmat): 

220 """Tests the string representation """ 

221 

222 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np) 

223 

224 if from_adjmat: 

225 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix 

226 else: 

227 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates 

228 

229 # string representation 

230 compact_str = qd1.to_str(mode="compact") 

231 assert compact_str == "QDForest (10 vars = 6 roots + 4 determined by 2 of the roots)" 

232 

233 roots_str = "0, 2, 3*, 4, 6, 9*" if is_np else "a, c, d*, e, g, j*" 

234 others_str = "1, 5, 7, 8" if is_np else "b, f, h, i" 

235 headers_str = qd1.to_str(mode="headers") 

236 assert headers_str == """QDForest (10 vars): 

237 - 6 roots (4+2*): %s 

238 - 4 other nodes: %s""" % (roots_str, others_str) 

239 # this should be the default 

240 assert qd1.to_str() == headers_str 

241 

242 trees_str = "\n" + "\n".join(qd1.get_trees_str_list()) 

243 if is_np: 

244 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars 

245 assert trees_str == u""" 

2463 

247└─ 5 

248 

2499 

250└─ 1 

251 └─ 8 

252└─ 7 

253""" 

254 else: 

255 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars 

256 assert trees_str == u""" 

257d 

258└─ f 

259 

260j 

261└─ b 

262 └─ i 

263└─ h 

264""" 

265 full_str = qd1.to_str(mode="full") 

266 assert full_str == headers_str + u"\n" + trees_str 

267 # this should be the default string representation if the nb vars is small enough 

268 if PY2: 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true

269 assert full_str.encode('utf-8') == str(qd1) 

270 else: 

271 assert full_str == str(qd1) 

272 

273 

274# def test_sklearn_compat(): 

275# """Trying to make sure that this compatibility code works: it does NOT with scikit-learn 0.22.2.post1 :(""" 

276# from qdscreen.compat import BaseEstimator 

277# assert BaseEstimator()._more_tags()['requires_y'] is False 

278# assert BaseEstimator()._get_tags()['requires_y'] is False 

279 

280 

281def test_nans_in_data(): 

282 """See https://github.com/python-qds/qdscreen/issues/28""" 

283 

284 df = pd.DataFrame([ 

285 ["A", "B"], 

286 ["A", "B"], 

287 ["N", np.nan], 

288 ], columns=["a", "b"]) 

289 

290 qd_forest = qd_screen(df) 

291 feat_selector = qd_forest.fit_selector_model(df) 

292 df_sel = feat_selector.remove_qd(df) 

293 pd.testing.assert_frame_equal(df_sel, pd.DataFrame([ 

294 ["A"], 

295 ["A"], 

296 ["N"], 

297 ], columns=["a"])) 

298 

299 

300def test_nans_in_data_sklearn(): 

301 """See https://github.com/python-qds/qdscreen/issues/28""" 

302 

303 df = pd.DataFrame([ 

304 ["A", "B"], 

305 ["A", "B"], 

306 ["N", np.nan], 

307 ]) 

308 

309 selector = QDScreen() 

310 Xsel = selector.fit_transform(df.to_numpy()) 

311 

312 assert Xsel.tolist() == [['A'], ['A'], ['N']] 

313 

314 

315def test_issue_37_non_categorical(): 

316 df = pd.DataFrame({ 

317 "nb": [1, 2], 

318 "name": ["A", "B"] 

319 }) 

320 with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"): 

321 qd_screen(df) 

322 

323 

324@pytest.mark.skipif(sys.version_info < (3, 6), 

325 reason="This test is known to fail for 3.5 and 2.7, see GH#43") 

326def test_issue_40_nan_then_str(): 

327 df = pd.DataFrame({ 

328 "foo": ["1", "2"], 

329 "bar": [np.nan, "B"] 

330 }) 

331 qd_forest = qd_screen(df) 

332 assert list(qd_forest.roots) == ["foo"] 

333 

334 feat_selector = qd_forest.fit_selector_model(df) 

335 only_important_features_df = feat_selector.remove_qd(df) 

336 assert list(only_important_features_df.columns) == ["foo"] 

337 

338 result = feat_selector.predict_qd(only_important_features_df) 

339 pd.testing.assert_frame_equal(df, result)