Coverage for qdscreen/tests/test_readme.py: 97%

162 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-17 11:02 +0000

1# -*- coding: utf-8 -*- 

2# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments) 

3# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2 

4import sys 

5 

6import numpy as np 

7import pandas as pd 

8import pytest 

9 

10from qdscreen import qd_screen 

11 

12 

13try: 

14 from pathlib import Path 

15except ImportError: 

16 from pathlib2 import Path 

17 

18 

19PY2 = sys.version_info < (3,) 

20 

21 

22def df_mix1(): 

23 """The dataset for reuse""" 

24 df = pd.DataFrame({ 

25 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"], 

26 'V': ["a", "b", "c", "a", "b", "c", "a", "b", "c", "c"], 

27 'W': ["a", "b", "b", "a", "b", "b", "a", "b", "b", "b"], 

28 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"], 

29 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"], 

30 'Z': ["a", "a", "b", "a", "a", "b", "b", "a", "a", "b"] # if the 3d element were an "a", X->Z and Y->Z 

31 }) 

32 

33 # In this data: the following relationships hold: 

34 # H(W|V) = 0 H(W|U) = 0 H(V|U) = 0 # U -> V -> W 

35 # H(Y|X) = H(X|Y) = 0 # Y <-> X but X selected as representant as 1st appearing 

36 # H(Z|Y) = H(Z|X) = 0.275 AND H(Z|Y) / H(Z) = 0.284 # X -> Z if qd threshold > 0.19 (absolute) or 0.28 (relative) 

37 return df 

38 

39 

40def test_encoding(capsys): 

41 """A small test to be sure we can compare the tree text characters correctly""" 

42 print("└─") 

43 captured = capsys.readouterr() 

44 with capsys.disabled(): 

45 # here this is unicode match 

46 assert captured.out == u"└─\n" 

47 

48 # here this is bytes match 

49 from .encoding_ref_help import Foo 

50 assert "└─ab\n" == str(Foo()) 

51 assert "└─ab\n" == repr(Foo()) 

52 

53 

54def test_readme_simple(capsys): 

55 """The exact scenario showed in the readme""" 

56 

57 df = df_mix1() 

58 

59 # detect strict deterministic relationships 

60 qd_forest = qd_screen(df) 

61 print(qd_forest) 

62 captured = capsys.readouterr() 

63 with capsys.disabled(): 

64 # note the u for unicode string in py2, see test_encoding 

65 assert "\n" + captured.out == u""" 

66QDForest (6 vars): 

67 - 3 roots (1+2*): U*, X*, Z 

68 - 3 other nodes: V, W, Y 

69 

70U 

71└─ V 

72 └─ W 

73 

74X 

75└─ Y 

76 

77""" 

78 print("Columns in df: %s" % list(df.columns)) 

79 

80 # fit a feature selection model 

81 feat_selector = qd_forest.fit_selector_model(df) 

82 

83 # use it to filter... 

84 only_important_features = feat_selector.remove_qd(df) 

85 print("Columns in only_important_features: %s" % list(only_important_features.columns)) 

86 

87 # or to restore/predict 

88 restored_full_df = feat_selector.predict_qd(only_important_features) 

89 print("Columns in restored_full_df: %s" % list(restored_full_df.columns)) 

90 

91 # note that the order of columns differs frmo origin 

92 pd.testing.assert_frame_equal(df, restored_full_df[df.columns]) 

93 

94 captured = capsys.readouterr() 

95 with capsys.disabled(): 

96 assert "\n" + captured.out == """ 

97Columns in df: ['U', 'V', 'W', 'X', 'Y', 'Z'] 

98Columns in only_important_features: ['U', 'X', 'Z'] 

99Columns in restored_full_df: ['U', 'X', 'Z', 'V', 'W', 'Y'] 

100""" 

101 

102 

103def test_readme_quasi(capsys): 

104 df = df_mix1() 

105 

106 # keep stats at the end of the screening 

107 qd_forest = qd_screen(df, keep_stats=True) 

108 print(qd_forest.stats) 

109 

110 captured = capsys.readouterr() 

111 with capsys.disabled(): 

112 # print(captured.out) 

113 assert "\n" + captured.out == """ 

114Statistics computed for dataset: 

115 U V W X Y Z 

1160 a a a a b a 

1171 b b b a b a 

118...(10 rows) 

119 

120Entropies (H): 

121U 1.970951 

122V 1.570951 

123W 0.881291 

124X 1.570951 

125Y 1.570951 

126Z 0.970951 

127dtype: float64 

128 

129Conditional entropies (Hcond = H(row|col)): 

130 U V W X Y Z 

131U 0.000000 0.400000 1.089660 0.875489 0.875489 1.475489 

132V 0.000000 0.000000 0.689660 0.875489 0.875489 1.200000 

133W 0.000000 0.000000 0.000000 0.875489 0.875489 0.875489 

134X 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489 

135Y 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489 

136Z 0.475489 0.600000 0.965148 0.275489 0.275489 0.000000 

137 

138Relative conditional entropies (Hcond_rel = H(row|col)/H(row)): 

139 U V W X Y Z 

140U 0.000000 0.202948 0.552860 0.444196 0.444196 0.748618 

141V 0.000000 0.000000 0.439008 0.557299 0.557299 0.763869 

142W 0.000000 0.000000 0.000000 0.993416 0.993416 0.993416 

143X 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299 

144Y 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299 

145Z 0.489715 0.617951 0.994024 0.283731 0.283731 0.000000 

146 

147""" 

148 qd_forest2 = qd_screen(df, relative_eps=0.29) 

149 print(qd_forest2) 

150 captured = capsys.readouterr() 

151 with capsys.disabled(): 

152 # print(captured.out) 

153 # note the u for unicode string in py2, see test_encoding 

154 assert "\n" + captured.out == u""" 

155QDForest (6 vars): 

156 - 2 roots (0+2*): U*, X* 

157 - 4 other nodes: V, W, Y, Z 

158 

159U 

160└─ V 

161 └─ W 

162 

163X 

164└─ Y 

165└─ Z 

166 

167""" 

168 ce_df = qd_forest.get_entropies_table(from_to=False, sort_by="rel_cond_entropy") 

169 print(ce_df.head(10)) 

170 captured = capsys.readouterr() 

171 with capsys.disabled(): 

172 # print(captured.out) 

173 assert "\n" + captured.out == """ 

174 cond_entropy rel_cond_entropy 

175arc  

176U->V 0.000000 0.000000 

177U->W 0.000000 0.000000 

178V->W 0.000000 0.000000 

179Y->X 0.000000 0.000000 

180X->Y 0.000000 0.000000 

181V->U 0.400000 0.202948 

182Y->Z 0.275489 0.283731 

183X->Z 0.275489 0.283731 

184U->X 0.475489 0.302676 

185U->Y 0.475489 0.302676 

186""" 

187 if not PY2: 187 ↛ exitline 187 didn't return from function 'test_readme_quasi', because the condition on line 187 was never false

188 # we have issues on travis CI with matplotlib on PY2: skip 

189 import matplotlib.pyplot as plt 

190 qd_forest.plot_increasing_entropies() 

191 # fig = plt.gcf() 

192 # fig.savefig(str(IMGS_FOLDER / "increasing_entropies.png")) 

193 plt.close("all") 

194 

195 

196@pytest.mark.parametrize("typ", ['int', 'str', 'mixed']) 

197def test_readme_skl(typ): 

198 from qdscreen.sklearn import QDScreen 

199 

200 if typ in ('int', 'str'): 

201 X = [[0, 2, 0, 3], 

202 [0, 1, 4, 3], 

203 [0, 1, 1, 3]] 

204 if typ == 'str': 

205 X = np.array(X).astype(str) 

206 

207 elif typ == 'mixed': 207 ↛ 215line 207 didn't jump to line 215, because the condition on line 207 was never false

208 # X = [['0', 2, '0', 3], 

209 # ['0', 1, '4', 3], 

210 # ['0', 1, '1', 3]] 

211 # X = np.array(X) 

212 # assert X.dtype == '<U1' #str 

213 pytest.skip("Mixed dtypes do not exist in numpy unstructured arrays") 

214 else: 

215 raise ValueError() 

216 

217 selector = QDScreen() 

218 X2 = selector.fit_transform(X) 

219 expected_res = [[0], [4], [1]] if typ == 'int' else [['0'], ['4'], ['1']] 

220 np.testing.assert_array_equal(X2, np.array(expected_res)) 

221 

222 

223@pytest.mark.parametrize("input_type", ["pandas", 

224 "numpy_unstructured", 

225 # "numpy_structured", "numpy_recarray" 

226 ]) 

227def test_readme(input_type): 

228 """All variants that exist around the nominal scenario from the readme""" 

229 

230 if input_type == "pandas": 

231 df_orig = data = df_mix1() 

232 var_names = list(data.columns) 

233 data_ar = data.values 

234 elif input_type == "numpy_unstructured": 234 ↛ 252line 234 didn't jump to line 252, because the condition on line 234 was never false

235 df_orig = df_mix1() 

236 var_names = list(df_orig.columns) 

237 data = df_orig.to_numpy(copy=True) # to_numpy(df_orig, copy=True) 

238 data_ar = data 

239 # elif input_type == "numpy_structured": 

240 # df_orig = df_mix1() 

241 # var_names = list(df_orig.columns) 

242 # data = df_orig.to_records() 

243 # data = data.view(data.dtype.fields or data.dtype, np.ndarray)[var_names].copy() 

244 # data_ar = data 

245 # elif input_type == "numpy_recarray": 

246 # df_orig = df_mix1() 

247 # var_names = list(df_orig.columns) 

248 # data = df_orig.to_records()[var_names].copy() 

249 # data_ar = data 

250 # data.dtype.names = tuple(range(len(var_names))) 

251 else: 

252 raise ValueError(input_type) 

253 

254 # ref roots arrays 

255 data_roots_df = df_orig[['U', 'X', 'Z']] 

256 data_roots_ar = data_roots_df.values 

257 data_roots_quasi_df = df_orig[['U', 'X']] 

258 data_roots_quasi_ar = data_roots_quasi_df.values 

259 

260 nb_vars = len(var_names) 

261 

262 # check the stats, for debug 

263 # df_stats = Entropies(data) 

264 

265 # Sklearn use case 

266 from qdscreen.sklearn import QDScreen 

267 if input_type not in ("numpy_structured", "numpy_recarray"): 267 ↛ 297line 267 didn't jump to line 297, because the condition on line 267 was never false

268 # --strict 

269 sel = QDScreen() 

270 sel_data = sel.fit_transform(data_ar) 

271 np.testing.assert_array_equal(sel_data, data_roots_ar) 

272 data2 = sel.inverse_transform(sel_data) 

273 np.testing.assert_array_equal(data2, data_ar) 

274 # --quasi absolute - Z should become a child of X 

275 sel = QDScreen(absolute_eps=0.28) 

276 sel_data = sel.fit_transform(data_ar) 

277 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar) 

278 data2 = sel.inverse_transform(sel_data) 

279 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input 

280 assert data2[2, 5] == "a" 

281 data2[2, 5] = "b" 

282 np.testing.assert_array_equal(data2, data_ar) 

283 

284 # --quasi relative - Z should become a child of X 

285 sel = QDScreen(relative_eps=0.29) 

286 sel_data = sel.fit_transform(data_ar) 

287 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar) 

288 data2 = sel.inverse_transform(sel_data) 

289 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input 

290 assert data2[2, 5] == "a" 

291 data2[2, 5] = "b" 

292 np.testing.assert_array_equal(data2, data_ar) 

293 

294 # qdscreen full use case 

295 # (1) strict 

296 # -- (a) forest 

297 ref = pd.DataFrame(data=np.zeros((nb_vars, nb_vars), dtype=bool), index=var_names, columns=var_names) 

298 ref.loc['U', 'V'] = True 

299 ref.loc['V', 'W'] = True 

300 ref.loc['X', 'Y'] = True 

301 qd_forest = qd_screen(data) 

302 adj_mat = qd_forest.adjmat 

303 if input_type != "pandas": 

304 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names) 

305 pd.testing.assert_frame_equal(adj_mat, ref) 

306 # -- (b) feature selector 

307 if input_type == "pandas": 

308 # Note: this should be also tested with numpy structured & records input type 

309 # we can test that randomized column order still works 

310 new_cols_order = np.array(data.columns) 

311 np.random.shuffle(new_cols_order) 

312 new_roots_order = [c for c in new_cols_order if c in data_roots_df.columns] 

313 data_randcol = data[new_cols_order].copy() 

314 else: 

315 # numpy: index has meaning, no randomization possible 

316 data_randcol = data 

317 selector_model = qd_forest.fit_selector_model(data_randcol) 

318 sel2 = selector_model.remove_qd(data_randcol) 

319 if input_type == "pandas": 

320 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order]) 

321 # elif input_type == "numpy_structured": 

322 # sel2u = np.array(sel2.tolist(), dtype=object) 

323 # np.testing.assert_array_equal(sel2u, data_roots_ar) 

324 else: 

325 np.testing.assert_array_equal(sel2, data_roots_ar) 

326 data2 = selector_model.predict_qd(sel2) 

327 if input_type == "pandas": 

328 pd.testing.assert_frame_equal(data2[data.columns], data) 

329 # elif input_type == "numpy_structured": 

330 # np.testing.assert_array_equal(data2, data_ar) 

331 else: 

332 np.testing.assert_array_equal(data2, data_ar) 

333 

334 # (2) quasi 

335 # -- (a) forest 

336 ref.loc['X', 'Z'] = True 

337 # Z si threshold qd > 0.28 (absolu) ou 0.29 (relatif) 

338 qd_forest = qd_screen(data, absolute_eps=0.28) 

339 adj_mat = qd_forest.adjmat 

340 if input_type != "pandas": 

341 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names) 

342 pd.testing.assert_frame_equal(adj_mat, ref) 

343 

344 qd_forest = qd_screen(data, relative_eps=0.29) 

345 adj_mat = qd_forest.adjmat 

346 if input_type != "pandas": 

347 assert isinstance(qd_forest.parents, np.ndarray) 

348 assert isinstance(adj_mat, np.ndarray) 

349 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names) 

350 else: 

351 assert isinstance(qd_forest.parents, pd.DataFrame) 

352 assert list(qd_forest.parents.index) == var_names 

353 pd.testing.assert_frame_equal(adj_mat, ref) 

354 

355 # -- (b) feature selector 

356 

357 selector_model = qd_forest.fit_selector_model(data_randcol) 

358 sel2 = selector_model.remove_qd(data_randcol) 

359 if input_type == "pandas": 

360 new_roots_order_quasi = [r for r in new_roots_order if r != 'Z'] 

361 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order_quasi]) 

362 # elif input_type == "numpy_structured": 

363 # sel2u = np.array(sel2.tolist(), dtype=object) 

364 # np.testing.assert_array_equal(sel2u, data_roots_ar) 

365 else: 

366 np.testing.assert_array_equal(sel2, data_roots_quasi_ar) 

367 data2 = selector_model.predict_qd(sel2) 

368 if input_type == "pandas": 

369 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input 

370 assert data2.loc[2, "Z"] == "a" 

371 data2.loc[2, "Z"] = "b" 

372 pd.testing.assert_frame_equal(data2[data.columns], data) 

373 # elif input_type == "numpy_structured": 

374 # np.testing.assert_array_equal(data2, data_ar) 

375 else: 

376 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input 

377 assert data2[2, 5] == "a" 

378 data2[2, 5] = "b" 

379 np.testing.assert_array_equal(data2, data_ar)