Coverage for qdscreen/tests/test

1# -*- coding: utf-8 -*-

2# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)

3# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2

4import sys

6import numpy as np

7import pandas as pd

8import pytest

10from qdscreen import qd_screen

13try:

14 from pathlib import Path

15except ImportError:

16 from pathlib2 import Path

19PY2 = sys.version_info < (3,)

22def df_mix1():

23 """The dataset for reuse"""

24 df = pd.DataFrame({

25 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],

26 'V': ["a", "b", "c", "a", "b", "c", "a", "b", "c", "c"],

27 'W': ["a", "b", "b", "a", "b", "b", "a", "b", "b", "b"],

28 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],

29 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],

30 'Z': ["a", "a", "b", "a", "a", "b", "b", "a", "a", "b"] # if the 3d element were an "a", X->Z and Y->Z

31 })

33 # In this data: the following relationships hold:

34 # H(W|V) = 0 H(W|U) = 0 H(V|U) = 0 # U -> V -> W

35 # H(Y|X) = H(X|Y) = 0 # Y <-> X but X selected as representant as 1st appearing

36 # H(Z|Y) = H(Z|X) = 0.275 AND H(Z|Y) / H(Z) = 0.284 # X -> Z if qd threshold > 0.19 (absolute) or 0.28 (relative)

37 return df

40def test_encoding(capsys):

41 """A small test to be sure we can compare the tree text characters correctly"""

42 print("└─")

43 captured = capsys.readouterr()

44 with capsys.disabled():

45 # here this is unicode match

46 assert captured.out == u"└─\n"

48 # here this is bytes match

49 from .encoding_ref_help import Foo

50 assert "└─ab\n" == str(Foo())

51 assert "└─ab\n" == repr(Foo())

54def test_readme_simple(capsys):

55 """The exact scenario showed in the readme"""

57 df = df_mix1()

59 # detect strict deterministic relationships

60 qd_forest = qd_screen(df)

61 print(qd_forest)

62 captured = capsys.readouterr()

63 with capsys.disabled():

64 # note the u for unicode string in py2, see test_encoding

65 assert "\n" + captured.out == u"""

66QDForest (6 vars):

67 - 3 roots (1+2*): U*, X*, Z

68 - 3 other nodes: V, W, Y

70U

71└─ V

72 └─ W

74X

75└─ Y

77"""

78 print("Columns in df: %s" % list(df.columns))

80 # fit a feature selection model

81 feat_selector = qd_forest.fit_selector_model(df)

83 # use it to filter...

84 only_important_features = feat_selector.remove_qd(df)

85 print("Columns in only_important_features: %s" % list(only_important_features.columns))

87 # or to restore/predict

88 restored_full_df = feat_selector.predict_qd(only_important_features)

89 print("Columns in restored_full_df: %s" % list(restored_full_df.columns))

91 # note that the order of columns differs frmo origin

92 pd.testing.assert_frame_equal(df, restored_full_df[df.columns])

94 captured = capsys.readouterr()

95 with capsys.disabled():

96 assert "\n" + captured.out == """

97Columns in df: ['U', 'V', 'W', 'X', 'Y', 'Z']

98Columns in only_important_features: ['U', 'X', 'Z']

99Columns in restored_full_df: ['U', 'X', 'Z', 'V', 'W', 'Y']

100"""

101

102

103def test_readme_quasi(capsys):

104 df = df_mix1()

105

106 # keep stats at the end of the screening

107 qd_forest = qd_screen(df, keep_stats=True)

108 print(qd_forest.stats)

109

110 captured = capsys.readouterr()

111 with capsys.disabled():

112 # print(captured.out)

113 assert "\n" + captured.out == """

114Statistics computed for dataset:

115 U V W X Y Z

1160 a a a a b a

1171 b b b a b a

118...(10 rows)

119

120Entropies (H):

121U 1.970951

122V 1.570951

123W 0.881291

124X 1.570951

125Y 1.570951

126Z 0.970951

127dtype: float64

128

129Conditional entropies (Hcond = H(row|col)):

130 U V W X Y Z

131U 0.000000 0.400000 1.089660 0.875489 0.875489 1.475489

132V 0.000000 0.000000 0.689660 0.875489 0.875489 1.200000

133W 0.000000 0.000000 0.000000 0.875489 0.875489 0.875489

134X 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489

135Y 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489

136Z 0.475489 0.600000 0.965148 0.275489 0.275489 0.000000

137

138Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):

139 U V W X Y Z

140U 0.000000 0.202948 0.552860 0.444196 0.444196 0.748618

141V 0.000000 0.000000 0.439008 0.557299 0.557299 0.763869

142W 0.000000 0.000000 0.000000 0.993416 0.993416 0.993416

143X 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299

144Y 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299

145Z 0.489715 0.617951 0.994024 0.283731 0.283731 0.000000

146

147"""

148 qd_forest2 = qd_screen(df, relative_eps=0.29)

149 print(qd_forest2)

150 captured = capsys.readouterr()

151 with capsys.disabled():

152 # print(captured.out)

153 # note the u for unicode string in py2, see test_encoding

154 assert "\n" + captured.out == u"""

155QDForest (6 vars):

156 - 2 roots (0+2*): U*, X*

157 - 4 other nodes: V, W, Y, Z

158

159U

160└─ V

161 └─ W

162

163X

164└─ Y

165└─ Z

166

167"""

168 ce_df = qd_forest.get_entropies_table(from_to=False, sort_by="rel_cond_entropy")

169 print(ce_df.head(10))

170 captured = capsys.readouterr()

171 with capsys.disabled():

172 # print(captured.out)

173 assert "\n" + captured.out == """

174 cond_entropy rel_cond_entropy

175arc

176U->V 0.000000 0.000000

177U->W 0.000000 0.000000

178V->W 0.000000 0.000000

179Y->X 0.000000 0.000000

180X->Y 0.000000 0.000000

181V->U 0.400000 0.202948

182Y->Z 0.275489 0.283731

183X->Z 0.275489 0.283731

184U->X 0.475489 0.302676

185U->Y 0.475489 0.302676

186"""

187 if not PY2: 187 ↛ exitline 187 didn't return from function 'test_readme_quasi', because the condition on line 187 was never false

188 # we have issues on travis CI with matplotlib on PY2: skip

189 import matplotlib.pyplot as plt

190 qd_forest.plot_increasing_entropies()

191 # fig = plt.gcf()

192 # fig.savefig(str(IMGS_FOLDER / "increasing_entropies.png"))

193 plt.close("all")

194

195

196@pytest.mark.parametrize("typ", ['int', 'str', 'mixed'])

197def test_readme_skl(typ):

198 from qdscreen.sklearn import QDScreen

199

200 if typ in ('int', 'str'):

201 X = [[0, 2, 0, 3],

202 [0, 1, 4, 3],

203 [0, 1, 1, 3]]

204 if typ == 'str':

205 X = np.array(X).astype(str)

206

207 elif typ == 'mixed': 207 ↛ 215line 207 didn't jump to line 215, because the condition on line 207 was never false

208 # X = [['0', 2, '0', 3],

209 # ['0', 1, '4', 3],

210 # ['0', 1, '1', 3]]

211 # X = np.array(X)

212 # assert X.dtype == '<U1' #str

213 pytest.skip("Mixed dtypes do not exist in numpy unstructured arrays")

214 else:

215 raise ValueError()

216

217 selector = QDScreen()

218 X2 = selector.fit_transform(X)

219 expected_res = [[0], [4], [1]] if typ == 'int' else [['0'], ['4'], ['1']]

220 np.testing.assert_array_equal(X2, np.array(expected_res))

221

222

223@pytest.mark.parametrize("input_type", ["pandas",

224 "numpy_unstructured",

225 # "numpy_structured", "numpy_recarray"

226 ])

227def test_readme(input_type):

228 """All variants that exist around the nominal scenario from the readme"""

229

230 if input_type == "pandas":

231 df_orig = data = df_mix1()

232 var_names = list(data.columns)

233 data_ar = data.values

234 elif input_type == "numpy_unstructured": 234 ↛ 252line 234 didn't jump to line 252, because the condition on line 234 was never false

235 df_orig = df_mix1()

236 var_names = list(df_orig.columns)

237 data = df_orig.to_numpy(copy=True) # to_numpy(df_orig, copy=True)

238 data_ar = data

239 # elif input_type == "numpy_structured":

240 # df_orig = df_mix1()

241 # var_names = list(df_orig.columns)

242 # data = df_orig.to_records()

243 # data = data.view(data.dtype.fields or data.dtype, np.ndarray)[var_names].copy()

244 # data_ar = data

245 # elif input_type == "numpy_recarray":

246 # df_orig = df_mix1()

247 # var_names = list(df_orig.columns)

248 # data = df_orig.to_records()[var_names].copy()

249 # data_ar = data

250 # data.dtype.names = tuple(range(len(var_names)))

251 else:

252 raise ValueError(input_type)

253

254 # ref roots arrays

255 data_roots_df = df_orig[['U', 'X', 'Z']]

256 data_roots_ar = data_roots_df.values

257 data_roots_quasi_df = df_orig[['U', 'X']]

258 data_roots_quasi_ar = data_roots_quasi_df.values

259

260 nb_vars = len(var_names)

261

262 # check the stats, for debug

263 # df_stats = Entropies(data)

264

265 # Sklearn use case

266 from qdscreen.sklearn import QDScreen

267 if input_type not in ("numpy_structured", "numpy_recarray"): 267 ↛ 297line 267 didn't jump to line 297, because the condition on line 267 was never false

268 # --strict

269 sel = QDScreen()

270 sel_data = sel.fit_transform(data_ar)

271 np.testing.assert_array_equal(sel_data, data_roots_ar)

272 data2 = sel.inverse_transform(sel_data)

273 np.testing.assert_array_equal(data2, data_ar)

274 # --quasi absolute - Z should become a child of X

275 sel = QDScreen(absolute_eps=0.28)

276 sel_data = sel.fit_transform(data_ar)

277 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)

278 data2 = sel.inverse_transform(sel_data)

279 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

280 assert data2[2, 5] == "a"

281 data2[2, 5] = "b"

282 np.testing.assert_array_equal(data2, data_ar)

283

284 # --quasi relative - Z should become a child of X

285 sel = QDScreen(relative_eps=0.29)

286 sel_data = sel.fit_transform(data_ar)

287 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)

288 data2 = sel.inverse_transform(sel_data)

289 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

290 assert data2[2, 5] == "a"

291 data2[2, 5] = "b"

292 np.testing.assert_array_equal(data2, data_ar)

293

294 # qdscreen full use case

295 # (1) strict

296 # -- (a) forest

297 ref = pd.DataFrame(data=np.zeros((nb_vars, nb_vars), dtype=bool), index=var_names, columns=var_names)

298 ref.loc['U', 'V'] = True

299 ref.loc['V', 'W'] = True

300 ref.loc['X', 'Y'] = True

301 qd_forest = qd_screen(data)

302 adj_mat = qd_forest.adjmat

303 if input_type != "pandas":

304 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)

305 pd.testing.assert_frame_equal(adj_mat, ref)

306 # -- (b) feature selector

307 if input_type == "pandas":

308 # Note: this should be also tested with numpy structured & records input type

309 # we can test that randomized column order still works

310 new_cols_order = np.array(data.columns)

311 np.random.shuffle(new_cols_order)

312 new_roots_order = [c for c in new_cols_order if c in data_roots_df.columns]

313 data_randcol = data[new_cols_order].copy()

314 else:

315 # numpy: index has meaning, no randomization possible

316 data_randcol = data

317 selector_model = qd_forest.fit_selector_model(data_randcol)

318 sel2 = selector_model.remove_qd(data_randcol)

319 if input_type == "pandas":

320 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order])

321 # elif input_type == "numpy_structured":

322 # sel2u = np.array(sel2.tolist(), dtype=object)

323 # np.testing.assert_array_equal(sel2u, data_roots_ar)

324 else:

325 np.testing.assert_array_equal(sel2, data_roots_ar)

326 data2 = selector_model.predict_qd(sel2)

327 if input_type == "pandas":

328 pd.testing.assert_frame_equal(data2[data.columns], data)

329 # elif input_type == "numpy_structured":

330 # np.testing.assert_array_equal(data2, data_ar)

331 else:

332 np.testing.assert_array_equal(data2, data_ar)

333

334 # (2) quasi

335 # -- (a) forest

336 ref.loc['X', 'Z'] = True

337 # Z si threshold qd > 0.28 (absolu) ou 0.29 (relatif)

338 qd_forest = qd_screen(data, absolute_eps=0.28)

339 adj_mat = qd_forest.adjmat

340 if input_type != "pandas":

341 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)

342 pd.testing.assert_frame_equal(adj_mat, ref)

343

344 qd_forest = qd_screen(data, relative_eps=0.29)

345 adj_mat = qd_forest.adjmat

346 if input_type != "pandas":

347 assert isinstance(qd_forest.parents, np.ndarray)

348 assert isinstance(adj_mat, np.ndarray)

349 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)

350 else:

351 assert isinstance(qd_forest.parents, pd.DataFrame)

352 assert list(qd_forest.parents.index) == var_names

353 pd.testing.assert_frame_equal(adj_mat, ref)

354

355 # -- (b) feature selector

356

357 selector_model = qd_forest.fit_selector_model(data_randcol)

358 sel2 = selector_model.remove_qd(data_randcol)

359 if input_type == "pandas":

360 new_roots_order_quasi = [r for r in new_roots_order if r != 'Z']

361 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order_quasi])

362 # elif input_type == "numpy_structured":

363 # sel2u = np.array(sel2.tolist(), dtype=object)

364 # np.testing.assert_array_equal(sel2u, data_roots_ar)

365 else:

366 np.testing.assert_array_equal(sel2, data_roots_quasi_ar)

367 data2 = selector_model.predict_qd(sel2)

368 if input_type == "pandas":

369 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

370 assert data2.loc[2, "Z"] == "a"

371 data2.loc[2, "Z"] = "b"

372 pd.testing.assert_frame_equal(data2[data.columns], data)

373 # elif input_type == "numpy_structured":

374 # np.testing.assert_array_equal(data2, data_ar)

375 else:

376 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

377 assert data2[2, 5] == "a"

378 data2[2, 5] = "b"

379 np.testing.assert_array_equal(data2, data_ar)

Coverage for qdscreen/tests/test_readme.py: 97%

162 statements