qdscreen/tests/test_readme.py - flake8 annotated source

1 # -*- coding: utf-8 -*-

2 # the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)

3 # from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2

4 import sys

5

6 import numpy as np

7 import pandas as pd

8 import pytest

9

10 from qdscreen import qd_screen

11

12

13 try:

14 from pathlib import Path

15 except ImportError:

F401 'pathlib2.Path' imported but unused

16 from pathlib2 import Path

17

18

19 PY2 = sys.version_info < (3,)

20

21

22 def df_mix1():

23 """The dataset for reuse"""

24 df = pd.DataFrame({

25 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],

26 'V': ["a", "b", "c", "a", "b", "c", "a", "b", "c", "c"],

27 'W': ["a", "b", "b", "a", "b", "b", "a", "b", "b", "b"],

28 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],

29 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],

30 'Z': ["a", "a", "b", "a", "a", "b", "b", "a", "a", "b"] # if the 3d element were an "a", X->Z and Y->Z

31 })

32

33 # In this data: the following relationships hold:

34 # H(W|V) = 0 H(W|U) = 0 H(V|U) = 0 # U -> V -> W

35 # H(Y|X) = H(X|Y) = 0 # Y <-> X but X selected as representant as 1st appearing

36 # H(Z|Y) = H(Z|X) = 0.275 AND H(Z|Y) / H(Z) = 0.284 # X -> Z if qd threshold > 0.19 (absolute) or 0.28 (relative)

37 return df

38

39

40 def test_encoding(capsys):

41 """A small test to be sure we can compare the tree text characters correctly"""

T001 Print found.

42 print("└─")

43 captured = capsys.readouterr()

44 with capsys.disabled():

45 # here this is unicode match

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

46 assert captured.out == u"└─\n"

47

48 # here this is bytes match

49 from .encoding_ref_help import Foo

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

50 assert "└─ab\n" == str(Foo())

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

51 assert "└─ab\n" == repr(Foo())

52

53

54 def test_readme_simple(capsys):

55 """The exact scenario showed in the readme"""

56

57 df = df_mix1()

58

59 # detect strict deterministic relationships

60 qd_forest = qd_screen(df)

T001 Print found.

61 print(qd_forest)

62 captured = capsys.readouterr()

63 with capsys.disabled():

64 # note the u for unicode string in py2, see test_encoding

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

65 assert "\n" + captured.out == u"""

66 QDForest (6 vars):

67 - 3 roots (1+2*): U*, X*, Z

68 - 3 other nodes: V, W, Y

69

70 U

71 └─ V

72 └─ W

73

74 X

75 └─ Y

76

77 """

T001 Print found.

78 print("Columns in df: %s" % list(df.columns))

79

80 # fit a feature selection model

81 feat_selector = qd_forest.fit_selector_model(df)

82

83 # use it to filter...

84 only_important_features = feat_selector.remove_qd(df)

T001 Print found.

85 print("Columns in only_important_features: %s" % list(only_important_features.columns))

86

87 # or to restore/predict

88 restored_full_df = feat_selector.predict_qd(only_important_features)

T001 Print found.

89 print("Columns in restored_full_df: %s" % list(restored_full_df.columns))

90

91 # note that the order of columns differs frmo origin

92 pd.testing.assert_frame_equal(df, restored_full_df[df.columns])

93

94 captured = capsys.readouterr()

95 with capsys.disabled():

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

96 assert "\n" + captured.out == """

97 Columns in df: ['U', 'V', 'W', 'X', 'Y', 'Z']

98 Columns in only_important_features: ['U', 'X', 'Z']

99 Columns in restored_full_df: ['U', 'X', 'Z', 'V', 'W', 'Y']

100 """

101

102

103 def test_readme_quasi(capsys):

104 df = df_mix1()

105

106 # keep stats at the end of the screening

107 qd_forest = qd_screen(df, keep_stats=True)

T001 Print found.

108 print(qd_forest.stats)

109

110 captured = capsys.readouterr()

111 with capsys.disabled():

112 # print(captured.out)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

113 assert "\n" + captured.out == """

114 Statistics computed for dataset:

115 U V W X Y Z

116 0 a a a a b a

117 1 b b b a b a

118 ...(10 rows)

119

120 Entropies (H):

121 U 1.970951

122 V 1.570951

123 W 0.881291

124 X 1.570951

125 Y 1.570951

126 Z 0.970951

127 dtype: float64

128

129 Conditional entropies (Hcond = H(row|col)):

130 U V W X Y Z

131 U 0.000000 0.400000 1.089660 0.875489 0.875489 1.475489

132 V 0.000000 0.000000 0.689660 0.875489 0.875489 1.200000

133 W 0.000000 0.000000 0.000000 0.875489 0.875489 0.875489

134 X 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489

135 Y 0.475489 0.875489 1.565148 0.000000 0.000000 0.875489

136 Z 0.475489 0.600000 0.965148 0.275489 0.275489 0.000000

137

138 Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):

139 U V W X Y Z

140 U 0.000000 0.202948 0.552860 0.444196 0.444196 0.748618

141 V 0.000000 0.000000 0.439008 0.557299 0.557299 0.763869

142 W 0.000000 0.000000 0.000000 0.993416 0.993416 0.993416

143 X 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299

144 Y 0.302676 0.557299 0.996307 0.000000 0.000000 0.557299

145 Z 0.489715 0.617951 0.994024 0.283731 0.283731 0.000000

146

147 """

148 qd_forest2 = qd_screen(df, relative_eps=0.29)

T001 Print found.

149 print(qd_forest2)

150 captured = capsys.readouterr()

151 with capsys.disabled():

152 # print(captured.out)

153 # note the u for unicode string in py2, see test_encoding

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

154 assert "\n" + captured.out == u"""

155 QDForest (6 vars):

156 - 2 roots (0+2*): U*, X*

157 - 4 other nodes: V, W, Y, Z

158

159 U

160 └─ V

161 └─ W

162

163 X

164 └─ Y

165 └─ Z

166

167 """

168 ce_df = qd_forest.get_entropies_table(from_to=False, sort_by="rel_cond_entropy")

T001 Print found.

169 print(ce_df.head(10))

170 captured = capsys.readouterr()

171 with capsys.disabled():

172 # print(captured.out)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

173 assert "\n" + captured.out == """

174 cond_entropy rel_cond_entropy

W291 Trailing whitespace

175 arc

176 U->V 0.000000 0.000000

177 U->W 0.000000 0.000000

178 V->W 0.000000 0.000000

179 Y->X 0.000000 0.000000

180 X->Y 0.000000 0.000000

181 V->U 0.400000 0.202948

182 Y->Z 0.275489 0.283731

183 X->Z 0.275489 0.283731

184 U->X 0.475489 0.302676

185 U->Y 0.475489 0.302676

186 """

187 if not PY2:

188 # we have issues on travis CI with matplotlib on PY2: skip

189 import matplotlib.pyplot as plt

190 qd_forest.plot_increasing_entropies()

191 # fig = plt.gcf()

192 # fig.savefig(str(IMGS_FOLDER / "increasing_entropies.png"))

193 plt.close("all")

194

195

196 @pytest.mark.parametrize("typ", ['int', 'str', 'mixed'])

197 def test_readme_skl(typ):

198 from qdscreen.sklearn import QDScreen

199

200 if typ in ('int', 'str'):

201 X = [[0, 2, 0, 3],

202 [0, 1, 4, 3],

203 [0, 1, 1, 3]]

204 if typ == 'str':

205 X = np.array(X).astype(str)

206

207 elif typ == 'mixed':

208 # X = [['0', 2, '0', 3],

209 # ['0', 1, '4', 3],

210 # ['0', 1, '1', 3]]

211 # X = np.array(X)

212 # assert X.dtype == '<U1' #str

213 pytest.skip("Mixed dtypes do not exist in numpy unstructured arrays")

214 else:

215 raise ValueError()

216

217 selector = QDScreen()

218 X2 = selector.fit_transform(X)

219 expected_res = [[0], [4], [1]] if typ == 'int' else [['0'], ['4'], ['1']]

220 np.testing.assert_array_equal(X2, np.array(expected_res))

221

222

223 @pytest.mark.parametrize("input_type", ["pandas",

224 "numpy_unstructured",

225 # "numpy_structured", "numpy_recarray"

226 ])

227 def test_readme(input_type):

228 """All variants that exist around the nominal scenario from the readme"""

229

230 if input_type == "pandas":

231 df_orig = data = df_mix1()

232 var_names = list(data.columns)

233 data_ar = data.values

234 elif input_type == "numpy_unstructured":

235 df_orig = df_mix1()

236 var_names = list(df_orig.columns)

237 data = df_orig.to_numpy(copy=True) # to_numpy(df_orig, copy=True)

238 data_ar = data

239 # elif input_type == "numpy_structured":

240 # df_orig = df_mix1()

241 # var_names = list(df_orig.columns)

242 # data = df_orig.to_records()

243 # data = data.view(data.dtype.fields or data.dtype, np.ndarray)[var_names].copy()

244 # data_ar = data

245 # elif input_type == "numpy_recarray":

246 # df_orig = df_mix1()

247 # var_names = list(df_orig.columns)

248 # data = df_orig.to_records()[var_names].copy()

249 # data_ar = data

250 # data.dtype.names = tuple(range(len(var_names)))

251 else:

252 raise ValueError(input_type)

253

254 # ref roots arrays

255 data_roots_df = df_orig[['U', 'X', 'Z']]

256 data_roots_ar = data_roots_df.values

257 data_roots_quasi_df = df_orig[['U', 'X']]

258 data_roots_quasi_ar = data_roots_quasi_df.values

259

260 nb_vars = len(var_names)

261

262 # check the stats, for debug

263 # df_stats = Entropies(data)

264

265 # Sklearn use case

266 from qdscreen.sklearn import QDScreen

267 if input_type not in ("numpy_structured", "numpy_recarray"):

268 # --strict

269 sel = QDScreen()

270 sel_data = sel.fit_transform(data_ar)

271 np.testing.assert_array_equal(sel_data, data_roots_ar)

272 data2 = sel.inverse_transform(sel_data)

273 np.testing.assert_array_equal(data2, data_ar)

274 # --quasi absolute - Z should become a child of X

275 sel = QDScreen(absolute_eps=0.28)

276 sel_data = sel.fit_transform(data_ar)

277 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)

278 data2 = sel.inverse_transform(sel_data)

279 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

280 assert data2[2, 5] == "a"

281 data2[2, 5] = "b"

282 np.testing.assert_array_equal(data2, data_ar)

283

284 # --quasi relative - Z should become a child of X

285 sel = QDScreen(relative_eps=0.29)

286 sel_data = sel.fit_transform(data_ar)

287 np.testing.assert_array_equal(sel_data, data_roots_quasi_ar)

288 data2 = sel.inverse_transform(sel_data)

289 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

290 assert data2[2, 5] == "a"

291 data2[2, 5] = "b"

292 np.testing.assert_array_equal(data2, data_ar)

293

294 # qdscreen full use case

295 # (1) strict

296 # -- (a) forest

297 ref = pd.DataFrame(data=np.zeros((nb_vars, nb_vars), dtype=bool), index=var_names, columns=var_names)

298 ref.loc['U', 'V'] = True

299 ref.loc['V', 'W'] = True

300 ref.loc['X', 'Y'] = True

301 qd_forest = qd_screen(data)

302 adj_mat = qd_forest.adjmat

303 if input_type != "pandas":

304 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)

305 pd.testing.assert_frame_equal(adj_mat, ref)

306 # -- (b) feature selector

307 if input_type == "pandas":

308 # Note: this should be also tested with numpy structured & records input type

309 # we can test that randomized column order still works

310 new_cols_order = np.array(data.columns)

311 np.random.shuffle(new_cols_order)

312 new_roots_order = [c for c in new_cols_order if c in data_roots_df.columns]

313 data_randcol = data[new_cols_order].copy()

314 else:

315 # numpy: index has meaning, no randomization possible

316 data_randcol = data

317 selector_model = qd_forest.fit_selector_model(data_randcol)

318 sel2 = selector_model.remove_qd(data_randcol)

319 if input_type == "pandas":

320 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order])

321 # elif input_type == "numpy_structured":

322 # sel2u = np.array(sel2.tolist(), dtype=object)

323 # np.testing.assert_array_equal(sel2u, data_roots_ar)

324 else:

325 np.testing.assert_array_equal(sel2, data_roots_ar)

326 data2 = selector_model.predict_qd(sel2)

327 if input_type == "pandas":

328 pd.testing.assert_frame_equal(data2[data.columns], data)

329 # elif input_type == "numpy_structured":

330 # np.testing.assert_array_equal(data2, data_ar)

331 else:

332 np.testing.assert_array_equal(data2, data_ar)

333

334 # (2) quasi

335 # -- (a) forest

336 ref.loc['X', 'Z'] = True

337 # Z si threshold qd > 0.28 (absolu) ou 0.29 (relatif)

338 qd_forest = qd_screen(data, absolute_eps=0.28)

339 adj_mat = qd_forest.adjmat

340 if input_type != "pandas":

341 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)

342 pd.testing.assert_frame_equal(adj_mat, ref)

343

344 qd_forest = qd_screen(data, relative_eps=0.29)

345 adj_mat = qd_forest.adjmat

346 if input_type != "pandas":

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

347 assert isinstance(qd_forest.parents, np.ndarray)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

348 assert isinstance(adj_mat, np.ndarray)

349 adj_mat = pd.DataFrame(adj_mat, columns=var_names, index=var_names)

350 else:

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

351 assert isinstance(qd_forest.parents, pd.DataFrame)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

352 assert list(qd_forest.parents.index) == var_names

353 pd.testing.assert_frame_equal(adj_mat, ref)

354

355 # -- (b) feature selector

356

357 selector_model = qd_forest.fit_selector_model(data_randcol)

358 sel2 = selector_model.remove_qd(data_randcol)

359 if input_type == "pandas":

360 new_roots_order_quasi = [r for r in new_roots_order if r != 'Z']

361 pd.testing.assert_frame_equal(sel2, data_roots_df[new_roots_order_quasi])

362 # elif input_type == "numpy_structured":

363 # sel2u = np.array(sel2.tolist(), dtype=object)

364 # np.testing.assert_array_equal(sel2u, data_roots_ar)

365 else:

366 np.testing.assert_array_equal(sel2, data_roots_quasi_ar)

367 data2 = selector_model.predict_qd(sel2)

368 if input_type == "pandas":

369 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

370 assert data2.loc[2, "Z"] == "a"

371 data2.loc[2, "Z"] = "b"

372 pd.testing.assert_frame_equal(data2[data.columns], data)

373 # elif input_type == "numpy_structured":

374 # np.testing.assert_array_equal(data2, data_ar)

375 else:

376 # the prediction is incorrect since this is "quasi". Fix the single element that differs from input

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

377 assert data2[2, 5] == "a"

378 data2[2, 5] = "b"

379 np.testing.assert_array_equal(data2, data_ar)

qdscreen/tests/test_readme.py source