qdscreen/tests/test_core.py - flake8 annotated source

1 # -*- coding: utf-8 -*-

2 # the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)

3 # from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2

4 import sys

5

6 import numpy as np

7 import pandas as pd

8 import pytest

9

10 from qdscreen import QDForest, qd_screen

11 from qdscreen.compat import PY2

12 from qdscreen.main import get_adjacency_matrix, remove_redundancies

13 from qdscreen.sklearn import QDScreen

14

15

16 def df_strict1():

17 """ A dataframe with two equivalent redundant variables X and Y """

18 df = pd.DataFrame({

19 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],

20 })

21 # 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],

22 df['Y'] = df['X'].replace("c", "d").replace("b", "c").replace("a", "b").replace("d", "a")

23 return df

24

25

26 def test_adjacency_strict():

27 """ Tests that `get_adjacency_matrix` works as expected in front of a dataset with redundancy """

28

29 # Compute the adjacency matrix

30 adj_df, df_stats = get_adjacency_matrix(df_strict1())

31 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[

32 [False, True],

33 [True, False]

34 ], index=['X', 'Y'], columns=['X', 'Y']))

35

36

37 def quasi_df1():

38 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657

39 return pd.DataFrame({

40 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],

41 'Y': ["a", "b", "c", "c", "b", "a", "a", "b", "c", "a"],

42 })

43

44

45 def test_adjacency_invalid_thresholds():

46 # an error is raised if both are provided, even if both are zero

47 with pytest.raises(ValueError):

48 get_adjacency_matrix(pd.DataFrame(), eps_absolute=0., eps_relative=0.)

49

50

51 @pytest.mark.parametrize("eps_absolute, eps_relative", [

52 (None, None), # strict mode

53 (0.2, None), # too low absolute

54 (None, 0.15) # too low relative

55 ])

56 def test_adjacency_quasi_low_threshold(eps_absolute, eps_relative):

57 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """

58

59 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657

60 # strict mode or with too low threshold: no arc is detected

61 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative)

62 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[

63 [False, False],

64 [False, False]

65 ], index=['X', 'Y'], columns=['X', 'Y']))

66

67

68 @pytest.mark.parametrize("eps_absolute, eps_relative", [

69 (0.33, None), # high enough absolute

70 (None, 0.21) # high enough relative

71 ])

72 def test_adjacency_quasi_high_threshold(eps_absolute, eps_relative):

73 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """

74

75 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657

76 # strict mode or with too low threshold: no arc is detected

77 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative)

78 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[

79 [False, True],

80 [True, False]

81 ], index=['X', 'Y'], columns=['X', 'Y']))

82

83

84 def test_remove_redundancies():

85 """ Tests that the redundancies removal routine works as expected """

86

87 # an adjacency matrix with two redundant nodes X and Y

88 adj_df = pd.DataFrame(data=[

89 [False, True],

90 [True, False]

91 ], index=['X', 'Y'], columns=['X', 'Y'])

92

93 # clean the redundancies with natural order: X is the representative of the class

94 adj_df_clean1 = remove_redundancies(adj_df)

95 pd.testing.assert_frame_equal(adj_df_clean1, pd.DataFrame(data=[

96 [False, True],

97 [False, False]

98 ], index=['X', 'Y'], columns=['X', 'Y']))

99

100 # clean the redundancies with reverse order: Y is the representative of the class

101 adj_df_clean2 = remove_redundancies(adj_df, selection_order=[1, 0])

102 pd.testing.assert_frame_equal(adj_df_clean2, pd.DataFrame(data=[

103 [False, False],

104 [True, False]

105 ], index=['X', 'Y'], columns=['X', 'Y']))

106

107

108 # def test_identify_redundancy_quasi():

109 # df = pd.DataFrame({

110 # 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],

111 # })

112 # df['V'] = df['U'].replace("d", "c") # d -> c

113 # df['W'] = df['V'].replace("c", "b") # c -> b

114 #

115 # adj_df = get_adjacency_matrix(df)

116 # df2 = identify_redundancy(adj_df)

117

118

119 def get_qd_forest1(is_np):

120 """

121 Created a forest with two trees 3->5 and 9->(1,7)

122

123 :param is_np: if true

124 :return:

125 """

126 adjmat_ar = adjmat = np.zeros((10, 10), dtype=bool)

127 adjmat[1, 8] = True

128 adjmat[3, 5] = True

129 adjmat[9, 1] = True

130 adjmat[9, 7] = True

131

132 parents_ar = parents = -np.ones((10,),

E501 Line too long (124 > 120 characters)

133 dtype=np.int64) # indeed computing parents from adjmat with np.where returns this dtype

134 parents[5] = 3

135 parents[1] = 9

136 parents[7] = 9

137 parents[8] = 1

138

139 roots_ar = roots = np.array([0, 2, 3, 4, 6, 9])

140 roots_wc_ar = roots_wc = np.array([3, 9])

141

142 if not is_np:

143 varnames = list("abcdefghij")

144 adjmat = pd.DataFrame(adjmat_ar, columns=varnames, index=varnames)

145 parents = pd.DataFrame(parents_ar, index=varnames, columns=('idx',))

146 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None)

147 roots = np.array(varnames)[roots_ar]

148 roots_wc = np.array(varnames)[roots_wc_ar]

149

150 return adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar

151

152

153 @pytest.mark.parametrize("from_adjmat", [True, False], ids="from_adjmat={}".format)

154 @pytest.mark.parametrize("is_np", [True, False], ids="is_np={}".format)

155 def test_qd_forest(is_np, from_adjmat):

156 """Tests that QDForest works correctly whether created from adj matrix or parents list"""

157

158 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np)

159

160 if from_adjmat:

161 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix

162 else:

163 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates

164

165 # roots

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

166 assert qd1.get_roots(names=False) == list(roots_ar)

167 np.testing.assert_array_equal(qd1.indices_to_mask(roots_ar), qd1.roots_mask_ar)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

168 assert qd1.get_roots_with_children(names=False) == list(roots_wc_ar)

169 if not is_np:

170 pd.testing.assert_series_equal(qd1.indices_to_mask(roots), qd1.roots_mask)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

171 assert qd1.get_roots(names=True) == list(roots)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

172 assert qd1.get_roots_with_children(names=True) == list(roots_wc)

173 else:

174 with pytest.raises(ValueError):

175 qd1.get_roots(names=True)

176 with pytest.raises(ValueError):

177 qd1.get_roots_with_children(names=True)

178 # default

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

179 assert qd1.get_roots() == qd1.get_roots(names=not is_np)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

180 assert qd1.get_roots_with_children() == qd1.get_roots_with_children(names=not is_np)

181

182 # string representation of arcs

183 # -- indices

184 if from_adjmat:

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

185 assert qd1.get_arcs_str_list(names=False) == ['1 -> 8', '3 -> 5', '9 -> 1', '9 -> 7']

186 else:

187 # make sure the adj matrix was not computed automatically, to test the right method

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

188 assert qd1._adjmat is None

189 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

190 assert qd1.get_arcs_str_list(names=False) == ['9 -> 1', '3 -> 5', '9 -> 7', '1 -> 8']

191 # -- names

192 if not is_np:

193 if from_adjmat:

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

194 assert qd1.get_arcs_str_list(names=True) == ['b -> i', 'd -> f', 'j -> b', 'j -> h']

195 else:

196 # make sure the adj matrix was not computed automatically, to test the right method

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

197 assert qd1._adjmat is None

198 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

199 assert qd1.get_arcs_str_list(names=True) == ['j -> b', 'd -> f', 'j -> h', 'b -> i']

200 else:

201 with pytest.raises(ValueError):

202 qd1.get_arcs_str_list(names=True)

203 # check the default value of `names`

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

204 assert qd1.get_arcs_str_list() == qd1.get_arcs_str_list(names=not is_np)

205

206 # equivalent adjmat and parents computation, to be sure

207 np.testing.assert_array_equal(qd1.parents_indices_ar, parents_ar)

208 if not is_np:

209 pd.testing.assert_frame_equal(qd1.parents, parents)

210

211 # equivalent adjmat computation, to be sure

212 np.testing.assert_array_equal(qd1.adjmat_ar, adjmat_ar)

213 if not is_np:

214 pd.testing.assert_frame_equal(qd1.adjmat, adjmat)

215

216

217 @pytest.mark.parametrize("from_adjmat", [True, False])

218 @pytest.mark.parametrize("is_np", [True, False])

219 def test_qd_forest_str(is_np, from_adjmat):

220 """Tests the string representation """

221

222 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np)

223

224 if from_adjmat:

225 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix

226 else:

227 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates

228

229 # string representation

230 compact_str = qd1.to_str(mode="compact")

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

231 assert compact_str == "QDForest (10 vars = 6 roots + 4 determined by 2 of the roots)"

232

233 roots_str = "0, 2, 3*, 4, 6, 9*" if is_np else "a, c, d*, e, g, j*"

234 others_str = "1, 5, 7, 8" if is_np else "b, f, h, i"

235 headers_str = qd1.to_str(mode="headers")

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

236 assert headers_str == """QDForest (10 vars):

237 - 6 roots (4+2*): %s

238 - 4 other nodes: %s""" % (roots_str, others_str)

239 # this should be the default

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

240 assert qd1.to_str() == headers_str

241

242 trees_str = "\n" + "\n".join(qd1.get_trees_str_list())

243 if is_np:

244 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

245 assert trees_str == u"""

246 3

247 └─ 5

248

249 9

250 └─ 1

251 └─ 8

252 └─ 7

253 """

254 else:

255 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

256 assert trees_str == u"""

257 d

258 └─ f

259

260 j

261 └─ b

262 └─ i

263 └─ h

264 """

265 full_str = qd1.to_str(mode="full")

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

266 assert full_str == headers_str + u"\n" + trees_str

267 # this should be the default string representation if the nb vars is small enough

268 if PY2:

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

269 assert full_str.encode('utf-8') == str(qd1)

270 else:

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

271 assert full_str == str(qd1)

272

273

274 # def test_sklearn_compat():

275 # """Trying to make sure that this compatibility code works: it does NOT with scikit-learn 0.22.2.post1 :("""

276 # from qdscreen.compat import BaseEstimator

277 # assert BaseEstimator()._more_tags()['requires_y'] is False

278 # assert BaseEstimator()._get_tags()['requires_y'] is False

279

280

281 def test_nans_in_data():

282 """See https://github.com/python-qds/qdscreen/issues/28"""

283

284 df = pd.DataFrame([

285 ["A", "B"],

286 ["A", "B"],

287 ["N", np.nan],

288 ], columns=["a", "b"])

289

290 qd_forest = qd_screen(df)

291 feat_selector = qd_forest.fit_selector_model(df)

292 df_sel = feat_selector.remove_qd(df)

293 pd.testing.assert_frame_equal(df_sel, pd.DataFrame([

294 ["A"],

295 ["A"],

296 ["N"],

297 ], columns=["a"]))

298

299

300 def test_nans_in_data_sklearn():

301 """See https://github.com/python-qds/qdscreen/issues/28"""

302

303 df = pd.DataFrame([

304 ["A", "B"],

305 ["A", "B"],

306 ["N", np.nan],

307 ])

308

309 selector = QDScreen()

310 Xsel = selector.fit_transform(df.to_numpy())

311

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

312 assert Xsel.tolist() == [['A'], ['A'], ['N']]

313

314

315 def test_issue_37_non_categorical():

316 df = pd.DataFrame({

317 "nb": [1, 2],

318 "name": ["A", "B"]

319 })

320 with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"):

321 qd_screen(df)

322

323

324 @pytest.mark.skipif(sys.version_info < (3, 6),

325 reason="This test is known to fail for 3.5 and 2.7, see GH#43")

326 def test_issue_40_nan_then_str():

327 df = pd.DataFrame({

328 "foo": ["1", "2"],

329 "bar": [np.nan, "B"]

330 })

331 qd_forest = qd_screen(df)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

332 assert list(qd_forest.roots) == ["foo"]

333

334 feat_selector = qd_forest.fit_selector_model(df)

335 only_important_features_df = feat_selector.remove_qd(df)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

336 assert list(only_important_features_df.columns) == ["foo"]

337

338 result = feat_selector.predict_qd(only_important_features_df)

339 pd.testing.assert_frame_equal(df, result)

qdscreen/tests/test_core.py source