Coverage for qdscreen/tests/test

1# -*- coding: utf-8 -*-

2# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)

3# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2

4import sys

6import numpy as np

7import pandas as pd

8import pytest

10from qdscreen import QDForest, qd_screen

11from qdscreen.compat import PY2

12from qdscreen.main import get_adjacency_matrix, remove_redundancies

13from qdscreen.sklearn import QDScreen

16def df_strict1():

17 """ A dataframe with two equivalent redundant variables X and Y """

18 df = pd.DataFrame({

19 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],

20 })

21 # 'Y': ["b", "b", "c", "c", "b", "a", "a", "b", "c", "a"],

22 df['Y'] = df['X'].replace("c", "d").replace("b", "c").replace("a", "b").replace("d", "a")

23 return df

26def test_adjacency_strict():

27 """ Tests that `get_adjacency_matrix` works as expected in front of a dataset with redundancy """

29 # Compute the adjacency matrix

30 adj_df, df_stats = get_adjacency_matrix(df_strict1())

31 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[

32 [False, True],

33 [True, False]

34 ], index=['X', 'Y'], columns=['X', 'Y']))

37def quasi_df1():

38 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657

39 return pd.DataFrame({

40 'X': ["a", "a", "b", "b", "a", "c", "c", "a", "b", "c"],

41 'Y': ["a", "b", "c", "c", "b", "a", "a", "b", "c", "a"],

42 })

45def test_adjacency_invalid_thresholds():

46 # an error is raised if both are provided, even if both are zero

47 with pytest.raises(ValueError):

48 get_adjacency_matrix(pd.DataFrame(), eps_absolute=0., eps_relative=0.)

51@pytest.mark.parametrize("eps_absolute, eps_relative", [

52 (None, None), # strict mode

53 (0.2, None), # too low absolute

54 (None, 0.15) # too low relative

55])

56def test_adjacency_quasi_low_threshold(eps_absolute, eps_relative):

57 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """

59 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657

60 # strict mode or with too low threshold: no arc is detected

61 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative)

62 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[

63 [False, False],

64 [False, False]

65 ], index=['X', 'Y'], columns=['X', 'Y']))

68@pytest.mark.parametrize("eps_absolute, eps_relative", [

69 (0.33, None), # high enough absolute

70 (None, 0.21) # high enough relative

71])

72def test_adjacency_quasi_high_threshold(eps_absolute, eps_relative):

73 """ Tests that `get_adjacency_matrix` works as expected in quasi mode """

75 # in that matrix the H(X|Y) = H(Y|X) = 0.324511 and the relative H(X|Y)/H(X) = H(Y|X)/H(Y) = 0.20657

76 # strict mode or with too low threshold: no arc is detected

77 adj_df, df_stats = get_adjacency_matrix(quasi_df1(), eps_absolute=eps_absolute, eps_relative=eps_relative)

78 pd.testing.assert_frame_equal(adj_df, pd.DataFrame(data=[

79 [False, True],

80 [True, False]

81 ], index=['X', 'Y'], columns=['X', 'Y']))

84def test_remove_redundancies():

85 """ Tests that the redundancies removal routine works as expected """

87 # an adjacency matrix with two redundant nodes X and Y

88 adj_df = pd.DataFrame(data=[

89 [False, True],

90 [True, False]

91 ], index=['X', 'Y'], columns=['X', 'Y'])

93 # clean the redundancies with natural order: X is the representative of the class

94 adj_df_clean1 = remove_redundancies(adj_df)

95 pd.testing.assert_frame_equal(adj_df_clean1, pd.DataFrame(data=[

96 [False, True],

97 [False, False]

98 ], index=['X', 'Y'], columns=['X', 'Y']))

100 # clean the redundancies with reverse order: Y is the representative of the class

101 adj_df_clean2 = remove_redundancies(adj_df, selection_order=[1, 0])

102 pd.testing.assert_frame_equal(adj_df_clean2, pd.DataFrame(data=[

103 [False, False],

104 [True, False]

105 ], index=['X', 'Y'], columns=['X', 'Y']))

106

107

108# def test_identify_redundancy_quasi():

109# df = pd.DataFrame({

110# 'U': ["a", "b", "d", "a", "b", "c", "a", "b", "d", "c"],

111# })

112# df['V'] = df['U'].replace("d", "c") # d -> c

113# df['W'] = df['V'].replace("c", "b") # c -> b

114#

115# adj_df = get_adjacency_matrix(df)

116# df2 = identify_redundancy(adj_df)

117

118

119def get_qd_forest1(is_np):

120 """

121 Created a forest with two trees 3->5 and 9->(1,7)

122

123 :param is_np: if true

124 :return:

125 """

126 adjmat_ar = adjmat = np.zeros((10, 10), dtype=bool)

127 adjmat[1, 8] = True

128 adjmat[3, 5] = True

129 adjmat[9, 1] = True

130 adjmat[9, 7] = True

131

132 parents_ar = parents = -np.ones((10,),

133 dtype=np.int64) # indeed computing parents from adjmat with np.where returns this dtype

134 parents[5] = 3

135 parents[1] = 9

136 parents[7] = 9

137 parents[8] = 1

138

139 roots_ar = roots = np.array([0, 2, 3, 4, 6, 9])

140 roots_wc_ar = roots_wc = np.array([3, 9])

141

142 if not is_np:

143 varnames = list("abcdefghij")

144 adjmat = pd.DataFrame(adjmat_ar, columns=varnames, index=varnames)

145 parents = pd.DataFrame(parents_ar, index=varnames, columns=('idx',))

146 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None)

147 roots = np.array(varnames)[roots_ar]

148 roots_wc = np.array(varnames)[roots_wc_ar]

149

150 return adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar

151

152

153@pytest.mark.parametrize("from_adjmat", [True, False], ids="from_adjmat={}".format)

154@pytest.mark.parametrize("is_np", [True, False], ids="is_np={}".format)

155def test_qd_forest(is_np, from_adjmat):

156 """Tests that QDForest works correctly whether created from adj matrix or parents list"""

157

158 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np)

159

160 if from_adjmat:

161 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix

162 else:

163 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates

164

165 # roots

166 assert qd1.get_roots(names=False) == list(roots_ar)

167 np.testing.assert_array_equal(qd1.indices_to_mask(roots_ar), qd1.roots_mask_ar)

168 assert qd1.get_roots_with_children(names=False) == list(roots_wc_ar)

169 if not is_np:

170 pd.testing.assert_series_equal(qd1.indices_to_mask(roots), qd1.roots_mask)

171 assert qd1.get_roots(names=True) == list(roots)

172 assert qd1.get_roots_with_children(names=True) == list(roots_wc)

173 else:

174 with pytest.raises(ValueError):

175 qd1.get_roots(names=True)

176 with pytest.raises(ValueError):

177 qd1.get_roots_with_children(names=True)

178 # default

179 assert qd1.get_roots() == qd1.get_roots(names=not is_np)

180 assert qd1.get_roots_with_children() == qd1.get_roots_with_children(names=not is_np)

181

182 # string representation of arcs

183 # -- indices

184 if from_adjmat:

185 assert qd1.get_arcs_str_list(names=False) == ['1 -> 8', '3 -> 5', '9 -> 1', '9 -> 7']

186 else:

187 # make sure the adj matrix was not computed automatically, to test the right method

188 assert qd1._adjmat is None

189 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9

190 assert qd1.get_arcs_str_list(names=False) == ['9 -> 1', '3 -> 5', '9 -> 7', '1 -> 8']

191 # -- names

192 if not is_np:

193 if from_adjmat:

194 assert qd1.get_arcs_str_list(names=True) == ['b -> i', 'd -> f', 'j -> b', 'j -> h']

195 else:

196 # make sure the adj matrix was not computed automatically, to test the right method

197 assert qd1._adjmat is None

198 # TODO order is not the same as above, see https://github.com/python-qds/qdscreen/issues/9

199 assert qd1.get_arcs_str_list(names=True) == ['j -> b', 'd -> f', 'j -> h', 'b -> i']

200 else:

201 with pytest.raises(ValueError):

202 qd1.get_arcs_str_list(names=True)

203 # check the default value of `names`

204 assert qd1.get_arcs_str_list() == qd1.get_arcs_str_list(names=not is_np)

205

206 # equivalent adjmat and parents computation, to be sure

207 np.testing.assert_array_equal(qd1.parents_indices_ar, parents_ar)

208 if not is_np:

209 pd.testing.assert_frame_equal(qd1.parents, parents)

210

211 # equivalent adjmat computation, to be sure

212 np.testing.assert_array_equal(qd1.adjmat_ar, adjmat_ar)

213 if not is_np:

214 pd.testing.assert_frame_equal(qd1.adjmat, adjmat)

215

216

217@pytest.mark.parametrize("from_adjmat", [True, False])

218@pytest.mark.parametrize("is_np", [True, False])

219def test_qd_forest_str(is_np, from_adjmat):

220 """Tests the string representation """

221

222 adjmat, adjmat_ar, parents, parents_ar, roots, roots_ar, roots_wc, roots_wc_ar = get_qd_forest1(is_np)

223

224 if from_adjmat:

225 qd1 = QDForest(adjmat=adjmat) # a forest created from the adj matrix

226 else:

227 qd1 = QDForest(parents=parents) # a forest created from the parents coordinates

228

229 # string representation

230 compact_str = qd1.to_str(mode="compact")

231 assert compact_str == "QDForest (10 vars = 6 roots + 4 determined by 2 of the roots)"

232

233 roots_str = "0, 2, 3*, 4, 6, 9*" if is_np else "a, c, d*, e, g, j*"

234 others_str = "1, 5, 7, 8" if is_np else "b, f, h, i"

235 headers_str = qd1.to_str(mode="headers")

236 assert headers_str == """QDForest (10 vars):

237 - 6 roots (4+2*): %s

238 - 4 other nodes: %s""" % (roots_str, others_str)

239 # this should be the default

240 assert qd1.to_str() == headers_str

241

242 trees_str = "\n" + "\n".join(qd1.get_trees_str_list())

243 if is_np:

244 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars

245 assert trees_str == u"""

2463

247└─ 5

248

2499

250└─ 1

251 └─ 8

252└─ 7

253"""

254 else:

255 # note the u for python 2 as in main.py we use unicode literals to cope with those non-base chars

256 assert trees_str == u"""

257d

258└─ f

259

260j

261└─ b

262 └─ i

263└─ h

264"""

265 full_str = qd1.to_str(mode="full")

266 assert full_str == headers_str + u"\n" + trees_str

267 # this should be the default string representation if the nb vars is small enough

268 if PY2: 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true

269 assert full_str.encode('utf-8') == str(qd1)

270 else:

271 assert full_str == str(qd1)

272

273

274# def test_sklearn_compat():

275# """Trying to make sure that this compatibility code works: it does NOT with scikit-learn 0.22.2.post1 :("""

276# from qdscreen.compat import BaseEstimator

277# assert BaseEstimator()._more_tags()['requires_y'] is False

278# assert BaseEstimator()._get_tags()['requires_y'] is False

279

280

281def test_nans_in_data():

282 """See https://github.com/python-qds/qdscreen/issues/28"""

283

284 df = pd.DataFrame([

285 ["A", "B"],

286 ["A", "B"],

287 ["N", np.nan],

288 ], columns=["a", "b"])

289

290 qd_forest = qd_screen(df)

291 feat_selector = qd_forest.fit_selector_model(df)

292 df_sel = feat_selector.remove_qd(df)

293 pd.testing.assert_frame_equal(df_sel, pd.DataFrame([

294 ["A"],

295 ["A"],

296 ["N"],

297 ], columns=["a"]))

298

299

300def test_nans_in_data_sklearn():

301 """See https://github.com/python-qds/qdscreen/issues/28"""

302

303 df = pd.DataFrame([

304 ["A", "B"],

305 ["A", "B"],

306 ["N", np.nan],

307 ])

308

309 selector = QDScreen()

310 Xsel = selector.fit_transform(df.to_numpy())

311

312 assert Xsel.tolist() == [['A'], ['A'], ['N']]

313

314

315def test_issue_37_non_categorical():

316 df = pd.DataFrame({

317 "nb": [1, 2],

318 "name": ["A", "B"]

319 })

320 with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"):

321 qd_screen(df)

322

323

324@pytest.mark.skipif(sys.version_info < (3, 6),

325 reason="This test is known to fail for 3.5 and 2.7, see GH#43")

326def test_issue_40_nan_then_str():

327 df = pd.DataFrame({

328 "foo": ["1", "2"],

329 "bar": [np.nan, "B"]

330 })

331 qd_forest = qd_screen(df)

332 assert list(qd_forest.roots) == ["foo"]

333

334 feat_selector = qd_forest.fit_selector_model(df)

335 only_important_features_df = feat_selector.remove_qd(df)

336 assert list(only_important_features_df.columns) == ["foo"]

337

338 result = feat_selector.predict_qd(only_important_features_df)

339 pd.testing.assert_frame_equal(df, result)

Coverage for qdscreen/tests/test_core.py: 99%

141 statements