Coverage for qdscreen/main.py: 84%

516 statements  

« prev     ^ index     » next       coverage.py v7.2.2, created at 2023-03-17 11:02 +0000

1# -*- coding: utf-8 -*- 

2# The above encoding declaration is needed because we use non-scii characters in `get_trees_str_list`. 

3# The below import transforms automatically all strings in this file in unicode in python 2. 

4from __future__ import unicode_literals # See tests/encoding_ref_help.py for a detailed explanation 

5import numpy as np 

6import pandas as pd 

7 

8from pyitlib import discrete_random_variable as drv 

9 

10from .compat import encode_if_py2 

11 

12try: 

13 from typing import Union, Iterable, Tuple 

14except: # noqa 

15 pass 

16 

17 

18# TODO 

19# Cramer's V + Theil's U: 

20# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9 

21# https://stackoverflow.com/questions/46498455/categorical-features-correlation 

22# https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix (+ corrected version) 

23# https://stackoverflow.com/questions/61236715/correlation-between-categorical-variables-within-a-dataset 

24# 

25# https://stackoverflow.com/questions/48035381/correlation-among-multiple-categorical-variables-pandas 

26# https://stackoverflow.com/questions/44694228/how-to-check-for-correlation-among-continuous-and-categorical-variables-in-pytho 

27# 

28 

29def _add_names_to_parents_idx_series(parents): 

30 parents = pd.DataFrame(parents, columns=('idx',)) 

31 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None) 

32 return parents 

33 

34 

35class QDForest(object): 

36 """A quasi-deterministic forest returned by `qd_screen`""" 

37 __slots__ = ( 

38 '_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child) 

39 '_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root 

40 'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe) 

41 '_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables 

42 '_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children 

43 'stats' # an optional `Entropies` object stored for debug 

44 ) 

45 

46 def __init__(self, 

47 adjmat=None, # type: Union[np.ndarray, pd.DataFrame] 

48 parents=None, # type: Union[np.ndarray, pd.Series] 

49 stats=None # type: Entropies 

50 ): 

51 """ 

52 

53 :param adjmat: 

54 :param parents: 

55 """ 

56 self.stats = stats 

57 self._adjmat = adjmat 

58 self.is_nparray = isinstance((adjmat if adjmat is not None else parents), np.ndarray) 

59 if parents is not None: 

60 if not self.is_nparray: 

61 # create a df with two columns: indices and names 

62 parents = _add_names_to_parents_idx_series(parents) 

63 self._parents = parents 

64 else: 

65 self._parents = None 

66 

67 self._roots_mask = None 

68 self._roots_wc_mask = None 

69 

70 def assert_stats_kept(self): 

71 if self.stats is None: 71 ↛ 72line 71 didn't jump to line 72, because the condition on line 71 was never true

72 raise ValueError("`stats` are not available in this QDForest instance. If you created it using `qd_screen`," 

73 " you should use `keep_stats=True`.") 

74 

75 def assert_pandas_capable(self): 

76 """Utility method to assert that "not self.is_nparray" """ 

77 if self.is_nparray: 

78 raise ValueError("This QDForest instance was built with numpy arrays, it is not pandas compliant") 

79 

80 def assert_names_available(self): 

81 """""" 

82 if self.is_nparray: 

83 raise ValueError("Variable names are not available") 

84 

85 def _validate_names_arg(self, 

86 names): 

87 """Utility method to validate and set default value of the `names` argument used in most methods""" 

88 if names is None: 

89 names = not self.is_nparray 

90 if names: 

91 self.assert_names_available() 

92 return names 

93 

94 @property 

95 def nb_vars(self): 

96 return self._parents.shape[0] if self._parents is not None else self._adjmat.shape[0] 

97 

98 @property 

99 def varnames(self): 

100 self.assert_names_available() 

101 return np.array(self._adjmat.columns) if self._adjmat is not None else np.array(self._parents.index) 

102 

103 def indices_to_mask(self, 

104 indices, 

105 in_names=None, 

106 out_names=None, 

107 ): 

108 """Utility to convert a list of indices to a numpy or pandas mask""" 

109 in_names = self._validate_names_arg(in_names) 

110 out_names = self._validate_names_arg(out_names) 

111 mask = np.zeros((self.nb_vars,), dtype=bool) 

112 if out_names: 

113 mask = pd.Series(mask, index=self.varnames, dtype=bool) 

114 if in_names and not out_names: 114 ↛ 116line 114 didn't jump to line 116, because the condition on line 114 was never true

115 # TODO 

116 mask[self.varnames] = True 

117 elif not in_names and out_names: 

118 mask.iloc[indices] = True 

119 else: 

120 mask[indices] = True 

121 return mask 

122 

123 def mask_to_indices(self, mask): 

124 """Utili""" 

125 if isinstance(mask, np.ndarray): 

126 return np.where(mask) 

127 else: 

128 return mask[mask].index 

129 

130 @property 

131 def adjmat_ar(self): 

132 """The adjacency matrix as a 2D numpy array""" 

133 return self.adjmat if self.is_nparray else self.adjmat.values 

134 

135 @property 

136 def adjmat(self): 

137 """The adjacency matrix as a pandas DataFrame or a 2D numpy array""" 

138 if self._adjmat is None: 

139 # compute adjmat from parents and cache it 

140 n = self.nb_vars 

141 adjmat = np.zeros((n, n), dtype=bool) 

142 # from https://stackoverflow.com/a/46018613/7262247 

143 index_array = get_arcs_from_parents_indices(self.parents_indices_ar, multiindex=True) 

144 flat_index_array = np.ravel_multi_index(index_array, adjmat.shape) 

145 np.ravel(adjmat)[flat_index_array] = True 

146 if self.is_nparray: 

147 self._adjmat = adjmat 

148 else: 

149 self._adjmat = pd.DataFrame(adjmat, index=self.varnames, columns=self.varnames) 

150 return self._adjmat 

151 

152 @property 

153 def parents_indices_ar(self): 

154 """A numpy array containing the indices of all parent nodes""" 

155 return self.parents if self.is_nparray else self.parents['idx'].values 

156 

157 @property 

158 def parents(self): 

159 """A numpy array containing the indices of all parent nodes, or a pandas DataFrame containing 'idx' and 'name'""" 

160 if self._parents is None: 

161 # compute parents from adjmat, whether a dataframe or an array 

162 # TODO maybe use a sparse array here? 

163 parents = to_forest_parents_indices(self._adjmat) 

164 if not self.is_nparray: 

165 # create a df with two columns: indices and names 

166 parents = _add_names_to_parents_idx_series(parents) 

167 self._parents = parents 

168 return self._parents 

169 

170 @property 

171 def roots_mask_ar(self): 

172 """A 1D numpy mask array indicating if a node is a root node""" 

173 return self.roots_mask if self.is_nparray else self.roots_mask.values 

174 

175 @property 

176 def roots_mask(self): 

177 """A pandas Series or a 1D numpy mask array indicating if a node is a root node""" 

178 if self._roots_mask is None: 

179 if self._adjmat is not None: 

180 self._roots_mask = ~self.adjmat.any(axis=0) 

181 else: 

182 self._roots_mask = (self.parents if self.is_nparray else self.parents['idx']) < 0 

183 if not self.is_nparray: 

184 # remove the name of the series 

185 self._roots_mask.name = None 

186 return self._roots_mask 

187 

188 @property 

189 def roots_ar(self): 

190 """A 1D numpy array of root indices""" 

191 return np.where(self.roots_mask_ar)[0] 

192 

193 @property 

194 def roots(self): 

195 """A pandas Series of root names or 1D numpy array of root indices""" 

196 if self.is_nparray: 196 ↛ 197line 196 didn't jump to line 197, because the condition on line 196 was never true

197 return self.roots_ar 

198 else: 

199 return self.roots_mask.index[self.roots_mask] 

200 

201 def get_roots(self, 

202 names=None # type: bool 

203 ): 

204 """ 

205 Returns the list of root indices or root names 

206 

207 :param names: an optional boolean indicating if this method should return names instead of indices. By 

208 default `names` is set to `not self.is_np` 

209 :return: 

210 """ 

211 names = self._validate_names_arg(names) 

212 return list(self.roots) if names else list(self.roots_ar) 

213 

214 @property 

215 def non_roots_ar(self): 

216 """A 1D numpy array of non-root indices""" 

217 return np.where(~self.roots_mask_ar)[0] 

218 

219 @property 

220 def non_roots(self): 

221 """A pandas Series of non-root names or 1D numpy array of non-root indices""" 

222 if self.is_nparray: 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true

223 return self.non_roots_ar 

224 else: 

225 return self.roots_mask.index[~self.roots_mask] 

226 

227 def get_non_roots(self, 

228 names=None # type: bool 

229 ): 

230 """ 

231 Returns the list of non-root indices or names 

232 

233 :param names: an optional boolean indicating if this method should return names instead of indices. By 

234 default `names` is set to `not self.is_np` 

235 :return: 

236 """ 

237 names = self._validate_names_arg(names) 

238 return list(self.non_roots) if names else list(self.non_roots_ar) 

239 

240 @property 

241 def nodes_with_children_mask_ar(self): 

242 """A 1D numpy mask array indicating nodes that have at least 1 child""" 

243 return self.nodes_with_children_mask if self.is_nparray else self.nodes_with_children_mask.values 

244 

245 @property 

246 def nodes_with_children_mask(self): 

247 """A pandas Series or 1D numpy array indicating nodes that have at least 1 child""" 

248 if self._adjmat is not None: 248 ↛ 251line 248 didn't jump to line 251, because the condition on line 248 was never false

249 return self._adjmat.any(axis=1) 

250 else: 

251 nwc_idx = np.unique(self.parents_indices_ar[self.parents_indices_ar >= 0]) 

252 return self.indices_to_mask(nwc_idx) 

253 

254 @property 

255 def roots_with_children_mask_ar(self): 

256 """A 1D numpy mask array indicating root nodes that have at least 1 child""" 

257 return self.roots_with_children_mask if self.is_nparray else self.roots_with_children_mask.values 

258 

259 @property 

260 def roots_with_children_mask(self): 

261 """A pandas Series or 1D numpy mask array indicating root nodes that have at least 1 child""" 

262 if self._roots_wc_mask is None: 

263 if self._adjmat is not None: 

264 # a root with children = a root AND a node with children 

265 self._roots_wc_mask = self._roots_mask & self.nodes_with_children_mask 

266 else: 

267 # a root with children = a parent of a level 1 child node 

268 level_1_nodes_mask = ((self.parents_indices_ar >= 0) 

269 & (self.parents_indices_ar[self.parents_indices_ar] == -1)) 

270 rwc_indices = np.unique(self.parents_indices_ar[level_1_nodes_mask]) 

271 self._roots_wc_mask = self.indices_to_mask(rwc_indices, in_names=False, out_names=not self.is_nparray) 

272 return self._roots_wc_mask 

273 

274 @property 

275 def roots_with_children_ar(self): 

276 """A 1D numpy array with the indices of root nodes that have at least 1 child""" 

277 return np.where(self.roots_with_children_mask_ar)[0] 

278 

279 @property 

280 def roots_with_children(self): 

281 """A pandas Series or 1D numpy array with names/indices of root nodes that have at least 1 child""" 

282 if self.is_nparray: 282 ↛ 283line 282 didn't jump to line 283, because the condition on line 282 was never true

283 return self.roots_with_children_ar 

284 else: 

285 return self.roots_with_children_mask.index[self.roots_with_children_mask] 

286 

287 def get_roots_with_children(self, 

288 names=None # type: bool 

289 ): 

290 """ 

291 Returns the list of root with children 

292 

293 :param names: an optional boolean indicating if this method should return names instead of indices. By 

294 default `names` is set to `not self.is_np` 

295 :return: 

296 """ 

297 names = self._validate_names_arg(names) 

298 return list(self.roots_with_children) if names else list(self.roots_with_children_ar) 

299 

300 def get_children(self, 

301 parent_idx=None, # type: int 

302 parent_name=None, # type: str 

303 names=None # type: bool 

304 ): 

305 """ 

306 Returns the list of children of a node 

307 

308 :param parent_idx: the index of the node to query for 

309 :param parent_name: the name of the node to query for 

310 :param names: a boolean to indicate if the returned children should be identified by their names (`True`) or 

311 indices (`False`). By default if the parent is identified with `parent_idx` indices will be returned (`False`), 

312 and if the parent is identified with parent_name names will be returned (`True`) 

313 :return: an array containing indices of the child nodes 

314 """ 

315 

316 if parent_idx is not None: 

317 if parent_name is not None: 317 ↛ 318line 317 didn't jump to line 318, because the condition on line 317 was never true

318 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided") 

319 node = parent_idx 

320 if names is None: 320 ↛ 330line 320 didn't jump to line 330, because the condition on line 320 was never false

321 names = False 

322 elif parent_name is not None: 322 ↛ 328line 322 didn't jump to line 328, because the condition on line 322 was never false

323 node = parent_name 

324 if names is None: 324 ↛ 326line 324 didn't jump to line 326, because the condition on line 324 was never false

325 names = True 

326 self.assert_names_available() 

327 else: 

328 raise ValueError("You should provide a non-None `parent_idx` or `parent_name`") 

329 

330 if self.is_nparray: 

331 return np.where(self.parents == node)[0] 

332 else: 

333 qcol = 'idx' if parent_idx is not None else 'name' 

334 if not names: 

335 return np.where(self.parents[qcol] == node)[0] 

336 else: 

337 return self.parents[self.parents[qcol] == node].index.values 

338 

339 def get_arcs(self, 

340 multiindex=False, # type: bool 

341 names=None # type: bool 

342 ): 

343 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]] 

344 """ 

345 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names 

346 

347 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables 

348 (all the parents, all the childs). 

349 

350 :param A: 

351 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples 

352 :param names: an optional boolean indicating if this method should return names instead of indices. By 

353 default `names` is set to `not self.is_np` 

354 :return: 

355 """ 

356 names = self._validate_names_arg(names) 

357 if self._adjmat is not None: 

358 return get_arcs_from_adjmat(self._adjmat, multiindex=multiindex, names=names) 

359 else: 

360 return get_arcs_from_parents_indices(self._parents, multiindex=multiindex, names=names) 

361 

362 def walk_arcs(self, 

363 parent_idx=None, # type: int 

364 parent_name=None, # type: str 

365 names=None # type: bool 

366 ): 

367 """ 

368 Yields a sequence of (parent, child) indices or names. As opposed to `get_arcs` the sequence follows the tree 

369 order: starting from the list of root nodes, for every node, it is returned first and then all of its children. 

370 (depth first, not breadth first) 

371 

372 :param parent_idx: the optional index of a node. If provided, only the subtree behind this node will be walked 

373 :param parent_name: the optional name of a node. If provided, only the subtree behind this node will be walked 

374 :param names: an optional boolean indicating if this method should return names instead of indices. By 

375 default `names` is set to `not self.is_np` 

376 :return: yields a sequence of (level, i, j) 

377 """ 

378 names = self._validate_names_arg(names) 

379 if names: 

380 get_children = lambda node: self.get_children(parent_name=node) 

381 else: 

382 get_children = lambda node: self.get_children(parent_idx=node) 

383 

384 def _walk(from_node, level): 

385 for child in get_children(from_node): 

386 yield level, from_node, child 

387 for l, i, j in _walk(child, level+1): 

388 yield l, i, j 

389 

390 if parent_idx is not None: 

391 if parent_name is not None: 391 ↛ 392line 391 didn't jump to line 392, because the condition on line 391 was never true

392 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided") 

393 root_nodes = (parent_idx,) if not names else (self.parents.index[parent_idx],) 

394 elif parent_name is not None: 

395 root_nodes = (parent_name,) 

396 else: 

397 root_nodes = (self.roots if names else self.roots_ar) 

398 

399 for n in root_nodes: 

400 # walk the subtree 

401 for level, i, j in _walk(n, level=0): 

402 yield level, i, j 

403 

404 # ------- display methods 

405 

406 def to_str(self, 

407 names=None, # type: bool 

408 mode="headers" # type: str 

409 ): 

410 """ 

411 Provides a string representation of this forest. 

412 

413 :param names: an optional boolean indicating if this method should return names instead of indices. By 

414 default `names` is set to `not self.is_np` 

415 :param mode: one of "compact", "headers", and "full" (displays the trees) 

416 :return: 

417 """ 

418 names = self._validate_names_arg(names) 

419 nb_vars = self.nb_vars 

420 roots = self.get_roots(names) 

421 non_roots = self.get_non_roots(names) 

422 nb_roots = len(roots) 

423 nb_arcs = nb_vars - nb_roots 

424 

425 roots_with_children = self.get_roots_with_children(names) 

426 nb_roots_with_children = len(roots_with_children) 

427 

428 nb_sole_roots = (nb_roots - nb_roots_with_children) 

429 

430 if mode == "compact": 

431 return "QDForest (%s vars = %s roots + %s determined by %s of the roots)" \ 

432 % (nb_vars, nb_roots, nb_arcs, nb_roots_with_children) 

433 

434 # list of root node indice/name with a star when they have children 

435 roots_str = [("%s*" if r in roots_with_children else "%s") % r for r in roots] 

436 # list of 

437 non_roots_str = ["%s" % r for r in non_roots] 

438 

439 headers = "\n".join(( 

440 "QDForest (%s vars):" % nb_vars, 

441 " - %s roots (%s+%s*): %s" % (nb_roots, nb_sole_roots, nb_roots_with_children, ", ".join(roots_str)), 

442 " - %s other nodes: %s" % (nb_arcs, ", ".join(non_roots_str)), 

443 )) 

444 if mode == "headers": 

445 return headers 

446 elif mode == "full": 446 ↛ 450line 446 didn't jump to line 450, because the condition on line 446 was never false

447 tree_str = "\n".join(self.get_trees_str_list()) 

448 return "%s\n\n%s" % (headers, tree_str) 

449 else: 

450 raise ValueError("Unknown mode: %r" % mode) 

451 

452 @encode_if_py2 

453 def __str__(self): 

454 """ String representation, listing all useful information when available """ 

455 if self.nb_vars > 30: 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true

456 return self.to_str(mode="headers") 

457 else: 

458 return self.to_str(mode="full") 

459 

460 @encode_if_py2 

461 def __repr__(self): 

462 # return str(self) # note if we use this then we'll have to comment the decorator 

463 return self.to_str(mode="compact") 

464 

465 def print_arcs(self, 

466 names=None # type: bool 

467 ): 

468 """ Prints the various deterministic arcs in this forest """ 

469 print("\n".join(self.get_arcs_str_list(names=names))) 

470 

471 def get_arcs_str_list(self, 

472 names=None # type: bool 

473 ): 

474 """ Returns a list of string representation of the various deterministic arcs in this forest """ 

475 res_str_list = [] 

476 for parent, child in self.get_arcs(names=names): 

477 res_str_list.append("%s -> %s" % (parent, child)) 

478 return res_str_list 

479 

480 def get_trees_str_list(self, 

481 all=False, 

482 names=None # type: bool 

483 ): 

484 """ 

485 Returns a list of string representation of the various trees in this forest 

486 TODO maybe use https://github.com/caesar0301/treelib ? Or networkX ? 

487 

488 :param all: if True, this will return also the trees that are consistuted of one node 

489 :param names: 

490 :return: 

491 """ 

492 names = self._validate_names_arg(names) 

493 res_str_list = [] 

494 if all: 494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true

495 roots = self.get_roots(names) 

496 else: 

497 roots = self.get_roots_with_children(names) 

498 

499 if names: 

500 walk_arcs = lambda n: self.walk_arcs(parent_name=n) 

501 else: 

502 walk_arcs = lambda n: self.walk_arcs(parent_idx=n) 

503 

504 for r in roots: 

505 subtree_str = "%s" % r 

506 for level, _, j in walk_arcs(r): 

507 subtree_str += "\n%s└─ %s" % (" " * level, j) 

508 res_str_list.append(subtree_str + "\n") 

509 return res_str_list 

510 

511 # ----------- 

512 

513 def fit_selector_model(self, 

514 X # type: Union[np.ndarray, pd.DataFrame] 

515 ): 

516 # type: (...) -> QDSelectorModel 

517 """ 

518 Returns a new instance of `QDSelectorModel` fit with the data in `X` 

519 

520 :param X: 

521 :return: 

522 """ 

523 from .selector import QDSelectorModel 

524 model = QDSelectorModel(self) 

525 model.fit(X) 

526 return model 

527 

528 # --------- tools for entropies analysis to find interesting thresholds 

529 

530 def get_entropies_table(self, 

531 from_to=True, # type: bool 

532 sort_by="from", # type: str 

533 drop_self_link=True, # type: bool 

534 ): 

535 """ See `Entropies.get_entropies_table` """ 

536 

537 self.assert_stats_kept() 

538 return self.stats.get_entropies_table(from_to=from_to, sort_by=sort_by, drop_self_link=drop_self_link) 

539 

540 def plot_increasing_entropies(self): 

541 """ See `Entropies.plot_increasing_entropies` """ 

542 

543 self.assert_stats_kept() 

544 self.stats.plot_increasing_entropies() 

545 

546 

547def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray] 

548 ): 

549 """ 

550 Raises a ValueError if `df_or_array` is 

551 

552 :param df_or_array: 

553 :return: 

554 """ 

555 if isinstance(df_or_array, pd.DataFrame): 

556 pass 

557 elif isinstance(df_or_array, np.ndarray): 557 ↛ 563line 557 didn't jump to line 563, because the condition on line 557 was never false

558 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes 

559 if len(df_or_array.shape) != 2: 559 ↛ 560line 559 didn't jump to line 560, because the condition on line 559 was never true

560 raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape)) 

561 else: 

562 # Raise error 

563 raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`") 

564 

565 

566def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray] 

567 absolute_eps=None, # type: float 

568 relative_eps=None, # type: float 

569 keep_stats=False, # type: bool 

570 non_categorical_mode='strict', 

571 ): 

572 # type: (...) -> QDForest 

573 """ 

574 Finds the (quasi-)deterministic relationships (functional dependencies) between the variables in `X`, and returns 

575 a `QDForest` object representing the forest of (quasi-)deterministic trees. This object can then be used to fit a 

576 feature selection model or to learn a Bayesian Network structure. 

577 

578 By default only deterministic relationships are detected. Quasi-determinism can be enabled by setting either 

579 an threshold on conditional entropies (`absolute_eps`) or on relative conditional entropies (`relative_eps`). Only 

580 one of them should be set. 

581 

582 By default (`keep_stats=False`) the entropies tables are not preserved once the forest has been created. If you wish 

583 to keep them available, set `keep_stats=True`. The entropies tables are then available in the `<QDForest>.stats` 

584 attribute, and threshold analysis methods such as `<QDForest>.get_entropies_table(...)` and 

585 `<QDForest>.plot_increasing_entropies()` become available. 

586 

587 :param X: the dataset as a pandas DataFrame or a numpy array. Columns represent the features to compare. 

588 :param absolute_eps: Absolute entropy threshold. Any feature `Y` that can be predicted from another feature `X` in 

589 a quasi-deterministic way, that is, where conditional entropy `H(Y|X) <= absolute_eps`, will be removed. The 

590 default value is `0` and corresponds to removing deterministic relationships only. 

591 :param relative_eps: Relative entropy threshold. Any feature `Y` that can be predicted from another feature `X` in 

592 a quasi-deterministic way, that is, where relative conditional entropy `H(Y|X)/H(Y) <= relative_eps` (a value 

593 between `0` and `1`), will be removed. Only one of `absolute_eps` or `relative_eps` should be provided. 

594 :param keep_stats: a boolean indicating if the various entropies tables computed in the process should be kept in 

595 memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`. 

596 :return: 

597 """ 

598 # Make sure this is a 2D table 

599 assert_df_or_2D_array(X) 

600 

601 # Sanity check: are there rows in here ? 

602 if len(X) == 0: 602 ↛ 603line 602 didn't jump to line 603, because the condition on line 602 was never true

603 raise ValueError("Provided dataset does not contain any row") 

604 

605 # Only work on the categorical features 

606 X = get_categorical_features(X, non_categorical_mode=non_categorical_mode) 

607 

608 # Sanity check concerning the number of columns 

609 assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing." 

610 

611 # parameters check and defaults 

612 if absolute_eps is None: 

613 if relative_eps is None: 

614 # nothing is provided, use absolute threshold 0 

615 absolute_eps = 0 

616 is_absolute = True 

617 else: 

618 # relative threshold provided 

619 is_absolute = False 

620 else: 

621 if relative_eps is not None: 621 ↛ 622line 621 didn't jump to line 622, because the condition on line 621 was never true

622 raise ValueError("only one of absolute and relative should be passed") 

623 # absolute threshold provided 

624 is_absolute = True 

625 

626 is_strict = (absolute_eps == 0.) if is_absolute else (relative_eps == 0.) 

627 

628 # sanity check 

629 if is_absolute and absolute_eps < 0: 629 ↛ 630line 629 didn't jump to line 630, because the condition on line 629 was never true

630 raise ValueError("epsilon_absolute should be positive") 

631 elif not is_absolute and (relative_eps < 0 or relative_eps > 1): 631 ↛ 632line 631 didn't jump to line 632, because the condition on line 631 was never true

632 raise ValueError("epsilon_relative should be 0=<eps=<1") 

633 

634 # (0) compute conditional entropies or relative conditional entropies 

635 A_orig, X_stats = get_adjacency_matrix(X, eps_absolute=absolute_eps, eps_relative=relative_eps) 

636 

637 # (2) identify redundancy 

638 A_noredundancy = remove_redundancies(A_orig, selection_order=None if is_strict else X_stats.list_vars_by_entropy_order(desc=True)) 

639 

640 # (3) transform into forest: remove extra parents by keeping only the parent with lowest entropy / nb levels ? 

641 # if X -> Z and X -> Y and Y -> Z then Z has two parents X and Y but only Y should be kept 

642 # Determinist case: minimize the number of parameters: take the minimal nb of levels 

643 # Quasi-determinist case: take the lowest entropy 

644 entropy_order_inc = (X_stats.H_ar).argsort() 

645 # parent_order = compute_nb_levels(df) if is_strict else entropy_based_order 

646 # A_forest = to_forest_adjmat(A_noredundancy, entropy_order_inc) 

647 # qd_forest = QDForest(adjmat=A_forest) 

648 parents = to_forest_parents_indices(A_noredundancy, entropy_order_inc) 

649 # if not X_stats.is_nparray: 

650 # parents = pd.Series(parents, index=A_noredundancy.columns) 

651 qd_forest = QDForest(parents=parents, stats=X_stats if keep_stats else None) 

652 

653 # forestAdjMatList <- FromHToDeterForestAdjMat(H = H, criterion = criterionNlevels, removePerfectMatchingCol = removePerfectMatchingCol) 

654 

655 # (4) convert to pgmpy format 

656 # deterForest <- FromAdjMatToBnLearn(adjMat = forestAdjMat) 

657 # rootsF <- Roots(forestAdjMat) 

658 

659 # if(length(rootsFNames) == 1){ 

660 # print("Only one tree in the forest !!! No root graph computed") 

661 # print("****************") 

662 # print("Training - Phase 2 and Phase 3 obsolete") 

663 # print("****************") 

664 # gstar <- deterForest 

665 # G_R <- NULL 

666 # print("Final graph computed") 

667 # }else{ 

668 # print(paste("Multiple trees in the forest (", length(rootsFNames),") Root graph will be computed.", sep = "", collapse = "")) 

669 # rootsOnly <- ReduceDataAndH(variablesToKeep = rootsFNames, dataFrame = data, H) 

670 # dataRoots <- rootsOnly[[1]] 

671 # HRoots <- rootsOnly[[2]] 

672 # rm(rootsOnly) 

673 # print("****************") 

674 # print("Training - Phase 2: computation of the root graph") 

675 # print("****************") 

676 # # removePerfectlyMatchingCol useless here because either pmc are in the same trees (but only one is the root), either they were deleted earlier on 

677 # G_R <- SotABNsl(data = dataRoots, method = method, score = score, hyperparamList, removePerfectMatchingCol = removePerfectMatchingCol, ...) 

678 # edgeListG_R <- FromBnToEdgeList(bn = G_R) 

679 # colnames(edgeListG_R) <- c('from', 'to') 

680 # print("****************") 

681 # print("Training - Phase 3: fusion of the deter forest with the root graph") 

682 # print("****************") 

683 # gstarAdjMat <- AddEdgesFromEdgeList(originalAdjMatrix = forestAdjMat, edgeListToAdd = edgeListG_R) 

684 # gstar <- FromAdjMatToBnLearn(gstarAdjMat) 

685 # print("Final graph computed") 

686 # } 

687 

688 return qd_forest 

689 

690 

691class Entropies(object): 

692 """ to do this could be easier to read with pyfields and default value factories """ 

693 

694 __slots__ = ('dataset', 'is_nparray', '_H', '_Hcond', '_Hcond_rel', '_dataset_df') 

695 

696 def __init__(self, df_or_array): 

697 """ 

698 

699 :param df_or_array: a pandas dataframe or numpy array where the variables are the columns 

700 """ 

701 self.dataset = df_or_array 

702 self.is_nparray = isinstance(df_or_array, np.ndarray) 

703 self._H = None # type: Union[np.ndarray, pd.Series] 

704 self._Hcond = None # type: Union[np.ndarray, pd.DataFrame] 

705 self._Hcond_rel = None # type: Union[np.ndarray, pd.DataFrame] 

706 self._dataset_df = None # type: pd.DataFrame 

707 

708 def __str__(self): 

709 return repr(self) 

710 

711 def __repr__(self): 

712 res = """Statistics computed for dataset: 

713%s 

714...(%s rows) 

715 

716Entropies (H): 

717%s 

718 

719Conditional entropies (Hcond = H(row|col)): 

720%s 

721 

722Relative conditional entropies (Hcond_rel = H(row|col)/H(row)): 

723%s 

724""" % (self.dataset.head(2), len(self.dataset), self.H.T, self.Hcond, self.Hcond_rel) 

725 return res 

726 

727 @property 

728 def dataset_df(self): 

729 """The dataset as a pandas DataFrame, if pandas is available""" 

730 if self.is_nparray: 

731 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes 

732 if self.dataset.dtype.names is not None: 732 ↛ 734line 732 didn't jump to line 734, because the condition on line 732 was never true

733 # structured array 

734 self._dataset_df = pd.DataFrame(self.dataset) 

735 else: 

736 # unstructured array... same ? 

737 self._dataset_df = pd.DataFrame(self.dataset) 

738 return self._dataset_df 

739 else: 

740 return self.dataset 

741 

742 @property 

743 def nb_vars(self): 

744 return self.dataset.shape[1] 

745 

746 @property 

747 def varnames(self): 

748 if self.is_nparray: 748 ↛ 749line 748 didn't jump to line 749, because the condition on line 748 was never true

749 raise ValueError("Variable names not available") 

750 return list(self.dataset.columns) 

751 

752 @property 

753 def H_ar(self): 

754 """ The entropy matrix (i, j) = H(Xi | Xj) as a numpy array """ 

755 return self.H if self.is_nparray else self.H.values 

756 

757 entropies_ar = H_ar 

758 """An alias for H_ar""" 

759 

760 @property 

761 def H(self): 

762 """The entropies of all variables. a pandas Series if df was a pandas dataframe, else a 1D numpy array""" 

763 if self._H is None: 

764 # Using pyitlib to compute H (hopefully efficiently) 

765 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report 

766 # note: we convert to string type to avoid a bug with ints. TODO... 

767 self._H = drv.entropy(self.dataset_df.T.astype(str)) 

768 if not self.is_nparray: 

769 self._H = pd.Series(self._H, index=self.varnames) 

770 

771 # basic sanity check: should all be positive 

772 assert np.all(self._H >= 0) 

773 return self._H 

774 

775 entropies = H 

776 """An alias for H""" 

777 

778 @property 

779 def Hcond_ar(self): 

780 """ The conditional entropy matrix (i, j) = H(Xi | Xj) as a numpy array """ 

781 return self.Hcond if self.is_nparray else self.Hcond.values 

782 

783 conditional_entropies_ar = Hcond_ar 

784 """An alias for Hcond_ar""" 

785 

786 @property 

787 def Hcond(self): 

788 """ 

789 The conditional entropy matrix (i, j) = H(Xi | Xj). 

790 A pandas Dataframe or a 2D numpy array depending on dataset type 

791 """ 

792 if self._Hcond is None: 

793 # Old attempt to do it ourselves 

794 # (0) init H 

795 # nb_vars = len(df.columns) 

796 # H = np.empty((nb_vars, nb_vars), dtype=float) 

797 # (1) for each column compute the counts per value 

798 # (2) for each (i, j) pair compute the counts 

799 

800 # Using pyitlib to compute H (hopefully efficiently) 

801 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report 

802 # note: we convert to string type to avoid a bug with ints. TODO... 

803 self._Hcond = drv.entropy_conditional(self.dataset_df.T.astype(str)) 

804 

805 # add the row/column headers 

806 if not self.is_nparray: 

807 self._Hcond = pd.DataFrame(self._Hcond, index=self.varnames, columns=self.varnames) 

808 

809 # basic sanity check: should all be positive 

810 assert np.all(self._Hcond >= 0) 

811 return self._Hcond 

812 

813 conditional_entropies = Hcond 

814 """An alias for Hcond""" 

815 

816 @property 

817 def Hcond_rel_ar(self): 

818 """ The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi) as a numpy array """ 

819 return self.Hcond_rel if self.is_nparray else self.Hcond_rel.values 

820 

821 relative_conditional_entropies_ar = Hcond_rel_ar 

822 """An alias for Hcond_rel_ar""" 

823 

824 @property 

825 def Hcond_rel(self): 

826 """ 

827 The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi). 

828 """ 

829 if self._Hcond_rel is None: 

830 # compute relative entropy: in each cell (X, Y) we want H(X|Y)/H(X) 

831 if self.is_nparray: 

832 self._Hcond_rel = (self.Hcond.T / self.H).T 

833 else: 

834 Hrel_array = (self.Hcond.values.T / self.H.values).T 

835 self._Hcond_rel = pd.DataFrame(Hrel_array, index=list(self.Hcond.columns), columns=list(self.Hcond.columns)) 

836 

837 # basic sanity check 

838 assert np.all(self._Hcond_rel >= 0.) 

839 assert np.all(self._Hcond_rel <= 1.) 

840 return self._Hcond_rel 

841 

842 relative_conditional_entropies = Hcond_rel 

843 """An alias for Hcond_rel""" 

844 

845 def list_vars_by_entropy_order(self, desc=False): 

846 """ 

847 Returns the indices or names of variables in ascending (resp. descending) order of entropy 

848 

849 :param desc: if True the order is descending, else ascending 

850 :return: 

851 """ 

852 if self.is_nparray: 

853 return (-self.H).argsort() if desc else self.H.argsort() 

854 else: 

855 return (-self.H.values).argsort() if desc else self.H.values.argsort() 

856 

857 # --------- tools for entropies analysis to find interesting thresholds 

858 

859 def get_entropies_table(self, 

860 from_to=True, # type: bool 

861 sort_by="from", # type: str 

862 drop_self_link=True, # type: bool 

863 ): 

864 """ 

865 Returns a pandas series or numpy array with four columns: from, to, cond_entropy, rel_cond_entropy. 

866 The index is 'arc', a string representing the arc e.g. "X->Y". 

867 

868 :param from_to: a boolean flag indicating if 'from' and 'to' columns should remain in the returned dataframe 

869 (True, default) or be dropped (False) 

870 :param sort_by: a string indicating if the arcs should be sorted by origin ("from", default) or destination 

871 ("to"), or by value "rel_cond_entropy" or "cond_entropy", in the resulting table. 

872 :param drop_self_link: by default node-to-self arcs are not returned in the list. Turn this to False to include 

873 them. 

874 :return: 

875 """ 

876 if self.is_nparray: 876 ↛ 877line 876 didn't jump to line 877, because the condition on line 876 was never true

877 raise NotImplementedError("TODO") 

878 else: 

879 # 1. gather the data and unpivot it so that there is one row per arc 

880 res_df = pd.DataFrame({ 

881 'cond_entropy': self.Hcond.unstack(), 

882 'rel_cond_entropy': self.Hcond_rel.unstack(), 

883 }) 

884 res_df.index.names = ['from', 'to'] 

885 res_df = res_df.reset_index() 

886 

887 # 2. filter out node-to-self if needed 

888 if drop_self_link: 888 ↛ 892line 888 didn't jump to line 892, because the condition on line 888 was never false

889 res_df = res_df.loc[res_df['from'] != res_df['to']].copy() 

890 

891 # 3. create the arc names column and make it the index 

892 def make_arc_str(row): 

893 return "%s->%s" % (row['from'], row.to) # note that .from is a reserved python symbol ! 

894 res_df['arc'] = res_df.apply(make_arc_str, axis=1) 

895 res_df.set_index('arc', inplace=True) 

896 

897 # 4. Optionally sort differently 

898 all_possibilities = ("from", "to", "cond_entropy", "rel_cond_entropy") 

899 if sort_by == all_possibilities[0]: 899 ↛ 900line 899 didn't jump to line 900, because the condition on line 899 was never true

900 pass # already done 

901 elif sort_by in all_possibilities[1:]: 901 ↛ 904line 901 didn't jump to line 904, because the condition on line 901 was never false

902 res_df.sort_values(by=sort_by, inplace=True) 

903 else: 

904 raise ValueError("Invalid value for `sort_by`: %r. Possible values: %r" % (sort_by, all_possibilities)) 

905 

906 # 5. Optionally drop the from and to columns 

907 if not from_to: 

908 res_df.drop(['from', 'to'], axis=1, inplace=True) 

909 

910 return res_df 

911 

912 def plot_increasing_entropies(self): 

913 """ 

914 

915 :return: 

916 """ 

917 import matplotlib.pyplot as plt 

918 

919 # get the dataframe 

920 df = self.get_entropies_table(sort_by="rel_cond_entropy") 

921 

922 # plot with all ticks 

923 df_sorted = df["rel_cond_entropy"] # .sort_values() 

924 df_sorted.plot(title="Relative conditional entropy H(X|Y)/H(X) for each arc X->Y, by increasing order", 

925 figsize=(15, 10)) 

926 plt.xticks(np.arange(len(df_sorted)), df_sorted.index, rotation=90) 

927 

928 

929def get_adjacency_matrix(df, # type: Union[np.ndarray, pd.DataFrame] 

930 eps_absolute=None, # type: float 

931 eps_relative=None # type: float 

932 ): 

933 """ 

934 

935 :param df: 

936 :param eps_absolute: 

937 :param eps_relative: 

938 :return: 

939 """ 

940 df_stats = Entropies(df) 

941 

942 # (1) create initial adjacency matrix by thresholding either H(X|Y) (absolute) or H(X|Y)/H(X) (relative) 

943 if eps_relative is None: 

944 # default value for eps absolute 

945 if eps_absolute is None: 

946 eps_absolute = 0 

947 

948 # threshold is on H(X|Y) 

949 edges = (df_stats.Hcond_ar <= eps_absolute).T 

950 else: 

951 if eps_absolute is not None: 

952 raise ValueError("Only one of `eps_absolute` and `eps_relative` should be provided") 

953 

954 # threshold on H(X|Y)/H(X) 

955 edges = (df_stats.Hcond_rel_ar <= eps_relative).T 

956 

957 # the diagonal should be false 

958 np.fill_diagonal(edges, False) 

959 

960 # finally create the matrix 

961 if df_stats.is_nparray: 

962 A = edges 

963 else: 

964 A = pd.DataFrame(edges, index=df_stats.varnames, columns=df_stats.varnames) 

965 

966 return A, df_stats 

967 

968 

969def remove_redundancies(A, # type: Union[np.ndarray, pd.DataFrame] 

970 selection_order=None # type: np.ndarray 

971 ): 

972 """Cleans the arcs in A between redundant variables. 

973 

974 A should be a dataframe or numpy array where index = columns and indicate the node name. 

975 It should contain boolean values where True at (i,j) means there is a directed arc i -> j 

976 

977 When there are redundant variables (arcs between i->j and j->i), only a single node is kept as the parent 

978 of all other redundant nodes. 

979 

980 :param A: an adjacency matrix where A(i, j) indicates that there is an arc i -> j 

981 :param selection_order: if none (default) the first node in the list will be kept as representant for each 

982 redundancy class. Otherwise an alternate order (array of indices) can be provided. 

983 :return: 

984 """ 

985 assert_adjacency_matrix(A) 

986 

987 # work on a copy 

988 is_nparray = isinstance(A, np.ndarray) 

989 if is_nparray: 

990 A = A.copy() 

991 A_df = None 

992 else: 

993 A_df = A.copy(deep=True) 

994 A = A_df.values 

995 

996 # init 

997 n_vars = A.shape[0] 

998 if selection_order is None: 

999 selection_order = range(n_vars) 

1000 

1001 # I contains the list of variable indices to go through. We can remove some in the loop 

1002 I = np.ones((n_vars, ), dtype=bool) 

1003 

1004 # for each node 

1005 for i in selection_order: 

1006 # if we did not yet take it into account 

1007 if I[i]: 

1008 # find redundant nodes with this node (= there is an arc in both directions) 

1009 mask_redundant_with_i = A[i, :] * A[:, i] 

1010 

1011 # we will stop caring about this redundancy class 

1012 I[mask_redundant_with_i] = False 

1013 

1014 # we do not care about i-to-i 

1015 mask_redundant_with_i[i] = False 

1016 

1017 # if there is at least one redundant node 

1018 if any(mask_redundant_with_i): 

1019 redundant_nodes_idx = np.where(mask_redundant_with_i) 

1020 

1021 # let only the first variable in the list (i) be the parent 

1022 # i becomes the representant of the redundancy class 

1023 A[redundant_nodes_idx, i] = False 

1024 

1025 # remove all arcs inter-redundant variables. 

1026 # Note that this destroys the diagonal but we will fix this at the end 

1027 A[redundant_nodes_idx, redundant_nodes_idx] = False 

1028 

1029 # restore the diagonal: no 

1030 # np.fill_diagonal(A, True) 

1031 if is_nparray: 

1032 return A 

1033 else: 

1034 return A_df 

1035 

1036 

1037def to_forest_parents_indices(A, # type: Union[np.ndarray, pd.DataFrame] 

1038 selection_order=None # type: np.ndarray 

1039 ): 

1040 # type: (...) -> Union[np.ndarray, pd.DataFrame] 

1041 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order 

1042 

1043 Returns a 1D array of parent index or -1 if root 

1044 

1045 :param A: an adjacency matrix, as a dataframe or numpy array 

1046 :param selection_order: an optional order for parent selection. If not provided, the first in the list of columns 

1047 of A will be used 

1048 :return: 

1049 """ 

1050 assert_adjacency_matrix(A) 

1051 is_np_array = isinstance(A, np.ndarray) 

1052 

1053 # From https://stackoverflow.com/a/47269413/7262247 

1054 if is_np_array: 

1055 mask = A[selection_order, :] if selection_order is not None else A 

1056 else: 

1057 mask = A.iloc[selection_order, :].values if selection_order is not None else A.values 

1058 

1059 # return a list containing for each feature, the index of its parent or -1 if it is a root 

1060 indices = np.where(mask.any(axis=0), 

1061 selection_order[mask.argmax(axis=0)] if selection_order is not None else mask.argmax(axis=0), 

1062 -1) 

1063 if not is_np_array: 

1064 indices = pd.Series(indices, index=A.columns) 

1065 return indices 

1066 

1067 

1068def to_forest_adjmat(A, # type: Union[np.ndarray, pd.DataFrame] 

1069 selection_order, # type: np.ndarray 

1070 inplace=False # type: bool 

1071 ): 

1072 # type: (...) -> Union[np.ndarray, pd.DataFrame] 

1073 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order 

1074 

1075 :param A: an adjacency matrix, as a dataframe or numpy array 

1076 :return: 

1077 """ 

1078 is_ndarray = isinstance(A, np.ndarray) 

1079 assert_adjacency_matrix(A) 

1080 

1081 if not inplace: 

1082 A = A.copy() 

1083 

1084 # nb_parents = A_df.sum(axis=0) 

1085 # nodes_with_many_parents = np.where(nb_parents.values > 1)[0] 

1086 

1087 # Probably the fastest: when the cumulative nb of parents is above 1 they need to be removed 

1088 if is_ndarray: 

1089 mask_parents_over_the_first = A[selection_order, :].cumsum(axis=0) > 1 

1090 mask_parents_over_the_first = mask_parents_over_the_first[selection_order.argsort(), :] 

1091 A[mask_parents_over_the_first] = False 

1092 else: 

1093 # From https://stackoverflow.com/a/47269413/7262247 

1094 mask_parents_over_the_first = A.iloc[selection_order, :].cumsum(axis=0) > 1 

1095 A[mask_parents_over_the_first] = False 

1096 

1097 # # Probably slower but easier to understand 

1098 # # for each column we want to only keep one (or zero) row (parent) with True 

1099 # for i in range(A_df.shape[1]): 

1100 # first_parent_idx = A_df.values[selection_order, i].argmax(axis=0) 

1101 # A_df.values[selection_order[first_parent_idx+1:], i] = False 

1102 

1103 if not inplace: 

1104 return A 

1105 

1106 

1107def get_arcs_from_parents_indices(parents, # type: Union[np.ndarray, pd.DataFrame] 

1108 multiindex=False, # type: bool 

1109 names=False # type: bool 

1110 ): 

1111 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]] 

1112 """ 

1113 if multiindex = False ; returns a sequence of pairs : (9, 1), (3, 5), (9, 7) 

1114 if multiindex = True ; returns two sequences of indices: (9, 3, 9), (1, 5, 7) 

1115 

1116 :param parents: 

1117 :param multiindex: 

1118 :param names: 

1119 :return: 

1120 """ 

1121 is_np_array = isinstance(parents, np.ndarray) 

1122 if not names: 

1123 if not is_np_array: 

1124 # assume a dataframe with an 'idx' column as in QDForest class 

1125 parents = parents['idx'].values 

1126 n = len(parents) 

1127 childs_mask = parents >= 0 

1128 res = parents[childs_mask], np.arange(n)[childs_mask] 

1129 return res if multiindex else zip(*res) 

1130 else: 

1131 if is_np_array: 1131 ↛ 1132line 1131 didn't jump to line 1132, because the condition on line 1131 was never true

1132 raise ValueError("Names are not available, this is a numpy array") 

1133 else: 

1134 # cols = A.columns 

1135 # return ((cols[i], cols[j]) for i, j in zip(*np.where(A))) 

1136 res = parents.loc[parents['name'].notna(), 'name'] 

1137 res = res.values, res.index 

1138 return res if multiindex else zip(*res) 

1139 

1140 

1141def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFrame] 

1142 multiindex=False, # type: bool 

1143 names=False # type: bool 

1144 ): 

1145 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]] 

1146 """ 

1147 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names 

1148 

1149 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables 

1150 (all the parents, all the childs). 

1151 

1152 :param A: 

1153 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples 

1154 :param names: if False, indices are returned. Otherwise feature names are returned if any 

1155 :return: 

1156 """ 

1157 if not names: 

1158 res = np.where(A) 

1159 return res if multiindex else zip(*res) 

1160 else: 

1161 is_np_array = isinstance(A, np.ndarray) 

1162 if is_np_array: 1162 ↛ 1163line 1162 didn't jump to line 1163, because the condition on line 1162 was never true

1163 raise ValueError("Names are not available, this is a numpy array") 

1164 else: 

1165 cols = A.columns 

1166 res_ar = np.where(A) 

1167 if multiindex: 1167 ↛ 1168line 1167 didn't jump to line 1168, because the condition on line 1167 was never true

1168 return ((cols[i] for i in l) for l in res_ar) # noqa 

1169 else: 

1170 return ((cols[i], cols[j]) for i, j in zip(*res_ar)) 

1171 

1172 

1173def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame] 

1174 non_categorical_mode="strict" # type: str 

1175 ): 

1176 # type: (...) -> Union[np.ndarray, pd.DataFrame] 

1177 """ 

1178 

1179 :param df_or_array: 

1180 :param non_categorical_mode: 

1181 :return: a dataframe or array with the categorical features 

1182 """ 

1183 assert_df_or_2D_array(df_or_array) 

1184 

1185 if non_categorical_mode == "strict": 1185 ↛ 1187line 1185 didn't jump to line 1187, because the condition on line 1185 was never false

1186 strict_mode = True 

1187 elif non_categorical_mode == "remove": 

1188 strict_mode = False 

1189 else: 

1190 raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode) 

1191 

1192 if isinstance(df_or_array, pd.DataFrame): 

1193 is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"]) 

1194 if strict_mode and not is_categorical_dtype.all(): 

1195 raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or " 

1196 "'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to " 

1197 "`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict()) 

1198 elif not is_categorical_dtype.any(): 1198 ↛ 1199line 1198 didn't jump to line 1199, because the condition on line 1198 was never true

1199 raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or " 

1200 "'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict()) 

1201 return df_or_array.loc[:, is_categorical_dtype] 

1202 

1203 elif isinstance(df_or_array, np.ndarray): 1203 ↛ 1228line 1203 didn't jump to line 1228, because the condition on line 1203 was never false

1204 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes 

1205 if df_or_array.dtype.names is not None: 1205 ↛ 1207line 1205 didn't jump to line 1207, because the condition on line 1205 was never true

1206 # structured array 

1207 is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object" 

1208 for n in df_or_array.dtype.names]) 

1209 if strict_mode and not is_categorical_dtype.all(): 

1210 invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict() 

1211 raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): " 

1212 "found dtypes %r. This is not supported when `non_categorical_mode` is set to " 

1213 "`'strict'`" % invalid_dtypes) 

1214 elif not is_categorical_dtype.any(): 

1215 raise ValueError( 

1216 "Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or " 

1217 "'categorical'): found dtypes %r" % df_or_array.dtype.fields) 

1218 categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype] 

1219 return df_or_array[categorical_names] 

1220 else: 

1221 # non-structured array 

1222 if df_or_array.dtype != np.dtype('O'): 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise TypeError("Provided array columns are not object nor categorical: found dtype %r" 

1224 % df_or_array.dtype) 

1225 return df_or_array 

1226 else: 

1227 # Should not happen since `assert_df_or_2D_array` is called upfront now. 

1228 raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray") 

1229 

1230 

1231def assert_adjacency_matrix(A # type: Union[np.ndarray, pd.DataFrame] 

1232 ): 

1233 """Routine to check that A is a proper adjacency matrix""" 

1234 

1235 if len(A.shape) != 2: 1235 ↛ 1236line 1235 didn't jump to line 1236, because the condition on line 1235 was never true

1236 raise ValueError("A is not a 2D adjacency matrix, its shape is %sD" % len(A.shape)) 

1237 

1238 if A.shape[0] != A.shape[1]: 1238 ↛ 1239line 1238 didn't jump to line 1239, because the condition on line 1238 was never true

1239 raise ValueError("A is not a 2D adjacency matrix: it is not square: %r" % A.shape)