qdscreen/main.py - flake8 annotated source

1 # -*- coding: utf-8 -*-

2 # The above encoding declaration is needed because we use non-scii characters in `get_trees_str_list`.

3 # The below import transforms automatically all strings in this file in unicode in python 2.

4 from __future__ import unicode_literals # See tests/encoding_ref_help.py for a detailed explanation

5 import numpy as np

6 import pandas as pd

7

8 from pyitlib import discrete_random_variable as drv

9

10 from .compat import encode_if_py2

11

12 try:

13 from typing import Union, Iterable, Tuple

14 except: # noqa

15 pass

16

17

18 # TODO

19 # Cramer's V + Theil's U:

20 # https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

21 # https://stackoverflow.com/questions/46498455/categorical-features-correlation

E501 Line too long (126 > 120 characters)

22 # https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix (+ corrected version)

23 # https://stackoverflow.com/questions/61236715/correlation-between-categorical-variables-within-a-dataset

24 #

25 # https://stackoverflow.com/questions/48035381/correlation-among-multiple-categorical-variables-pandas

26 # https://stackoverflow.com/questions/44694228/how-to-check-for-correlation-among-continuous-and-categorical-variables-in-pytho

27 #

28

29 def _add_names_to_parents_idx_series(parents):

30 parents = pd.DataFrame(parents, columns=('idx',))

31 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None)

32 return parents

33

34

35 class QDForest(object):

36 """A quasi-deterministic forest returned by `qd_screen`"""

37 __slots__ = (

38 '_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child)

39 '_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root

40 'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)

41 '_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables

42 '_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children

43 'stats' # an optional `Entropies` object stored for debug

44 )

45

46 def __init__(self,

47 adjmat=None, # type: Union[np.ndarray, pd.DataFrame]

48 parents=None, # type: Union[np.ndarray, pd.Series]

49 stats=None # type: Entropies

50 ):

51 """

52

53 :param adjmat:

54 :param parents:

55 """

56 self.stats = stats

57 self._adjmat = adjmat

58 self.is_nparray = isinstance((adjmat if adjmat is not None else parents), np.ndarray)

59 if parents is not None:

60 if not self.is_nparray:

61 # create a df with two columns: indices and names

62 parents = _add_names_to_parents_idx_series(parents)

63 self._parents = parents

64 else:

65 self._parents = None

66

67 self._roots_mask = None

68 self._roots_wc_mask = None

69

70 def assert_stats_kept(self):

71 if self.stats is None:

72 raise ValueError("`stats` are not available in this QDForest instance. If you created it using `qd_screen`,"

73 " you should use `keep_stats=True`.")

74

75 def assert_pandas_capable(self):

76 """Utility method to assert that "not self.is_nparray" """

77 if self.is_nparray:

78 raise ValueError("This QDForest instance was built with numpy arrays, it is not pandas compliant")

79

80 def assert_names_available(self):

81 """"""

82 if self.is_nparray:

83 raise ValueError("Variable names are not available")

84

85 def _validate_names_arg(self,

86 names):

87 """Utility method to validate and set default value of the `names` argument used in most methods"""

88 if names is None:

89 names = not self.is_nparray

90 if names:

91 self.assert_names_available()

92 return names

93

94 @property

95 def nb_vars(self):

96 return self._parents.shape[0] if self._parents is not None else self._adjmat.shape[0]

97

98 @property

99 def varnames(self):

100 self.assert_names_available()

101 return np.array(self._adjmat.columns) if self._adjmat is not None else np.array(self._parents.index)

102

103 def indices_to_mask(self,

104 indices,

105 in_names=None,

106 out_names=None,

107 ):

108 """Utility to convert a list of indices to a numpy or pandas mask"""

109 in_names = self._validate_names_arg(in_names)

110 out_names = self._validate_names_arg(out_names)

111 mask = np.zeros((self.nb_vars,), dtype=bool)

112 if out_names:

113 mask = pd.Series(mask, index=self.varnames, dtype=bool)

114 if in_names and not out_names:

115 # TODO

116 mask[self.varnames] = True

117 elif not in_names and out_names:

118 mask.iloc[indices] = True

119 else:

120 mask[indices] = True

121 return mask

122

123 def mask_to_indices(self, mask):

124 """Utili"""

125 if isinstance(mask, np.ndarray):

126 return np.where(mask)

127 else:

128 return mask[mask].index

129

130 @property

131 def adjmat_ar(self):

132 """The adjacency matrix as a 2D numpy array"""

133 return self.adjmat if self.is_nparray else self.adjmat.values

134

135 @property

136 def adjmat(self):

137 """The adjacency matrix as a pandas DataFrame or a 2D numpy array"""

138 if self._adjmat is None:

139 # compute adjmat from parents and cache it

140 n = self.nb_vars

141 adjmat = np.zeros((n, n), dtype=bool)

142 # from https://stackoverflow.com/a/46018613/7262247

143 index_array = get_arcs_from_parents_indices(self.parents_indices_ar, multiindex=True)

144 flat_index_array = np.ravel_multi_index(index_array, adjmat.shape)

145 np.ravel(adjmat)[flat_index_array] = True

146 if self.is_nparray:

147 self._adjmat = adjmat

148 else:

149 self._adjmat = pd.DataFrame(adjmat, index=self.varnames, columns=self.varnames)

150 return self._adjmat

151

152 @property

153 def parents_indices_ar(self):

154 """A numpy array containing the indices of all parent nodes"""

155 return self.parents if self.is_nparray else self.parents['idx'].values

156

157 @property

158 def parents(self):

E501 Line too long (121 > 120 characters)

159 """A numpy array containing the indices of all parent nodes, or a pandas DataFrame containing 'idx' and 'name'"""

160 if self._parents is None:

161 # compute parents from adjmat, whether a dataframe or an array

162 # TODO maybe use a sparse array here?

163 parents = to_forest_parents_indices(self._adjmat)

164 if not self.is_nparray:

165 # create a df with two columns: indices and names

166 parents = _add_names_to_parents_idx_series(parents)

167 self._parents = parents

168 return self._parents

169

170 @property

171 def roots_mask_ar(self):

172 """A 1D numpy mask array indicating if a node is a root node"""

173 return self.roots_mask if self.is_nparray else self.roots_mask.values

174

175 @property

176 def roots_mask(self):

177 """A pandas Series or a 1D numpy mask array indicating if a node is a root node"""

178 if self._roots_mask is None:

179 if self._adjmat is not None:

180 self._roots_mask = ~self.adjmat.any(axis=0)

181 else:

182 self._roots_mask = (self.parents if self.is_nparray else self.parents['idx']) < 0

183 if not self.is_nparray:

184 # remove the name of the series

185 self._roots_mask.name = None

186 return self._roots_mask

187

188 @property

189 def roots_ar(self):

190 """A 1D numpy array of root indices"""

191 return np.where(self.roots_mask_ar)[0]

192

193 @property

194 def roots(self):

195 """A pandas Series of root names or 1D numpy array of root indices"""

196 if self.is_nparray:

197 return self.roots_ar

198 else:

199 return self.roots_mask.index[self.roots_mask]

200

201 def get_roots(self,

202 names=None # type: bool

203 ):

204 """

205 Returns the list of root indices or root names

206

207 :param names: an optional boolean indicating if this method should return names instead of indices. By

208 default `names` is set to `not self.is_np`

209 :return:

210 """

211 names = self._validate_names_arg(names)

212 return list(self.roots) if names else list(self.roots_ar)

213

214 @property

215 def non_roots_ar(self):

216 """A 1D numpy array of non-root indices"""

217 return np.where(~self.roots_mask_ar)[0]

218

219 @property

220 def non_roots(self):

221 """A pandas Series of non-root names or 1D numpy array of non-root indices"""

222 if self.is_nparray:

223 return self.non_roots_ar

224 else:

225 return self.roots_mask.index[~self.roots_mask]

226

227 def get_non_roots(self,

228 names=None # type: bool

229 ):

230 """

231 Returns the list of non-root indices or names

232

233 :param names: an optional boolean indicating if this method should return names instead of indices. By

234 default `names` is set to `not self.is_np`

235 :return:

236 """

237 names = self._validate_names_arg(names)

238 return list(self.non_roots) if names else list(self.non_roots_ar)

239

240 @property

241 def nodes_with_children_mask_ar(self):

242 """A 1D numpy mask array indicating nodes that have at least 1 child"""

243 return self.nodes_with_children_mask if self.is_nparray else self.nodes_with_children_mask.values

244

245 @property

246 def nodes_with_children_mask(self):

247 """A pandas Series or 1D numpy array indicating nodes that have at least 1 child"""

248 if self._adjmat is not None:

249 return self._adjmat.any(axis=1)

250 else:

251 nwc_idx = np.unique(self.parents_indices_ar[self.parents_indices_ar >= 0])

252 return self.indices_to_mask(nwc_idx)

253

254 @property

255 def roots_with_children_mask_ar(self):

256 """A 1D numpy mask array indicating root nodes that have at least 1 child"""

257 return self.roots_with_children_mask if self.is_nparray else self.roots_with_children_mask.values

258

259 @property

260 def roots_with_children_mask(self):

261 """A pandas Series or 1D numpy mask array indicating root nodes that have at least 1 child"""

262 if self._roots_wc_mask is None:

263 if self._adjmat is not None:

264 # a root with children = a root AND a node with children

265 self._roots_wc_mask = self._roots_mask & self.nodes_with_children_mask

266 else:

267 # a root with children = a parent of a level 1 child node

268 level_1_nodes_mask = ((self.parents_indices_ar >= 0)

269 & (self.parents_indices_ar[self.parents_indices_ar] == -1))

270 rwc_indices = np.unique(self.parents_indices_ar[level_1_nodes_mask])

271 self._roots_wc_mask = self.indices_to_mask(rwc_indices, in_names=False, out_names=not self.is_nparray)

272 return self._roots_wc_mask

273

274 @property

275 def roots_with_children_ar(self):

276 """A 1D numpy array with the indices of root nodes that have at least 1 child"""

277 return np.where(self.roots_with_children_mask_ar)[0]

278

279 @property

280 def roots_with_children(self):

281 """A pandas Series or 1D numpy array with names/indices of root nodes that have at least 1 child"""

282 if self.is_nparray:

283 return self.roots_with_children_ar

284 else:

285 return self.roots_with_children_mask.index[self.roots_with_children_mask]

286

287 def get_roots_with_children(self,

288 names=None # type: bool

289 ):

290 """

291 Returns the list of root with children

292

293 :param names: an optional boolean indicating if this method should return names instead of indices. By

294 default `names` is set to `not self.is_np`

295 :return:

296 """

297 names = self._validate_names_arg(names)

298 return list(self.roots_with_children) if names else list(self.roots_with_children_ar)

299

300 def get_children(self,

301 parent_idx=None, # type: int

302 parent_name=None, # type: str

303 names=None # type: bool

304 ):

305 """

306 Returns the list of children of a node

307

308 :param parent_idx: the index of the node to query for

309 :param parent_name: the name of the node to query for

310 :param names: a boolean to indicate if the returned children should be identified by their names (`True`) or

E501 Line too long (123 > 120 characters)

311 indices (`False`). By default if the parent is identified with `parent_idx` indices will be returned (`False`),

312 and if the parent is identified with parent_name names will be returned (`True`)

313 :return: an array containing indices of the child nodes

314 """

315

316 if parent_idx is not None:

317 if parent_name is not None:

318 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided")

319 node = parent_idx

320 if names is None:

321 names = False

322 elif parent_name is not None:

323 node = parent_name

324 if names is None:

325 names = True

326 self.assert_names_available()

327 else:

328 raise ValueError("You should provide a non-None `parent_idx` or `parent_name`")

329

330 if self.is_nparray:

331 return np.where(self.parents == node)[0]

332 else:

333 qcol = 'idx' if parent_idx is not None else 'name'

334 if not names:

335 return np.where(self.parents[qcol] == node)[0]

336 else:

337 return self.parents[self.parents[qcol] == node].index.values

338

339 def get_arcs(self,

340 multiindex=False, # type: bool

341 names=None # type: bool

342 ):

E501 Line too long (158 > 120 characters)

343 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]

344 """

345 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names

346

347 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables

348 (all the parents, all the childs).

349

350 :param A:

351 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples

352 :param names: an optional boolean indicating if this method should return names instead of indices. By

353 default `names` is set to `not self.is_np`

354 :return:

355 """

356 names = self._validate_names_arg(names)

357 if self._adjmat is not None:

358 return get_arcs_from_adjmat(self._adjmat, multiindex=multiindex, names=names)

359 else:

360 return get_arcs_from_parents_indices(self._parents, multiindex=multiindex, names=names)

361

362 def walk_arcs(self,

363 parent_idx=None, # type: int

364 parent_name=None, # type: str

365 names=None # type: bool

366 ):

367 """

368 Yields a sequence of (parent, child) indices or names. As opposed to `get_arcs` the sequence follows the tree

369 order: starting from the list of root nodes, for every node, it is returned first and then all of its children.

370 (depth first, not breadth first)

371

372 :param parent_idx: the optional index of a node. If provided, only the subtree behind this node will be walked

373 :param parent_name: the optional name of a node. If provided, only the subtree behind this node will be walked

374 :param names: an optional boolean indicating if this method should return names instead of indices. By

375 default `names` is set to `not self.is_np`

376 :return: yields a sequence of (level, i, j)

377 """

378 names = self._validate_names_arg(names)

379 if names:

E731 Do not assign a lambda expression, use a def

380 get_children = lambda node: self.get_children(parent_name=node)

381 else:

E731 Do not assign a lambda expression, use a def

382 get_children = lambda node: self.get_children(parent_idx=node)

383

384 def _walk(from_node, level):

385 for child in get_children(from_node):

386 yield level, from_node, child

E741 Ambiguous variable name 'l'

387 for l, i, j in _walk(child, level+1):

388 yield l, i, j

389

390 if parent_idx is not None:

391 if parent_name is not None:

392 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided")

393 root_nodes = (parent_idx,) if not names else (self.parents.index[parent_idx],)

394 elif parent_name is not None:

395 root_nodes = (parent_name,)

396 else:

397 root_nodes = (self.roots if names else self.roots_ar)

398

399 for n in root_nodes:

400 # walk the subtree

401 for level, i, j in _walk(n, level=0):

402 yield level, i, j

403

404 # ------- display methods

405

406 def to_str(self,

407 names=None, # type: bool

408 mode="headers" # type: str

409 ):

410 """

411 Provides a string representation of this forest.

412

413 :param names: an optional boolean indicating if this method should return names instead of indices. By

414 default `names` is set to `not self.is_np`

415 :param mode: one of "compact", "headers", and "full" (displays the trees)

416 :return:

417 """

418 names = self._validate_names_arg(names)

419 nb_vars = self.nb_vars

420 roots = self.get_roots(names)

421 non_roots = self.get_non_roots(names)

422 nb_roots = len(roots)

423 nb_arcs = nb_vars - nb_roots

424

425 roots_with_children = self.get_roots_with_children(names)

426 nb_roots_with_children = len(roots_with_children)

427

428 nb_sole_roots = (nb_roots - nb_roots_with_children)

429

430 if mode == "compact":

431 return "QDForest (%s vars = %s roots + %s determined by %s of the roots)" \

432 % (nb_vars, nb_roots, nb_arcs, nb_roots_with_children)

433

434 # list of root node indice/name with a star when they have children

435 roots_str = [("%s*" if r in roots_with_children else "%s") % r for r in roots]

436 # list of

437 non_roots_str = ["%s" % r for r in non_roots]

438

439 headers = "\n".join((

440 "QDForest (%s vars):" % nb_vars,

441 " - %s roots (%s+%s*): %s" % (nb_roots, nb_sole_roots, nb_roots_with_children, ", ".join(roots_str)),

442 " - %s other nodes: %s" % (nb_arcs, ", ".join(non_roots_str)),

443 ))

444 if mode == "headers":

445 return headers

446 elif mode == "full":

447 tree_str = "\n".join(self.get_trees_str_list())

448 return "%s\n\n%s" % (headers, tree_str)

449 else:

450 raise ValueError("Unknown mode: %r" % mode)

451

452 @encode_if_py2

453 def __str__(self):

454 """ String representation, listing all useful information when available """

455 if self.nb_vars > 30:

456 return self.to_str(mode="headers")

457 else:

458 return self.to_str(mode="full")

459

460 @encode_if_py2

461 def __repr__(self):

462 # return str(self) # note if we use this then we'll have to comment the decorator

463 return self.to_str(mode="compact")

464

465 def print_arcs(self,

466 names=None # type: bool

467 ):

468 """ Prints the various deterministic arcs in this forest """

T001 Print found.

469 print("\n".join(self.get_arcs_str_list(names=names)))

470

471 def get_arcs_str_list(self,

472 names=None # type: bool

473 ):

474 """ Returns a list of string representation of the various deterministic arcs in this forest """

475 res_str_list = []

476 for parent, child in self.get_arcs(names=names):

477 res_str_list.append("%s -> %s" % (parent, child))

478 return res_str_list

479

480 def get_trees_str_list(self,

481 all=False,

482 names=None # type: bool

483 ):

484 """

485 Returns a list of string representation of the various trees in this forest

486 TODO maybe use https://github.com/caesar0301/treelib ? Or networkX ?

487

488 :param all: if True, this will return also the trees that are consistuted of one node

489 :param names:

490 :return:

491 """

492 names = self._validate_names_arg(names)

493 res_str_list = []

494 if all:

495 roots = self.get_roots(names)

496 else:

497 roots = self.get_roots_with_children(names)

498

499 if names:

E731 Do not assign a lambda expression, use a def

500 walk_arcs = lambda n: self.walk_arcs(parent_name=n)

501 else:

E731 Do not assign a lambda expression, use a def

502 walk_arcs = lambda n: self.walk_arcs(parent_idx=n)

503

504 for r in roots:

505 subtree_str = "%s" % r

506 for level, _, j in walk_arcs(r):

507 subtree_str += "\n%s└─ %s" % (" " * level, j)

508 res_str_list.append(subtree_str + "\n")

509 return res_str_list

510

511 # -----------

512

513 def fit_selector_model(self,

514 X # type: Union[np.ndarray, pd.DataFrame]

515 ):

516 # type: (...) -> QDSelectorModel

517 """

518 Returns a new instance of `QDSelectorModel` fit with the data in `X`

519

520 :param X:

521 :return:

522 """

523 from .selector import QDSelectorModel

524 model = QDSelectorModel(self)

525 model.fit(X)

526 return model

527

528 # --------- tools for entropies analysis to find interesting thresholds

529

530 def get_entropies_table(self,

531 from_to=True, # type: bool

532 sort_by="from", # type: str

533 drop_self_link=True, # type: bool

534 ):

535 """ See `Entropies.get_entropies_table` """

536

537 self.assert_stats_kept()

538 return self.stats.get_entropies_table(from_to=from_to, sort_by=sort_by, drop_self_link=drop_self_link)

539

540 def plot_increasing_entropies(self):

541 """ See `Entropies.plot_increasing_entropies` """

542

543 self.assert_stats_kept()

544 self.stats.plot_increasing_entropies()

545

546

547 def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray]

548 ):

549 """

550 Raises a ValueError if `df_or_array` is

551

552 :param df_or_array:

553 :return:

554 """

555 if isinstance(df_or_array, pd.DataFrame):

556 pass

557 elif isinstance(df_or_array, np.ndarray):

558 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes

559 if len(df_or_array.shape) != 2:

560 raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape))

561 else:

562 # Raise error

563 raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`")

564

565

566 def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]

567 absolute_eps=None, # type: float

568 relative_eps=None, # type: float

569 keep_stats=False, # type: bool

570 non_categorical_mode='strict',

571 ):

572 # type: (...) -> QDForest

573 """

574 Finds the (quasi-)deterministic relationships (functional dependencies) between the variables in `X`, and returns

575 a `QDForest` object representing the forest of (quasi-)deterministic trees. This object can then be used to fit a

576 feature selection model or to learn a Bayesian Network structure.

577

578 By default only deterministic relationships are detected. Quasi-determinism can be enabled by setting either

579 an threshold on conditional entropies (`absolute_eps`) or on relative conditional entropies (`relative_eps`). Only

580 one of them should be set.

581

582 By default (`keep_stats=False`) the entropies tables are not preserved once the forest has been created. If you wish

583 to keep them available, set `keep_stats=True`. The entropies tables are then available in the `<QDForest>.stats`

584 attribute, and threshold analysis methods such as `<QDForest>.get_entropies_table(...)` and

585 `<QDForest>.plot_increasing_entropies()` become available.

586

587 :param X: the dataset as a pandas DataFrame or a numpy array. Columns represent the features to compare.

588 :param absolute_eps: Absolute entropy threshold. Any feature `Y` that can be predicted from another feature `X` in

589 a quasi-deterministic way, that is, where conditional entropy `H(Y|X) <= absolute_eps`, will be removed. The

590 default value is `0` and corresponds to removing deterministic relationships only.

591 :param relative_eps: Relative entropy threshold. Any feature `Y` that can be predicted from another feature `X` in

592 a quasi-deterministic way, that is, where relative conditional entropy `H(Y|X)/H(Y) <= relative_eps` (a value

593 between `0` and `1`), will be removed. Only one of `absolute_eps` or `relative_eps` should be provided.

594 :param keep_stats: a boolean indicating if the various entropies tables computed in the process should be kept in

595 memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`.

596 :return:

597 """

598 # Make sure this is a 2D table

599 assert_df_or_2D_array(X)

600

601 # Sanity check: are there rows in here ?

602 if len(X) == 0:

603 raise ValueError("Provided dataset does not contain any row")

604

605 # Only work on the categorical features

606 X = get_categorical_features(X, non_categorical_mode=non_categorical_mode)

607

608 # Sanity check concerning the number of columns

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

609 assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing."

610

611 # parameters check and defaults

612 if absolute_eps is None:

613 if relative_eps is None:

614 # nothing is provided, use absolute threshold 0

615 absolute_eps = 0

616 is_absolute = True

617 else:

618 # relative threshold provided

619 is_absolute = False

620 else:

621 if relative_eps is not None:

622 raise ValueError("only one of absolute and relative should be passed")

623 # absolute threshold provided

624 is_absolute = True

625

626 is_strict = (absolute_eps == 0.) if is_absolute else (relative_eps == 0.)

627

628 # sanity check

629 if is_absolute and absolute_eps < 0:

630 raise ValueError("epsilon_absolute should be positive")

631 elif not is_absolute and (relative_eps < 0 or relative_eps > 1):

632 raise ValueError("epsilon_relative should be 0=<eps=<1")

633

634 # (0) compute conditional entropies or relative conditional entropies

635 A_orig, X_stats = get_adjacency_matrix(X, eps_absolute=absolute_eps, eps_relative=relative_eps)

636

637 # (2) identify redundancy

E501 Line too long (134 > 120 characters)

638 A_noredundancy = remove_redundancies(A_orig, selection_order=None if is_strict else X_stats.list_vars_by_entropy_order(desc=True))

639

640 # (3) transform into forest: remove extra parents by keeping only the parent with lowest entropy / nb levels ?

641 # if X -> Z and X -> Y and Y -> Z then Z has two parents X and Y but only Y should be kept

642 # Determinist case: minimize the number of parameters: take the minimal nb of levels

643 # Quasi-determinist case: take the lowest entropy

644 entropy_order_inc = (X_stats.H_ar).argsort()

645 # parent_order = compute_nb_levels(df) if is_strict else entropy_based_order

646 # A_forest = to_forest_adjmat(A_noredundancy, entropy_order_inc)

647 # qd_forest = QDForest(adjmat=A_forest)

648 parents = to_forest_parents_indices(A_noredundancy, entropy_order_inc)

649 # if not X_stats.is_nparray:

650 # parents = pd.Series(parents, index=A_noredundancy.columns)

651 qd_forest = QDForest(parents=parents, stats=X_stats if keep_stats else None)

652

E501 Line too long (140 > 120 characters)

653 # forestAdjMatList <- FromHToDeterForestAdjMat(H = H, criterion = criterionNlevels, removePerfectMatchingCol = removePerfectMatchingCol)

654

655 # (4) convert to pgmpy format

656 # deterForest <- FromAdjMatToBnLearn(adjMat = forestAdjMat)

657 # rootsF <- Roots(forestAdjMat)

658

659 # if(length(rootsFNames) == 1){

660 # print("Only one tree in the forest !!! No root graph computed")

661 # print("****************")

662 # print("Training - Phase 2 and Phase 3 obsolete")

663 # print("****************")

664 # gstar <- deterForest

665 # G_R <- NULL

666 # print("Final graph computed")

667 # }else{

E501 Line too long (135 > 120 characters)

668 # print(paste("Multiple trees in the forest (", length(rootsFNames),") Root graph will be computed.", sep = "", collapse = ""))

669 # rootsOnly <- ReduceDataAndH(variablesToKeep = rootsFNames, dataFrame = data, H)

670 # dataRoots <- rootsOnly[[1]]

671 # HRoots <- rootsOnly[[2]]

672 # rm(rootsOnly)

673 # print("****************")

674 # print("Training - Phase 2: computation of the root graph")

675 # print("****************")

E501 Line too long (156 > 120 characters)

676 # # removePerfectlyMatchingCol useless here because either pmc are in the same trees (but only one is the root), either they were deleted earlier on

E501 Line too long (149 > 120 characters)

677 # G_R <- SotABNsl(data = dataRoots, method = method, score = score, hyperparamList, removePerfectMatchingCol = removePerfectMatchingCol, ...)

678 # edgeListG_R <- FromBnToEdgeList(bn = G_R)

679 # colnames(edgeListG_R) <- c('from', 'to')

680 # print("****************")

681 # print("Training - Phase 3: fusion of the deter forest with the root graph")

682 # print("****************")

683 # gstarAdjMat <- AddEdgesFromEdgeList(originalAdjMatrix = forestAdjMat, edgeListToAdd = edgeListG_R)

684 # gstar <- FromAdjMatToBnLearn(gstarAdjMat)

685 # print("Final graph computed")

686 # }

687

688 return qd_forest

689

690

691 class Entropies(object):

692 """ to do this could be easier to read with pyfields and default value factories """

693

694 __slots__ = ('dataset', 'is_nparray', '_H', '_Hcond', '_Hcond_rel', '_dataset_df')

695

696 def __init__(self, df_or_array):

697 """

698

699 :param df_or_array: a pandas dataframe or numpy array where the variables are the columns

700 """

701 self.dataset = df_or_array

702 self.is_nparray = isinstance(df_or_array, np.ndarray)

703 self._H = None # type: Union[np.ndarray, pd.Series]

704 self._Hcond = None # type: Union[np.ndarray, pd.DataFrame]

705 self._Hcond_rel = None # type: Union[np.ndarray, pd.DataFrame]

706 self._dataset_df = None # type: pd.DataFrame

707

708 def __str__(self):

709 return repr(self)

710

711 def __repr__(self):

712 res = """Statistics computed for dataset:

713 %s

714 ...(%s rows)

715

716 Entropies (H):

717 %s

718

719 Conditional entropies (Hcond = H(row|col)):

720 %s

721

722 Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):

723 %s

724 """ % (self.dataset.head(2), len(self.dataset), self.H.T, self.Hcond, self.Hcond_rel)

725 return res

726

727 @property

728 def dataset_df(self):

729 """The dataset as a pandas DataFrame, if pandas is available"""

730 if self.is_nparray:

731 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes

732 if self.dataset.dtype.names is not None:

733 # structured array

734 self._dataset_df = pd.DataFrame(self.dataset)

735 else:

736 # unstructured array... same ?

737 self._dataset_df = pd.DataFrame(self.dataset)

738 return self._dataset_df

739 else:

740 return self.dataset

741

742 @property

743 def nb_vars(self):

744 return self.dataset.shape[1]

745

746 @property

747 def varnames(self):

748 if self.is_nparray:

749 raise ValueError("Variable names not available")

750 return list(self.dataset.columns)

751

752 @property

753 def H_ar(self):

754 """ The entropy matrix (i, j) = H(Xi | Xj) as a numpy array """

755 return self.H if self.is_nparray else self.H.values

756

757 entropies_ar = H_ar

758 """An alias for H_ar"""

759

760 @property

761 def H(self):

762 """The entropies of all variables. a pandas Series if df was a pandas dataframe, else a 1D numpy array"""

763 if self._H is None:

764 # Using pyitlib to compute H (hopefully efficiently)

765 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report

766 # note: we convert to string type to avoid a bug with ints. TODO...

767 self._H = drv.entropy(self.dataset_df.T.astype(str))

768 if not self.is_nparray:

769 self._H = pd.Series(self._H, index=self.varnames)

770

771 # basic sanity check: should all be positive

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

772 assert np.all(self._H >= 0)

773 return self._H

774

775 entropies = H

776 """An alias for H"""

777

778 @property

779 def Hcond_ar(self):

780 """ The conditional entropy matrix (i, j) = H(Xi | Xj) as a numpy array """

781 return self.Hcond if self.is_nparray else self.Hcond.values

782

783 conditional_entropies_ar = Hcond_ar

784 """An alias for Hcond_ar"""

785

786 @property

787 def Hcond(self):

788 """

789 The conditional entropy matrix (i, j) = H(Xi | Xj).

790 A pandas Dataframe or a 2D numpy array depending on dataset type

791 """

792 if self._Hcond is None:

793 # Old attempt to do it ourselves

794 # (0) init H

795 # nb_vars = len(df.columns)

796 # H = np.empty((nb_vars, nb_vars), dtype=float)

797 # (1) for each column compute the counts per value

798 # (2) for each (i, j) pair compute the counts

799

800 # Using pyitlib to compute H (hopefully efficiently)

801 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report

802 # note: we convert to string type to avoid a bug with ints. TODO...

803 self._Hcond = drv.entropy_conditional(self.dataset_df.T.astype(str))

804

805 # add the row/column headers

806 if not self.is_nparray:

807 self._Hcond = pd.DataFrame(self._Hcond, index=self.varnames, columns=self.varnames)

808

809 # basic sanity check: should all be positive

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

810 assert np.all(self._Hcond >= 0)

811 return self._Hcond

812

813 conditional_entropies = Hcond

814 """An alias for Hcond"""

815

816 @property

817 def Hcond_rel_ar(self):

818 """ The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi) as a numpy array """

819 return self.Hcond_rel if self.is_nparray else self.Hcond_rel.values

820

821 relative_conditional_entropies_ar = Hcond_rel_ar

822 """An alias for Hcond_rel_ar"""

823

824 @property

825 def Hcond_rel(self):

826 """

827 The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi).

828 """

829 if self._Hcond_rel is None:

830 # compute relative entropy: in each cell (X, Y) we want H(X|Y)/H(X)

831 if self.is_nparray:

832 self._Hcond_rel = (self.Hcond.T / self.H).T

833 else:

834 Hrel_array = (self.Hcond.values.T / self.H.values).T

E501 Line too long (124 > 120 characters)

835 self._Hcond_rel = pd.DataFrame(Hrel_array, index=list(self.Hcond.columns), columns=list(self.Hcond.columns))

836

837 # basic sanity check

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

838 assert np.all(self._Hcond_rel >= 0.)

S101 Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

839 assert np.all(self._Hcond_rel <= 1.)

840 return self._Hcond_rel

841

842 relative_conditional_entropies = Hcond_rel

843 """An alias for Hcond_rel"""

844

845 def list_vars_by_entropy_order(self, desc=False):

846 """

847 Returns the indices or names of variables in ascending (resp. descending) order of entropy

848

849 :param desc: if True the order is descending, else ascending

850 :return:

851 """

852 if self.is_nparray:

853 return (-self.H).argsort() if desc else self.H.argsort()

854 else:

855 return (-self.H.values).argsort() if desc else self.H.values.argsort()

856

857 # --------- tools for entropies analysis to find interesting thresholds

858

859 def get_entropies_table(self,

860 from_to=True, # type: bool

861 sort_by="from", # type: str

862 drop_self_link=True, # type: bool

863 ):

864 """

865 Returns a pandas series or numpy array with four columns: from, to, cond_entropy, rel_cond_entropy.

866 The index is 'arc', a string representing the arc e.g. "X->Y".

867

868 :param from_to: a boolean flag indicating if 'from' and 'to' columns should remain in the returned dataframe

869 (True, default) or be dropped (False)

870 :param sort_by: a string indicating if the arcs should be sorted by origin ("from", default) or destination

871 ("to"), or by value "rel_cond_entropy" or "cond_entropy", in the resulting table.

872 :param drop_self_link: by default node-to-self arcs are not returned in the list. Turn this to False to include

873 them.

874 :return:

875 """

876 if self.is_nparray:

877 raise NotImplementedError("TODO")

878 else:

879 # 1. gather the data and unpivot it so that there is one row per arc

880 res_df = pd.DataFrame({

881 'cond_entropy': self.Hcond.unstack(),

882 'rel_cond_entropy': self.Hcond_rel.unstack(),

883 })

884 res_df.index.names = ['from', 'to']

885 res_df = res_df.reset_index()

886

887 # 2. filter out node-to-self if needed

888 if drop_self_link:

889 res_df = res_df.loc[res_df['from'] != res_df['to']].copy()

890

891 # 3. create the arc names column and make it the index

892 def make_arc_str(row):

893 return "%s->%s" % (row['from'], row.to) # note that .from is a reserved python symbol !

894 res_df['arc'] = res_df.apply(make_arc_str, axis=1)

895 res_df.set_index('arc', inplace=True)

896

897 # 4. Optionally sort differently

898 all_possibilities = ("from", "to", "cond_entropy", "rel_cond_entropy")

899 if sort_by == all_possibilities[0]:

900 pass # already done

901 elif sort_by in all_possibilities[1:]:

902 res_df.sort_values(by=sort_by, inplace=True)

903 else:

904 raise ValueError("Invalid value for `sort_by`: %r. Possible values: %r" % (sort_by, all_possibilities))

905

906 # 5. Optionally drop the from and to columns

907 if not from_to:

908 res_df.drop(['from', 'to'], axis=1, inplace=True)

909

910 return res_df

911

912 def plot_increasing_entropies(self):

913 """

914

915 :return:

916 """

917 import matplotlib.pyplot as plt

918

919 # get the dataframe

920 df = self.get_entropies_table(sort_by="rel_cond_entropy")

921

922 # plot with all ticks

923 df_sorted = df["rel_cond_entropy"] # .sort_values()

924 df_sorted.plot(title="Relative conditional entropy H(X|Y)/H(X) for each arc X->Y, by increasing order",

925 figsize=(15, 10))

926 plt.xticks(np.arange(len(df_sorted)), df_sorted.index, rotation=90)

927

928

929 def get_adjacency_matrix(df, # type: Union[np.ndarray, pd.DataFrame]

930 eps_absolute=None, # type: float

931 eps_relative=None # type: float

932 ):

933 """

934

935 :param df:

936 :param eps_absolute:

937 :param eps_relative:

938 :return:

939 """

940 df_stats = Entropies(df)

941

942 # (1) create initial adjacency matrix by thresholding either H(X|Y) (absolute) or H(X|Y)/H(X) (relative)

943 if eps_relative is None:

944 # default value for eps absolute

945 if eps_absolute is None:

946 eps_absolute = 0

947

948 # threshold is on H(X|Y)

949 edges = (df_stats.Hcond_ar <= eps_absolute).T

950 else:

951 if eps_absolute is not None:

952 raise ValueError("Only one of `eps_absolute` and `eps_relative` should be provided")

953

954 # threshold on H(X|Y)/H(X)

955 edges = (df_stats.Hcond_rel_ar <= eps_relative).T

956

957 # the diagonal should be false

958 np.fill_diagonal(edges, False)

959

960 # finally create the matrix

961 if df_stats.is_nparray:

962 A = edges

963 else:

964 A = pd.DataFrame(edges, index=df_stats.varnames, columns=df_stats.varnames)

965

966 return A, df_stats

967

968

969 def remove_redundancies(A, # type: Union[np.ndarray, pd.DataFrame]

970 selection_order=None # type: np.ndarray

971 ):

972 """Cleans the arcs in A between redundant variables.

973

974 A should be a dataframe or numpy array where index = columns and indicate the node name.

975 It should contain boolean values where True at (i,j) means there is a directed arc i -> j

976

977 When there are redundant variables (arcs between i->j and j->i), only a single node is kept as the parent

978 of all other redundant nodes.

979

980 :param A: an adjacency matrix where A(i, j) indicates that there is an arc i -> j

981 :param selection_order: if none (default) the first node in the list will be kept as representant for each

982 redundancy class. Otherwise an alternate order (array of indices) can be provided.

983 :return:

984 """

985 assert_adjacency_matrix(A)

986

987 # work on a copy

988 is_nparray = isinstance(A, np.ndarray)

989 if is_nparray:

990 A = A.copy()

991 A_df = None

992 else:

993 A_df = A.copy(deep=True)

994 A = A_df.values

995

996 # init

997 n_vars = A.shape[0]

998 if selection_order is None:

999 selection_order = range(n_vars)

1000

1001 # I contains the list of variable indices to go through. We can remove some in the loop

E741 Ambiguous variable name 'I'

1002 I = np.ones((n_vars, ), dtype=bool)

1003

1004 # for each node

1005 for i in selection_order:

1006 # if we did not yet take it into account

1007 if I[i]:

1008 # find redundant nodes with this node (= there is an arc in both directions)

1009 mask_redundant_with_i = A[i, :] * A[:, i]

1010

1011 # we will stop caring about this redundancy class

1012 I[mask_redundant_with_i] = False

1013

1014 # we do not care about i-to-i

1015 mask_redundant_with_i[i] = False

1016

1017 # if there is at least one redundant node

1018 if any(mask_redundant_with_i):

1019 redundant_nodes_idx = np.where(mask_redundant_with_i)

1020

1021 # let only the first variable in the list (i) be the parent

1022 # i becomes the representant of the redundancy class

1023 A[redundant_nodes_idx, i] = False

1024

1025 # remove all arcs inter-redundant variables.

1026 # Note that this destroys the diagonal but we will fix this at the end

1027 A[redundant_nodes_idx, redundant_nodes_idx] = False

1028

1029 # restore the diagonal: no

1030 # np.fill_diagonal(A, True)

1031 if is_nparray:

1032 return A

1033 else:

1034 return A_df

1035

1036

1037 def to_forest_parents_indices(A, # type: Union[np.ndarray, pd.DataFrame]

1038 selection_order=None # type: np.ndarray

1039 ):

1040 # type: (...) -> Union[np.ndarray, pd.DataFrame]

1041 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order

1042

1043 Returns a 1D array of parent index or -1 if root

1044

1045 :param A: an adjacency matrix, as a dataframe or numpy array

1046 :param selection_order: an optional order for parent selection. If not provided, the first in the list of columns

1047 of A will be used

1048 :return:

1049 """

1050 assert_adjacency_matrix(A)

1051 is_np_array = isinstance(A, np.ndarray)

1052

1053 # From https://stackoverflow.com/a/47269413/7262247

1054 if is_np_array:

1055 mask = A[selection_order, :] if selection_order is not None else A

1056 else:

1057 mask = A.iloc[selection_order, :].values if selection_order is not None else A.values

1058

1059 # return a list containing for each feature, the index of its parent or -1 if it is a root

1060 indices = np.where(mask.any(axis=0),

1061 selection_order[mask.argmax(axis=0)] if selection_order is not None else mask.argmax(axis=0),

1062 -1)

1063 if not is_np_array:

1064 indices = pd.Series(indices, index=A.columns)

1065 return indices

1066

1067

1068 def to_forest_adjmat(A, # type: Union[np.ndarray, pd.DataFrame]

1069 selection_order, # type: np.ndarray

1070 inplace=False # type: bool

1071 ):

1072 # type: (...) -> Union[np.ndarray, pd.DataFrame]

1073 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order

1074

1075 :param A: an adjacency matrix, as a dataframe or numpy array

1076 :return:

1077 """

1078 is_ndarray = isinstance(A, np.ndarray)

1079 assert_adjacency_matrix(A)

1080

1081 if not inplace:

1082 A = A.copy()

1083

1084 # nb_parents = A_df.sum(axis=0)

1085 # nodes_with_many_parents = np.where(nb_parents.values > 1)[0]

1086

1087 # Probably the fastest: when the cumulative nb of parents is above 1 they need to be removed

1088 if is_ndarray:

1089 mask_parents_over_the_first = A[selection_order, :].cumsum(axis=0) > 1

1090 mask_parents_over_the_first = mask_parents_over_the_first[selection_order.argsort(), :]

1091 A[mask_parents_over_the_first] = False

1092 else:

1093 # From https://stackoverflow.com/a/47269413/7262247

1094 mask_parents_over_the_first = A.iloc[selection_order, :].cumsum(axis=0) > 1

1095 A[mask_parents_over_the_first] = False

1096

1097 # # Probably slower but easier to understand

1098 # # for each column we want to only keep one (or zero) row (parent) with True

1099 # for i in range(A_df.shape[1]):

1100 # first_parent_idx = A_df.values[selection_order, i].argmax(axis=0)

1101 # A_df.values[selection_order[first_parent_idx+1:], i] = False

1102

1103 if not inplace:

1104 return A

1105

1106

1107 def get_arcs_from_parents_indices(parents, # type: Union[np.ndarray, pd.DataFrame]

1108 multiindex=False, # type: bool

1109 names=False # type: bool

1110 ):

E501 Line too long (154 > 120 characters)

1111 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]

1112 """

1113 if multiindex = False ; returns a sequence of pairs : (9, 1), (3, 5), (9, 7)

1114 if multiindex = True ; returns two sequences of indices: (9, 3, 9), (1, 5, 7)

1115

1116 :param parents:

1117 :param multiindex:

1118 :param names:

1119 :return:

1120 """

1121 is_np_array = isinstance(parents, np.ndarray)

1122 if not names:

1123 if not is_np_array:

1124 # assume a dataframe with an 'idx' column as in QDForest class

1125 parents = parents['idx'].values

1126 n = len(parents)

1127 childs_mask = parents >= 0

1128 res = parents[childs_mask], np.arange(n)[childs_mask]

1129 return res if multiindex else zip(*res)

1130 else:

1131 if is_np_array:

1132 raise ValueError("Names are not available, this is a numpy array")

1133 else:

1134 # cols = A.columns

1135 # return ((cols[i], cols[j]) for i, j in zip(*np.where(A)))

1136 res = parents.loc[parents['name'].notna(), 'name']

1137 res = res.values, res.index

1138 return res if multiindex else zip(*res)

1139

1140

1141 def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFrame]

1142 multiindex=False, # type: bool

1143 names=False # type: bool

1144 ):

E501 Line too long (154 > 120 characters)

1145 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]

1146 """

1147 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names

1148

1149 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables

1150 (all the parents, all the childs).

1151

1152 :param A:

1153 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples

1154 :param names: if False, indices are returned. Otherwise feature names are returned if any

1155 :return:

1156 """

1157 if not names:

1158 res = np.where(A)

1159 return res if multiindex else zip(*res)

1160 else:

1161 is_np_array = isinstance(A, np.ndarray)

1162 if is_np_array:

1163 raise ValueError("Names are not available, this is a numpy array")

1164 else:

1165 cols = A.columns

1166 res_ar = np.where(A)

1167 if multiindex:

1168 return ((cols[i] for i in l) for l in res_ar) # noqa

1169 else:

1170 return ((cols[i], cols[j]) for i, j in zip(*res_ar))

1171

1172

1173 def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame]

1174 non_categorical_mode="strict" # type: str

1175 ):

1176 # type: (...) -> Union[np.ndarray, pd.DataFrame]

1177 """

1178

1179 :param df_or_array:

1180 :param non_categorical_mode:

1181 :return: a dataframe or array with the categorical features

1182 """

1183 assert_df_or_2D_array(df_or_array)

1184

1185 if non_categorical_mode == "strict":

1186 strict_mode = True

1187 elif non_categorical_mode == "remove":

1188 strict_mode = False

1189 else:

1190 raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode)

1191

1192 if isinstance(df_or_array, pd.DataFrame):

1193 is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"])

1194 if strict_mode and not is_categorical_dtype.all():

1195 raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or "

E128 Continuation line under-indented for visual indent
E501 Line too long (123 > 120 characters)

1196 "'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to "

E128 Continuation line under-indented for visual indent

1197 "`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict())

1198 elif not is_categorical_dtype.any():

1199 raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "

E128 Continuation line under-indented for visual indent

1200 "'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict())

1201 return df_or_array.loc[:, is_categorical_dtype]

1202

1203 elif isinstance(df_or_array, np.ndarray):

1204 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes

1205 if df_or_array.dtype.names is not None:

1206 # structured array

1207 is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object"

1208 for n in df_or_array.dtype.names])

1209 if strict_mode and not is_categorical_dtype.all():

1210 invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict()

1211 raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): "

1212 "found dtypes %r. This is not supported when `non_categorical_mode` is set to "

1213 "`'strict'`" % invalid_dtypes)

1214 elif not is_categorical_dtype.any():

1215 raise ValueError(

1216 "Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "

1217 "'categorical'): found dtypes %r" % df_or_array.dtype.fields)

1218 categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype]

1219 return df_or_array[categorical_names]

1220 else:

1221 # non-structured array

1222 if df_or_array.dtype != np.dtype('O'):

1223 raise TypeError("Provided array columns are not object nor categorical: found dtype %r"

1224 % df_or_array.dtype)

1225 return df_or_array

1226 else:

1227 # Should not happen since `assert_df_or_2D_array` is called upfront now.

1228 raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray")

1229

1230

1231 def assert_adjacency_matrix(A # type: Union[np.ndarray, pd.DataFrame]

1232 ):

1233 """Routine to check that A is a proper adjacency matrix"""

1234

1235 if len(A.shape) != 2:

1236 raise ValueError("A is not a 2D adjacency matrix, its shape is %sD" % len(A.shape))

1237

1238 if A.shape[0] != A.shape[1]:

1239 raise ValueError("A is not a 2D adjacency matrix: it is not square: %r" % A.shape)

qdscreen/main.py source