1 # -*- coding: utf-8 -*-
2 # The above encoding declaration is needed because we use non-scii characters in `get_trees_str_list`.
3 # The below import transforms automatically all strings in this file in unicode in python 2.
4 from __future__ import unicode_literals # See tests/encoding_ref_help.py for a detailed explanation
5 import numpy as np
6 import pandas as pd
7
8 from pyitlib import discrete_random_variable as drv
9
10 from .compat import encode_if_py2
11
12 try:
13 from typing import Union, Iterable, Tuple
14 except: # noqa
15 pass
16
17
18 # TODO
19 # Cramer's V + Theil's U:
20 # https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
21 # https://stackoverflow.com/questions/46498455/categorical-features-correlation
-
E501
Line too long (126 > 120 characters)
22 # https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix (+ corrected version)
23 # https://stackoverflow.com/questions/61236715/correlation-between-categorical-variables-within-a-dataset
24 #
25 # https://stackoverflow.com/questions/48035381/correlation-among-multiple-categorical-variables-pandas
26 # https://stackoverflow.com/questions/44694228/how-to-check-for-correlation-among-continuous-and-categorical-variables-in-pytho
27 #
28
29 def _add_names_to_parents_idx_series(parents):
30 parents = pd.DataFrame(parents, columns=('idx',))
31 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None)
32 return parents
33
34
35 class QDForest(object):
36 """A quasi-deterministic forest returned by `qd_screen`"""
37 __slots__ = (
38 '_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child)
39 '_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
40 'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)
41 '_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables
42 '_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children
43 'stats' # an optional `Entropies` object stored for debug
44 )
45
46 def __init__(self,
47 adjmat=None, # type: Union[np.ndarray, pd.DataFrame]
48 parents=None, # type: Union[np.ndarray, pd.Series]
49 stats=None # type: Entropies
50 ):
51 """
52
53 :param adjmat:
54 :param parents:
55 """
56 self.stats = stats
57 self._adjmat = adjmat
58 self.is_nparray = isinstance((adjmat if adjmat is not None else parents), np.ndarray)
59 if parents is not None:
60 if not self.is_nparray:
61 # create a df with two columns: indices and names
62 parents = _add_names_to_parents_idx_series(parents)
63 self._parents = parents
64 else:
65 self._parents = None
66
67 self._roots_mask = None
68 self._roots_wc_mask = None
69
70 def assert_stats_kept(self):
71 if self.stats is None:
72 raise ValueError("`stats` are not available in this QDForest instance. If you created it using `qd_screen`,"
73 " you should use `keep_stats=True`.")
74
75 def assert_pandas_capable(self):
76 """Utility method to assert that "not self.is_nparray" """
77 if self.is_nparray:
78 raise ValueError("This QDForest instance was built with numpy arrays, it is not pandas compliant")
79
80 def assert_names_available(self):
81 """"""
82 if self.is_nparray:
83 raise ValueError("Variable names are not available")
84
85 def _validate_names_arg(self,
86 names):
87 """Utility method to validate and set default value of the `names` argument used in most methods"""
88 if names is None:
89 names = not self.is_nparray
90 if names:
91 self.assert_names_available()
92 return names
93
94 @property
95 def nb_vars(self):
96 return self._parents.shape[0] if self._parents is not None else self._adjmat.shape[0]
97
98 @property
99 def varnames(self):
100 self.assert_names_available()
101 return np.array(self._adjmat.columns) if self._adjmat is not None else np.array(self._parents.index)
102
103 def indices_to_mask(self,
104 indices,
105 in_names=None,
106 out_names=None,
107 ):
108 """Utility to convert a list of indices to a numpy or pandas mask"""
109 in_names = self._validate_names_arg(in_names)
110 out_names = self._validate_names_arg(out_names)
111 mask = np.zeros((self.nb_vars,), dtype=bool)
112 if out_names:
113 mask = pd.Series(mask, index=self.varnames, dtype=bool)
114 if in_names and not out_names:
115 # TODO
116 mask[self.varnames] = True
117 elif not in_names and out_names:
118 mask.iloc[indices] = True
119 else:
120 mask[indices] = True
121 return mask
122
123 def mask_to_indices(self, mask):
124 """Utili"""
125 if isinstance(mask, np.ndarray):
126 return np.where(mask)
127 else:
128 return mask[mask].index
129
130 @property
131 def adjmat_ar(self):
132 """The adjacency matrix as a 2D numpy array"""
133 return self.adjmat if self.is_nparray else self.adjmat.values
134
135 @property
136 def adjmat(self):
137 """The adjacency matrix as a pandas DataFrame or a 2D numpy array"""
138 if self._adjmat is None:
139 # compute adjmat from parents and cache it
140 n = self.nb_vars
141 adjmat = np.zeros((n, n), dtype=bool)
142 # from https://stackoverflow.com/a/46018613/7262247
143 index_array = get_arcs_from_parents_indices(self.parents_indices_ar, multiindex=True)
144 flat_index_array = np.ravel_multi_index(index_array, adjmat.shape)
145 np.ravel(adjmat)[flat_index_array] = True
146 if self.is_nparray:
147 self._adjmat = adjmat
148 else:
149 self._adjmat = pd.DataFrame(adjmat, index=self.varnames, columns=self.varnames)
150 return self._adjmat
151
152 @property
153 def parents_indices_ar(self):
154 """A numpy array containing the indices of all parent nodes"""
155 return self.parents if self.is_nparray else self.parents['idx'].values
156
157 @property
158 def parents(self):
-
E501
Line too long (121 > 120 characters)
159 """A numpy array containing the indices of all parent nodes, or a pandas DataFrame containing 'idx' and 'name'"""
160 if self._parents is None:
161 # compute parents from adjmat, whether a dataframe or an array
162 # TODO maybe use a sparse array here?
163 parents = to_forest_parents_indices(self._adjmat)
164 if not self.is_nparray:
165 # create a df with two columns: indices and names
166 parents = _add_names_to_parents_idx_series(parents)
167 self._parents = parents
168 return self._parents
169
170 @property
171 def roots_mask_ar(self):
172 """A 1D numpy mask array indicating if a node is a root node"""
173 return self.roots_mask if self.is_nparray else self.roots_mask.values
174
175 @property
176 def roots_mask(self):
177 """A pandas Series or a 1D numpy mask array indicating if a node is a root node"""
178 if self._roots_mask is None:
179 if self._adjmat is not None:
180 self._roots_mask = ~self.adjmat.any(axis=0)
181 else:
182 self._roots_mask = (self.parents if self.is_nparray else self.parents['idx']) < 0
183 if not self.is_nparray:
184 # remove the name of the series
185 self._roots_mask.name = None
186 return self._roots_mask
187
188 @property
189 def roots_ar(self):
190 """A 1D numpy array of root indices"""
191 return np.where(self.roots_mask_ar)[0]
192
193 @property
194 def roots(self):
195 """A pandas Series of root names or 1D numpy array of root indices"""
196 if self.is_nparray:
197 return self.roots_ar
198 else:
199 return self.roots_mask.index[self.roots_mask]
200
201 def get_roots(self,
202 names=None # type: bool
203 ):
204 """
205 Returns the list of root indices or root names
206
207 :param names: an optional boolean indicating if this method should return names instead of indices. By
208 default `names` is set to `not self.is_np`
209 :return:
210 """
211 names = self._validate_names_arg(names)
212 return list(self.roots) if names else list(self.roots_ar)
213
214 @property
215 def non_roots_ar(self):
216 """A 1D numpy array of non-root indices"""
217 return np.where(~self.roots_mask_ar)[0]
218
219 @property
220 def non_roots(self):
221 """A pandas Series of non-root names or 1D numpy array of non-root indices"""
222 if self.is_nparray:
223 return self.non_roots_ar
224 else:
225 return self.roots_mask.index[~self.roots_mask]
226
227 def get_non_roots(self,
228 names=None # type: bool
229 ):
230 """
231 Returns the list of non-root indices or names
232
233 :param names: an optional boolean indicating if this method should return names instead of indices. By
234 default `names` is set to `not self.is_np`
235 :return:
236 """
237 names = self._validate_names_arg(names)
238 return list(self.non_roots) if names else list(self.non_roots_ar)
239
240 @property
241 def nodes_with_children_mask_ar(self):
242 """A 1D numpy mask array indicating nodes that have at least 1 child"""
243 return self.nodes_with_children_mask if self.is_nparray else self.nodes_with_children_mask.values
244
245 @property
246 def nodes_with_children_mask(self):
247 """A pandas Series or 1D numpy array indicating nodes that have at least 1 child"""
248 if self._adjmat is not None:
249 return self._adjmat.any(axis=1)
250 else:
251 nwc_idx = np.unique(self.parents_indices_ar[self.parents_indices_ar >= 0])
252 return self.indices_to_mask(nwc_idx)
253
254 @property
255 def roots_with_children_mask_ar(self):
256 """A 1D numpy mask array indicating root nodes that have at least 1 child"""
257 return self.roots_with_children_mask if self.is_nparray else self.roots_with_children_mask.values
258
259 @property
260 def roots_with_children_mask(self):
261 """A pandas Series or 1D numpy mask array indicating root nodes that have at least 1 child"""
262 if self._roots_wc_mask is None:
263 if self._adjmat is not None:
264 # a root with children = a root AND a node with children
265 self._roots_wc_mask = self._roots_mask & self.nodes_with_children_mask
266 else:
267 # a root with children = a parent of a level 1 child node
268 level_1_nodes_mask = ((self.parents_indices_ar >= 0)
269 & (self.parents_indices_ar[self.parents_indices_ar] == -1))
270 rwc_indices = np.unique(self.parents_indices_ar[level_1_nodes_mask])
271 self._roots_wc_mask = self.indices_to_mask(rwc_indices, in_names=False, out_names=not self.is_nparray)
272 return self._roots_wc_mask
273
274 @property
275 def roots_with_children_ar(self):
276 """A 1D numpy array with the indices of root nodes that have at least 1 child"""
277 return np.where(self.roots_with_children_mask_ar)[0]
278
279 @property
280 def roots_with_children(self):
281 """A pandas Series or 1D numpy array with names/indices of root nodes that have at least 1 child"""
282 if self.is_nparray:
283 return self.roots_with_children_ar
284 else:
285 return self.roots_with_children_mask.index[self.roots_with_children_mask]
286
287 def get_roots_with_children(self,
288 names=None # type: bool
289 ):
290 """
291 Returns the list of root with children
292
293 :param names: an optional boolean indicating if this method should return names instead of indices. By
294 default `names` is set to `not self.is_np`
295 :return:
296 """
297 names = self._validate_names_arg(names)
298 return list(self.roots_with_children) if names else list(self.roots_with_children_ar)
299
300 def get_children(self,
301 parent_idx=None, # type: int
302 parent_name=None, # type: str
303 names=None # type: bool
304 ):
305 """
306 Returns the list of children of a node
307
308 :param parent_idx: the index of the node to query for
309 :param parent_name: the name of the node to query for
310 :param names: a boolean to indicate if the returned children should be identified by their names (`True`) or
-
E501
Line too long (123 > 120 characters)
311 indices (`False`). By default if the parent is identified with `parent_idx` indices will be returned (`False`),
312 and if the parent is identified with parent_name names will be returned (`True`)
313 :return: an array containing indices of the child nodes
314 """
315
316 if parent_idx is not None:
317 if parent_name is not None:
318 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided")
319 node = parent_idx
320 if names is None:
321 names = False
322 elif parent_name is not None:
323 node = parent_name
324 if names is None:
325 names = True
326 self.assert_names_available()
327 else:
328 raise ValueError("You should provide a non-None `parent_idx` or `parent_name`")
329
330 if self.is_nparray:
331 return np.where(self.parents == node)[0]
332 else:
333 qcol = 'idx' if parent_idx is not None else 'name'
334 if not names:
335 return np.where(self.parents[qcol] == node)[0]
336 else:
337 return self.parents[self.parents[qcol] == node].index.values
338
339 def get_arcs(self,
340 multiindex=False, # type: bool
341 names=None # type: bool
342 ):
-
E501
Line too long (158 > 120 characters)
343 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]
344 """
345 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names
346
347 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables
348 (all the parents, all the childs).
349
350 :param A:
351 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples
352 :param names: an optional boolean indicating if this method should return names instead of indices. By
353 default `names` is set to `not self.is_np`
354 :return:
355 """
356 names = self._validate_names_arg(names)
357 if self._adjmat is not None:
358 return get_arcs_from_adjmat(self._adjmat, multiindex=multiindex, names=names)
359 else:
360 return get_arcs_from_parents_indices(self._parents, multiindex=multiindex, names=names)
361
362 def walk_arcs(self,
363 parent_idx=None, # type: int
364 parent_name=None, # type: str
365 names=None # type: bool
366 ):
367 """
368 Yields a sequence of (parent, child) indices or names. As opposed to `get_arcs` the sequence follows the tree
369 order: starting from the list of root nodes, for every node, it is returned first and then all of its children.
370 (depth first, not breadth first)
371
372 :param parent_idx: the optional index of a node. If provided, only the subtree behind this node will be walked
373 :param parent_name: the optional name of a node. If provided, only the subtree behind this node will be walked
374 :param names: an optional boolean indicating if this method should return names instead of indices. By
375 default `names` is set to `not self.is_np`
376 :return: yields a sequence of (level, i, j)
377 """
378 names = self._validate_names_arg(names)
379 if names:
-
E731
Do not assign a lambda expression, use a def
380 get_children = lambda node: self.get_children(parent_name=node)
381 else:
-
E731
Do not assign a lambda expression, use a def
382 get_children = lambda node: self.get_children(parent_idx=node)
383
384 def _walk(from_node, level):
385 for child in get_children(from_node):
386 yield level, from_node, child
-
E741
Ambiguous variable name 'l'
387 for l, i, j in _walk(child, level+1):
388 yield l, i, j
389
390 if parent_idx is not None:
391 if parent_name is not None:
392 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided")
393 root_nodes = (parent_idx,) if not names else (self.parents.index[parent_idx],)
394 elif parent_name is not None:
395 root_nodes = (parent_name,)
396 else:
397 root_nodes = (self.roots if names else self.roots_ar)
398
399 for n in root_nodes:
400 # walk the subtree
401 for level, i, j in _walk(n, level=0):
402 yield level, i, j
403
404 # ------- display methods
405
406 def to_str(self,
407 names=None, # type: bool
408 mode="headers" # type: str
409 ):
410 """
411 Provides a string representation of this forest.
412
413 :param names: an optional boolean indicating if this method should return names instead of indices. By
414 default `names` is set to `not self.is_np`
415 :param mode: one of "compact", "headers", and "full" (displays the trees)
416 :return:
417 """
418 names = self._validate_names_arg(names)
419 nb_vars = self.nb_vars
420 roots = self.get_roots(names)
421 non_roots = self.get_non_roots(names)
422 nb_roots = len(roots)
423 nb_arcs = nb_vars - nb_roots
424
425 roots_with_children = self.get_roots_with_children(names)
426 nb_roots_with_children = len(roots_with_children)
427
428 nb_sole_roots = (nb_roots - nb_roots_with_children)
429
430 if mode == "compact":
431 return "QDForest (%s vars = %s roots + %s determined by %s of the roots)" \
432 % (nb_vars, nb_roots, nb_arcs, nb_roots_with_children)
433
434 # list of root node indice/name with a star when they have children
435 roots_str = [("%s*" if r in roots_with_children else "%s") % r for r in roots]
436 # list of
437 non_roots_str = ["%s" % r for r in non_roots]
438
439 headers = "\n".join((
440 "QDForest (%s vars):" % nb_vars,
441 " - %s roots (%s+%s*): %s" % (nb_roots, nb_sole_roots, nb_roots_with_children, ", ".join(roots_str)),
442 " - %s other nodes: %s" % (nb_arcs, ", ".join(non_roots_str)),
443 ))
444 if mode == "headers":
445 return headers
446 elif mode == "full":
447 tree_str = "\n".join(self.get_trees_str_list())
448 return "%s\n\n%s" % (headers, tree_str)
449 else:
450 raise ValueError("Unknown mode: %r" % mode)
451
452 @encode_if_py2
453 def __str__(self):
454 """ String representation, listing all useful information when available """
455 if self.nb_vars > 30:
456 return self.to_str(mode="headers")
457 else:
458 return self.to_str(mode="full")
459
460 @encode_if_py2
461 def __repr__(self):
462 # return str(self) # note if we use this then we'll have to comment the decorator
463 return self.to_str(mode="compact")
464
465 def print_arcs(self,
466 names=None # type: bool
467 ):
468 """ Prints the various deterministic arcs in this forest """
469 print("\n".join(self.get_arcs_str_list(names=names)))
470
471 def get_arcs_str_list(self,
472 names=None # type: bool
473 ):
474 """ Returns a list of string representation of the various deterministic arcs in this forest """
475 res_str_list = []
476 for parent, child in self.get_arcs(names=names):
477 res_str_list.append("%s -> %s" % (parent, child))
478 return res_str_list
479
480 def get_trees_str_list(self,
481 all=False,
482 names=None # type: bool
483 ):
484 """
485 Returns a list of string representation of the various trees in this forest
486 TODO maybe use https://github.com/caesar0301/treelib ? Or networkX ?
487
488 :param all: if True, this will return also the trees that are consistuted of one node
489 :param names:
490 :return:
491 """
492 names = self._validate_names_arg(names)
493 res_str_list = []
494 if all:
495 roots = self.get_roots(names)
496 else:
497 roots = self.get_roots_with_children(names)
498
499 if names:
-
E731
Do not assign a lambda expression, use a def
500 walk_arcs = lambda n: self.walk_arcs(parent_name=n)
501 else:
-
E731
Do not assign a lambda expression, use a def
502 walk_arcs = lambda n: self.walk_arcs(parent_idx=n)
503
504 for r in roots:
505 subtree_str = "%s" % r
506 for level, _, j in walk_arcs(r):
507 subtree_str += "\n%s└─ %s" % (" " * level, j)
508 res_str_list.append(subtree_str + "\n")
509 return res_str_list
510
511 # -----------
512
513 def fit_selector_model(self,
514 X # type: Union[np.ndarray, pd.DataFrame]
515 ):
516 # type: (...) -> QDSelectorModel
517 """
518 Returns a new instance of `QDSelectorModel` fit with the data in `X`
519
520 :param X:
521 :return:
522 """
523 from .selector import QDSelectorModel
524 model = QDSelectorModel(self)
525 model.fit(X)
526 return model
527
528 # --------- tools for entropies analysis to find interesting thresholds
529
530 def get_entropies_table(self,
531 from_to=True, # type: bool
532 sort_by="from", # type: str
533 drop_self_link=True, # type: bool
534 ):
535 """ See `Entropies.get_entropies_table` """
536
537 self.assert_stats_kept()
538 return self.stats.get_entropies_table(from_to=from_to, sort_by=sort_by, drop_self_link=drop_self_link)
539
540 def plot_increasing_entropies(self):
541 """ See `Entropies.plot_increasing_entropies` """
542
543 self.assert_stats_kept()
544 self.stats.plot_increasing_entropies()
545
546
547 def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray]
548 ):
549 """
550 Raises a ValueError if `df_or_array` is
551
552 :param df_or_array:
553 :return:
554 """
555 if isinstance(df_or_array, pd.DataFrame):
556 pass
557 elif isinstance(df_or_array, np.ndarray):
558 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
559 if len(df_or_array.shape) != 2:
560 raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape))
561 else:
562 # Raise error
563 raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`")
564
565
566 def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
567 absolute_eps=None, # type: float
568 relative_eps=None, # type: float
569 keep_stats=False, # type: bool
570 non_categorical_mode='strict',
571 ):
572 # type: (...) -> QDForest
573 """
574 Finds the (quasi-)deterministic relationships (functional dependencies) between the variables in `X`, and returns
575 a `QDForest` object representing the forest of (quasi-)deterministic trees. This object can then be used to fit a
576 feature selection model or to learn a Bayesian Network structure.
577
578 By default only deterministic relationships are detected. Quasi-determinism can be enabled by setting either
579 an threshold on conditional entropies (`absolute_eps`) or on relative conditional entropies (`relative_eps`). Only
580 one of them should be set.
581
582 By default (`keep_stats=False`) the entropies tables are not preserved once the forest has been created. If you wish
583 to keep them available, set `keep_stats=True`. The entropies tables are then available in the `<QDForest>.stats`
584 attribute, and threshold analysis methods such as `<QDForest>.get_entropies_table(...)` and
585 `<QDForest>.plot_increasing_entropies()` become available.
586
587 :param X: the dataset as a pandas DataFrame or a numpy array. Columns represent the features to compare.
588 :param absolute_eps: Absolute entropy threshold. Any feature `Y` that can be predicted from another feature `X` in
589 a quasi-deterministic way, that is, where conditional entropy `H(Y|X) <= absolute_eps`, will be removed. The
590 default value is `0` and corresponds to removing deterministic relationships only.
591 :param relative_eps: Relative entropy threshold. Any feature `Y` that can be predicted from another feature `X` in
592 a quasi-deterministic way, that is, where relative conditional entropy `H(Y|X)/H(Y) <= relative_eps` (a value
593 between `0` and `1`), will be removed. Only one of `absolute_eps` or `relative_eps` should be provided.
594 :param keep_stats: a boolean indicating if the various entropies tables computed in the process should be kept in
595 memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`.
596 :return:
597 """
598 # Make sure this is a 2D table
599 assert_df_or_2D_array(X)
600
601 # Sanity check: are there rows in here ?
602 if len(X) == 0:
603 raise ValueError("Provided dataset does not contain any row")
604
605 # Only work on the categorical features
606 X = get_categorical_features(X, non_categorical_mode=non_categorical_mode)
607
608 # Sanity check concerning the number of columns
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
609 assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing."
610
611 # parameters check and defaults
612 if absolute_eps is None:
613 if relative_eps is None:
614 # nothing is provided, use absolute threshold 0
615 absolute_eps = 0
616 is_absolute = True
617 else:
618 # relative threshold provided
619 is_absolute = False
620 else:
621 if relative_eps is not None:
622 raise ValueError("only one of absolute and relative should be passed")
623 # absolute threshold provided
624 is_absolute = True
625
626 is_strict = (absolute_eps == 0.) if is_absolute else (relative_eps == 0.)
627
628 # sanity check
629 if is_absolute and absolute_eps < 0:
630 raise ValueError("epsilon_absolute should be positive")
631 elif not is_absolute and (relative_eps < 0 or relative_eps > 1):
632 raise ValueError("epsilon_relative should be 0=<eps=<1")
633
634 # (0) compute conditional entropies or relative conditional entropies
635 A_orig, X_stats = get_adjacency_matrix(X, eps_absolute=absolute_eps, eps_relative=relative_eps)
636
637 # (2) identify redundancy
-
E501
Line too long (134 > 120 characters)
638 A_noredundancy = remove_redundancies(A_orig, selection_order=None if is_strict else X_stats.list_vars_by_entropy_order(desc=True))
639
640 # (3) transform into forest: remove extra parents by keeping only the parent with lowest entropy / nb levels ?
641 # if X -> Z and X -> Y and Y -> Z then Z has two parents X and Y but only Y should be kept
642 # Determinist case: minimize the number of parameters: take the minimal nb of levels
643 # Quasi-determinist case: take the lowest entropy
644 entropy_order_inc = (X_stats.H_ar).argsort()
645 # parent_order = compute_nb_levels(df) if is_strict else entropy_based_order
646 # A_forest = to_forest_adjmat(A_noredundancy, entropy_order_inc)
647 # qd_forest = QDForest(adjmat=A_forest)
648 parents = to_forest_parents_indices(A_noredundancy, entropy_order_inc)
649 # if not X_stats.is_nparray:
650 # parents = pd.Series(parents, index=A_noredundancy.columns)
651 qd_forest = QDForest(parents=parents, stats=X_stats if keep_stats else None)
652
-
E501
Line too long (140 > 120 characters)
653 # forestAdjMatList <- FromHToDeterForestAdjMat(H = H, criterion = criterionNlevels, removePerfectMatchingCol = removePerfectMatchingCol)
654
655 # (4) convert to pgmpy format
656 # deterForest <- FromAdjMatToBnLearn(adjMat = forestAdjMat)
657 # rootsF <- Roots(forestAdjMat)
658
659 # if(length(rootsFNames) == 1){
660 # print("Only one tree in the forest !!! No root graph computed")
661 # print("****************")
662 # print("Training - Phase 2 and Phase 3 obsolete")
663 # print("****************")
664 # gstar <- deterForest
665 # G_R <- NULL
666 # print("Final graph computed")
667 # }else{
-
E501
Line too long (135 > 120 characters)
668 # print(paste("Multiple trees in the forest (", length(rootsFNames),") Root graph will be computed.", sep = "", collapse = ""))
669 # rootsOnly <- ReduceDataAndH(variablesToKeep = rootsFNames, dataFrame = data, H)
670 # dataRoots <- rootsOnly[[1]]
671 # HRoots <- rootsOnly[[2]]
672 # rm(rootsOnly)
673 # print("****************")
674 # print("Training - Phase 2: computation of the root graph")
675 # print("****************")
-
E501
Line too long (156 > 120 characters)
676 # # removePerfectlyMatchingCol useless here because either pmc are in the same trees (but only one is the root), either they were deleted earlier on
-
E501
Line too long (149 > 120 characters)
677 # G_R <- SotABNsl(data = dataRoots, method = method, score = score, hyperparamList, removePerfectMatchingCol = removePerfectMatchingCol, ...)
678 # edgeListG_R <- FromBnToEdgeList(bn = G_R)
679 # colnames(edgeListG_R) <- c('from', 'to')
680 # print("****************")
681 # print("Training - Phase 3: fusion of the deter forest with the root graph")
682 # print("****************")
683 # gstarAdjMat <- AddEdgesFromEdgeList(originalAdjMatrix = forestAdjMat, edgeListToAdd = edgeListG_R)
684 # gstar <- FromAdjMatToBnLearn(gstarAdjMat)
685 # print("Final graph computed")
686 # }
687
688 return qd_forest
689
690
691 class Entropies(object):
692 """ to do this could be easier to read with pyfields and default value factories """
693
694 __slots__ = ('dataset', 'is_nparray', '_H', '_Hcond', '_Hcond_rel', '_dataset_df')
695
696 def __init__(self, df_or_array):
697 """
698
699 :param df_or_array: a pandas dataframe or numpy array where the variables are the columns
700 """
701 self.dataset = df_or_array
702 self.is_nparray = isinstance(df_or_array, np.ndarray)
703 self._H = None # type: Union[np.ndarray, pd.Series]
704 self._Hcond = None # type: Union[np.ndarray, pd.DataFrame]
705 self._Hcond_rel = None # type: Union[np.ndarray, pd.DataFrame]
706 self._dataset_df = None # type: pd.DataFrame
707
708 def __str__(self):
709 return repr(self)
710
711 def __repr__(self):
712 res = """Statistics computed for dataset:
713 %s
714 ...(%s rows)
715
716 Entropies (H):
717 %s
718
719 Conditional entropies (Hcond = H(row|col)):
720 %s
721
722 Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):
723 %s
724 """ % (self.dataset.head(2), len(self.dataset), self.H.T, self.Hcond, self.Hcond_rel)
725 return res
726
727 @property
728 def dataset_df(self):
729 """The dataset as a pandas DataFrame, if pandas is available"""
730 if self.is_nparray:
731 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
732 if self.dataset.dtype.names is not None:
733 # structured array
734 self._dataset_df = pd.DataFrame(self.dataset)
735 else:
736 # unstructured array... same ?
737 self._dataset_df = pd.DataFrame(self.dataset)
738 return self._dataset_df
739 else:
740 return self.dataset
741
742 @property
743 def nb_vars(self):
744 return self.dataset.shape[1]
745
746 @property
747 def varnames(self):
748 if self.is_nparray:
749 raise ValueError("Variable names not available")
750 return list(self.dataset.columns)
751
752 @property
753 def H_ar(self):
754 """ The entropy matrix (i, j) = H(Xi | Xj) as a numpy array """
755 return self.H if self.is_nparray else self.H.values
756
757 entropies_ar = H_ar
758 """An alias for H_ar"""
759
760 @property
761 def H(self):
762 """The entropies of all variables. a pandas Series if df was a pandas dataframe, else a 1D numpy array"""
763 if self._H is None:
764 # Using pyitlib to compute H (hopefully efficiently)
765 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report
766 # note: we convert to string type to avoid a bug with ints. TODO...
767 self._H = drv.entropy(self.dataset_df.T.astype(str))
768 if not self.is_nparray:
769 self._H = pd.Series(self._H, index=self.varnames)
770
771 # basic sanity check: should all be positive
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
772 assert np.all(self._H >= 0)
773 return self._H
774
775 entropies = H
776 """An alias for H"""
777
778 @property
779 def Hcond_ar(self):
780 """ The conditional entropy matrix (i, j) = H(Xi | Xj) as a numpy array """
781 return self.Hcond if self.is_nparray else self.Hcond.values
782
783 conditional_entropies_ar = Hcond_ar
784 """An alias for Hcond_ar"""
785
786 @property
787 def Hcond(self):
788 """
789 The conditional entropy matrix (i, j) = H(Xi | Xj).
790 A pandas Dataframe or a 2D numpy array depending on dataset type
791 """
792 if self._Hcond is None:
793 # Old attempt to do it ourselves
794 # (0) init H
795 # nb_vars = len(df.columns)
796 # H = np.empty((nb_vars, nb_vars), dtype=float)
797 # (1) for each column compute the counts per value
798 # (2) for each (i, j) pair compute the counts
799
800 # Using pyitlib to compute H (hopefully efficiently)
801 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report
802 # note: we convert to string type to avoid a bug with ints. TODO...
803 self._Hcond = drv.entropy_conditional(self.dataset_df.T.astype(str))
804
805 # add the row/column headers
806 if not self.is_nparray:
807 self._Hcond = pd.DataFrame(self._Hcond, index=self.varnames, columns=self.varnames)
808
809 # basic sanity check: should all be positive
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
810 assert np.all(self._Hcond >= 0)
811 return self._Hcond
812
813 conditional_entropies = Hcond
814 """An alias for Hcond"""
815
816 @property
817 def Hcond_rel_ar(self):
818 """ The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi) as a numpy array """
819 return self.Hcond_rel if self.is_nparray else self.Hcond_rel.values
820
821 relative_conditional_entropies_ar = Hcond_rel_ar
822 """An alias for Hcond_rel_ar"""
823
824 @property
825 def Hcond_rel(self):
826 """
827 The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi).
828 """
829 if self._Hcond_rel is None:
830 # compute relative entropy: in each cell (X, Y) we want H(X|Y)/H(X)
831 if self.is_nparray:
832 self._Hcond_rel = (self.Hcond.T / self.H).T
833 else:
834 Hrel_array = (self.Hcond.values.T / self.H.values).T
-
E501
Line too long (124 > 120 characters)
835 self._Hcond_rel = pd.DataFrame(Hrel_array, index=list(self.Hcond.columns), columns=list(self.Hcond.columns))
836
837 # basic sanity check
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
838 assert np.all(self._Hcond_rel >= 0.)
-
S101
Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
839 assert np.all(self._Hcond_rel <= 1.)
840 return self._Hcond_rel
841
842 relative_conditional_entropies = Hcond_rel
843 """An alias for Hcond_rel"""
844
845 def list_vars_by_entropy_order(self, desc=False):
846 """
847 Returns the indices or names of variables in ascending (resp. descending) order of entropy
848
849 :param desc: if True the order is descending, else ascending
850 :return:
851 """
852 if self.is_nparray:
853 return (-self.H).argsort() if desc else self.H.argsort()
854 else:
855 return (-self.H.values).argsort() if desc else self.H.values.argsort()
856
857 # --------- tools for entropies analysis to find interesting thresholds
858
859 def get_entropies_table(self,
860 from_to=True, # type: bool
861 sort_by="from", # type: str
862 drop_self_link=True, # type: bool
863 ):
864 """
865 Returns a pandas series or numpy array with four columns: from, to, cond_entropy, rel_cond_entropy.
866 The index is 'arc', a string representing the arc e.g. "X->Y".
867
868 :param from_to: a boolean flag indicating if 'from' and 'to' columns should remain in the returned dataframe
869 (True, default) or be dropped (False)
870 :param sort_by: a string indicating if the arcs should be sorted by origin ("from", default) or destination
871 ("to"), or by value "rel_cond_entropy" or "cond_entropy", in the resulting table.
872 :param drop_self_link: by default node-to-self arcs are not returned in the list. Turn this to False to include
873 them.
874 :return:
875 """
876 if self.is_nparray:
877 raise NotImplementedError("TODO")
878 else:
879 # 1. gather the data and unpivot it so that there is one row per arc
880 res_df = pd.DataFrame({
881 'cond_entropy': self.Hcond.unstack(),
882 'rel_cond_entropy': self.Hcond_rel.unstack(),
883 })
884 res_df.index.names = ['from', 'to']
885 res_df = res_df.reset_index()
886
887 # 2. filter out node-to-self if needed
888 if drop_self_link:
889 res_df = res_df.loc[res_df['from'] != res_df['to']].copy()
890
891 # 3. create the arc names column and make it the index
892 def make_arc_str(row):
893 return "%s->%s" % (row['from'], row.to) # note that .from is a reserved python symbol !
894 res_df['arc'] = res_df.apply(make_arc_str, axis=1)
895 res_df.set_index('arc', inplace=True)
896
897 # 4. Optionally sort differently
898 all_possibilities = ("from", "to", "cond_entropy", "rel_cond_entropy")
899 if sort_by == all_possibilities[0]:
900 pass # already done
901 elif sort_by in all_possibilities[1:]:
902 res_df.sort_values(by=sort_by, inplace=True)
903 else:
904 raise ValueError("Invalid value for `sort_by`: %r. Possible values: %r" % (sort_by, all_possibilities))
905
906 # 5. Optionally drop the from and to columns
907 if not from_to:
908 res_df.drop(['from', 'to'], axis=1, inplace=True)
909
910 return res_df
911
912 def plot_increasing_entropies(self):
913 """
914
915 :return:
916 """
917 import matplotlib.pyplot as plt
918
919 # get the dataframe
920 df = self.get_entropies_table(sort_by="rel_cond_entropy")
921
922 # plot with all ticks
923 df_sorted = df["rel_cond_entropy"] # .sort_values()
924 df_sorted.plot(title="Relative conditional entropy H(X|Y)/H(X) for each arc X->Y, by increasing order",
925 figsize=(15, 10))
926 plt.xticks(np.arange(len(df_sorted)), df_sorted.index, rotation=90)
927
928
929 def get_adjacency_matrix(df, # type: Union[np.ndarray, pd.DataFrame]
930 eps_absolute=None, # type: float
931 eps_relative=None # type: float
932 ):
933 """
934
935 :param df:
936 :param eps_absolute:
937 :param eps_relative:
938 :return:
939 """
940 df_stats = Entropies(df)
941
942 # (1) create initial adjacency matrix by thresholding either H(X|Y) (absolute) or H(X|Y)/H(X) (relative)
943 if eps_relative is None:
944 # default value for eps absolute
945 if eps_absolute is None:
946 eps_absolute = 0
947
948 # threshold is on H(X|Y)
949 edges = (df_stats.Hcond_ar <= eps_absolute).T
950 else:
951 if eps_absolute is not None:
952 raise ValueError("Only one of `eps_absolute` and `eps_relative` should be provided")
953
954 # threshold on H(X|Y)/H(X)
955 edges = (df_stats.Hcond_rel_ar <= eps_relative).T
956
957 # the diagonal should be false
958 np.fill_diagonal(edges, False)
959
960 # finally create the matrix
961 if df_stats.is_nparray:
962 A = edges
963 else:
964 A = pd.DataFrame(edges, index=df_stats.varnames, columns=df_stats.varnames)
965
966 return A, df_stats
967
968
969 def remove_redundancies(A, # type: Union[np.ndarray, pd.DataFrame]
970 selection_order=None # type: np.ndarray
971 ):
972 """Cleans the arcs in A between redundant variables.
973
974 A should be a dataframe or numpy array where index = columns and indicate the node name.
975 It should contain boolean values where True at (i,j) means there is a directed arc i -> j
976
977 When there are redundant variables (arcs between i->j and j->i), only a single node is kept as the parent
978 of all other redundant nodes.
979
980 :param A: an adjacency matrix where A(i, j) indicates that there is an arc i -> j
981 :param selection_order: if none (default) the first node in the list will be kept as representant for each
982 redundancy class. Otherwise an alternate order (array of indices) can be provided.
983 :return:
984 """
985 assert_adjacency_matrix(A)
986
987 # work on a copy
988 is_nparray = isinstance(A, np.ndarray)
989 if is_nparray:
990 A = A.copy()
991 A_df = None
992 else:
993 A_df = A.copy(deep=True)
994 A = A_df.values
995
996 # init
997 n_vars = A.shape[0]
998 if selection_order is None:
999 selection_order = range(n_vars)
1000
1001 # I contains the list of variable indices to go through. We can remove some in the loop
-
E741
Ambiguous variable name 'I'
1002 I = np.ones((n_vars, ), dtype=bool)
1003
1004 # for each node
1005 for i in selection_order:
1006 # if we did not yet take it into account
1007 if I[i]:
1008 # find redundant nodes with this node (= there is an arc in both directions)
1009 mask_redundant_with_i = A[i, :] * A[:, i]
1010
1011 # we will stop caring about this redundancy class
1012 I[mask_redundant_with_i] = False
1013
1014 # we do not care about i-to-i
1015 mask_redundant_with_i[i] = False
1016
1017 # if there is at least one redundant node
1018 if any(mask_redundant_with_i):
1019 redundant_nodes_idx = np.where(mask_redundant_with_i)
1020
1021 # let only the first variable in the list (i) be the parent
1022 # i becomes the representant of the redundancy class
1023 A[redundant_nodes_idx, i] = False
1024
1025 # remove all arcs inter-redundant variables.
1026 # Note that this destroys the diagonal but we will fix this at the end
1027 A[redundant_nodes_idx, redundant_nodes_idx] = False
1028
1029 # restore the diagonal: no
1030 # np.fill_diagonal(A, True)
1031 if is_nparray:
1032 return A
1033 else:
1034 return A_df
1035
1036
1037 def to_forest_parents_indices(A, # type: Union[np.ndarray, pd.DataFrame]
1038 selection_order=None # type: np.ndarray
1039 ):
1040 # type: (...) -> Union[np.ndarray, pd.DataFrame]
1041 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order
1042
1043 Returns a 1D array of parent index or -1 if root
1044
1045 :param A: an adjacency matrix, as a dataframe or numpy array
1046 :param selection_order: an optional order for parent selection. If not provided, the first in the list of columns
1047 of A will be used
1048 :return:
1049 """
1050 assert_adjacency_matrix(A)
1051 is_np_array = isinstance(A, np.ndarray)
1052
1053 # From https://stackoverflow.com/a/47269413/7262247
1054 if is_np_array:
1055 mask = A[selection_order, :] if selection_order is not None else A
1056 else:
1057 mask = A.iloc[selection_order, :].values if selection_order is not None else A.values
1058
1059 # return a list containing for each feature, the index of its parent or -1 if it is a root
1060 indices = np.where(mask.any(axis=0),
1061 selection_order[mask.argmax(axis=0)] if selection_order is not None else mask.argmax(axis=0),
1062 -1)
1063 if not is_np_array:
1064 indices = pd.Series(indices, index=A.columns)
1065 return indices
1066
1067
1068 def to_forest_adjmat(A, # type: Union[np.ndarray, pd.DataFrame]
1069 selection_order, # type: np.ndarray
1070 inplace=False # type: bool
1071 ):
1072 # type: (...) -> Union[np.ndarray, pd.DataFrame]
1073 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order
1074
1075 :param A: an adjacency matrix, as a dataframe or numpy array
1076 :return:
1077 """
1078 is_ndarray = isinstance(A, np.ndarray)
1079 assert_adjacency_matrix(A)
1080
1081 if not inplace:
1082 A = A.copy()
1083
1084 # nb_parents = A_df.sum(axis=0)
1085 # nodes_with_many_parents = np.where(nb_parents.values > 1)[0]
1086
1087 # Probably the fastest: when the cumulative nb of parents is above 1 they need to be removed
1088 if is_ndarray:
1089 mask_parents_over_the_first = A[selection_order, :].cumsum(axis=0) > 1
1090 mask_parents_over_the_first = mask_parents_over_the_first[selection_order.argsort(), :]
1091 A[mask_parents_over_the_first] = False
1092 else:
1093 # From https://stackoverflow.com/a/47269413/7262247
1094 mask_parents_over_the_first = A.iloc[selection_order, :].cumsum(axis=0) > 1
1095 A[mask_parents_over_the_first] = False
1096
1097 # # Probably slower but easier to understand
1098 # # for each column we want to only keep one (or zero) row (parent) with True
1099 # for i in range(A_df.shape[1]):
1100 # first_parent_idx = A_df.values[selection_order, i].argmax(axis=0)
1101 # A_df.values[selection_order[first_parent_idx+1:], i] = False
1102
1103 if not inplace:
1104 return A
1105
1106
1107 def get_arcs_from_parents_indices(parents, # type: Union[np.ndarray, pd.DataFrame]
1108 multiindex=False, # type: bool
1109 names=False # type: bool
1110 ):
-
E501
Line too long (154 > 120 characters)
1111 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]
1112 """
1113 if multiindex = False ; returns a sequence of pairs : (9, 1), (3, 5), (9, 7)
1114 if multiindex = True ; returns two sequences of indices: (9, 3, 9), (1, 5, 7)
1115
1116 :param parents:
1117 :param multiindex:
1118 :param names:
1119 :return:
1120 """
1121 is_np_array = isinstance(parents, np.ndarray)
1122 if not names:
1123 if not is_np_array:
1124 # assume a dataframe with an 'idx' column as in QDForest class
1125 parents = parents['idx'].values
1126 n = len(parents)
1127 childs_mask = parents >= 0
1128 res = parents[childs_mask], np.arange(n)[childs_mask]
1129 return res if multiindex else zip(*res)
1130 else:
1131 if is_np_array:
1132 raise ValueError("Names are not available, this is a numpy array")
1133 else:
1134 # cols = A.columns
1135 # return ((cols[i], cols[j]) for i, j in zip(*np.where(A)))
1136 res = parents.loc[parents['name'].notna(), 'name']
1137 res = res.values, res.index
1138 return res if multiindex else zip(*res)
1139
1140
1141 def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFrame]
1142 multiindex=False, # type: bool
1143 names=False # type: bool
1144 ):
-
E501
Line too long (154 > 120 characters)
1145 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]
1146 """
1147 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names
1148
1149 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables
1150 (all the parents, all the childs).
1151
1152 :param A:
1153 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples
1154 :param names: if False, indices are returned. Otherwise feature names are returned if any
1155 :return:
1156 """
1157 if not names:
1158 res = np.where(A)
1159 return res if multiindex else zip(*res)
1160 else:
1161 is_np_array = isinstance(A, np.ndarray)
1162 if is_np_array:
1163 raise ValueError("Names are not available, this is a numpy array")
1164 else:
1165 cols = A.columns
1166 res_ar = np.where(A)
1167 if multiindex:
1168 return ((cols[i] for i in l) for l in res_ar) # noqa
1169 else:
1170 return ((cols[i], cols[j]) for i, j in zip(*res_ar))
1171
1172
1173 def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame]
1174 non_categorical_mode="strict" # type: str
1175 ):
1176 # type: (...) -> Union[np.ndarray, pd.DataFrame]
1177 """
1178
1179 :param df_or_array:
1180 :param non_categorical_mode:
1181 :return: a dataframe or array with the categorical features
1182 """
1183 assert_df_or_2D_array(df_or_array)
1184
1185 if non_categorical_mode == "strict":
1186 strict_mode = True
1187 elif non_categorical_mode == "remove":
1188 strict_mode = False
1189 else:
1190 raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode)
1191
1192 if isinstance(df_or_array, pd.DataFrame):
1193 is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"])
1194 if strict_mode and not is_categorical_dtype.all():
1195 raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or "
-
E128
Continuation line under-indented for visual indent
-
E501
Line too long (123 > 120 characters)
1196 "'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to "
-
E128
Continuation line under-indented for visual indent
1197 "`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
1198 elif not is_categorical_dtype.any():
1199 raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
-
E128
Continuation line under-indented for visual indent
1200 "'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
1201 return df_or_array.loc[:, is_categorical_dtype]
1202
1203 elif isinstance(df_or_array, np.ndarray):
1204 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
1205 if df_or_array.dtype.names is not None:
1206 # structured array
1207 is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object"
1208 for n in df_or_array.dtype.names])
1209 if strict_mode and not is_categorical_dtype.all():
1210 invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict()
1211 raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): "
1212 "found dtypes %r. This is not supported when `non_categorical_mode` is set to "
1213 "`'strict'`" % invalid_dtypes)
1214 elif not is_categorical_dtype.any():
1215 raise ValueError(
1216 "Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
1217 "'categorical'): found dtypes %r" % df_or_array.dtype.fields)
1218 categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype]
1219 return df_or_array[categorical_names]
1220 else:
1221 # non-structured array
1222 if df_or_array.dtype != np.dtype('O'):
1223 raise TypeError("Provided array columns are not object nor categorical: found dtype %r"
1224 % df_or_array.dtype)
1225 return df_or_array
1226 else:
1227 # Should not happen since `assert_df_or_2D_array` is called upfront now.
1228 raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray")
1229
1230
1231 def assert_adjacency_matrix(A # type: Union[np.ndarray, pd.DataFrame]
1232 ):
1233 """Routine to check that A is a proper adjacency matrix"""
1234
1235 if len(A.shape) != 2:
1236 raise ValueError("A is not a 2D adjacency matrix, its shape is %sD" % len(A.shape))
1237
1238 if A.shape[0] != A.shape[1]:
1239 raise ValueError("A is not a 2D adjacency matrix: it is not square: %r" % A.shape)