Coverage for qdscreen/main.py: 84%
516 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-17 11:02 +0000
1# -*- coding: utf-8 -*-
2# The above encoding declaration is needed because we use non-scii characters in `get_trees_str_list`.
3# The below import transforms automatically all strings in this file in unicode in python 2.
4from __future__ import unicode_literals # See tests/encoding_ref_help.py for a detailed explanation
5import numpy as np
6import pandas as pd
8from pyitlib import discrete_random_variable as drv
10from .compat import encode_if_py2
12try:
13 from typing import Union, Iterable, Tuple
14except: # noqa
15 pass
18# TODO
19# Cramer's V + Theil's U:
20# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
21# https://stackoverflow.com/questions/46498455/categorical-features-correlation
22# https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix (+ corrected version)
23# https://stackoverflow.com/questions/61236715/correlation-between-categorical-variables-within-a-dataset
24#
25# https://stackoverflow.com/questions/48035381/correlation-among-multiple-categorical-variables-pandas
26# https://stackoverflow.com/questions/44694228/how-to-check-for-correlation-among-continuous-and-categorical-variables-in-pytho
27#
29def _add_names_to_parents_idx_series(parents):
30 parents = pd.DataFrame(parents, columns=('idx',))
31 parents['name'] = parents.index[parents['idx']].where(parents['idx'] >= 0, None)
32 return parents
35class QDForest(object):
36 """A quasi-deterministic forest returned by `qd_screen`"""
37 __slots__ = (
38 '_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child)
39 '_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
40 'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)
41 '_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables
42 '_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children
43 'stats' # an optional `Entropies` object stored for debug
44 )
46 def __init__(self,
47 adjmat=None, # type: Union[np.ndarray, pd.DataFrame]
48 parents=None, # type: Union[np.ndarray, pd.Series]
49 stats=None # type: Entropies
50 ):
51 """
53 :param adjmat:
54 :param parents:
55 """
56 self.stats = stats
57 self._adjmat = adjmat
58 self.is_nparray = isinstance((adjmat if adjmat is not None else parents), np.ndarray)
59 if parents is not None:
60 if not self.is_nparray:
61 # create a df with two columns: indices and names
62 parents = _add_names_to_parents_idx_series(parents)
63 self._parents = parents
64 else:
65 self._parents = None
67 self._roots_mask = None
68 self._roots_wc_mask = None
70 def assert_stats_kept(self):
71 if self.stats is None: 71 ↛ 72line 71 didn't jump to line 72, because the condition on line 71 was never true
72 raise ValueError("`stats` are not available in this QDForest instance. If you created it using `qd_screen`,"
73 " you should use `keep_stats=True`.")
75 def assert_pandas_capable(self):
76 """Utility method to assert that "not self.is_nparray" """
77 if self.is_nparray:
78 raise ValueError("This QDForest instance was built with numpy arrays, it is not pandas compliant")
80 def assert_names_available(self):
81 """"""
82 if self.is_nparray:
83 raise ValueError("Variable names are not available")
85 def _validate_names_arg(self,
86 names):
87 """Utility method to validate and set default value of the `names` argument used in most methods"""
88 if names is None:
89 names = not self.is_nparray
90 if names:
91 self.assert_names_available()
92 return names
94 @property
95 def nb_vars(self):
96 return self._parents.shape[0] if self._parents is not None else self._adjmat.shape[0]
98 @property
99 def varnames(self):
100 self.assert_names_available()
101 return np.array(self._adjmat.columns) if self._adjmat is not None else np.array(self._parents.index)
103 def indices_to_mask(self,
104 indices,
105 in_names=None,
106 out_names=None,
107 ):
108 """Utility to convert a list of indices to a numpy or pandas mask"""
109 in_names = self._validate_names_arg(in_names)
110 out_names = self._validate_names_arg(out_names)
111 mask = np.zeros((self.nb_vars,), dtype=bool)
112 if out_names:
113 mask = pd.Series(mask, index=self.varnames, dtype=bool)
114 if in_names and not out_names: 114 ↛ 116line 114 didn't jump to line 116, because the condition on line 114 was never true
115 # TODO
116 mask[self.varnames] = True
117 elif not in_names and out_names:
118 mask.iloc[indices] = True
119 else:
120 mask[indices] = True
121 return mask
123 def mask_to_indices(self, mask):
124 """Utili"""
125 if isinstance(mask, np.ndarray):
126 return np.where(mask)
127 else:
128 return mask[mask].index
130 @property
131 def adjmat_ar(self):
132 """The adjacency matrix as a 2D numpy array"""
133 return self.adjmat if self.is_nparray else self.adjmat.values
135 @property
136 def adjmat(self):
137 """The adjacency matrix as a pandas DataFrame or a 2D numpy array"""
138 if self._adjmat is None:
139 # compute adjmat from parents and cache it
140 n = self.nb_vars
141 adjmat = np.zeros((n, n), dtype=bool)
142 # from https://stackoverflow.com/a/46018613/7262247
143 index_array = get_arcs_from_parents_indices(self.parents_indices_ar, multiindex=True)
144 flat_index_array = np.ravel_multi_index(index_array, adjmat.shape)
145 np.ravel(adjmat)[flat_index_array] = True
146 if self.is_nparray:
147 self._adjmat = adjmat
148 else:
149 self._adjmat = pd.DataFrame(adjmat, index=self.varnames, columns=self.varnames)
150 return self._adjmat
152 @property
153 def parents_indices_ar(self):
154 """A numpy array containing the indices of all parent nodes"""
155 return self.parents if self.is_nparray else self.parents['idx'].values
157 @property
158 def parents(self):
159 """A numpy array containing the indices of all parent nodes, or a pandas DataFrame containing 'idx' and 'name'"""
160 if self._parents is None:
161 # compute parents from adjmat, whether a dataframe or an array
162 # TODO maybe use a sparse array here?
163 parents = to_forest_parents_indices(self._adjmat)
164 if not self.is_nparray:
165 # create a df with two columns: indices and names
166 parents = _add_names_to_parents_idx_series(parents)
167 self._parents = parents
168 return self._parents
170 @property
171 def roots_mask_ar(self):
172 """A 1D numpy mask array indicating if a node is a root node"""
173 return self.roots_mask if self.is_nparray else self.roots_mask.values
175 @property
176 def roots_mask(self):
177 """A pandas Series or a 1D numpy mask array indicating if a node is a root node"""
178 if self._roots_mask is None:
179 if self._adjmat is not None:
180 self._roots_mask = ~self.adjmat.any(axis=0)
181 else:
182 self._roots_mask = (self.parents if self.is_nparray else self.parents['idx']) < 0
183 if not self.is_nparray:
184 # remove the name of the series
185 self._roots_mask.name = None
186 return self._roots_mask
188 @property
189 def roots_ar(self):
190 """A 1D numpy array of root indices"""
191 return np.where(self.roots_mask_ar)[0]
193 @property
194 def roots(self):
195 """A pandas Series of root names or 1D numpy array of root indices"""
196 if self.is_nparray: 196 ↛ 197line 196 didn't jump to line 197, because the condition on line 196 was never true
197 return self.roots_ar
198 else:
199 return self.roots_mask.index[self.roots_mask]
201 def get_roots(self,
202 names=None # type: bool
203 ):
204 """
205 Returns the list of root indices or root names
207 :param names: an optional boolean indicating if this method should return names instead of indices. By
208 default `names` is set to `not self.is_np`
209 :return:
210 """
211 names = self._validate_names_arg(names)
212 return list(self.roots) if names else list(self.roots_ar)
214 @property
215 def non_roots_ar(self):
216 """A 1D numpy array of non-root indices"""
217 return np.where(~self.roots_mask_ar)[0]
219 @property
220 def non_roots(self):
221 """A pandas Series of non-root names or 1D numpy array of non-root indices"""
222 if self.is_nparray: 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true
223 return self.non_roots_ar
224 else:
225 return self.roots_mask.index[~self.roots_mask]
227 def get_non_roots(self,
228 names=None # type: bool
229 ):
230 """
231 Returns the list of non-root indices or names
233 :param names: an optional boolean indicating if this method should return names instead of indices. By
234 default `names` is set to `not self.is_np`
235 :return:
236 """
237 names = self._validate_names_arg(names)
238 return list(self.non_roots) if names else list(self.non_roots_ar)
240 @property
241 def nodes_with_children_mask_ar(self):
242 """A 1D numpy mask array indicating nodes that have at least 1 child"""
243 return self.nodes_with_children_mask if self.is_nparray else self.nodes_with_children_mask.values
245 @property
246 def nodes_with_children_mask(self):
247 """A pandas Series or 1D numpy array indicating nodes that have at least 1 child"""
248 if self._adjmat is not None: 248 ↛ 251line 248 didn't jump to line 251, because the condition on line 248 was never false
249 return self._adjmat.any(axis=1)
250 else:
251 nwc_idx = np.unique(self.parents_indices_ar[self.parents_indices_ar >= 0])
252 return self.indices_to_mask(nwc_idx)
254 @property
255 def roots_with_children_mask_ar(self):
256 """A 1D numpy mask array indicating root nodes that have at least 1 child"""
257 return self.roots_with_children_mask if self.is_nparray else self.roots_with_children_mask.values
259 @property
260 def roots_with_children_mask(self):
261 """A pandas Series or 1D numpy mask array indicating root nodes that have at least 1 child"""
262 if self._roots_wc_mask is None:
263 if self._adjmat is not None:
264 # a root with children = a root AND a node with children
265 self._roots_wc_mask = self._roots_mask & self.nodes_with_children_mask
266 else:
267 # a root with children = a parent of a level 1 child node
268 level_1_nodes_mask = ((self.parents_indices_ar >= 0)
269 & (self.parents_indices_ar[self.parents_indices_ar] == -1))
270 rwc_indices = np.unique(self.parents_indices_ar[level_1_nodes_mask])
271 self._roots_wc_mask = self.indices_to_mask(rwc_indices, in_names=False, out_names=not self.is_nparray)
272 return self._roots_wc_mask
274 @property
275 def roots_with_children_ar(self):
276 """A 1D numpy array with the indices of root nodes that have at least 1 child"""
277 return np.where(self.roots_with_children_mask_ar)[0]
279 @property
280 def roots_with_children(self):
281 """A pandas Series or 1D numpy array with names/indices of root nodes that have at least 1 child"""
282 if self.is_nparray: 282 ↛ 283line 282 didn't jump to line 283, because the condition on line 282 was never true
283 return self.roots_with_children_ar
284 else:
285 return self.roots_with_children_mask.index[self.roots_with_children_mask]
287 def get_roots_with_children(self,
288 names=None # type: bool
289 ):
290 """
291 Returns the list of root with children
293 :param names: an optional boolean indicating if this method should return names instead of indices. By
294 default `names` is set to `not self.is_np`
295 :return:
296 """
297 names = self._validate_names_arg(names)
298 return list(self.roots_with_children) if names else list(self.roots_with_children_ar)
300 def get_children(self,
301 parent_idx=None, # type: int
302 parent_name=None, # type: str
303 names=None # type: bool
304 ):
305 """
306 Returns the list of children of a node
308 :param parent_idx: the index of the node to query for
309 :param parent_name: the name of the node to query for
310 :param names: a boolean to indicate if the returned children should be identified by their names (`True`) or
311 indices (`False`). By default if the parent is identified with `parent_idx` indices will be returned (`False`),
312 and if the parent is identified with parent_name names will be returned (`True`)
313 :return: an array containing indices of the child nodes
314 """
316 if parent_idx is not None:
317 if parent_name is not None: 317 ↛ 318line 317 didn't jump to line 318, because the condition on line 317 was never true
318 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided")
319 node = parent_idx
320 if names is None: 320 ↛ 330line 320 didn't jump to line 330, because the condition on line 320 was never false
321 names = False
322 elif parent_name is not None: 322 ↛ 328line 322 didn't jump to line 328, because the condition on line 322 was never false
323 node = parent_name
324 if names is None: 324 ↛ 326line 324 didn't jump to line 326, because the condition on line 324 was never false
325 names = True
326 self.assert_names_available()
327 else:
328 raise ValueError("You should provide a non-None `parent_idx` or `parent_name`")
330 if self.is_nparray:
331 return np.where(self.parents == node)[0]
332 else:
333 qcol = 'idx' if parent_idx is not None else 'name'
334 if not names:
335 return np.where(self.parents[qcol] == node)[0]
336 else:
337 return self.parents[self.parents[qcol] == node].index.values
339 def get_arcs(self,
340 multiindex=False, # type: bool
341 names=None # type: bool
342 ):
343 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]
344 """
345 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names
347 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables
348 (all the parents, all the childs).
350 :param A:
351 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples
352 :param names: an optional boolean indicating if this method should return names instead of indices. By
353 default `names` is set to `not self.is_np`
354 :return:
355 """
356 names = self._validate_names_arg(names)
357 if self._adjmat is not None:
358 return get_arcs_from_adjmat(self._adjmat, multiindex=multiindex, names=names)
359 else:
360 return get_arcs_from_parents_indices(self._parents, multiindex=multiindex, names=names)
362 def walk_arcs(self,
363 parent_idx=None, # type: int
364 parent_name=None, # type: str
365 names=None # type: bool
366 ):
367 """
368 Yields a sequence of (parent, child) indices or names. As opposed to `get_arcs` the sequence follows the tree
369 order: starting from the list of root nodes, for every node, it is returned first and then all of its children.
370 (depth first, not breadth first)
372 :param parent_idx: the optional index of a node. If provided, only the subtree behind this node will be walked
373 :param parent_name: the optional name of a node. If provided, only the subtree behind this node will be walked
374 :param names: an optional boolean indicating if this method should return names instead of indices. By
375 default `names` is set to `not self.is_np`
376 :return: yields a sequence of (level, i, j)
377 """
378 names = self._validate_names_arg(names)
379 if names:
380 get_children = lambda node: self.get_children(parent_name=node)
381 else:
382 get_children = lambda node: self.get_children(parent_idx=node)
384 def _walk(from_node, level):
385 for child in get_children(from_node):
386 yield level, from_node, child
387 for l, i, j in _walk(child, level+1):
388 yield l, i, j
390 if parent_idx is not None:
391 if parent_name is not None: 391 ↛ 392line 391 didn't jump to line 392, because the condition on line 391 was never true
392 raise ValueError("Only one of `parent_idx` and `parent_name` should be provided")
393 root_nodes = (parent_idx,) if not names else (self.parents.index[parent_idx],)
394 elif parent_name is not None:
395 root_nodes = (parent_name,)
396 else:
397 root_nodes = (self.roots if names else self.roots_ar)
399 for n in root_nodes:
400 # walk the subtree
401 for level, i, j in _walk(n, level=0):
402 yield level, i, j
404 # ------- display methods
406 def to_str(self,
407 names=None, # type: bool
408 mode="headers" # type: str
409 ):
410 """
411 Provides a string representation of this forest.
413 :param names: an optional boolean indicating if this method should return names instead of indices. By
414 default `names` is set to `not self.is_np`
415 :param mode: one of "compact", "headers", and "full" (displays the trees)
416 :return:
417 """
418 names = self._validate_names_arg(names)
419 nb_vars = self.nb_vars
420 roots = self.get_roots(names)
421 non_roots = self.get_non_roots(names)
422 nb_roots = len(roots)
423 nb_arcs = nb_vars - nb_roots
425 roots_with_children = self.get_roots_with_children(names)
426 nb_roots_with_children = len(roots_with_children)
428 nb_sole_roots = (nb_roots - nb_roots_with_children)
430 if mode == "compact":
431 return "QDForest (%s vars = %s roots + %s determined by %s of the roots)" \
432 % (nb_vars, nb_roots, nb_arcs, nb_roots_with_children)
434 # list of root node indice/name with a star when they have children
435 roots_str = [("%s*" if r in roots_with_children else "%s") % r for r in roots]
436 # list of
437 non_roots_str = ["%s" % r for r in non_roots]
439 headers = "\n".join((
440 "QDForest (%s vars):" % nb_vars,
441 " - %s roots (%s+%s*): %s" % (nb_roots, nb_sole_roots, nb_roots_with_children, ", ".join(roots_str)),
442 " - %s other nodes: %s" % (nb_arcs, ", ".join(non_roots_str)),
443 ))
444 if mode == "headers":
445 return headers
446 elif mode == "full": 446 ↛ 450line 446 didn't jump to line 450, because the condition on line 446 was never false
447 tree_str = "\n".join(self.get_trees_str_list())
448 return "%s\n\n%s" % (headers, tree_str)
449 else:
450 raise ValueError("Unknown mode: %r" % mode)
452 @encode_if_py2
453 def __str__(self):
454 """ String representation, listing all useful information when available """
455 if self.nb_vars > 30: 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true
456 return self.to_str(mode="headers")
457 else:
458 return self.to_str(mode="full")
460 @encode_if_py2
461 def __repr__(self):
462 # return str(self) # note if we use this then we'll have to comment the decorator
463 return self.to_str(mode="compact")
465 def print_arcs(self,
466 names=None # type: bool
467 ):
468 """ Prints the various deterministic arcs in this forest """
469 print("\n".join(self.get_arcs_str_list(names=names)))
471 def get_arcs_str_list(self,
472 names=None # type: bool
473 ):
474 """ Returns a list of string representation of the various deterministic arcs in this forest """
475 res_str_list = []
476 for parent, child in self.get_arcs(names=names):
477 res_str_list.append("%s -> %s" % (parent, child))
478 return res_str_list
480 def get_trees_str_list(self,
481 all=False,
482 names=None # type: bool
483 ):
484 """
485 Returns a list of string representation of the various trees in this forest
486 TODO maybe use https://github.com/caesar0301/treelib ? Or networkX ?
488 :param all: if True, this will return also the trees that are consistuted of one node
489 :param names:
490 :return:
491 """
492 names = self._validate_names_arg(names)
493 res_str_list = []
494 if all: 494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true
495 roots = self.get_roots(names)
496 else:
497 roots = self.get_roots_with_children(names)
499 if names:
500 walk_arcs = lambda n: self.walk_arcs(parent_name=n)
501 else:
502 walk_arcs = lambda n: self.walk_arcs(parent_idx=n)
504 for r in roots:
505 subtree_str = "%s" % r
506 for level, _, j in walk_arcs(r):
507 subtree_str += "\n%s└─ %s" % (" " * level, j)
508 res_str_list.append(subtree_str + "\n")
509 return res_str_list
511 # -----------
513 def fit_selector_model(self,
514 X # type: Union[np.ndarray, pd.DataFrame]
515 ):
516 # type: (...) -> QDSelectorModel
517 """
518 Returns a new instance of `QDSelectorModel` fit with the data in `X`
520 :param X:
521 :return:
522 """
523 from .selector import QDSelectorModel
524 model = QDSelectorModel(self)
525 model.fit(X)
526 return model
528 # --------- tools for entropies analysis to find interesting thresholds
530 def get_entropies_table(self,
531 from_to=True, # type: bool
532 sort_by="from", # type: str
533 drop_self_link=True, # type: bool
534 ):
535 """ See `Entropies.get_entropies_table` """
537 self.assert_stats_kept()
538 return self.stats.get_entropies_table(from_to=from_to, sort_by=sort_by, drop_self_link=drop_self_link)
540 def plot_increasing_entropies(self):
541 """ See `Entropies.plot_increasing_entropies` """
543 self.assert_stats_kept()
544 self.stats.plot_increasing_entropies()
547def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray]
548 ):
549 """
550 Raises a ValueError if `df_or_array` is
552 :param df_or_array:
553 :return:
554 """
555 if isinstance(df_or_array, pd.DataFrame):
556 pass
557 elif isinstance(df_or_array, np.ndarray): 557 ↛ 563line 557 didn't jump to line 563, because the condition on line 557 was never false
558 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
559 if len(df_or_array.shape) != 2: 559 ↛ 560line 559 didn't jump to line 560, because the condition on line 559 was never true
560 raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape))
561 else:
562 # Raise error
563 raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`")
566def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
567 absolute_eps=None, # type: float
568 relative_eps=None, # type: float
569 keep_stats=False, # type: bool
570 non_categorical_mode='strict',
571 ):
572 # type: (...) -> QDForest
573 """
574 Finds the (quasi-)deterministic relationships (functional dependencies) between the variables in `X`, and returns
575 a `QDForest` object representing the forest of (quasi-)deterministic trees. This object can then be used to fit a
576 feature selection model or to learn a Bayesian Network structure.
578 By default only deterministic relationships are detected. Quasi-determinism can be enabled by setting either
579 an threshold on conditional entropies (`absolute_eps`) or on relative conditional entropies (`relative_eps`). Only
580 one of them should be set.
582 By default (`keep_stats=False`) the entropies tables are not preserved once the forest has been created. If you wish
583 to keep them available, set `keep_stats=True`. The entropies tables are then available in the `<QDForest>.stats`
584 attribute, and threshold analysis methods such as `<QDForest>.get_entropies_table(...)` and
585 `<QDForest>.plot_increasing_entropies()` become available.
587 :param X: the dataset as a pandas DataFrame or a numpy array. Columns represent the features to compare.
588 :param absolute_eps: Absolute entropy threshold. Any feature `Y` that can be predicted from another feature `X` in
589 a quasi-deterministic way, that is, where conditional entropy `H(Y|X) <= absolute_eps`, will be removed. The
590 default value is `0` and corresponds to removing deterministic relationships only.
591 :param relative_eps: Relative entropy threshold. Any feature `Y` that can be predicted from another feature `X` in
592 a quasi-deterministic way, that is, where relative conditional entropy `H(Y|X)/H(Y) <= relative_eps` (a value
593 between `0` and `1`), will be removed. Only one of `absolute_eps` or `relative_eps` should be provided.
594 :param keep_stats: a boolean indicating if the various entropies tables computed in the process should be kept in
595 memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`.
596 :return:
597 """
598 # Make sure this is a 2D table
599 assert_df_or_2D_array(X)
601 # Sanity check: are there rows in here ?
602 if len(X) == 0: 602 ↛ 603line 602 didn't jump to line 603, because the condition on line 602 was never true
603 raise ValueError("Provided dataset does not contain any row")
605 # Only work on the categorical features
606 X = get_categorical_features(X, non_categorical_mode=non_categorical_mode)
608 # Sanity check concerning the number of columns
609 assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing."
611 # parameters check and defaults
612 if absolute_eps is None:
613 if relative_eps is None:
614 # nothing is provided, use absolute threshold 0
615 absolute_eps = 0
616 is_absolute = True
617 else:
618 # relative threshold provided
619 is_absolute = False
620 else:
621 if relative_eps is not None: 621 ↛ 622line 621 didn't jump to line 622, because the condition on line 621 was never true
622 raise ValueError("only one of absolute and relative should be passed")
623 # absolute threshold provided
624 is_absolute = True
626 is_strict = (absolute_eps == 0.) if is_absolute else (relative_eps == 0.)
628 # sanity check
629 if is_absolute and absolute_eps < 0: 629 ↛ 630line 629 didn't jump to line 630, because the condition on line 629 was never true
630 raise ValueError("epsilon_absolute should be positive")
631 elif not is_absolute and (relative_eps < 0 or relative_eps > 1): 631 ↛ 632line 631 didn't jump to line 632, because the condition on line 631 was never true
632 raise ValueError("epsilon_relative should be 0=<eps=<1")
634 # (0) compute conditional entropies or relative conditional entropies
635 A_orig, X_stats = get_adjacency_matrix(X, eps_absolute=absolute_eps, eps_relative=relative_eps)
637 # (2) identify redundancy
638 A_noredundancy = remove_redundancies(A_orig, selection_order=None if is_strict else X_stats.list_vars_by_entropy_order(desc=True))
640 # (3) transform into forest: remove extra parents by keeping only the parent with lowest entropy / nb levels ?
641 # if X -> Z and X -> Y and Y -> Z then Z has two parents X and Y but only Y should be kept
642 # Determinist case: minimize the number of parameters: take the minimal nb of levels
643 # Quasi-determinist case: take the lowest entropy
644 entropy_order_inc = (X_stats.H_ar).argsort()
645 # parent_order = compute_nb_levels(df) if is_strict else entropy_based_order
646 # A_forest = to_forest_adjmat(A_noredundancy, entropy_order_inc)
647 # qd_forest = QDForest(adjmat=A_forest)
648 parents = to_forest_parents_indices(A_noredundancy, entropy_order_inc)
649 # if not X_stats.is_nparray:
650 # parents = pd.Series(parents, index=A_noredundancy.columns)
651 qd_forest = QDForest(parents=parents, stats=X_stats if keep_stats else None)
653 # forestAdjMatList <- FromHToDeterForestAdjMat(H = H, criterion = criterionNlevels, removePerfectMatchingCol = removePerfectMatchingCol)
655 # (4) convert to pgmpy format
656 # deterForest <- FromAdjMatToBnLearn(adjMat = forestAdjMat)
657 # rootsF <- Roots(forestAdjMat)
659 # if(length(rootsFNames) == 1){
660 # print("Only one tree in the forest !!! No root graph computed")
661 # print("****************")
662 # print("Training - Phase 2 and Phase 3 obsolete")
663 # print("****************")
664 # gstar <- deterForest
665 # G_R <- NULL
666 # print("Final graph computed")
667 # }else{
668 # print(paste("Multiple trees in the forest (", length(rootsFNames),") Root graph will be computed.", sep = "", collapse = ""))
669 # rootsOnly <- ReduceDataAndH(variablesToKeep = rootsFNames, dataFrame = data, H)
670 # dataRoots <- rootsOnly[[1]]
671 # HRoots <- rootsOnly[[2]]
672 # rm(rootsOnly)
673 # print("****************")
674 # print("Training - Phase 2: computation of the root graph")
675 # print("****************")
676 # # removePerfectlyMatchingCol useless here because either pmc are in the same trees (but only one is the root), either they were deleted earlier on
677 # G_R <- SotABNsl(data = dataRoots, method = method, score = score, hyperparamList, removePerfectMatchingCol = removePerfectMatchingCol, ...)
678 # edgeListG_R <- FromBnToEdgeList(bn = G_R)
679 # colnames(edgeListG_R) <- c('from', 'to')
680 # print("****************")
681 # print("Training - Phase 3: fusion of the deter forest with the root graph")
682 # print("****************")
683 # gstarAdjMat <- AddEdgesFromEdgeList(originalAdjMatrix = forestAdjMat, edgeListToAdd = edgeListG_R)
684 # gstar <- FromAdjMatToBnLearn(gstarAdjMat)
685 # print("Final graph computed")
686 # }
688 return qd_forest
691class Entropies(object):
692 """ to do this could be easier to read with pyfields and default value factories """
694 __slots__ = ('dataset', 'is_nparray', '_H', '_Hcond', '_Hcond_rel', '_dataset_df')
696 def __init__(self, df_or_array):
697 """
699 :param df_or_array: a pandas dataframe or numpy array where the variables are the columns
700 """
701 self.dataset = df_or_array
702 self.is_nparray = isinstance(df_or_array, np.ndarray)
703 self._H = None # type: Union[np.ndarray, pd.Series]
704 self._Hcond = None # type: Union[np.ndarray, pd.DataFrame]
705 self._Hcond_rel = None # type: Union[np.ndarray, pd.DataFrame]
706 self._dataset_df = None # type: pd.DataFrame
708 def __str__(self):
709 return repr(self)
711 def __repr__(self):
712 res = """Statistics computed for dataset:
713%s
714...(%s rows)
716Entropies (H):
717%s
719Conditional entropies (Hcond = H(row|col)):
720%s
722Relative conditional entropies (Hcond_rel = H(row|col)/H(row)):
723%s
724""" % (self.dataset.head(2), len(self.dataset), self.H.T, self.Hcond, self.Hcond_rel)
725 return res
727 @property
728 def dataset_df(self):
729 """The dataset as a pandas DataFrame, if pandas is available"""
730 if self.is_nparray:
731 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
732 if self.dataset.dtype.names is not None: 732 ↛ 734line 732 didn't jump to line 734, because the condition on line 732 was never true
733 # structured array
734 self._dataset_df = pd.DataFrame(self.dataset)
735 else:
736 # unstructured array... same ?
737 self._dataset_df = pd.DataFrame(self.dataset)
738 return self._dataset_df
739 else:
740 return self.dataset
742 @property
743 def nb_vars(self):
744 return self.dataset.shape[1]
746 @property
747 def varnames(self):
748 if self.is_nparray: 748 ↛ 749line 748 didn't jump to line 749, because the condition on line 748 was never true
749 raise ValueError("Variable names not available")
750 return list(self.dataset.columns)
752 @property
753 def H_ar(self):
754 """ The entropy matrix (i, j) = H(Xi | Xj) as a numpy array """
755 return self.H if self.is_nparray else self.H.values
757 entropies_ar = H_ar
758 """An alias for H_ar"""
760 @property
761 def H(self):
762 """The entropies of all variables. a pandas Series if df was a pandas dataframe, else a 1D numpy array"""
763 if self._H is None:
764 # Using pyitlib to compute H (hopefully efficiently)
765 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report
766 # note: we convert to string type to avoid a bug with ints. TODO...
767 self._H = drv.entropy(self.dataset_df.T.astype(str))
768 if not self.is_nparray:
769 self._H = pd.Series(self._H, index=self.varnames)
771 # basic sanity check: should all be positive
772 assert np.all(self._H >= 0)
773 return self._H
775 entropies = H
776 """An alias for H"""
778 @property
779 def Hcond_ar(self):
780 """ The conditional entropy matrix (i, j) = H(Xi | Xj) as a numpy array """
781 return self.Hcond if self.is_nparray else self.Hcond.values
783 conditional_entropies_ar = Hcond_ar
784 """An alias for Hcond_ar"""
786 @property
787 def Hcond(self):
788 """
789 The conditional entropy matrix (i, j) = H(Xi | Xj).
790 A pandas Dataframe or a 2D numpy array depending on dataset type
791 """
792 if self._Hcond is None:
793 # Old attempt to do it ourselves
794 # (0) init H
795 # nb_vars = len(df.columns)
796 # H = np.empty((nb_vars, nb_vars), dtype=float)
797 # (1) for each column compute the counts per value
798 # (2) for each (i, j) pair compute the counts
800 # Using pyitlib to compute H (hopefully efficiently)
801 # Unfortunately this does not work with numpy arrays, convert to pandas TODO report
802 # note: we convert to string type to avoid a bug with ints. TODO...
803 self._Hcond = drv.entropy_conditional(self.dataset_df.T.astype(str))
805 # add the row/column headers
806 if not self.is_nparray:
807 self._Hcond = pd.DataFrame(self._Hcond, index=self.varnames, columns=self.varnames)
809 # basic sanity check: should all be positive
810 assert np.all(self._Hcond >= 0)
811 return self._Hcond
813 conditional_entropies = Hcond
814 """An alias for Hcond"""
816 @property
817 def Hcond_rel_ar(self):
818 """ The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi) as a numpy array """
819 return self.Hcond_rel if self.is_nparray else self.Hcond_rel.values
821 relative_conditional_entropies_ar = Hcond_rel_ar
822 """An alias for Hcond_rel_ar"""
824 @property
825 def Hcond_rel(self):
826 """
827 The relative conditional entropy matrix (i, j) = H(Xi | Xj) / H(Xi).
828 """
829 if self._Hcond_rel is None:
830 # compute relative entropy: in each cell (X, Y) we want H(X|Y)/H(X)
831 if self.is_nparray:
832 self._Hcond_rel = (self.Hcond.T / self.H).T
833 else:
834 Hrel_array = (self.Hcond.values.T / self.H.values).T
835 self._Hcond_rel = pd.DataFrame(Hrel_array, index=list(self.Hcond.columns), columns=list(self.Hcond.columns))
837 # basic sanity check
838 assert np.all(self._Hcond_rel >= 0.)
839 assert np.all(self._Hcond_rel <= 1.)
840 return self._Hcond_rel
842 relative_conditional_entropies = Hcond_rel
843 """An alias for Hcond_rel"""
845 def list_vars_by_entropy_order(self, desc=False):
846 """
847 Returns the indices or names of variables in ascending (resp. descending) order of entropy
849 :param desc: if True the order is descending, else ascending
850 :return:
851 """
852 if self.is_nparray:
853 return (-self.H).argsort() if desc else self.H.argsort()
854 else:
855 return (-self.H.values).argsort() if desc else self.H.values.argsort()
857 # --------- tools for entropies analysis to find interesting thresholds
859 def get_entropies_table(self,
860 from_to=True, # type: bool
861 sort_by="from", # type: str
862 drop_self_link=True, # type: bool
863 ):
864 """
865 Returns a pandas series or numpy array with four columns: from, to, cond_entropy, rel_cond_entropy.
866 The index is 'arc', a string representing the arc e.g. "X->Y".
868 :param from_to: a boolean flag indicating if 'from' and 'to' columns should remain in the returned dataframe
869 (True, default) or be dropped (False)
870 :param sort_by: a string indicating if the arcs should be sorted by origin ("from", default) or destination
871 ("to"), or by value "rel_cond_entropy" or "cond_entropy", in the resulting table.
872 :param drop_self_link: by default node-to-self arcs are not returned in the list. Turn this to False to include
873 them.
874 :return:
875 """
876 if self.is_nparray: 876 ↛ 877line 876 didn't jump to line 877, because the condition on line 876 was never true
877 raise NotImplementedError("TODO")
878 else:
879 # 1. gather the data and unpivot it so that there is one row per arc
880 res_df = pd.DataFrame({
881 'cond_entropy': self.Hcond.unstack(),
882 'rel_cond_entropy': self.Hcond_rel.unstack(),
883 })
884 res_df.index.names = ['from', 'to']
885 res_df = res_df.reset_index()
887 # 2. filter out node-to-self if needed
888 if drop_self_link: 888 ↛ 892line 888 didn't jump to line 892, because the condition on line 888 was never false
889 res_df = res_df.loc[res_df['from'] != res_df['to']].copy()
891 # 3. create the arc names column and make it the index
892 def make_arc_str(row):
893 return "%s->%s" % (row['from'], row.to) # note that .from is a reserved python symbol !
894 res_df['arc'] = res_df.apply(make_arc_str, axis=1)
895 res_df.set_index('arc', inplace=True)
897 # 4. Optionally sort differently
898 all_possibilities = ("from", "to", "cond_entropy", "rel_cond_entropy")
899 if sort_by == all_possibilities[0]: 899 ↛ 900line 899 didn't jump to line 900, because the condition on line 899 was never true
900 pass # already done
901 elif sort_by in all_possibilities[1:]: 901 ↛ 904line 901 didn't jump to line 904, because the condition on line 901 was never false
902 res_df.sort_values(by=sort_by, inplace=True)
903 else:
904 raise ValueError("Invalid value for `sort_by`: %r. Possible values: %r" % (sort_by, all_possibilities))
906 # 5. Optionally drop the from and to columns
907 if not from_to:
908 res_df.drop(['from', 'to'], axis=1, inplace=True)
910 return res_df
912 def plot_increasing_entropies(self):
913 """
915 :return:
916 """
917 import matplotlib.pyplot as plt
919 # get the dataframe
920 df = self.get_entropies_table(sort_by="rel_cond_entropy")
922 # plot with all ticks
923 df_sorted = df["rel_cond_entropy"] # .sort_values()
924 df_sorted.plot(title="Relative conditional entropy H(X|Y)/H(X) for each arc X->Y, by increasing order",
925 figsize=(15, 10))
926 plt.xticks(np.arange(len(df_sorted)), df_sorted.index, rotation=90)
929def get_adjacency_matrix(df, # type: Union[np.ndarray, pd.DataFrame]
930 eps_absolute=None, # type: float
931 eps_relative=None # type: float
932 ):
933 """
935 :param df:
936 :param eps_absolute:
937 :param eps_relative:
938 :return:
939 """
940 df_stats = Entropies(df)
942 # (1) create initial adjacency matrix by thresholding either H(X|Y) (absolute) or H(X|Y)/H(X) (relative)
943 if eps_relative is None:
944 # default value for eps absolute
945 if eps_absolute is None:
946 eps_absolute = 0
948 # threshold is on H(X|Y)
949 edges = (df_stats.Hcond_ar <= eps_absolute).T
950 else:
951 if eps_absolute is not None:
952 raise ValueError("Only one of `eps_absolute` and `eps_relative` should be provided")
954 # threshold on H(X|Y)/H(X)
955 edges = (df_stats.Hcond_rel_ar <= eps_relative).T
957 # the diagonal should be false
958 np.fill_diagonal(edges, False)
960 # finally create the matrix
961 if df_stats.is_nparray:
962 A = edges
963 else:
964 A = pd.DataFrame(edges, index=df_stats.varnames, columns=df_stats.varnames)
966 return A, df_stats
969def remove_redundancies(A, # type: Union[np.ndarray, pd.DataFrame]
970 selection_order=None # type: np.ndarray
971 ):
972 """Cleans the arcs in A between redundant variables.
974 A should be a dataframe or numpy array where index = columns and indicate the node name.
975 It should contain boolean values where True at (i,j) means there is a directed arc i -> j
977 When there are redundant variables (arcs between i->j and j->i), only a single node is kept as the parent
978 of all other redundant nodes.
980 :param A: an adjacency matrix where A(i, j) indicates that there is an arc i -> j
981 :param selection_order: if none (default) the first node in the list will be kept as representant for each
982 redundancy class. Otherwise an alternate order (array of indices) can be provided.
983 :return:
984 """
985 assert_adjacency_matrix(A)
987 # work on a copy
988 is_nparray = isinstance(A, np.ndarray)
989 if is_nparray:
990 A = A.copy()
991 A_df = None
992 else:
993 A_df = A.copy(deep=True)
994 A = A_df.values
996 # init
997 n_vars = A.shape[0]
998 if selection_order is None:
999 selection_order = range(n_vars)
1001 # I contains the list of variable indices to go through. We can remove some in the loop
1002 I = np.ones((n_vars, ), dtype=bool)
1004 # for each node
1005 for i in selection_order:
1006 # if we did not yet take it into account
1007 if I[i]:
1008 # find redundant nodes with this node (= there is an arc in both directions)
1009 mask_redundant_with_i = A[i, :] * A[:, i]
1011 # we will stop caring about this redundancy class
1012 I[mask_redundant_with_i] = False
1014 # we do not care about i-to-i
1015 mask_redundant_with_i[i] = False
1017 # if there is at least one redundant node
1018 if any(mask_redundant_with_i):
1019 redundant_nodes_idx = np.where(mask_redundant_with_i)
1021 # let only the first variable in the list (i) be the parent
1022 # i becomes the representant of the redundancy class
1023 A[redundant_nodes_idx, i] = False
1025 # remove all arcs inter-redundant variables.
1026 # Note that this destroys the diagonal but we will fix this at the end
1027 A[redundant_nodes_idx, redundant_nodes_idx] = False
1029 # restore the diagonal: no
1030 # np.fill_diagonal(A, True)
1031 if is_nparray:
1032 return A
1033 else:
1034 return A_df
1037def to_forest_parents_indices(A, # type: Union[np.ndarray, pd.DataFrame]
1038 selection_order=None # type: np.ndarray
1039 ):
1040 # type: (...) -> Union[np.ndarray, pd.DataFrame]
1041 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order
1043 Returns a 1D array of parent index or -1 if root
1045 :param A: an adjacency matrix, as a dataframe or numpy array
1046 :param selection_order: an optional order for parent selection. If not provided, the first in the list of columns
1047 of A will be used
1048 :return:
1049 """
1050 assert_adjacency_matrix(A)
1051 is_np_array = isinstance(A, np.ndarray)
1053 # From https://stackoverflow.com/a/47269413/7262247
1054 if is_np_array:
1055 mask = A[selection_order, :] if selection_order is not None else A
1056 else:
1057 mask = A.iloc[selection_order, :].values if selection_order is not None else A.values
1059 # return a list containing for each feature, the index of its parent or -1 if it is a root
1060 indices = np.where(mask.any(axis=0),
1061 selection_order[mask.argmax(axis=0)] if selection_order is not None else mask.argmax(axis=0),
1062 -1)
1063 if not is_np_array:
1064 indices = pd.Series(indices, index=A.columns)
1065 return indices
1068def to_forest_adjmat(A, # type: Union[np.ndarray, pd.DataFrame]
1069 selection_order, # type: np.ndarray
1070 inplace=False # type: bool
1071 ):
1072 # type: (...) -> Union[np.ndarray, pd.DataFrame]
1073 """ Removes extra arcs in the adjacency matrix A by only keeping the first parent in the given order
1075 :param A: an adjacency matrix, as a dataframe or numpy array
1076 :return:
1077 """
1078 is_ndarray = isinstance(A, np.ndarray)
1079 assert_adjacency_matrix(A)
1081 if not inplace:
1082 A = A.copy()
1084 # nb_parents = A_df.sum(axis=0)
1085 # nodes_with_many_parents = np.where(nb_parents.values > 1)[0]
1087 # Probably the fastest: when the cumulative nb of parents is above 1 they need to be removed
1088 if is_ndarray:
1089 mask_parents_over_the_first = A[selection_order, :].cumsum(axis=0) > 1
1090 mask_parents_over_the_first = mask_parents_over_the_first[selection_order.argsort(), :]
1091 A[mask_parents_over_the_first] = False
1092 else:
1093 # From https://stackoverflow.com/a/47269413/7262247
1094 mask_parents_over_the_first = A.iloc[selection_order, :].cumsum(axis=0) > 1
1095 A[mask_parents_over_the_first] = False
1097 # # Probably slower but easier to understand
1098 # # for each column we want to only keep one (or zero) row (parent) with True
1099 # for i in range(A_df.shape[1]):
1100 # first_parent_idx = A_df.values[selection_order, i].argmax(axis=0)
1101 # A_df.values[selection_order[first_parent_idx+1:], i] = False
1103 if not inplace:
1104 return A
1107def get_arcs_from_parents_indices(parents, # type: Union[np.ndarray, pd.DataFrame]
1108 multiindex=False, # type: bool
1109 names=False # type: bool
1110 ):
1111 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]
1112 """
1113 if multiindex = False ; returns a sequence of pairs : (9, 1), (3, 5), (9, 7)
1114 if multiindex = True ; returns two sequences of indices: (9, 3, 9), (1, 5, 7)
1116 :param parents:
1117 :param multiindex:
1118 :param names:
1119 :return:
1120 """
1121 is_np_array = isinstance(parents, np.ndarray)
1122 if not names:
1123 if not is_np_array:
1124 # assume a dataframe with an 'idx' column as in QDForest class
1125 parents = parents['idx'].values
1126 n = len(parents)
1127 childs_mask = parents >= 0
1128 res = parents[childs_mask], np.arange(n)[childs_mask]
1129 return res if multiindex else zip(*res)
1130 else:
1131 if is_np_array: 1131 ↛ 1132line 1131 didn't jump to line 1132, because the condition on line 1131 was never true
1132 raise ValueError("Names are not available, this is a numpy array")
1133 else:
1134 # cols = A.columns
1135 # return ((cols[i], cols[j]) for i, j in zip(*np.where(A)))
1136 res = parents.loc[parents['name'].notna(), 'name']
1137 res = res.values, res.index
1138 return res if multiindex else zip(*res)
1141def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFrame]
1142 multiindex=False, # type: bool
1143 names=False # type: bool
1144 ):
1145 # type: (...) -> Union[Iterable[Tuple[int, int]], Iterable[Tuple[str, str]], Tuple[Iterable[int], Iterable[int]], Tuple[Iterable[str], Iterable[str]]]
1146 """
1147 Return the arcs of an adjacency matrix, an iterable of (parent, child) indices or names
1149 If 'multiindex' is True instead of returning an iterable of (parent, child), it returns a tuple of iterables
1150 (all the parents, all the childs).
1152 :param A:
1153 :param multiindex: if this is True, a 2-tuple of iterable is returned instead of an iterable of 2-tuples
1154 :param names: if False, indices are returned. Otherwise feature names are returned if any
1155 :return:
1156 """
1157 if not names:
1158 res = np.where(A)
1159 return res if multiindex else zip(*res)
1160 else:
1161 is_np_array = isinstance(A, np.ndarray)
1162 if is_np_array: 1162 ↛ 1163line 1162 didn't jump to line 1163, because the condition on line 1162 was never true
1163 raise ValueError("Names are not available, this is a numpy array")
1164 else:
1165 cols = A.columns
1166 res_ar = np.where(A)
1167 if multiindex: 1167 ↛ 1168line 1167 didn't jump to line 1168, because the condition on line 1167 was never true
1168 return ((cols[i] for i in l) for l in res_ar) # noqa
1169 else:
1170 return ((cols[i], cols[j]) for i, j in zip(*res_ar))
1173def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame]
1174 non_categorical_mode="strict" # type: str
1175 ):
1176 # type: (...) -> Union[np.ndarray, pd.DataFrame]
1177 """
1179 :param df_or_array:
1180 :param non_categorical_mode:
1181 :return: a dataframe or array with the categorical features
1182 """
1183 assert_df_or_2D_array(df_or_array)
1185 if non_categorical_mode == "strict": 1185 ↛ 1187line 1185 didn't jump to line 1187, because the condition on line 1185 was never false
1186 strict_mode = True
1187 elif non_categorical_mode == "remove":
1188 strict_mode = False
1189 else:
1190 raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode)
1192 if isinstance(df_or_array, pd.DataFrame):
1193 is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"])
1194 if strict_mode and not is_categorical_dtype.all():
1195 raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or "
1196 "'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to "
1197 "`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
1198 elif not is_categorical_dtype.any(): 1198 ↛ 1199line 1198 didn't jump to line 1199, because the condition on line 1198 was never true
1199 raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
1200 "'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
1201 return df_or_array.loc[:, is_categorical_dtype]
1203 elif isinstance(df_or_array, np.ndarray): 1203 ↛ 1228line 1203 didn't jump to line 1228, because the condition on line 1203 was never false
1204 # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
1205 if df_or_array.dtype.names is not None: 1205 ↛ 1207line 1205 didn't jump to line 1207, because the condition on line 1205 was never true
1206 # structured array
1207 is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object"
1208 for n in df_or_array.dtype.names])
1209 if strict_mode and not is_categorical_dtype.all():
1210 invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict()
1211 raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): "
1212 "found dtypes %r. This is not supported when `non_categorical_mode` is set to "
1213 "`'strict'`" % invalid_dtypes)
1214 elif not is_categorical_dtype.any():
1215 raise ValueError(
1216 "Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
1217 "'categorical'): found dtypes %r" % df_or_array.dtype.fields)
1218 categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype]
1219 return df_or_array[categorical_names]
1220 else:
1221 # non-structured array
1222 if df_or_array.dtype != np.dtype('O'): 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true
1223 raise TypeError("Provided array columns are not object nor categorical: found dtype %r"
1224 % df_or_array.dtype)
1225 return df_or_array
1226 else:
1227 # Should not happen since `assert_df_or_2D_array` is called upfront now.
1228 raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray")
1231def assert_adjacency_matrix(A # type: Union[np.ndarray, pd.DataFrame]
1232 ):
1233 """Routine to check that A is a proper adjacency matrix"""
1235 if len(A.shape) != 2: 1235 ↛ 1236line 1235 didn't jump to line 1236, because the condition on line 1235 was never true
1236 raise ValueError("A is not a 2D adjacency matrix, its shape is %sD" % len(A.shape))
1238 if A.shape[0] != A.shape[1]: 1238 ↛ 1239line 1238 didn't jump to line 1239, because the condition on line 1238 was never true
1239 raise ValueError("A is not a 2D adjacency matrix: it is not square: %r" % A.shape)