Fix query that hangs after do-intervention (#173)

FrancescaSogaroQB · oentaryorj · web-flow · commit 9de529cbac77 · 2021-08-13T09:54:08.000+01:00
* trying to reporduce the hanging error

* first iteration to handle splitted graph by do-intervention. tests needed

* reverted to develop as only commented

* added functions docstrings and typing

* first attempt at tests

* fixing tests

* flake

* Speed up _create_node_functions by taking the first element using next()

* Use next(iter(x)) to get the first element

* first iterations to address PR comments and discussion: adds default marginals and returns upstream marginal from default ones rather than nans

* removing nan import

* setting default marginal with query()

* lint changes

* removed jupyter notebook file from git

* lint changes

* latest modifications

* added my info and fix updates

* fisrt attempt pr comment to avoid duplicate call to obtain parents of node

* PR comment: avoide duplicate call to get node parents

* fixing lint

* Refactor _remove_disconnected_nodes() and tidy up codes and docstrings

* Add edge one by one (instead of constructing edge list) to make graph construction faster

* Linting

* last PR comments

* Shift add_node() inside the loop for _remove_disconnected_node

Co-authored-by: oentaryorj &lt;oentaryorj@gmail.com&gt;
diff --git a/RELEASE.md b/RELEASE.md
@@ -5,6 +5,8 @@
 * Fix broken URLs in FAQ documentation, as per #113 and #125
 * Add a link to `PyGraphviz` installation guide under the installation prerequisites
 * Fix integer index type checking for timeseries data, as per #74 and #86
+* Fix infinite loop at `.query()` after a `.do_intervention()` that splits
+  the graph into two or more subgraphs, as per #100, #45
 
 # Release 0.10.0
 * Add supervised discretisation strategies using Decision Tree and MDLP algorithms.
@@ -104,6 +106,8 @@ The initial release of CausalNex.
 
 ## Thanks for supporting contributions
 CausalNex was originally designed by [Paul Beaumont](https://www.linkedin.com/in/pbeaumont/) and [Ben Horsburgh](https://www.linkedin.com/in/benhorsburgh/) to solve challenges they faced in inferencing causality in their project work. This work was later turned into a product thanks to the following contributors:
-[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), [Zain Patel](https://www.linkedin.com/in/zain-patel/), [Richard Oentaryo](https://www.linkedin.com/in/oentaryo/), and [Shuhei Ishida](https://www.linkedin.com/in/shuhei-i/).
+[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), [Zain Patel](https://www.linkedin.com/in/zain-patel/), [Richard Oentaryo](https://www.linkedin.com/in/oentaryo/),
+[Shuhei Ishida](https://www.linkedin.com/in/shuhei-i/), and [Francesca
+Sogaro](https://www.linkedin.com/in/francesca-sogaro/).
 
 CausalNex would also not be possible without the generous sharing from leading researches in the field of causal inference and we are grateful to everyone who advised and supported us, filed issues or helped resolve them, asked and answered questions or simply be part of inspiring discussions.
diff --git a/causalnex/inference/inference.py b/causalnex/inference/inference.py
@@ -30,13 +30,13 @@
 
 ``InferenceEngine`` provides tools to make inferences based on interventions and observations.
 """
-
 import copy
 import inspect
 import re
 import types
 from typing import Callable, Dict, Hashable, List, Tuple, Union
 
+import networkx as nx
 import pandas as pd
 from pathos import multiprocessing
 
@@ -91,7 +91,7 @@ class InferenceEngine:
 
     def __init__(self, bn: BayesianNetwork):
         """
-        Create a new ``InferenceEngine`` from an existing ``BayesianNetwork``.
+        Creates a new ``InferenceEngine`` from an existing ``BayesianNetwork``.
 
         It is expected that structure and probability distribution has already been learned
         for the ``BayesianNetwork`` that is to be used for inference.
@@ -104,8 +104,8 @@ def __init__(self, bn: BayesianNetwork):
             ValueError: if the Bayesian Network contains isolates, or if a variable name is invalid,
                         or if the CPDs have not been learned yet.
         """
-
         bad_nodes = [node for node in bn.nodes if not re.match("^[0-9a-zA-Z_]+$", node)]
+
         if bad_nodes:
             raise ValueError(
                 "Variable names must match ^[0-9a-zA-Z_]+$ - please fix the "
@@ -119,16 +119,20 @@ def __init__(self, bn: BayesianNetwork):
             )
 
         self._cpds = None
+        self._upstream_cpds = {}
 
         self._create_cpds_dict_bn(bn)
         self._generate_domains_bn(bn)
         self._generate_bbn()
 
+        # TODO: can we do it without a query() call? # pylint: disable=fixme
+        self._default_marginals = self.query()
+
     def _single_query(
         self, observations: Dict[str, Hashable] = None
     ) -> Dict[str, Dict[Hashable, float]]:
         """
-        Query the ``BayesianNetwork`` for marginals given some observations.
+        Queries the ``BayesianNetwork`` for marginals given some observations.
 
         Args:
             observations: observed states of nodes in the Bayesian Network.
@@ -139,15 +143,19 @@ def _single_query(
             A dictionary of marginal probabilities of the network.
             For instance, :math:`P(a=1) = 0.3, P(a=2) = 0.7` -> {a: {1: 0.3, 2: 0.7}}
         """
-
         bbn_results = (
             self._bbn.query(**observations) if observations else self._bbn.query()
         )
-
         results = {node: dict() for node in self._cpds}
+
         for (node, state), prob in bbn_results.items():
             results[node][state] = prob
 
+        # the upstream nodes are set to the default marginals based on the
+        # original cpds of the bn
+        for detached_node in self._upstream_cpds:
+            results[detached_node] = self._default_marginals[detached_node]
+
         return results
 
     def query(
@@ -159,7 +167,7 @@ def query(
         Dict[str, Dict[Hashable, float]], List[Dict[str, Dict[Hashable, float]]]
     ]:
         """
-        Query the ``BayesianNetwork`` for marginals given one or more observations.
+        Queries the ``BayesianNetwork`` for marginals given one or more observations.
 
         Args:
             observations: one or more observations of states of nodes in the Bayesian Network.
@@ -170,21 +178,21 @@ def query(
         Returns:
             A dictionary or a list of dictionaries of marginal probabilities of the network.
         """
-
         if isinstance(observations, dict) or observations is None:
             return self._single_query(observations)
+
         result = []
+
         if parallel:
             with multiprocessing.Pool(num_cores) as p:
                 result = p.map(self._single_query, observations)
-
         else:
             for obs in observations:
                 result.append(self._single_query(obs))
 
         return result
 
-    def _do(self, observation: str, state: Dict[Hashable, float]) -> None:
+    def _do(self, observation: str, state: Dict[Hashable, float]):
         """
         Makes an intervention on the Bayesian Network.
 
@@ -215,10 +223,12 @@ def _do(self, observation: str, state: Dict[Hashable, float]) -> None:
         self._cpds[observation] = {s: {(): p} for s, p in state.items()}
 
     def do_intervention(
-        self, node: str, state: Union[Hashable, Dict[Hashable, float]] = None
-    ) -> None:
+        self,
+        node: str,
+        state: Union[Hashable, Dict[Hashable, float]] = None,
+    ):
         """
-        Make an intervention on the Bayesian Network.
+        Makes an intervention on the Bayesian Network.
 
         For instance,
             `do_intervention('X', 'x')` will set :math:`P(X=x)` to 1, and :math:`P(X=y)` to 0
@@ -245,36 +255,45 @@ def do_intervention(
             state = {s: float(s == state) for s in self._cpds[node]}
 
         self._do(node, state)
+
+        # check for presence of separate subgraph after do-intervention
+        self._remove_disconnected_nodes(node)
         self._generate_bbn()
 
-    def reset_do(self, observation: str) -> None:
+    def reset_do(self, observation: str):
         """
         Resets any do_interventions that have been applied to the observation.
 
         Args:
             observation: observation that will be reset.
         """
-
         self._cpds[observation] = self._cpds_original[observation]
+
+        for upstream_node, original_cpds in self._upstream_cpds.items():
+            self._cpds[upstream_node] = original_cpds
+
+        self._upstream_cpds = {}
         self._generate_bbn()
 
     def _generate_bbn(self):
-        """Re-create the _bbn."""
+        """Re-creates the _bbn."""
         self._node_functions = self._create_node_functions()
-
         self._bbn = build_bbn(
             list(self._node_functions.values()), domains=self._domains
         )
 
-    def _generate_domains_bn(self, bn):
-
+    def _generate_domains_bn(self, bn: BayesianNetwork):
+        """Generates domains from Bayesian network"""
         self._domains = {
             variable: list(cpd.index.values) for variable, cpd in bn.cpds.items()
         }
 
-    def _create_cpds_dict_bn(self, bn: BayesianNetwork) -> None:
+    def _create_cpds_dict_bn(self, bn: BayesianNetwork):
         """
-        Map CPDs in the ``BayesianNetwork`` to required format:
+        Maps CPDs in the ``BayesianNetwork`` to required format:
+
+        Args:
+            bn: Bayesian network
 
         >>> {"observation":
         >>>     {"state":
@@ -292,7 +311,6 @@ def _create_cpds_dict_bn(self, bn: BayesianNetwork) -> None:
         >>>     }
         >>> }
         """
-
         lookup = {
             variable: {
                 state: {
@@ -305,7 +323,6 @@ def _create_cpds_dict_bn(self, bn: BayesianNetwork) -> None:
             }
             for variable, cpd in bn.cpds.items()
         }
-
         self._cpds = lookup
         self._cpds_original = copy.deepcopy(self._cpds)
 
@@ -349,26 +366,66 @@ def template() -> float:
             code.co_cellvars,
         )
         template.__name__ = name
-
         return template
 
     def _create_node_functions(self) -> Dict[str, Callable]:
-        """Creates all functions required to create a ``BayesianNetwork``."""
+        """
+        Creates all functions required to create a ``BayesianNetwork``.
 
+        Returns:
+            Dictionary of node functions
+        """
         node_functions = dict()
 
         for node, states in self._cpds.items():
             # since we only need condition names, which are consistent across all states,
             # then we can inspect the 0th element
-            states_conditions = list(states.values())[0]
+            states_conditions = next(iter(states.values()))
 
             # take any state, and get its conditions
-            state_conditions = list(states_conditions.items())[0]
-            condition_nodes = [n for n, v in state_conditions[0]]
+            state_conditions = next(iter(states_conditions.keys()))
+            condition_nodes = [n for n, v in state_conditions]
 
             node_args = tuple([node] + condition_nodes)  # type: Tuple[str]
             function_name = "f_{node}".format(node=node)
             node_function = self._create_node_function(function_name, node_args)
             node_functions[node] = node_function
 
         return node_functions
+
+    def _remove_disconnected_nodes(self, var: str):
+        """
+        Identifies and removes from the _cpds the nodes of the bbn which are
+        part of one or more upstream subgraphs that could have been formed
+        after a do-intervention.
+
+        Uses the attribute _cpds to determine the parents of each node.
+        Leverages networkX `weakly_connected_component` method to identify the
+        subgraphs.
+
+        For instance, the network A -> B -> C -> D -> E  would be split into
+        two sub networks (A -> B) and (C -> D -> E) if we intervene on
+        node C.
+
+        Args:
+            var: variable we have intervened on
+        """
+        # construct graph from CPDs
+        g = nx.DiGraph()
+
+        # add nodes as there could be isolates (e.g. A->B->C intervening on B
+        # makes A an isolate)
+        for node, states in self._cpds.items():
+            sample_state = next(iter(states.values()))
+            parents = next(iter(sample_state.keys()))
+            g.add_node(node)
+
+            for parent, _ in parents:
+                g.add_edge(parent, node)
+
+        # remove nodes in subgraphs which do not contain the intervention node
+        for sub_graph in nx.weakly_connected_components(g):
+            if var not in sub_graph:
+                for node in sub_graph:
+                    self._upstream_cpds[node] = self._cpds[node]
+                    self._cpds.pop(node)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1070,3 +1070,37 @@ def iris_edge_list():
     ]
 
     return edge_list
+
+
+@pytest.fixture
+def chain_network() -> BayesianNetwork:
+    """
+    This Bayesian Model structure to test do interventions that split graph
+    into subgraphs.
+
+    a → b → c → d → e
+
+    """
+
+    n = 50
+    nodes_names = list("abcde")
+    random_binary_matrix = (
+        np.random.randint(10, size=(n, len(nodes_names))) > 6
+    ).astype(int)
+    df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names)
+
+    model = StructureModel()
+    model.add_edges_from(
+        [
+            ("a", "b"),
+            ("b", "c"),
+            ("c", "d"),
+            ("d", "e"),
+        ]
+    )
+
+    chain_bn = BayesianNetwork(model)
+    chain_bn = chain_bn.fit_node_states(df)
+    chain_bn = chain_bn.fit_cpds(df, method="BayesianEstimator", bayes_prior="K2")
+
+    return chain_bn
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -408,3 +408,72 @@ def test_multi_query(self, bn):
         assert single_0 == results_parallel[0]
         assert single_1 == results_parallel[1]
         assert single_2 == results_parallel[2]
+
+    def test_query_after_do_intervention_has_split_graph(self, chain_network):
+        """
+        chain network: a → b → c → d → e
+
+        test 1.
+        - do intervention on node c generates 2 graphs (a → b) and (c → d → e)
+        - assert the query can be run (it used to hang before)
+        - assert rest_do works
+        """
+        ie = InferenceEngine(chain_network)
+        original_margs = ie.query()
+
+        var = "c"
+        state_dict = {0: 1.0, 1: 0.0}
+        ie.do_intervention(var, state_dict)
+        # assert the intervention node has indeed the right state
+        assert ie.query()[var][0] == state_dict[0]
+        assert ie.query()[var][1] == state_dict[1]
+
+        # assert the upstream nodes have the default marginals (no info
+        # propagates in the upstream graph)
+        assert ie.query()["a"][0] == original_margs["a"][0]
+        assert ie.query()["a"][1] == original_margs["a"][1]
+        assert ie.query()["b"][0] == original_margs["b"][0]
+        assert ie.query()["b"][1] == original_margs["b"][1]
+
+        # assert the _cpds of the upstream nodes are stored correctly
+        orig_cpds = ie._cpds_original  # pylint: disable=protected-access
+        upstream_cpds = ie._upstream_cpds  # pylint: disable=protected-access
+        assert orig_cpds["a"] == upstream_cpds["a"]
+        assert orig_cpds["b"] == upstream_cpds["b"]
+
+        ie.reset_do(var)
+        reset_margs = ie.query()
+
+        for node in original_margs.keys():
+            dict_left = original_margs[node]
+            dict_right = reset_margs[node]
+            for (kl, kr) in zip(dict_left.keys(), dict_right.keys()):
+                assert math.isclose(dict_left[kl], dict_right[kr])
+
+        # repeating above tests intervening on b, so that there is one single
+        # isolate
+        var_b = "b"
+        state_dict_b = {0: 1.0, 1: 0.0}
+        ie.do_intervention(var_b, state_dict_b)
+        # assert the intervention node has indeed the right state
+        assert ie.query()[var_b][0] == state_dict[0]
+        assert ie.query()[var_b][1] == state_dict[1]
+
+        # assert the upstream nodes have the default marginals (no info
+        # propagates in the upstream graph)
+        assert ie.query()["a"][0] == original_margs["a"][0]
+        assert ie.query()["a"][1] == original_margs["a"][1]
+
+        # assert the _cpds of the upstream nodes are stored correctly
+        orig_cpds = ie._cpds_original  # pylint: disable=protected-access
+        upstream_cpds = ie._upstream_cpds  # pylint: disable=protected-access
+        assert orig_cpds["a"] == upstream_cpds["a"]
+
+        ie.reset_do(var_b)
+        reset_margs = ie.query()
+
+        for node in original_margs.keys():
+            dict_left = original_margs[node]
+            dict_right = reset_margs[node]
+            for (kl, kr) in zip(dict_left.keys(), dict_right.keys()):
+                assert math.isclose(dict_left[kl], dict_right[kr])