Fix LSTM layer conversion in tf 2.x (#412)

wenbingl · web-flow · commit ee2945c7076d · 2020-03-20T19:00:57.000-07:00
* The sequential model and tf2.2 issue fixing.

* more adjustment.

* fixing the import issue.

* remove keras/tf2.0 combination.

* exprimentals

* revert the exprimental change

* more revert
diff --git a/.azure-pipelines/win32-conda-CI.yml b/.azure-pipelines/win32-conda-CI.yml
@@ -25,7 +25,7 @@ jobs:
         python.version: '3.6'
         ONNX_PATH: onnx==1.5.0
         KERAS: keras==2.2.5
-        TENSORFLOW_PATH: tensorflow==1.14.0
+        TENSORFLOW_PATH: tensorflow==1.15.0
         INSTALL_ORT: pip install onnxruntime==1.1.1
 
       Python37-tf200:
diff --git a/applications/nightly_build/test_transformers.py b/applications/nightly_build/test_transformers.py
@@ -9,6 +9,7 @@
 import keras2onnx
 import json
 from os.path import dirname, abspath
+
 sys.path.insert(0, os.path.join(dirname(abspath(__file__)), '../../tests/'))
 from test_utils import run_onnx_runtime
 from keras2onnx.proto import is_tensorflow_older_than
@@ -18,7 +19,7 @@
     enable_transformer_test = True
 
 
-@unittest.skipIf(is_tensorflow_older_than('2.1.0') or not enable_transformer_test,
+@unittest.skipIf(not enable_transformer_test,
                  "Need enable transformer test before Transformers conversion.")
 class TestTransformers(unittest.TestCase):
 
@@ -38,6 +39,18 @@ def _prepare_inputs(self, tokenizer):
         inputs_onnx = {k_: v_.numpy() for k_, v_ in inputs.items()}
         return text, inputs, inputs_onnx
 
+    def test_3layer_gpt2(self):
+        from transformers import GPT2Config, TFGPT2Model, BertTokenizer
+        keras2onnx.proto.keras.backend.set_learning_phase(0)
+        config = GPT2Config(n_layer=3)
+        model = TFGPT2Model(config)
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
+        inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='tf')
+        predictions = model.predict(inputs)
+        onnx_model = keras2onnx.convert_keras(model, model.name)
+        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
+
     def test_TFBertModel(self):
         from transformers import BertTokenizer, TFBertModel
         pretrained_weights = 'bert-base-uncased'
@@ -56,7 +69,9 @@ def test_TFBertForPreTraining(self):
         model = TFBertForPreTraining.from_pretrained(pretrained_weights)
         predictions = model.predict(inputs)
         onnx_model = keras2onnx.convert_keras(model, model.name)
-        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
+        self.assertTrue(
+            run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
+                             atol=1.e-4))
 
     def test_TFBertForMaskedLM(self):
         from transformers import BertTokenizer, TFBertForMaskedLM
@@ -66,7 +81,9 @@ def test_TFBertForMaskedLM(self):
         model = TFBertForMaskedLM.from_pretrained(pretrained_weights)
         predictions = model.predict(inputs)
         onnx_model = keras2onnx.convert_keras(model, model.name)
-        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
+        self.assertTrue(
+            run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
+                             atol=1.e-4))
 
     def test_TFBertForNextSentencePrediction(self):
         from transformers import BertTokenizer, TFBertForNextSentencePrediction
@@ -146,7 +163,9 @@ def test_TFXLMModel(self):
         model = TFXLMModel.from_pretrained(pretrained_weights)
         predictions = model.predict(inputs)
         onnx_model = keras2onnx.convert_keras(model, model.name)
-        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
+        self.assertTrue(
+            run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
+                             atol=1.e-4))
 
     def test_TFXLMWithLMHeadModel(self):
         from transformers import XLMTokenizer, TFXLMWithLMHeadModel
@@ -156,7 +175,9 @@ def test_TFXLMWithLMHeadModel(self):
         model = TFXLMWithLMHeadModel.from_pretrained(pretrained_weights)
         predictions = model.predict(inputs)
         onnx_model = keras2onnx.convert_keras(model, model.name)
-        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
+        self.assertTrue(
+            run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
+                             atol=1.e-4))
 
     def test_TFXLMForSequenceClassification(self):
         from transformers import XLMTokenizer, TFXLMForSequenceClassification
@@ -196,7 +217,9 @@ def test_TFDistilBertForMaskedLM(self):
         model = TFDistilBertForMaskedLM.from_pretrained(pretrained_weights)
         predictions = model.predict(inputs)
         onnx_model = keras2onnx.convert_keras(model, model.name)
-        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
+        self.assertTrue(
+            run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
+                             atol=1.e-4))
 
     def test_TFDistilBertForSequenceClassification(self):
         from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
@@ -246,7 +269,9 @@ def test_TFRobertaForMaskedLM(self):
         model = TFRobertaForMaskedLM.from_pretrained(pretrained_weights)
         predictions = model.predict(inputs)
         onnx_model = keras2onnx.convert_keras(model, model.name)
-        self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
+        self.assertTrue(
+            run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
+                             atol=1.e-4))
 
     def test_TFRobertaForSequenceClassification(self):
         from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
diff --git a/keras2onnx/_graph_cvt.py b/keras2onnx/_graph_cvt.py
@@ -494,7 +494,7 @@ def _save_placeholder(node_name, dtype):
       # Get dtype and data for non-variable Placeholders (ex. values for 1.X
       # Const ops that are loaded as Placeholders in 2.0)
       _save_placeholder(node.name, node.attr["dtype"])
-    elif node.op in ["ReadVariableOp", "ResourceGather", "AssignSubVariableOp"]:
+    elif node.op in ["ReadVariableOp", "ResourceGather", "ResourceGatherNd", "AssignSubVariableOp"]:
       # Get dtype and data for Placeholder ops associated with ReadVariableOp
       # and ResourceGather ops. There can be an Identity in between the
       # resource op and Placeholder. Store the dtype for the Identity ops.
@@ -532,12 +532,12 @@ def _save_placeholder(node_name, dtype):
       _populate_identity_op(output_node, input_node)
     # Convert ResourceGather to Gather ops with a Const axis feeding into it.
     elif input_node.op == "AssignSubVariableOp":
-        output_node.op = "Sub"
-        output_node.name = input_node.name
-        output_node.input.extend(input_node.input)
-        output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
-        if "_class" in input_node.attr:
-            output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
+      output_node.op = "Sub"
+      output_node.name = input_node.name
+      output_node.input.extend(input_node.input)
+      output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
     elif input_node.op == "ResourceGather":
       if input_node.attr["batch_dims"].i != 0:
         raise ValueError("batch_dims != 0 is not supported by freeze_graph.")
@@ -557,6 +557,15 @@ def _save_placeholder(node_name, dtype):
       output_node.attr["Taxis"].CopyFrom(axis_dtype)
       if "_class" in input_node.attr:
         output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
+    elif input_node.op == "ResourceGatherNd":
+      output_node.op = "GatherNd"
+      output_node.name = input_node.name
+      output_node.input.extend(
+          [input_node.input[0], input_node.input[1]])
+      output_node.attr["Tparams"].CopyFrom(input_node.attr["dtype"])
+      output_node.attr["Tindices"].CopyFrom(input_node.attr["Tindices"])
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
     # Update the function names and argument types for the conditional ops.
     elif input_node.op in _CONDITIONAL_OPS:
       _populate_if_op(output_node, input_node, function_data)
@@ -625,5 +634,4 @@ def _save_placeholder(node_name, dtype):
               output_node.input[idx] = input_name
 
   output_graph_def.versions.CopyFrom(graph_def.versions)
-  return _construct_concrete_function(func, output_graph_def,
-                                      converted_input_indices)
+  return output_graph_def, converted_input_indices
diff --git a/keras2onnx/_parse_tf.py b/keras2onnx/_parse_tf.py
@@ -204,37 +204,18 @@ def build_layer_outputs(model, graph, outputs):
     return output_dict
 
 
-TF_GRAPH_OPTIMIZATION = False
-
-
 def extract_outputs_from_subclassing_model(model, output_dict, output_names):
-    from tensorflow.core.protobuf import config_pb2
     from tensorflow.python.keras.saving import saving_utils as _saving_utils
-    from tensorflow.lite.python.util import run_graph_optimizations as _run_graph_optimizations
+    from tensorflow.python.util import object_identity
     from ._graph_cvt import convert_variables_to_constants_v2 as _convert_to_constants
 
     function = _saving_utils.trace_model_call(model)
     concrete_func = function.get_concrete_function()
     output_names.extend([ts_.name for ts_ in concrete_func.outputs])
     output_dict.update(build_layer_outputs(model, concrete_func.graph, concrete_func.outputs))
-    frozen_func = _convert_to_constants(
+    graph_def, converted_input_indices = _convert_to_constants(
         concrete_func, lower_control_flow=True)
-    graph_def = frozen_func.graph.as_graph_def()
-    if TF_GRAPH_OPTIMIZATION:
-        input_tensors = [
-            tensor for tensor in frozen_func.inputs
-            if tensor.dtype != tf.dtypes.resource
-        ]
-        output_tensors = frozen_func.outputs
-        config = config_pb2.ConfigProto()
-        rewrite_options = config.graph_options.rewrite_options
-        rewrite_options.constant_folding = rewrite_options.ON
-        graph_def = _run_graph_optimizations(
-            graph_def,
-            input_tensors,
-            output_tensors,
-            config=config,
-            graph=frozen_func.graph)
+
     with tf.Graph().as_default() as tf_graph:
         tf.import_graph_def(graph_def, name='')
 
diff --git a/keras2onnx/ke2onnx/main.py b/keras2onnx/ke2onnx/main.py
@@ -216,6 +216,8 @@ def convert_keras_training_only_layer(scope, operator, container):
 
 if is_tf_keras and is_tf2:
     keras_layer_to_operator.update({
+        _layer.recurrent_v2.GRU: convert_keras_gru,
+        _layer.recurrent_v2.LSTM: convert_keras_lstm,
         _layer.normalization_v2.BatchNormalization: convert_keras_batch_normalization,
     })
 
diff --git a/keras2onnx/parser.py b/keras2onnx/parser.py
@@ -605,11 +605,15 @@ def _parse_graph_core(graph, keras_node_dict, topology, top_scope, output_names)
 
 def _sorted_inputs(nodelist, outputs, inputs_set):
     inputs = []
-    node_set = set(nodelist)
+    node_set = frozenset(nodelist)
+    visited = set()
 
     def travel(node):
         for in_ts_ in node.inputs:
             op_node = in_ts_.op
+            if op_node in visited:
+                continue
+            visited.add(op_node)
             if (op_node in inputs_set) and (op_node not in inputs):
                 inputs.append(op_node)
             elif op_node in node_set:
@@ -715,6 +719,28 @@ def _parse_graph_core_v2(graph, keras_node_dict, topology, top_scope, output_nam
     return topology
 
 
+def parse_graph_modeless(topo, graph, target_opset, input_names, output_names, keras_node_dict):
+    top_level = topo.declare_scope('__root')
+    input_tensors = [graph.get_tensor_by_name(n_) for n_ in input_names]
+    output_tensors = [graph.get_tensor_by_name(n_) for n_ in output_names]
+
+    for ts_i_ in input_tensors:
+        var_type = _adjust_input_batch_size(infer_variable_type(ts_i_, target_opset))
+        str_value = ts_i_.name
+        top_level.get_local_variable_or_declare_one(str_value, var_type)
+        topo.raw_model.add_input_name(str_value)
+
+    for ts_o_ in output_tensors:
+        var_type = _adjust_input_batch_size(infer_variable_type(ts_o_, target_opset))
+        str_value = ts_o_.name
+        top_level.get_local_variable_or_declare_one(str_value, var_type)
+        topo.raw_model.add_output_name(str_value)
+
+    return _parse_graph_core_v2(
+        graph, keras_node_dict, topo, top_level, output_names
+    )
+
+
 def parse_graph(topo, graph, target_opset, output_names, keras_node_dict):
     # type: (Topology, tf.Graph, int, [], []) -> Topology
     """