modified files to build for Databricks runtime 11.3 LTS compliant versions

ronanstokes-db · ronanstokes-db · commit 04d6f58146de · 2025-03-05T20:13:45.000-08:00
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
@@ -95,7 +95,7 @@ class ColumnGenerationSpec(SerializableToDict):
     # restrict spurious messages from java gateway
     logging.getLogger("py4j").setLevel(logging.WARNING)
 
-    def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix='', random=False,
+    def __init__(self, name, colType=None, *, minValue=0, maxValue=None, step=1, prefix='', random=False,
                  distribution=None, baseColumn=None, randomSeed=None, randomSeedMethod=None,
                  implicit=False, omit=False, nullable=True, debug=False, verbose=False,
                  seedColumnName=DEFAULT_SEED_COLUMN,
@@ -529,18 +529,22 @@ def _setup_logger(self):
         else:
             self.logger.setLevel(logging.WARNING)
 
-    def _computeAdjustedRangeForColumn(self, colType, c_min, c_max, c_step, c_begin, c_end, c_interval, c_range,
+    def _computeAdjustedRangeForColumn(self, colType, c_min, c_max, c_step, *, c_begin, c_end, c_interval, c_range,
                                        c_unique):
         """Determine adjusted range for data column
         """
         assert colType is not None, "`colType` must be non-None instance"
 
         if type(colType) is DateType or type(colType) is TimestampType:
-            return self._computeAdjustedDateTimeRangeForColumn(colType, c_begin, c_end, c_interval, c_range, c_unique)
+            return self._computeAdjustedDateTimeRangeForColumn(colType, c_begin, c_end, c_interval,
+                                                               c_range=c_range,
+                                                               c_unique=c_unique)
         else:
-            return self._computeAdjustedNumericRangeForColumn(colType, c_min, c_max, c_step, c_range, c_unique)
+            return self._computeAdjustedNumericRangeForColumn(colType, c_min, c_max, c_step,
+                                                              c_range=c_range,
+                                                              c_unique=c_unique)
 
-    def _computeAdjustedNumericRangeForColumn(self, colType, c_min, c_max, c_step, c_range, c_unique):
+    def _computeAdjustedNumericRangeForColumn(self, colType, c_min, c_max, c_step, *, c_range, c_unique):
         """Determine adjusted range for data column
 
         Rules:
@@ -589,7 +593,7 @@ def _computeAdjustedNumericRangeForColumn(self, colType, c_min, c_max, c_step, c
 
         return result
 
-    def _computeAdjustedDateTimeRangeForColumn(self, colType, c_begin, c_end, c_interval, c_range, c_unique):
+    def _computeAdjustedDateTimeRangeForColumn(self, colType, c_begin, c_end, c_interval, *, c_range, c_unique):
         """Determine adjusted range for Date or Timestamp data column
         """
         effective_begin, effective_end, effective_interval = None, None, None
@@ -656,7 +660,7 @@ def _getUniformRandomSQLExpression(self, col_name):
         else:
             return "rand()"
 
-    def _getScaledIntSQLExpression(self, col_name, scale, base_columns, base_datatypes=None, compute_method=None,
+    def _getScaledIntSQLExpression(self, col_name, scale, base_columns, *, base_datatypes=None, compute_method=None,
                                    normalize=False):
         """ Get scaled numeric expression
 
diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py
@@ -92,7 +92,7 @@ def _displayRow(self, row):
 
         return ", ".join(results)
 
-    def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, dfData=None, rowLimit=1,
+    def _addMeasureToSummary(self, measureName, *, summaryExpr="''", fieldExprs=None, dfData=None, rowLimit=1,
                              dfSummary=None):
         """ Add a measure to the summary dataframe
 
@@ -340,7 +340,7 @@ def _generatorDefaultAttributesFromType(cls, sqlType, colName=None, dataSummary=
         return result
 
     @classmethod
-    def _scriptDataGeneratorCode(cls, schema, dataSummary=None, sourceDf=None, suppressOutput=False, name=None):
+    def _scriptDataGeneratorCode(cls, schema, *, dataSummary=None, sourceDf=None, suppressOutput=False, name=None):
         """
         Generate outline data generator code from an existing dataframe
 
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -76,7 +76,7 @@ class DataGenerator(SerializableToDict):
 
     # logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.NOTSET)
 
-    def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
+    def __init__(self, sparkSession=None, name=None, *, randomSeedMethod=None,
                  rows=1000000, startingId=0, randomSeed=None, partitions=None, verbose=False,
                  batchSize=None, debug=False, seedColumnName=DEFAULT_SEED_COLUMN,
                  random=False,
@@ -782,7 +782,7 @@ def _checkColumnOrColumnList(self, columns, allowId=False):
                    f" column `{columns}` must refer to defined column")
         return True
 
-    def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=None,
+    def withColumnSpec(self, colName, *, minValue=None, maxValue=None, step=1, prefix=None,
                        random=None, distribution=None,
                        implicit=False, dataRange=None, omit=False, baseColumn=None, **kwargs):
         """ add a column specification for an existing column
@@ -842,7 +842,7 @@ def hasColumnSpec(self, colName):
         """
         return colName in self._columnSpecsByName
 
-    def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None, step=1,
+    def withColumn(self, colName, colType=StringType(), *, minValue=None, maxValue=None, step=1,
                    dataRange=None, prefix=None, random=None, distribution=None,
                    baseColumn=None, nullable=True,
                    omit=False, implicit=False, noWarn=False,
@@ -1058,7 +1058,7 @@ def withStructColumn(self, colName, fields=None, asJson=False, **kwargs):
 
         return newDf
 
-    def _generateColumnDefinition(self, colName, colType=None, baseColumn=None,
+    def _generateColumnDefinition(self, colName, colType=None, baseColumn=None, *,
                                   implicit=False, omit=False, nullable=True, **kwargs):
         """ generate field definition and column spec
 
@@ -1591,7 +1591,7 @@ def scriptTable(self, name=None, location=None, tableFormat="delta", asHtml=Fals
 
         return results
 
-    def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
+    def scriptMerge(self, tgtName=None, srcName=None, *, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
                     insertExpr=None,
                     useExplicitNames=True,
                     updateColumns=None, updateColumnExprs=None,
diff --git a/dbldatagen/text_generator_plugins.py b/dbldatagen/text_generator_plugins.py
@@ -69,7 +69,7 @@ class _FnCallContext:
         def __init__(self, txtGen):
             self.textGenerator = txtGen
 
-    def __init__(self, fn, init=None, initPerBatch=False, name=None, rootProperty=None):
+    def __init__(self, fn, *, init=None, initPerBatch=False, name=None, rootProperty=None):
         super().__init__()
         assert fn is not None or callable(fn), "Function must be provided wiith signature fn(context, oldValue)"
         assert init is None or callable(init), "Init function must be a callable function or lambda if passed"
@@ -284,7 +284,7 @@ class FakerTextFactory(PyfuncTextFactory):
 
     _defaultFakerTextFactory = None
 
-    def __init__(self, locale=None, providers=None, name="FakerText", lib=None,
+    def __init__(self, *, locale=None, providers=None, name="FakerText", lib=None,
                  rootClass=None):
 
         super().__init__(name)
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
@@ -429,7 +429,8 @@ def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False):
 
         return num_placeholders, retval
 
-    def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, escapeSpecialMeaning=False):
+    def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, *,
+                                         escapeSpecialMeaning=False):
         """ Vectorized implementation of template driven text substitution
 
          Apply substitutions to placeholders using random numbers
diff --git a/tests/test_basic_test.py b/tests/test_basic_test.py
@@ -1,6 +1,6 @@
 import logging
-import pytest
 
+import pytest
 from pyspark.sql import functions as F
 from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
 
@@ -146,7 +146,9 @@ def test_alt_seed_column(self, caplog):
                                IntegerType(),
                                {'uniqueValues': 5000, 'random': True})
                               ])
-    def test_seed_column_nocollision(self, caseName, withIdOutput, idType, additionalOptions, caplog):
+    def test_seed_column_nocollision(self, caseName, withIdOutput, idType, additionalOptions, caplog):  \
+            # pylint: disable=too-many-positional-arguments
+
         logging.info(f"case: {caseName}")
 
         # caplog fixture captures log content
@@ -189,7 +191,9 @@ def test_seed_column_nocollision(self, caseName, withIdOutput, idType, additiona
                               ("with no Id output _id float", False, FloatType(), "_id"),
                               ("with no Id output _id int", False, IntegerType(), "_id"),
                               ])
-    def test_seed_column_expected_collision1(self, caseName, withIdOutput, idType, idName, caplog):
+    def test_seed_column_expected_collision1(self, caseName, withIdOutput, idType, idName, caplog):  \
+            # pylint: disable=too-many-positional-arguments
+
         logging.info(f"case: {caseName}")
 
         # caplog fixture captures log content
diff --git a/tests/test_complex_columns.py b/tests/test_complex_columns.py
@@ -91,7 +91,7 @@ def test_unitialized_complex_fields2(self, complexFieldType, expectedType, inval
         invalid_data_count = df.where(invalidValueCondition).count()
         assert invalid_data_count == 0, "Not expecting invalid values"
 
-    @pytest.mark.parametrize("complexFieldType, expectedType, valueInitializer, validValueCondition",
+    @pytest.mark.parametrize("complexFieldType, expectedType, valueInit, validCond",
                              [("array<int>", ArrayType(IntegerType()), "array(1,2,3)",
                                "complex_field[1] = 2"),
                               ("array<array<string>>", ArrayType(ArrayType(StringType())), "array(array('one','two'))",
@@ -111,8 +111,9 @@ def test_unitialized_complex_fields2(self, complexFieldType, expectedType, inval
                                "complex_field is not Null and complex_field.c = code2"
                                )
                               ])
-    def test_initialized_complex_fields(self, complexFieldType, expectedType, valueInitializer, validValueCondition,
-                                        setupLogging):
+    def test_initialized_complex_fields(self, complexFieldType, expectedType, valueInit, validCond, setupLogging):  \
+            # pylint: disable=too-many-positional-arguments
+
         data_rows = 1000
         df_spec = (dg.DataGenerator(spark, name="test_data_set1", rows=data_rows,
                                     partitions=spark.sparkContext.defaultParallelism)
@@ -122,7 +123,7 @@ def test_initialized_complex_fields(self, complexFieldType, expectedType, valueI
                    .withColumn("code3", StringType(), values=['a', 'b', 'c'])
                    .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True)
                    .withColumn("code5", StringType(), values=['a', 'b', 'c'], random=True, weights=[9, 1, 1])
-                   .withColumn("complex_field", complexFieldType, expr=valueInitializer,
+                   .withColumn("complex_field", complexFieldType, expr=valueInit,
                                baseColumn=['code1', 'code2', 'code3', 'code4', 'code5'])
                    )
 
@@ -132,7 +133,7 @@ def test_initialized_complex_fields(self, complexFieldType, expectedType, valueI
         complex_type = df.schema["complex_field"].dataType
         assert complex_type == expectedType
 
-        valid_data_count = df.where(validValueCondition).count()
+        valid_data_count = df.where(validCond).count()
         assert valid_data_count == data_rows, "Not expecting invalid values"
 
     def test_basic_arrays_with_columns(self, setupLogging):
diff --git a/tests/test_constraints.py b/tests/test_constraints.py
@@ -109,7 +109,9 @@ def test_constraint_filter_expression_cache(self):
                                  ("id", "==", 50, 1),
                                  ("id", "!=", 50, 98),
                              ])
-    def test_scalar_relation(self, generationSpec1, column, operation, literalValue, expectedRows):
+    def test_scalar_relation(self, column, operation, literalValue, expectedRows, generationSpec1):  \
+            # pylint: disable=too-many-positional-arguments
+
         testDataSpec = (generationSpec1
                         .withConstraints([SqlExpr("id < 100"),
                                           SqlExpr("id > 0")])
@@ -275,7 +277,9 @@ def test_unique_combinations2(self, generationSpec3):
                                  ("id", 10, 20, True, 9),
                                  ("id", 10, 20, False, 11),
                              ])
-    def test_literal_range(self, generationSpec2, column, minValue, maxValue, strictFlag, expectedRows):
+    def test_literal_range(self, column, minValue, maxValue, strictFlag, expectedRows, generationSpec2):  \
+            # pylint: disable=too-many-positional-arguments
+
         testDataSpec = (generationSpec2
                         .withConstraints([SqlExpr("id < 100"),
                                           SqlExpr("id > 0")])
diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py
@@ -69,7 +69,9 @@ def test_text_generator_basics(self):
                                  (r'\n.\n.\n.\n', False, 15, None, True),
                                  (r'\n.\n.\n.\n', False, 15, -1, True),
                              ])
-    def test_random_number_generator(self, template, escapeSpecial, low, high, useSystemLib):
+    def test_random_number_generator(self, template, escapeSpecial, low, high, useSystemLib):  \
+            # pylint: disable=too-many-positional-arguments
+
         """ As the test coverage tools dont detect code only used in UDFs,
             lets add some explicit tests for the underlying code"""
         test_template = TemplateGenerator(template, escapeSpecialChars=escapeSpecial)