Solve Excel loading for xlrd/openpyxl/pandas issues

berland · berland · commit 2b1518e34414 · 2020-12-16T08:33:33.000+01:00
Matrix tested manually for xlrd in [1.2, 2.0] and for pandas
in [0.25.2, 1.1.5].

Openpyxl is now a direct dependency due to exception capture.
diff --git a/.github/workflows/pyscal.yml b/.github/workflows/pyscal.yml
@@ -37,7 +37,6 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Check code style
-        if: matrix.python-version != '2.7'
         run: |
           pip install black
           black --check pyscal/*py tests/test_*py setup.py docs/conf.py
@@ -79,7 +78,7 @@ jobs:
             git config --local user.name "pyscal-github-action"
             git fetch origin gh-pages
             git checkout --track origin/gh-pages
-            git clean -f -f -d -x
+            git clean -f -f -d -x  # Double -f is intentional
             git rm -r *
 
             cp -R ../html/* .
diff --git a/pyscal/factory.py b/pyscal/factory.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import numpy as np
 import xlrd
+import openpyxl
 
 from pyscal.utils import capillarypressure
 from .wateroil import WaterOil
@@ -597,43 +598,53 @@ def load_relperm_df(inputfile, sheet_name=None):
 
         Returns:
             pd.DataFrame. To be handed over to pyscal list factory methods.
+            Empty dataframe in case of errors (messages will be logged).
         """
         if isinstance(inputfile, (str, Path)) and Path(inputfile).is_file():
-            if str(inputfile).lower().endswith("csv") and sheet_name is not None:
+            tabular_file_format = infer_tabular_file_format(inputfile)
+            if not tabular_file_format:
+                # Error message emitted by infer_file_format()
+                return pd.DataFrame()
+
+            if tabular_file_format == "csv" and sheet_name is not None:
                 logger.warning(
                     "Sheet name only relevant for XLSX files, ignoring %s", sheet_name
                 )
-            try:
-                if sheet_name:
-                    input_df = pd.read_excel(inputfile, sheet_name=sheet_name)
-                    logger.info("Parsed XLSX file %s, sheet %s", inputfile, sheet_name)
-                else:
-                    input_df = pd.read_excel(inputfile)
-                    logger.info("Parsed XLSX file %s", inputfile)
-            except xlrd.XLRDError as xlserror:
-                if inputfile.lower().endswith("xlsx") or inputfile.lower().endswith(
-                    "xls"
-                ):
-                    logger.error(xlserror)
+            excel_engines = {"xls": "xlrd", "xlsx": "openpyxl"}
+            if sheet_name:
                 try:
-                    input_df = pd.read_csv(inputfile, skipinitialspace=True)
-                    logger.info("Parsed CSV file %s", inputfile)
-                except pd.errors.ParserError as csverror:
-                    logger.error("Could not parse %s as XLSX or CSV", inputfile)
-                    logger.error("Error message from csv-parser: %s", str(csverror))
-                    input_df = pd.DataFrame()
-                except ValueError:
-                    # We end here when we use csv reader on xls files, that
-                    # means that xls parsing failed in the first place. Other
-                    # error messages have been, and will be printed.
-                    input_df = pd.DataFrame()
+                    input_df = pd.read_excel(
+                        inputfile,
+                        sheet_name=sheet_name,
+                        engine=excel_engines[tabular_file_format],
+                    )
+                    logger.info(
+                        "Parsed %s file %s, sheet %s",
+                        tabular_file_format.upper(),
+                        inputfile,
+                        sheet_name,
+                    )
+                except KeyError as error:
+                    logger.error("Non-existing sheet-name %s provided?", sheet_name)
+                    logger.error(str(error))
+                    return pd.DataFrame()
+            elif tabular_file_format.startswith("xls"):
+                input_df = pd.read_excel(
+                    inputfile, engine=excel_engines[tabular_file_format]
+                )
+                logger.info("Parsed %s file %s", tabular_file_format.upper(), inputfile)
+            else:
+                input_df = pd.read_csv(inputfile, skipinitialspace=True)
+                logger.info("Parsed CSV file %s", inputfile)
+
         elif isinstance(inputfile, pd.DataFrame):
             input_df = inputfile
         else:
             if isinstance(inputfile, str) and not Path(inputfile).is_file():
                 raise IOError("File not found " + str(inputfile))
             raise ValueError("Unsupported argument " + str(inputfile))
         assert isinstance(input_df, pd.DataFrame)
+
         if input_df.empty:
             logger.error("Relperm input dataframe is empty!")
 
@@ -1044,6 +1055,46 @@ def filter_nan_from_dict(params):
     return cleaned_params
 
 
+def infer_tabular_file_format(filename):
+    """Determine the file format of a file containing tabular data,
+    distinguishes between csv, xls and xlsx
+
+    Args:
+        filename (str): Path to file, possibley pathlib Path object.
+
+    Returns:
+        str: One of "csv", "xlsx" or "xls". Empty string if nothing found out.
+    """
+    try:
+        pd.read_excel(filename, engine="openpyxl")
+        return "xlsx"
+    except openpyxl.utils.exceptions.InvalidFileException:
+        # We get here for both CSV and XLS files.
+        pass
+    try:
+        pd.read_excel(filename, engine="xlrd")
+        return "xls"
+    except xlrd.biffh.XLRDError:
+        # We get here for both CSV and XLSX files.
+        pass
+    try:
+        dframe = pd.read_csv(filename)
+        if not dframe.empty:
+            return "csv"
+    except UnicodeDecodeError:
+        # (xls and xlsx files)
+        pass
+    except pd.parser.ParserError as csverror:
+        logger.error("Message from CSV parser: %s", str(csverror))
+        # Some text file that is not CSV
+        pass
+
+    logger.error(
+        "Impossible to infer file format for %s, not CSV/XLS/XLSX", str(filename)
+    )
+    return ""
+
+
 def check_deprecated(params):
     """Check for deprecated parameter names
 
diff --git a/setup.py b/setup.py
@@ -19,6 +19,7 @@
 REQUIREMENTS = [
     "matplotlib",
     "numpy",
+    "openpyxl",
     "pandas",
     "scipy",
     "xlrd",
diff --git a/tests/data/scal-pc-input-example.xls b/tests/data/scal-pc-input-example.xls
diff --git a/tests/test_factory.py b/tests/test_factory.py
@@ -538,7 +538,7 @@ def test_xls_factory():
 
     xlsxfile = testdir / "data/scal-pc-input-example.xlsx"
 
-    scalinput = pd.read_excel(xlsxfile).set_index(["SATNUM", "CASE"])
+    scalinput = pd.read_excel(xlsxfile, engine="openpyxl").set_index(["SATNUM", "CASE"])
 
     for ((satnum, _), params) in scalinput.iterrows():
         assert satnum
@@ -642,7 +642,7 @@ def test_xls_scalrecommendation():
     testdir = Path(__file__).absolute().parent
 
     xlsxfile = testdir / "data/scal-pc-input-example.xlsx"
-    scalinput = pd.read_excel(xlsxfile).set_index(["SATNUM", "CASE"])
+    scalinput = pd.read_excel(xlsxfile, engine="openpyxl").set_index(["SATNUM", "CASE"])
     print(scalinput)
     for satnum in scalinput.index.levels[0].values:
         dictofdict = scalinput.loc[satnum, :].to_dict(orient="index")
diff --git a/tests/test_pyscallist.py b/tests/test_pyscallist.py
@@ -60,13 +60,20 @@ def test_pyscallist_basic():
     assert len(p_list) == 2
 
 
-def test_load_scalrec():
+def test_load_scalrec(tmpdir):
     """Load a SATNUM range from xlsx"""
     testdir = Path(__file__).absolute().parent
 
     scalrec_data = PyscalFactory.load_relperm_df(
         testdir / "data/scal-pc-input-example.xlsx"
     )
+
+    # Also check that we can read the old excel format
+    scalrec_data_legacy_xls = PyscalFactory.load_relperm_df(
+        testdir / "data/scal-pc-input-example.xls"
+    )
+    pd.testing.assert_frame_equal(scalrec_data, scalrec_data_legacy_xls)
+
     scalrec_list = PyscalFactory.create_scal_recommendation_list(scalrec_data)
     wog_list = scalrec_list.interpolate(-0.3)