Skip to content

Commit 2b1518e

Browse files
committed
Solve Excel loading for xlrd/openpyxl/pandas issues
Matrix tested manually for xlrd in [1.2, 2.0] and for pandas in [0.25.2, 1.1.5]. Openpyxl is now a direct dependency due to exception capture.
1 parent b46f720 commit 2b1518e

File tree

6 files changed

+87
-29
lines changed

6 files changed

+87
-29
lines changed

.github/workflows/pyscal.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ jobs:
3737
python-version: ${{ matrix.python-version }}
3838

3939
- name: Check code style
40-
if: matrix.python-version != '2.7'
4140
run: |
4241
pip install black
4342
black --check pyscal/*py tests/test_*py setup.py docs/conf.py
@@ -79,7 +78,7 @@ jobs:
7978
git config --local user.name "pyscal-github-action"
8079
git fetch origin gh-pages
8180
git checkout --track origin/gh-pages
82-
git clean -f -f -d -x
81+
git clean -f -f -d -x # Double -f is intentional
8382
git rm -r *
8483
8584
cp -R ../html/* .

pyscal/factory.py

Lines changed: 75 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77
import numpy as np
88
import xlrd
9+
import openpyxl
910

1011
from pyscal.utils import capillarypressure
1112
from .wateroil import WaterOil
@@ -597,43 +598,53 @@ def load_relperm_df(inputfile, sheet_name=None):
597598
598599
Returns:
599600
pd.DataFrame. To be handed over to pyscal list factory methods.
601+
Empty dataframe in case of errors (messages will be logged).
600602
"""
601603
if isinstance(inputfile, (str, Path)) and Path(inputfile).is_file():
602-
if str(inputfile).lower().endswith("csv") and sheet_name is not None:
604+
tabular_file_format = infer_tabular_file_format(inputfile)
605+
if not tabular_file_format:
606+
# Error message emitted by infer_file_format()
607+
return pd.DataFrame()
608+
609+
if tabular_file_format == "csv" and sheet_name is not None:
603610
logger.warning(
604611
"Sheet name only relevant for XLSX files, ignoring %s", sheet_name
605612
)
606-
try:
607-
if sheet_name:
608-
input_df = pd.read_excel(inputfile, sheet_name=sheet_name)
609-
logger.info("Parsed XLSX file %s, sheet %s", inputfile, sheet_name)
610-
else:
611-
input_df = pd.read_excel(inputfile)
612-
logger.info("Parsed XLSX file %s", inputfile)
613-
except xlrd.XLRDError as xlserror:
614-
if inputfile.lower().endswith("xlsx") or inputfile.lower().endswith(
615-
"xls"
616-
):
617-
logger.error(xlserror)
613+
excel_engines = {"xls": "xlrd", "xlsx": "openpyxl"}
614+
if sheet_name:
618615
try:
619-
input_df = pd.read_csv(inputfile, skipinitialspace=True)
620-
logger.info("Parsed CSV file %s", inputfile)
621-
except pd.errors.ParserError as csverror:
622-
logger.error("Could not parse %s as XLSX or CSV", inputfile)
623-
logger.error("Error message from csv-parser: %s", str(csverror))
624-
input_df = pd.DataFrame()
625-
except ValueError:
626-
# We end here when we use csv reader on xls files, that
627-
# means that xls parsing failed in the first place. Other
628-
# error messages have been, and will be printed.
629-
input_df = pd.DataFrame()
616+
input_df = pd.read_excel(
617+
inputfile,
618+
sheet_name=sheet_name,
619+
engine=excel_engines[tabular_file_format],
620+
)
621+
logger.info(
622+
"Parsed %s file %s, sheet %s",
623+
tabular_file_format.upper(),
624+
inputfile,
625+
sheet_name,
626+
)
627+
except KeyError as error:
628+
logger.error("Non-existing sheet-name %s provided?", sheet_name)
629+
logger.error(str(error))
630+
return pd.DataFrame()
631+
elif tabular_file_format.startswith("xls"):
632+
input_df = pd.read_excel(
633+
inputfile, engine=excel_engines[tabular_file_format]
634+
)
635+
logger.info("Parsed %s file %s", tabular_file_format.upper(), inputfile)
636+
else:
637+
input_df = pd.read_csv(inputfile, skipinitialspace=True)
638+
logger.info("Parsed CSV file %s", inputfile)
639+
630640
elif isinstance(inputfile, pd.DataFrame):
631641
input_df = inputfile
632642
else:
633643
if isinstance(inputfile, str) and not Path(inputfile).is_file():
634644
raise IOError("File not found " + str(inputfile))
635645
raise ValueError("Unsupported argument " + str(inputfile))
636646
assert isinstance(input_df, pd.DataFrame)
647+
637648
if input_df.empty:
638649
logger.error("Relperm input dataframe is empty!")
639650

@@ -1044,6 +1055,46 @@ def filter_nan_from_dict(params):
10441055
return cleaned_params
10451056

10461057

1058+
def infer_tabular_file_format(filename):
1059+
"""Determine the file format of a file containing tabular data,
1060+
distinguishes between csv, xls and xlsx
1061+
1062+
Args:
1063+
filename (str): Path to file, possibley pathlib Path object.
1064+
1065+
Returns:
1066+
str: One of "csv", "xlsx" or "xls". Empty string if nothing found out.
1067+
"""
1068+
try:
1069+
pd.read_excel(filename, engine="openpyxl")
1070+
return "xlsx"
1071+
except openpyxl.utils.exceptions.InvalidFileException:
1072+
# We get here for both CSV and XLS files.
1073+
pass
1074+
try:
1075+
pd.read_excel(filename, engine="xlrd")
1076+
return "xls"
1077+
except xlrd.biffh.XLRDError:
1078+
# We get here for both CSV and XLSX files.
1079+
pass
1080+
try:
1081+
dframe = pd.read_csv(filename)
1082+
if not dframe.empty:
1083+
return "csv"
1084+
except UnicodeDecodeError:
1085+
# (xls and xlsx files)
1086+
pass
1087+
except pd.parser.ParserError as csverror:
1088+
logger.error("Message from CSV parser: %s", str(csverror))
1089+
# Some text file that is not CSV
1090+
pass
1091+
1092+
logger.error(
1093+
"Impossible to infer file format for %s, not CSV/XLS/XLSX", str(filename)
1094+
)
1095+
return ""
1096+
1097+
10471098
def check_deprecated(params):
10481099
"""Check for deprecated parameter names
10491100

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
REQUIREMENTS = [
2020
"matplotlib",
2121
"numpy",
22+
"openpyxl",
2223
"pandas",
2324
"scipy",
2425
"xlrd",

tests/data/scal-pc-input-example.xls

35 KB
Binary file not shown.

tests/test_factory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ def test_xls_factory():
538538

539539
xlsxfile = testdir / "data/scal-pc-input-example.xlsx"
540540

541-
scalinput = pd.read_excel(xlsxfile).set_index(["SATNUM", "CASE"])
541+
scalinput = pd.read_excel(xlsxfile, engine="openpyxl").set_index(["SATNUM", "CASE"])
542542

543543
for ((satnum, _), params) in scalinput.iterrows():
544544
assert satnum
@@ -642,7 +642,7 @@ def test_xls_scalrecommendation():
642642
testdir = Path(__file__).absolute().parent
643643

644644
xlsxfile = testdir / "data/scal-pc-input-example.xlsx"
645-
scalinput = pd.read_excel(xlsxfile).set_index(["SATNUM", "CASE"])
645+
scalinput = pd.read_excel(xlsxfile, engine="openpyxl").set_index(["SATNUM", "CASE"])
646646
print(scalinput)
647647
for satnum in scalinput.index.levels[0].values:
648648
dictofdict = scalinput.loc[satnum, :].to_dict(orient="index")

tests/test_pyscallist.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,20 @@ def test_pyscallist_basic():
6060
assert len(p_list) == 2
6161

6262

63-
def test_load_scalrec():
63+
def test_load_scalrec(tmpdir):
6464
"""Load a SATNUM range from xlsx"""
6565
testdir = Path(__file__).absolute().parent
6666

6767
scalrec_data = PyscalFactory.load_relperm_df(
6868
testdir / "data/scal-pc-input-example.xlsx"
6969
)
70+
71+
# Also check that we can read the old excel format
72+
scalrec_data_legacy_xls = PyscalFactory.load_relperm_df(
73+
testdir / "data/scal-pc-input-example.xls"
74+
)
75+
pd.testing.assert_frame_equal(scalrec_data, scalrec_data_legacy_xls)
76+
7077
scalrec_list = PyscalFactory.create_scal_recommendation_list(scalrec_data)
7178
wog_list = scalrec_list.interpolate(-0.3)
7279

0 commit comments

Comments
 (0)