Skip to content

Commit b4ef1dd

Browse files
authored
Handle ENOSPC more cleanly (#3513)
* Handle ENOSPC more cleanly PBENCH-1239 In Pbench ops review, after staging the latest `main` and with the intent of testing the new `Tarball.extract` on a large dataset, we pushed a >7Gb `uperf` tarball. This failed with an internal error, leaving a partial tarball copy in the `ARCHIVE` controller directory, revealing several related problems: 1. The cache manager was using `shlib.copy2`, which copies the tarball from the staging area into the archive tree. Because `nginx` also caches the entire data stream, this *triples* the storage requirements to upload a tarball. 2. On copy failure, the cache manager did not delete the partial file. 3. While the initial data stream save code handled an `ENOSPC` specially, after mapping trouble in Werkzeug it was reported as a "server internal error", which is not ideal. 4. The MD5 file write was not similarly protected: and while this is a small file and `ENOSPC` is unlikely, we should be prepared to handle it gracefully. This PR changes the cache manager to use `shlib.move` (which was the original intent) to avoid a third copy of the tarball. On failure, we unlink the file. Both the initial tarball and MD5 write handle `ENOSPC` and return HTTP status 413 (request entity too large), which is not a perfect mapping but a standard error code that Werkzeug can handle.
1 parent 9b071de commit b4ef1dd

File tree

4 files changed

+259
-48
lines changed

4 files changed

+259
-48
lines changed

lib/pbench/server/api/resources/intake_base.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -356,17 +356,17 @@ def _intake(
356356
ofp.write(chunk)
357357
hash_md5.update(chunk)
358358
except OSError as exc:
359-
# NOTE: Werkzeug doesn't support status 509, so the abort call
360-
# in _dispatch will fail. Rather than figure out how to fix
361-
# that, just report as an internal error.
362359
if exc.errno == errno.ENOSPC:
363-
msg = f"Out of space on {tar_full_path.root}"
364-
else:
365-
msg = f"Unexpected error {exc.errno} encountered during file upload"
366-
raise APIInternalError(msg) from exc
360+
raise APIAbort(
361+
HTTPStatus.REQUEST_ENTITY_TOO_LARGE,
362+
f"Out of space on {tar_full_path.parent}",
363+
)
364+
raise APIInternalError(
365+
f"Unexpected error encountered during file upload: {str(exc)!r} "
366+
) from exc
367367
except Exception as e:
368368
raise APIInternalError(
369-
"Unexpected error encountered during file upload"
369+
"Unexpected error encountered during file upload: {str(e)!r}"
370370
) from e
371371

372372
if bytes_received != stream.length:
@@ -381,12 +381,21 @@ def _intake(
381381
)
382382

383383
# From this point attempt to remove the MD5 file on error exit
384-
recovery.add(md5_full_path.unlink)
384+
recovery.add(lambda: md5_full_path.unlink(missing_ok=True))
385385
try:
386386
md5_full_path.write_text(f"{intake.md5} {filename}\n")
387+
except OSError as exc:
388+
if exc.errno == errno.ENOSPC:
389+
raise APIAbort(
390+
HTTPStatus.REQUEST_ENTITY_TOO_LARGE,
391+
f"Out of space on {md5_full_path.parent}",
392+
)
393+
raise APIInternalError(
394+
f"Unexpected error encountered during MD5 creation: {str(exc)!r}"
395+
) from exc
387396
except Exception as e:
388397
raise APIInternalError(
389-
f"Failed to write .md5 file '{md5_full_path}'",
398+
f"Failed to write .md5 file '{md5_full_path}': {str(e)!r}",
390399
) from e
391400

392401
# Create a cache manager object
@@ -408,6 +417,15 @@ def _intake(
408417
HTTPStatus.BAD_REQUEST,
409418
f"Tarball {dataset.name!r} is invalid or missing required metadata.log: {exc}",
410419
) from exc
420+
except OSError as exc:
421+
if exc.errno == errno.ENOSPC:
422+
raise APIAbort(
423+
HTTPStatus.REQUEST_ENTITY_TOO_LARGE,
424+
f"Out of space on {tar_full_path.parent}",
425+
)
426+
raise APIInternalError(
427+
f"Unexpected error encountered during archive: {str(exc)!r}"
428+
) from exc
411429
except Exception as exc:
412430
raise APIInternalError(
413431
f"Unable to create dataset in file system for {tar_full_path}: {exc}"

lib/pbench/server/cache_manager.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -378,35 +378,40 @@ def create(cls, tarball: Path, controller: "Controller") -> "Tarball":
378378
# standard .tar.xz
379379
md5_source = tarball.with_suffix(".xz.md5")
380380

381+
destination = controller.path / tarball.name
382+
md5_destination = controller.path / md5_source.name
383+
381384
# If either expected destination file exists, something is wrong
382-
if (controller.path / tarball.name).exists():
383-
raise DuplicateTarball(name)
384-
if (controller.path / md5_source.name).exists():
385+
if destination.exists() or md5_destination.exists():
385386
raise DuplicateTarball(name)
386387

387-
# Copy the MD5 file first; only if that succeeds, copy the tarball
388-
# itself.
388+
# Move the MD5 file first; only if that succeeds, move the tarball
389+
# itself. Note that we expect the source to be on the same
390+
# filesystem as the ARCHIVE tree, and we want to avoid using double
391+
# the space by copying large tarballs if the file can be moved.
389392
try:
390-
md5_destination = Path(shutil.copy2(md5_source, controller.path))
393+
shutil.move(md5_source, md5_destination)
391394
except Exception as e:
395+
md5_destination.unlink(missing_ok=True)
392396
controller.logger.error(
393-
"ERROR copying dataset {} ({}) MD5: {}", name, tarball, e
397+
"ERROR moving dataset {} ({}) MD5: {}", name, tarball, e
394398
)
395399
raise
396400

397401
try:
398-
destination = Path(shutil.copy2(tarball, controller.path))
402+
shutil.move(tarball, destination)
399403
except Exception as e:
400404
try:
401-
md5_destination.unlink()
405+
md5_destination.unlink(missing_ok=True)
402406
except Exception as md5_e:
403407
controller.logger.error(
404408
"Unable to recover by removing {} MD5 after tarball copy failure: {}",
405409
name,
406410
md5_e,
407411
)
412+
destination.unlink(missing_ok=True)
408413
controller.logger.error(
409-
"ERROR copying dataset {} tarball {}: {}", name, tarball, e
414+
"ERROR moving dataset {} tarball {}: {}", name, tarball, e
410415
)
411416
raise
412417

@@ -419,12 +424,17 @@ def create(cls, tarball: Path, controller: "Controller") -> "Tarball":
419424
# log it but do not abort
420425
controller.logger.error("Unable to set SELINUX context for {}: {}", name, e)
421426

422-
# If we were able to copy both files, remove the originals
427+
# If we were able to copy both files, remove the originals. If we moved
428+
# the files above, instead of copying them, these will no longer exist
429+
# and we'll ignore that condition silently.
430+
try:
431+
tarball.unlink(missing_ok=True)
432+
except Exception as e:
433+
controller.logger.error("Error removing staged tarball {}: {}", name, e)
423434
try:
424-
tarball.unlink()
425-
md5_source.unlink()
435+
md5_source.unlink(missing_ok=True)
426436
except Exception as e:
427-
controller.logger.error("Error removing incoming dataset {}: {}", name, e)
437+
controller.logger.error("Error removing staged MD5 {}: {}", name, e)
428438

429439
return cls(destination, controller)
430440

lib/pbench/test/unit/server/test_cache_manager.py

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import errno
12
import hashlib
23
import io
34
from logging import Logger
@@ -244,17 +245,82 @@ def test_duplicate(
244245
monkeypatch.setattr(Tarball, "_get_metadata", fake_get_metadata)
245246
cm = CacheManager(server_config, make_logger)
246247

247-
# Create a tarball file in the expected destination directory
248+
# Create a tarball file in the expected destination directory: the
249+
# subsequent create should report a duplicate.
248250
controller = cm.archive_root / "ABC"
249251
controller.mkdir()
250252
(controller / source_tarball.name).write_text("Send in the clones")
251-
252-
# Attempting to create a dataset from the md5 file should result in
253-
# a duplicate dataset error
254253
with pytest.raises(DuplicateTarball) as exc:
255254
cm.create(source_tarball)
256255
assert exc.value.tarball == Dataset.stem(source_tarball)
257256

257+
@pytest.mark.parametrize(
258+
"allow,errno",
259+
(
260+
(".md5", errno.ENOSPC),
261+
(".md5", errno.EEXIST),
262+
(".md5", None),
263+
("", errno.ENOSPC),
264+
("", errno.EACCES),
265+
("", None),
266+
),
267+
)
268+
def test_move_fails(
269+
self,
270+
monkeypatch,
271+
selinux_disabled,
272+
server_config,
273+
make_logger,
274+
tarball,
275+
allow,
276+
errno,
277+
):
278+
src: list[Path] = []
279+
dest: list[Path] = []
280+
real_move = shutil.move
281+
282+
def mymove(source: Path, destination: Path, *args, **kwargs) -> Path:
283+
src.append(source)
284+
if destination.is_dir():
285+
d = destination / source.name
286+
else:
287+
d = destination
288+
dest.append(d)
289+
if source.suffix == allow:
290+
return real_move(source, destination, *args, **kwargs)
291+
if errno:
292+
e = OSError(errno, "something went badly")
293+
else:
294+
e = Exception("I broke")
295+
raise e
296+
297+
ulink: list[Path] = []
298+
ok: list[bool] = []
299+
300+
def unlink(self, missing_ok: bool = False):
301+
ulink.append(self)
302+
ok.append(missing_ok)
303+
304+
source_tarball, source_md5, md5 = tarball
305+
monkeypatch.setattr(Tarball, "_get_metadata", fake_get_metadata)
306+
cm = CacheManager(server_config, make_logger)
307+
monkeypatch.setattr("pbench.server.cache_manager.shutil.move", mymove)
308+
monkeypatch.setattr(Path, "unlink", unlink)
309+
with pytest.raises(Exception) as e:
310+
cm.create(source_tarball)
311+
if errno:
312+
assert isinstance(e.value, OSError), f"Wanted OSError, got {type(e.value)}"
313+
assert e.value.errno == errno
314+
else:
315+
assert str(e.value) == "I broke"
316+
assert src == [source_md5] + ([source_tarball] if allow else [])
317+
assert len(src) == len(dest) == len(ulink) == len(ok) == (2 if allow else 1)
318+
for i, s in enumerate(src):
319+
assert dest[i].name == s.name
320+
assert ulink[i].name == s.name
321+
assert dest[i] == ulink[i]
322+
assert all(ok), f"Cleanup unlinks don't ignore missing: {ok}, {ulink}"
323+
258324
def test_tarball_subprocess_run_with_exception(self, monkeypatch):
259325
"""Test to check the subprocess_run functionality of the Tarball when
260326
an Exception occurred"""

0 commit comments

Comments
 (0)