Merge pull request #3537 from Zac-HD/example-xfail

Zac-HD · web-flow · commit 329ba0463204 · 2023-01-09T04:23:33.000+11:00
New method: `@example(...).xfail()`
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,17 @@
+RELEASE_TYPE: minor
+
+A classic error when testing is to write a test function that can never fail,
+even on inputs that aren't allowed or manually provided.  By analogy to the
+design pattern of::
+
+    @pytest.mark.parametrize("arg", [
+        ...,  # passing examples
+        pytest.param(..., marks=[pytest.mark.xfail])  # expected-failing input
+    ])
+
+we now support :obj:`@example(...).xfail() <hypothesis.example.xfail>`, with
+the same (optional) ``condition``, ``reason``, and ``raises`` arguments as
+``pytest.mark.xfail()``.
+
+Naturally you can also write ``.via(...).xfail(...)``, or ``.xfail(...).via(...)``,
+if you wish to note the provenance of expected-failing examples.
diff --git a/hypothesis-python/docs/reproducing.rst b/hypothesis-python/docs/reproducing.rst
@@ -76,6 +76,8 @@ Either are fine, and you can use one in one example and the other in another
 example if for some reason you really want to, but a single example must be
 consistent.
 
+.. automethod:: hypothesis.example.xfail
+
 .. automethod:: hypothesis.example.via
 
 .. _reproducing-with-seed:
diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -34,6 +34,8 @@
     Hashable,
     List,
     Optional,
+    Tuple,
+    Type,
     TypeVar,
     Union,
     overload,
@@ -92,10 +94,12 @@
     get_signature,
     impersonate,
     is_mock,
+    nicerepr,
     proxies,
     repr_call,
 )
 from hypothesis.internal.scrutineer import Tracer, explanatory_lines
+from hypothesis.internal.validation import check_type
 from hypothesis.reporting import (
     current_verbosity,
     report,
@@ -134,6 +138,9 @@
 class Example:
     args = attr.ib()
     kwargs = attr.ib()
+    # Plus two optional arguments for .xfail()
+    raises = attr.ib(default=None)
+    reason = attr.ib(default=None)
 
 
 class example:
@@ -156,6 +163,51 @@ def __call__(self, test: TestFunc) -> TestFunc:
         test.hypothesis_explicit_examples.append(self._this_example)  # type: ignore
         return test
 
+    def xfail(
+        self,
+        condition: bool = True,
+        *,
+        reason: str = "",
+        raises: Union[
+            Type[BaseException], Tuple[Type[BaseException], ...]
+        ] = BaseException,
+    ) -> "example":
+        """Mark this example as an expected failure, like pytest.mark.xfail().
+
+        Expected-failing examples allow you to check that your test does fail on
+        some examples, and therefore build confidence that *passing* tests are
+        because your code is working, not because the test is missing something.
+
+        .. code-block:: python
+
+            @example(...).xfail()
+            @example(...).xfail(reason="Prices must be non-negative")
+            @example(...).xfail(raises=(KeyError, ValueError))
+            @example(...).xfail(sys.version_info[:2] >= (3, 9), reason="needs py39+")
+            @example(...).xfail(condition=sys.platform != "linux", raises=OSError)
+            def test(x):
+                pass
+        """
+        check_type(bool, condition, "condition")
+        check_type(str, reason, "reason")
+        if not (
+            isinstance(raises, type) and issubclass(raises, BaseException)
+        ) and not (
+            isinstance(raises, tuple)
+            and raises  # () -> expected to fail with no error, which is impossible
+            and all(
+                isinstance(r, type) and issubclass(r, BaseException) for r in raises
+            )
+        ):
+            raise InvalidArgument(
+                f"raises={raises!r} must be an exception type or tuple of exception types"
+            )
+        if condition:
+            self._this_example = attr.evolve(
+                self._this_example, raises=raises, reason=reason
+            )
+        return self
+
     def via(self, *whence: str) -> "example":
         """Attach a machine-readable label noting whence this example came.
 
@@ -400,9 +452,7 @@ def draw(self, strategy):
         assert self.__draws == 0
         self.__draws += 1
         # The main strategy for given is always a tuples strategy that returns
-        # first positional arguments then keyword arguments. When building this
-        # object already converted all positional arguments to keyword arguments,
-        # so this is the correct format to return.
+        # first positional arguments then keyword arguments.
         return self.__args, self.__kwargs
 
 
@@ -414,6 +464,7 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
     ]
 
     for example in reversed(getattr(wrapped_test, "hypothesis_explicit_examples", ())):
+        assert isinstance(example, Example)
         # All of this validation is to check that @example() got "the same" arguments
         # as @given, i.e. corresponding to the same parameters, even though they might
         # be any mixture of positional and keyword arguments.
@@ -455,12 +506,47 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
         with local_settings(state.settings):
             fragments_reported = []
             try:
+                adata = ArtificialDataForExample(arguments, example_kwargs)
+                bits = ", ".join(nicerepr(x) for x in arguments) + ", ".join(
+                    f"{k}={nicerepr(v)}" for k, v in example_kwargs.items()
+                )
                 with with_reporter(fragments_reported.append):
-                    state.execute_once(
-                        ArtificialDataForExample(arguments, example_kwargs),
-                        is_final=True,
-                        print_example=True,
-                    )
+                    if example.raises is None:
+                        state.execute_once(adata, is_final=True, print_example=True)
+                    else:
+                        # @example(...).xfail(...)
+                        try:
+                            state.execute_once(adata, is_final=True, print_example=True)
+                        except failure_exceptions_to_catch() as err:
+                            if not isinstance(err, example.raises):
+                                raise
+                        except example.raises as err:
+                            # We'd usually check this as early as possible, but it's
+                            # possible for failure_exceptions_to_catch() to grow when
+                            # e.g. pytest is imported between import- and test-time.
+                            raise InvalidArgument(
+                                f"@example({bits}) raised an expected {err!r}, "
+                                "but Hypothesis does not treat this as a test failure"
+                            ) from err
+                        else:
+                            # Unexpectedly passing; always raise an error in this case.
+                            reason = f" because {example.reason}" * bool(example.reason)
+                            if example.raises is BaseException:
+                                name = "exception"  # special-case no raises= arg
+                            elif not isinstance(example.raises, tuple):
+                                name = example.raises.__name__
+                            elif len(example.raises) == 1:
+                                name = example.raises[0].__name__
+                            else:
+                                name = (
+                                    ", ".join(ex.__name__ for ex in example.raises[:-1])
+                                    + f", or {example.raises[-1].__name__}"
+                                )
+                            vowel = name.upper()[0] in "AEIOU"
+                            raise AssertionError(
+                                f"Expected a{'n' * vowel} {name} from @example({bits})"
+                                f"{reason}, but no exception was raised."
+                            )
             except UnsatisfiedAssumption:
                 # Odd though it seems, we deliberately support explicit examples that
                 # are then rejected by a call to `assume()`.  As well as iterative
@@ -478,7 +564,7 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
                 # One user error - whether misunderstanding or typo - we've seen a few
                 # times is to pass strategies to @example() where values are expected.
                 # Checking is easy, and false-positives not much of a problem, so:
-                if any(
+                if isinstance(err, failure_exceptions_to_catch()) and any(
                     isinstance(arg, SearchStrategy)
                     for arg in example.args + tuple(example.kwargs.values())
                 ):
@@ -494,6 +580,7 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
                 if (
                     state.settings.report_multiple_bugs
                     and pytest_shows_exceptiongroups
+                    and isinstance(err, failure_exceptions_to_catch())
                     and not isinstance(err, skip_exceptions_to_reraise())
                 ):
                     continue
diff --git a/hypothesis-python/tests/common/utils.py b/hypothesis-python/tests/common/utils.py
@@ -89,7 +89,7 @@ class ExcInfo:
     pass
 
 
-def fails_with(e):
+def fails_with(e, *, match=None):
     def accepts(f):
         @proxies(f)
         def inverted_test(*arguments, **kwargs):
@@ -98,7 +98,7 @@ def inverted_test(*arguments, **kwargs):
             # the `raises` context manager so that any problems in rigging the
             # PRNG don't accidentally count as the expected failure.
             with deterministic_PRNG():
-                with raises(e):
+                with raises(e, match=match):
                     f(*arguments, **kwargs)
 
         return inverted_test
diff --git a/hypothesis-python/tests/cover/test_example.py b/hypothesis-python/tests/cover/test_example.py
@@ -87,6 +87,14 @@ def test_interactive_example_does_not_emit_warning():
     child.sendline("quit(code=0)")
 
 
+@fails_with(KeyboardInterrupt)
+@example(1)
+@example(2)
+@given(st.none())
+def test_raises_keyboardinterrupt_immediately(_):
+    raise KeyboardInterrupt
+
+
 def identity(decorator):
     # The "identity function hack" from https://peps.python.org/pep-0614/
     # Method-chaining decorators are otherwise a syntax error in Python <= 3.8
@@ -104,3 +112,93 @@ def test_invalid_example_via():
         example(x=False).via(100)  # not a string!
     with pytest.raises(TypeError):
         example(x=False).via("abc", "def")  # too many args
+
+
+@pytest.mark.parametrize(
+    "kw",
+    [
+        {"condition": None},  # must be a bool
+        {"reason": None},  # must be a string
+        {"raises": None},  # not a BaseException (or even a type)
+        {"raises": int},  # not a BaseException
+        {"raises": [Exception]},  # not a tuple
+        {"raises": (None,)},  # tuple containing a non-BaseException
+        {"raises": ()},  # empty tuple doesn't make sense here
+        # raising non-failure exceptions, eg KeyboardInterrupt, is tested below
+    ],
+    ids=repr,
+)
+def test_invalid_example_xfail_arguments(kw):
+    with pytest.raises(InvalidArgument):
+        example(x=False).xfail(**kw)
+
+
+@identity(example(True).xfail())
+@identity(example(True).xfail(reason="ignored for passing tests"))
+@identity(example(True).xfail(raises=KeyError))
+@identity(example(True).xfail(raises=(KeyError, ValueError)))
+@identity(example(True).xfail(True, reason="..."))
+@identity(example(False).xfail(condition=False))
+@given(st.none())
+def test_many_xfail_example_decorators(fails):
+    if fails:
+        raise KeyError
+
+
+@fails_with(AssertionError)
+@identity(example(x=True).xfail(raises=KeyError))
+@given(st.none())
+def test_xfail_reraises_non_specified_exception(x):
+    assert not x
+
+
+@fails_with(
+    InvalidArgument,
+    match=r"@example\(x=True\) raised an expected BaseException\('msg'\), "
+    r"but Hypothesis does not treat this as a test failure",
+)
+@identity(example(True).xfail())
+@given(st.none())
+def test_must_raise_a_failure_exception(x):
+    if x:
+        raise BaseException("msg")
+
+
+@fails_with(
+    AssertionError,
+    match=r"Expected an exception from @example\(x=None\), but no exception was raised.",
+)
+@identity(example(None).xfail())
+@given(st.none())
+def test_error_on_unexpected_pass_base(x):
+    pass
+
+
+@fails_with(
+    AssertionError,
+    match=r"Expected an AssertionError from @example\(x=None\), but no exception was raised.",
+)
+@identity(example(None).xfail(raises=AssertionError))
+@given(st.none())
+def test_error_on_unexpected_pass_single(x):
+    pass
+
+
+@fails_with(
+    AssertionError,
+    match=r"Expected an AssertionError from @example\(x=None\), but no exception was raised.",
+)
+@identity(example(None).xfail(raises=(AssertionError,)))
+@given(st.none())
+def test_error_on_unexpected_pass_single_elem_tuple(x):
+    pass
+
+
+@fails_with(
+    AssertionError,
+    match=r"Expected a KeyError, or ValueError from @example\(x=None\), but no exception was raised.",
+)
+@identity(example(None).xfail(raises=(KeyError, ValueError)))
+@given(st.none())
+def test_error_on_unexpected_pass_multi(x):
+    pass
diff --git a/hypothesis-python/tests/nocover/test_integer_ranges.py b/hypothesis-python/tests/nocover/test_integer_ranges.py
@@ -10,7 +10,7 @@
 
 import pytest
 
-from hypothesis import given
+from hypothesis import given, settings
 from hypothesis.internal.conjecture.utils import integer_range
 from hypothesis.strategies import integers
 from hypothesis.strategies._internal.strategies import SearchStrategy
@@ -47,14 +47,16 @@ def test_intervals_shrink_to_center(lower_center_upper):
 def test_bounded_integers_distribution_of_bit_width_issue_1387_regression():
     values = []
 
+    @settings(database=None, max_examples=1000)
     @given(integers(0, 1e100))
     def test(x):
-        values.append(x)
+        if 2 <= x <= int(1e100) - 2:  # skip forced-endpoints
+            values.append(x)
 
     test()
 
     # We draw from a shaped distribution up to 128bit ~7/8 of the time, and
     # uniformly the rest.  So we should get some very large but not too many.
     huge = sum(x > 1e97 for x in values)
-    assert huge != 0
+    assert huge != 0 or len(values) < 800
     assert huge <= 0.3 * len(values)  # expected ~1/8