Merge branch 'main' into xpu_nondeterministic_roi_align

frost-intel · web-flow · commit 79da54b29f6c · 2025-02-25T08:03:19.000-05:00
diff --git a/setup.py b/setup.py
@@ -2,6 +2,7 @@
 import distutils.spawn
 import glob
 import os
+import shlex
 import shutil
 import subprocess
 import sys
@@ -123,7 +124,7 @@ def get_macros_and_flags():
             if NVCC_FLAGS is None:
                 nvcc_flags = []
             else:
-                nvcc_flags = NVCC_FLAGS.split(" ")
+                nvcc_flags = shlex.split(NVCC_FLAGS)
         extra_compile_args["nvcc"] = nvcc_flags
 
     if sys.platform == "win32":
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
@@ -611,6 +611,7 @@ class ImageDatasetTestCase(DatasetTestCase):
     """
 
     FEATURE_TYPES = (PIL.Image.Image, int)
+    SUPPORT_TV_IMAGE_DECODE: bool = False
 
     @contextlib.contextmanager
     def create_dataset(
@@ -632,22 +633,34 @@ def create_dataset(
             # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an
             # image, but never use the underlying data. During normal operation it is reasonable to assume that the
             # user wants to work with the image he just opened rather than deleting the underlying file.
-            with self._force_load_images():
+            with self._force_load_images(loader=(config or {}).get("loader", None)):
                 yield dataset, info
 
     @contextlib.contextmanager
-    def _force_load_images(self):
-        open = PIL.Image.open
+    def _force_load_images(self, loader: Optional[Callable[[str], Any]] = None):
+        open = loader or PIL.Image.open
 
         def new(fp, *args, **kwargs):
             image = open(fp, *args, **kwargs)
-            if isinstance(fp, (str, pathlib.Path)):
+            if isinstance(fp, (str, pathlib.Path)) and isinstance(image, PIL.Image.Image):
                 image.load()
             return image
 
-        with unittest.mock.patch("PIL.Image.open", new=new):
+        with unittest.mock.patch(open.__module__ + "." + open.__qualname__, new=new):
             yield
 
+    def test_tv_decode_image_support(self):
+        if not self.SUPPORT_TV_IMAGE_DECODE:
+            pytest.skip(f"{self.DATASET_CLASS.__name__} does not support torchvision.io.decode_image.")
+
+        with self.create_dataset(
+            config=dict(
+                loader=torchvision.io.decode_image,
+            )
+        ) as (dataset, _):
+            image = dataset[0][0]
+            assert isinstance(image, torch.Tensor)
+
 
 class VideoDatasetTestCase(DatasetTestCase):
     """Abstract base class for video dataset testcases.
diff --git a/test/test_datasets.py b/test/test_datasets.py
@@ -405,6 +405,8 @@ class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
     REQUIRED_PACKAGES = ("scipy",)
     ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir)
 
@@ -2308,6 +2310,7 @@ def inject_fake_data(self, tmpdir, config):
 class EuroSATTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.EuroSAT
     FEATURE_TYPES = (PIL.Image.Image, int)
+    SUPPORT_TV_IMAGE_DECODE = True
 
     def inject_fake_data(self, tmpdir, config):
         data_folder = os.path.join(tmpdir, "eurosat", "2750")
@@ -2749,6 +2752,8 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase):
 
     ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test"))
 
+    SUPPORT_TV_IMAGE_DECODE = True
+
     def inject_fake_data(self, tmpdir: str, config):
         split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
         split_folder.mkdir(parents=True, exist_ok=True)
diff --git a/test/test_image.py b/test/test_image.py
@@ -623,6 +623,42 @@ def test_encode_jpeg_cuda(img_path, scripted, contiguous):
     assert abs_mean_diff < 3
 
 
+@needs_cuda
+def test_encode_jpeg_cuda_sync():
+    """
+    Non-regression test for https://github.com/pytorch/vision/issues/8587.
+    Attempts to reproduce an intermittent CUDA stream synchronization bug
+    by randomly creating images and round-tripping them via encode_jpeg
+    and decode_jpeg on the GPU. Fails if the mean difference in uint8 range
+    exceeds 5.
+    """
+    torch.manual_seed(42)
+
+    # manual testing shows this bug appearing often in iterations between 50 and 100
+    # as a synchronization bug, this can't be reliably reproduced
+    max_iterations = 100
+    threshold = 5.0  # in [0..255]
+
+    device = torch.device("cuda")
+
+    for iteration in range(max_iterations):
+        height, width = torch.randint(4000, 5000, size=(2,))
+
+        image = torch.linspace(0, 1, steps=height * width, device=device)
+        image = image.view(1, height, width).expand(3, -1, -1)
+
+        image = (image * 255).clamp(0, 255).to(torch.uint8)
+        jpeg_bytes = encode_jpeg(image, quality=100)
+
+        decoded_image = decode_jpeg(jpeg_bytes.cpu(), device=device)
+        mean_difference = (image.float() - decoded_image.float()).abs().mean().item()
+
+        assert mean_difference <= threshold, (
+            f"Encode/decode mismatch at iteration={iteration}, "
+            f"size={height}x{width}, mean diff={mean_difference:.2f}"
+        )
+
+
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("scripted", (True, False))
 @pytest.mark.parametrize("contiguous", (True, False))
diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp
@@ -94,12 +94,12 @@ std::vector<torch::Tensor> encode_jpegs_cuda(
 
   cudaJpegEncoder->set_quality(quality);
   std::vector<torch::Tensor> encoded_images;
-  at::cuda::CUDAEvent event;
-  event.record(cudaJpegEncoder->stream);
   for (const auto& image : contig_images) {
     auto encoded_image = cudaJpegEncoder->encode_jpeg(image);
     encoded_images.push_back(encoded_image);
   }
+  at::cuda::CUDAEvent event;
+  event.record(cudaJpegEncoder->stream);
 
   // We use a dedicated stream to do the encoding and even though the results
   // may be ready on that stream we cannot assume that they are also available
@@ -108,10 +108,7 @@ std::vector<torch::Tensor> encode_jpegs_cuda(
   // do not want to block the host at this particular point
   // (which is what cudaStreamSynchronize would do.) Events allow us to
   // synchronize the streams without blocking the host.
-  event.block(at::cuda::getCurrentCUDAStream(
-      cudaJpegEncoder->original_device.has_index()
-          ? cudaJpegEncoder->original_device.index()
-          : 0));
+  event.block(cudaJpegEncoder->current_stream);
   return encoded_images;
 }
 
@@ -121,7 +118,11 @@ CUDAJpegEncoder::CUDAJpegEncoder(const torch::Device& target_device)
       stream{
           target_device.has_index()
               ? at::cuda::getStreamFromPool(false, target_device.index())
-              : at::cuda::getStreamFromPool(false)} {
+              : at::cuda::getStreamFromPool(false)},
+      current_stream{
+          original_device.has_index()
+              ? at::cuda::getCurrentCUDAStream(original_device.index())
+              : at::cuda::getCurrentCUDAStream()} {
   nvjpegStatus_t status;
   status = nvjpegCreateSimple(&nvjpeg_handle);
   TORCH_CHECK(
@@ -186,12 +187,17 @@ CUDAJpegEncoder::~CUDAJpegEncoder() {
 }
 
 torch::Tensor CUDAJpegEncoder::encode_jpeg(const torch::Tensor& src_image) {
+  nvjpegStatus_t status;
+  cudaError_t cudaStatus;
+
+  // Ensure that the incoming src_image is safe to use
+  cudaStatus = cudaStreamSynchronize(current_stream);
+  TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus);
+
   int channels = src_image.size(0);
   int height = src_image.size(1);
   int width = src_image.size(2);
 
-  nvjpegStatus_t status;
-  cudaError_t cudaStatus;
   status = nvjpegEncoderParamsSetSamplingFactors(
       nv_enc_params, NVJPEG_CSS_444, stream);
   TORCH_CHECK(
@@ -251,7 +257,7 @@ torch::Tensor CUDAJpegEncoder::encode_jpeg(const torch::Tensor& src_image) {
       nv_enc_state,
       encoded_image.data_ptr<uint8_t>(),
       &length,
-      0);
+      stream);
   TORCH_CHECK(
       status == NVJPEG_STATUS_SUCCESS,
       "Failed to retrieve encoded image: ",
diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h
@@ -22,6 +22,7 @@ class CUDAJpegEncoder {
   const torch::Device original_device;
   const torch::Device target_device;
   const c10::cuda::CUDAStream stream;
+  const c10::cuda::CUDAStream current_stream;
 
  protected:
   nvjpegEncoderState_t nv_enc_state;
diff --git a/torchvision/datasets/coco.py b/torchvision/datasets/coco.py
@@ -10,7 +10,8 @@
 class CocoDetection(VisionDataset):
     """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
 
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+    It requires `pycocotools <https://github.com/ppwwyyxx/cocoapi>`_ to be installed,
+    which could be installed via ``pip install pycocotools`` or ``conda install conda-forge::pycocotools``.
 
     Args:
         root (str or ``pathlib.Path``): Root directory where images are downloaded to.
@@ -65,7 +66,8 @@ def __len__(self) -> int:
 class CocoCaptions(CocoDetection):
     """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
 
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+    It requires `pycocotools <https://github.com/ppwwyyxx/cocoapi>`_ to be installed,
+    which could be installed via ``pip install pycocotools`` or ``conda install conda-forge::pycocotools``.
 
     Args:
         root (str or ``pathlib.Path``): Root directory where images are downloaded to.
diff --git a/torchvision/datasets/country211.py b/torchvision/datasets/country211.py
@@ -1,7 +1,7 @@
 from pathlib import Path
-from typing import Callable, Optional, Union
+from typing import Any, Callable, Optional, Union
 
-from .folder import ImageFolder
+from .folder import default_loader, ImageFolder
 from .utils import download_and_extract_archive, verify_str_arg
 
 
@@ -21,6 +21,9 @@ class Country211(ImageFolder):
         target_transform (callable, optional): A function/transform that takes in the target and transforms it.
         download (bool, optional): If True, downloads the dataset from the internet and puts it into
             ``root/country211/``. If dataset is already downloaded, it is not downloaded again.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     _URL = "https://openaipublic.azureedge.net/clip/data/country211.tgz"
@@ -33,6 +36,7 @@ def __init__(
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         self._split = verify_str_arg(split, "split", ("train", "valid", "test"))
 
@@ -46,7 +50,12 @@ def __init__(
         if not self._check_exists():
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
-        super().__init__(str(self._base_folder / self._split), transform=transform, target_transform=target_transform)
+        super().__init__(
+            str(self._base_folder / self._split),
+            transform=transform,
+            target_transform=target_transform,
+            loader=loader,
+        )
         self.root = str(root)
 
     def _check_exists(self) -> bool:
diff --git a/torchvision/datasets/eurosat.py b/torchvision/datasets/eurosat.py
@@ -1,8 +1,8 @@
 import os
 from pathlib import Path
-from typing import Callable, Optional, Union
+from typing import Any, Callable, Optional, Union
 
-from .folder import ImageFolder
+from .folder import default_loader, ImageFolder
 from .utils import download_and_extract_archive
 
 
@@ -21,6 +21,9 @@ class EuroSAT(ImageFolder):
         download (bool, optional): If True, downloads the dataset from the internet and
             puts it in root directory. If dataset is already downloaded, it is not
             downloaded again. Default is False.
+        loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
     """
 
     def __init__(
@@ -29,6 +32,7 @@ def __init__(
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
+        loader: Callable[[str], Any] = default_loader,
     ) -> None:
         self.root = os.path.expanduser(root)
         self._base_folder = os.path.join(self.root, "eurosat")
@@ -40,7 +44,12 @@ def __init__(
         if not self._check_exists():
             raise RuntimeError("Dataset not found. You can use download=True to download it")
 
-        super().__init__(self._data_folder, transform=transform, target_transform=target_transform)
+        super().__init__(
+            self._data_folder,
+            transform=transform,
+            target_transform=target_transform,
+            loader=loader,
+        )
         self.root = os.path.expanduser(root)
 
     def __len__(self) -> int:
diff --git a/torchvision/datasets/imagenet.py b/torchvision/datasets/imagenet.py
@@ -36,6 +36,8 @@ class ImageNet(ImageFolder):
         target_transform (callable, optional): A function/transform that takes in the
             target and transforms it.
         loader (callable, optional): A function to load an image given its path.
+            By default, it uses PIL as its image loader, but users could also pass in
+            ``torchvision.io.decode_image`` for decoding image data into tensors directly.
 
      Attributes:
         classes (list): List of the class name tuples.
diff --git a/torchvision/datasets/places365.py b/torchvision/datasets/places365.py
@@ -1,7 +1,7 @@
 import os
 from os import path
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 from urllib.parse import urljoin
 
 from .folder import default_loader
@@ -15,7 +15,7 @@ class Places365(VisionDataset):
     Args:
         root (str or ``pathlib.Path``): Root directory of the Places365 dataset.
         split (string, optional): The dataset split. Can be one of ``train-standard`` (default), ``train-challenge``,
-            ``val``.
+            ``val``, ``test``.
         small (bool, optional): If ``True``, uses the small images, i.e. resized to 256 x 256 pixels, instead of the
             high resolution ones.
         download (bool, optional): If ``True``, downloads the dataset components and places them in ``root``. Already
@@ -36,7 +36,8 @@ class Places365(VisionDataset):
         RuntimeError: If ``download is False`` and the meta files, i.e. the devkit, are not present or corrupted.
         RuntimeError: If ``download is True`` and the image archive is already extracted.
     """
-    _SPLITS = ("train-standard", "train-challenge", "val")
+
+    _SPLITS = ("train-standard", "train-challenge", "val", "test")
     _BASE_URL = "http://data.csail.mit.edu/places/places365/"
     # {variant: (archive, md5)}
     _DEVKIT_META = {
@@ -50,15 +51,18 @@ class Places365(VisionDataset):
         "train-standard": ("places365_train_standard.txt", "30f37515461640559006b8329efbed1a"),
         "train-challenge": ("places365_train_challenge.txt", "b2931dc997b8c33c27e7329c073a6b57"),
         "val": ("places365_val.txt", "e9f2fd57bfd9d07630173f4e8708e4b1"),
+        "test": ("places365_test.txt", "2fce8233fe493576d724142e45d93653"),
     }
     # {(split, small): (file, md5)}
     _IMAGES_META = {
         ("train-standard", False): ("train_large_places365standard.tar", "67e186b496a84c929568076ed01a8aa1"),
         ("train-challenge", False): ("train_large_places365challenge.tar", "605f18e68e510c82b958664ea134545f"),
         ("val", False): ("val_large.tar", "9b71c4993ad89d2d8bcbdc4aef38042f"),
+        ("test", False): ("test_large.tar", "41a4b6b724b1d2cd862fb3871ed59913"),
         ("train-standard", True): ("train_256_places365standard.tar", "53ca1c756c3d1e7809517cc47c5561c5"),
         ("train-challenge", True): ("train_256_places365challenge.tar", "741915038a5e3471ec7332404dfb64ef"),
         ("val", True): ("val_256.tar", "e27b17d8d44f4af9a78502beb927f808"),
+        ("test", True): ("test_256.tar", "f532f6ad7b582262a2ec8009075e186b"),
     }
 
     def __init__(
@@ -123,10 +127,14 @@ def process(line: str) -> Tuple[str, int]:
 
         return sorted(class_to_idx.keys()), class_to_idx
 
-    def load_file_list(self, download: bool = True) -> Tuple[List[Tuple[str, int]], List[int]]:
-        def process(line: str, sep="/") -> Tuple[str, int]:
-            image, idx = line.split()
-            return path.join(self.images_dir, image.lstrip(sep).replace(sep, os.sep)), int(idx)
+    def load_file_list(
+        self, download: bool = True
+    ) -> Tuple[List[Tuple[str, Union[int, None]]], List[Union[int, None]]]:
+        def process(line: str, sep="/") -> Tuple[str, Union[int, None]]:
+            image, idx = (line.split() + [None])[:2]
+            image = cast(str, image)
+            idx = int(idx) if idx is not None else None
+            return path.join(self.images_dir, image.lstrip(sep).replace(sep, os.sep)), idx
 
         file, md5 = self._FILE_LIST_META[self.split]
         file = path.join(self.root, file)
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py