diff --git a/README.md b/README.md index 47db06c4d..a675f7d00 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,16 @@ # TorchX -TorchX is a library containing standard DSLs for authoring and running PyTorch -related components for an E2E production ML pipeline. +TorchX is a universal job launcher for PyTorch applications. +TorchX is designed to have fast iteration time for training/research and support +for E2E production ML pipelines when you're ready. For the latest documentation, please refer to our [website](https://pytorch.org/torchx). +## Quickstart + +See the [quickstart guide](https://pytorch.org/torchx/latest/quickstart.html). + ## Requirements TorchX SDK (torchx): @@ -58,10 +63,6 @@ $ pip install -e git+https://github.com/pytorch/torchx.git#egg=torchx $ pip install -e git+https://github.com/pytorch/torchx.git#egg=torchx[kubernetes] ``` -## Quickstart - -See the [quickstart guide](https://pytorch.org/torchx/latest/quickstart.html). - ## Contributing We welcome PRs! See the [CONTRIBUTING](CONTRIBUTING.md) file. diff --git a/docs/source/.gitignore b/docs/source/.gitignore new file mode 100644 index 000000000..ff326926a --- /dev/null +++ b/docs/source/.gitignore @@ -0,0 +1,3 @@ +.torchxconfig +Dockerfile* +*.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 0e297fd6a..0505c4112 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -361,4 +361,6 @@ def handle_item(fieldarg, content):
""" -# nbsphinx_execute = 'never' + +if os.environ.get("SKIP_NB"): + nbsphinx_execute = "never" diff --git a/docs/source/custom_components.md b/docs/source/custom_components.md new file mode 100644 index 000000000..040dd32aa --- /dev/null +++ b/docs/source/custom_components.md @@ -0,0 +1,149 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.0 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Custom Components + +This is a guide on how to build a simple app and custom component spec +and launch it via two different schedulers. + +See the [Quickstart Guide](quickstart.md) for installation and basic usage. + +## Hello World + +Lets start off with writing a simple "Hello World" python app. This is just a +normal python program and can contain anything you'd like. + +
+
Note
+This example uses Jupyter Notebook `%%writefile` to create local files for +example purposes. Under normal usage you would have these as standalone files. +
+ +```python +%%writefile my_app.py + +import sys +import argparse + +def main(user: str) -> None: + print(f"Hello, {user}!") + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Hello world app" + ) + parser.add_argument( + "--user", + type=str, + help="the person to greet", + required=True, + ) + args = parser.parse_args(sys.argv[1:]) + + main(args.user) +``` + +Now that we have an app we can write the component file for it. This +function allows us to reuse and share our app in a user friendly way. + +We can use this component from the `torchx` cli or programmatically as part of a +pipeline. + +```python +%%writefile my_component.py + +import torchx.specs as specs + +def greet(user: str, image: str = "my_app:latest") -> specs.AppDef: + return specs.AppDef( + name="hello_world", + roles=[ + specs.Role( + name="greeter", + image=image, + entrypoint="python", + args=[ + "-m", "my_app", + "--user", user, + ], + ) + ], + ) +``` + +We can execute our component via `torchx run`. The +`local_cwd` scheduler executes the component relative to the current directory. + +```sh +torchx run --scheduler local_cwd my_component.py:greet --user "your name" +``` + +If we want to run in other environments, we can build a Docker container so we +can run our component in Docker enabled environments such as Kubernetes or via +the local Docker scheduler. + +
+
Note
+This requires Docker installed and won't work in environments such as Google +Colab. If you have not done so already follow the install instructions on: +[https://docs.docker.com/get-docker/](https://docs.docker.com/get-docker/) +
+ +```python +%%writefile Dockerfile.custom + +FROM ghcr.io/pytorch/torchx:0.1.0rc1 + +ADD my_app.py . +``` + +Once we have the Dockerfile created we can create our docker image. + +```sh +docker build -t my_app:latest -f Dockerfile.custom . +``` + +We can then launch it on the local scheduler. + +```sh +torchx run --scheduler local_docker my_component.py:greet --image "my_app:latest" --user "your name" +``` + +If you have a Kubernetes cluster you can use the [Kubernetes scheduler](schedulers/kubernetes.rst) to launch +this on the cluster instead. + + + +```sh +$ docker push my_app:latest +$ torchx run --scheduler kubernetes my_component.py:greet --image "my_app:latest" --user "your name" +``` + + + +## Builtins + +TorchX also provides a number of builtin components with premade images. You can discover +them via: + +```sh +torchx builtins +``` + +You can use these either from the CLI, from a pipeline or programmatically like +you would any other component. + +```sh +torchx run utils.echo --msg "Hello :)" +``` diff --git a/docs/source/index.rst b/docs/source/index.rst index d8e5de945..bab04ad9f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,37 +3,33 @@ TorchX ================== -TorchX is an SDK for quickly building and deploying ML applications from R&D to production. -It offers various builtin components that encode MLOps best practices and make advanced -features like distributed training and hyperparameter optimization accessible to all. -Users can get started with TorchX with no added setup cost since it supports popular -ML schedulers and pipeline orchestrators that are already widely adopted and deployed -in production. +TorchX is a universal job launcher for PyTorch applications. +TorchX is designed to have fast iteration time for training/research and support +for E2E production ML pipelines when you're ready. -No two production environments are the same. To comply with various use cases, TorchX's -core APIs allow tons of customization at well-defined extension points so that even the -most unique applications can be serviced without customizing the whole vertical stack. +**GETTING STARTED?** Follow the :ref:`quickstart guide`. -**GETTING STARTED?** First learn the :ref:`basic concepts` and -follow the :ref:`quickstart guide`. - -.. image:: torchx_index_diag.png - In 1-2-3 ----------------- -**01 DEFINE OR CHOOSE** Start by :ref:`writing a component` -- a python -function that returns an AppDef object for your application. Or you can choose one of the -:ref:`builtin components`. +Step 1. Install + +.. code-block:: shell + + pip install torchx[dev] + +Step 2. Run Locally + +.. code-block:: shell -**02 RUN AS A JOB** Once you've defined or chosen a component, you can :ref:`run it` -by submitting it as a job in one of the supported :ref:`Schedulers`. TorchX supports several -popular ones, such as Kubernetes and SLURM out of the box. + torchx run --scheduler local_cwd utils.python --script my_app.py "Hello, localhost!" -**03 CONVERT TO PIPELINE** In production, components are often run as a workflow (aka pipeline). -TorchX components can be converted to pipeline stages by passing them through the :py:mod:`torchx.pipelines` -adapter. :ref:`Pipelines` lists the pipeline orchestrators supported out of the box. +Step 3. Run Remotely + +.. code-block:: shell + + torchx run --scheduler kubernetes utils.python --script my_app.py "Hello, Kubernetes!" Documentation @@ -43,13 +39,12 @@ Documentation :maxdepth: 1 :caption: Usage - basics quickstart.md cli - + basics runner.config - advanced + custom_components.md Works With diff --git a/docs/source/pipelines.rst b/docs/source/pipelines.rst index 033d1d9d5..5569a2d37 100644 --- a/docs/source/pipelines.rst +++ b/docs/source/pipelines.rst @@ -4,19 +4,12 @@ torchx.pipelines .. automodule:: torchx.pipelines .. currentmodule:: torchx.pipelines -torchx.pipelines.kfp -##################### +All Pipelines +~~~~~~~~~~~~~~~~ -.. image:: pipeline_kfp_diagram.png +.. toctree:: + :maxdepth: 1 + :glob: -.. automodule:: torchx.pipelines.kfp -.. currentmodule:: torchx.pipelines.kfp + pipelines/* -.. currentmodule:: torchx.pipelines.kfp.adapter - -.. autofunction:: container_from_app -.. autofunction:: resource_from_app -.. autofunction:: component_from_app -.. autofunction:: component_spec_from_app - -.. autoclass:: ContainerFactory diff --git a/docs/source/pipelines/kfp.rst b/docs/source/pipelines/kfp.rst index c33162185..cabb04493 100644 --- a/docs/source/pipelines/kfp.rst +++ b/docs/source/pipelines/kfp.rst @@ -2,7 +2,23 @@ Kubeflow Pipelines ====================== TorchX provides an adapter to run TorchX components as part of Kubeflow -Pipelines. See :ref:`examples_pipelines/index:KubeFlow Pipelines Examples` and -the :mod:`torchx.pipelines.kfp` for API reference. +Pipelines. See :ref:`examples_pipelines/index:KubeFlow Pipelines Examples`. .. image:: kfp_diagram.jpg + +torchx.pipelines.kfp +##################### + +.. image:: pipeline_kfp_diagram.png + +.. automodule:: torchx.pipelines.kfp +.. currentmodule:: torchx.pipelines.kfp + +.. currentmodule:: torchx.pipelines.kfp.adapter + +.. autofunction:: container_from_app +.. autofunction:: resource_from_app +.. autofunction:: component_from_app +.. autofunction:: component_spec_from_app + +.. autoclass:: ContainerFactory diff --git a/docs/source/pipeline_kfp_diagram.png b/docs/source/pipelines/pipeline_kfp_diagram.png similarity index 100% rename from docs/source/pipeline_kfp_diagram.png rename to docs/source/pipelines/pipeline_kfp_diagram.png diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index cf3a45501..1908dec2c 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -12,10 +12,10 @@ jupyter: name: python3 --- -# Quickstart - Custom Components +# Quickstart -This is a self contained guide on how to build a simple app and component spec -and launch it via two different schedulers. +This is a self contained guide on how to write a simple app and start launching +distributed jobs on local and remote clusters. ## Installation @@ -51,126 +51,190 @@ example purposes. Under normal usage you would have these as standalone files. %%writefile my_app.py import sys -import argparse - -def main(user: str) -> None: - print(f"Hello, {user}!") - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Hello world app" - ) - parser.add_argument( - "--user", - type=str, - help="the person to greet", - required=True, - ) - args = parser.parse_args(sys.argv[1:]) - - main(args.user) + +print(f"Hello, {sys.argv[1]}!") ``` -Now that we have an app we can write the component file for it. This -function allows us to reuse and share our app in a user friendly way. +## Launching -We can use this component from the `torchx` cli or programmatically as part of a -pipeline. +We can execute our app via `torchx run`. The +`local_cwd` scheduler executes the app relative to the current directory. -```python -%%writefile my_component.py - -import torchx.specs as specs - -def greet(user: str, image: str = "my_app:latest") -> specs.AppDef: - return specs.AppDef( - name="hello_world", - roles=[ - specs.Role( - name="greeter", - image=image, - entrypoint="python", - args=[ - "-m", "my_app", - "--user", user, - ], - ) - ], - ) +For this we'll use the `utils.python` component: + +```sh +torchx run --scheduler local_cwd utils.python --help ``` -We can execute our component via `torchx run`. The -`local_cwd` scheduler executes the component relative to the current directory. +The component takes in the script name and any extra arguments will be passed to +the script itself. ```sh -torchx run --scheduler local_cwd my_component.py:greet --user "your name" +torchx run --scheduler local_cwd utils.python --script my_app.py "your name" ``` -If we want to run in other environments, we can build a Docker container so we -can run our component in Docker enabled environments such as Kubernetes or via -the local Docker scheduler. +We can run the exact same app via the `local_docker` scheduler. This scheduler +will package up the local workspace as a layer on top of the specified image. +This provides a very similar environment to the container based remote +schedulers.
Note
This requires Docker installed and won't work in environments such as Google -Colab. If you have not done so already follow the install instructions on: +Colab. See the Docker install instructions: [https://docs.docker.com/get-docker/](https://docs.docker.com/get-docker/)
+```sh +torchx run --scheduler local_docker utils.python --script my_app.py "your name" +``` + +TorchX defaults to using the +[ghcr.io/pytorch/torchx](https://ghcr.io/pytorch/torchx) Docker container image +which contains the PyTorch libraries, TorchX and related dependencies. + +## Distributed + +TorchX's `dist.ddp` component uses +[TorchElastic](https://pytorch.org/docs/stable/distributed.elastic.html) +to manage the workers. This means you can launch multi-worker and multi-host +jobs out of the box on all of the schedulers we support. + +```sh +torchx run --scheduler local_docker dist.ddp --help +``` + +Lets create a slightly more interesting app to leverage the TorchX distributed +support. + ```python -%%writefile Dockerfile +%%writefile dist_app.py -FROM ghcr.io/pytorch/torchx:0.1.0rc1 +import torch +import torch.distributed as dist -ADD my_app.py . +dist.init_process_group(backend="gloo") +print(f"I am worker {dist.get_rank()} of {dist.get_world_size()}!") + +a = torch.tensor([dist.get_rank()]) +dist.all_reduce(a) +print(f"all_reduce output = {a}") ``` -Once we have the Dockerfile created we can create our docker image. +Let launch a small job with 2 nodes and 2 worker processes per node: ```sh -docker build -t my_app:latest -f Dockerfile . +torchx run --scheduler local_docker dist.ddp -j 2x2 --script dist_app.py ``` -We can then launch it on the local scheduler. +## Workspaces / Patching -```sh -torchx run --scheduler local_docker my_component.py:greet --image "my_app:latest" --user "your name" +For each scheduler there's a concept of an `image`. For `local_cwd` and `slurm` +it uses the current working directory. For container based schedulers such as +`local_docker`, `kubernetes` and `aws_batch` it uses a docker container. + +To provide the same environment between local and remote jobs, TorchX CLI uses +workspaces to automatically patch images for remote jobs on a per scheduler +basis. + +When you launch a job via `torchx run` it'll overlay the current directory on +top of the provided image so your code is available in the launched job. + +For `docker` based schedulers you'll need a local docker daemon to build and +push the image to your remote docker repository. + +## `.torchxconfig` + +Arguments to schedulers can be specified either via a command line flag to +`torchx run -s -c ` or on a per scheduler basis via a +`.torchxconfig` file. + +```python +%%writefile .torchxconfig + +[kubernetes] +queue=torchx +image_repo= + +[slurm] +partition=torchx ``` -If you have a Kubernetes cluster you can use the [Kubernetes scheduler](schedulers/kubernetes.rst) to launch -this on the cluster instead. +## Remote Schedulers +TorchX supports a large number of schedulers. +Don't see yours? +[Request it!](https://github.com/pytorch/torchx/issues/new?assignees=&labels=&template=feature-request.md) + +Remote schedulers operate the exact same way the local schedulers do. The same +run command for local works out of the box on remote. ```sh -$ docker push my_app:latest -$ torchx run --scheduler kubernetes my_component.py:greet --image "my_app:latest" --user "your name" +$ torchx run --scheduler slurm dist.ddp -j 2x2 --script dist_app.py +$ torchx run --scheduler kubernetes dist.ddp -j 2x2 --script dist_app.py +$ torchx run --scheduler aws_batch dist.ddp -j 2x2 --script dist_app.py +$ torchx run --scheduler ray dist.ddp -j 2x2 --script dist_app.py ``` +Depending on the scheduler there may be a few extra configuration parameters so +TorchX knows where to run the job and upload built images. These can either be +set via `-c` or in the `.torchxconfig` file. -## Builtins -TorchX also provides a number of builtin components with premade images. You can discover -them via: +All config options: ```sh -torchx builtins +torchx runopts +``` + + +## Custom Images + +### Docker-based Schedulers + +If you want more than the standard PyTorch libraries you can add custom +Dockerfile or build your own docker container and use it as the base image for +your TorchX jobs. + + +```python +%%writefile timm_app.py + +import timm + +print(timm.models.resnet18()) ``` -You can use these either from the CLI, from a pipeline or programmatically like -you would any other component. +```python +%%writefile Dockerfile.torchx + +FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime + +RUN pip install timm + +COPY . . +``` + +Once we have the Dockerfile created we can launch as normal and TorchX will +automatically build the image with the newly provided Dockerfile instead of the +default one. ```sh -torchx run utils.echo --msg "Hello :)" +torchx run --scheduler local_docker utils.python --script timm_app.py "your name" ``` +### Slurm + +The `slurm` and `local_cwd` use the current environment so you can use `pip` and +`conda` as normal. + ## Next Steps 1. Checkout other features of the [torchx CLI](cli.rst) -2. Learn how to author more complex app specs by referencing [specs](specs.rst) +2. Take a look at the [list of schedulers](schedulers.rst) supported by the runner 3. Browse through the collection of [builtin components](components/overview.rst) -4. Take a look at the [list of schedulers](schedulers.rst) supported by the runner -5. See which [ML pipeline platforms](pipelines.rst) you can run components on -6. See a [training app example](examples_apps/index.rst) +4. See which [ML pipeline platforms](pipelines.rst) you can run components on +5. See a [training app example](examples_apps/index.rst) diff --git a/docs/source/schedulers.rst b/docs/source/schedulers.rst index 8d27326a9..3d3ec66b4 100644 --- a/docs/source/schedulers.rst +++ b/docs/source/schedulers.rst @@ -9,6 +9,15 @@ and :ref:`registering` it in the entrypo .. image:: scheduler_diagram.png +All Schedulers +~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 1 + :glob: + + schedulers/* + Scheduler Functions ~~~~~~~~~~~~~~~~~~~~ diff --git a/torchx/components/utils.py b/torchx/components/utils.py index a8267981b..e3728c44e 100644 --- a/torchx/components/utils.py +++ b/torchx/components/utils.py @@ -99,6 +99,7 @@ def python( *args: str, m: Optional[str] = None, c: Optional[str] = None, + script: Optional[str] = None, image: str = torchx.IMAGE, name: str = "torchx_utils_python", cpu: int = 2, @@ -108,7 +109,7 @@ def python( num_replicas: int = 1, ) -> specs.AppDef: """ - Runs ``python -c CMD`` or ``python -m MODULE`` on the specified + Runs ``python`` with the specified module, command or script on the specified image and host. Use ``--`` to separate component args and program args (e.g. ``torchx run utils.python --m foo.main -- --args to --main``) @@ -120,6 +121,7 @@ def python( args: arguments passed to the program in sys.argv[1:] (ignored with `--c`) m: run library module as a script c: program passed as string (may error if scheduler has a length limit on args) + script: .py script to run image: image to run on name: name of the job cpu: number of cpus per replica @@ -129,12 +131,19 @@ def python( num_replicas: number of copies to run (each on its own container) :return: """ - if m and c: - raise ValueError("only one of `--m` or `--c` can be specified") - if not m and not c: - raise ValueError("only one of `--m` or `--c` must be specified") - - prog_args = args if m else [] + if sum([m is not None, c is not None, script is not None]) != 1: + raise ValueError( + "exactly one of `-m`, `-c` and `--script` needs to be specified" + ) + + if script: + cmd = [script] + elif m: + cmd = ["-m", m] + elif c: + cmd = ["-c", c] + else: + raise ValueError("no program specified") return specs.AppDef( name=name, @@ -145,12 +154,7 @@ def python( entrypoint="python", num_replicas=num_replicas, resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h), - # pyre-ignore[6]: one of (only one of) m or c HAS to be not null - args=[ - "-m" if m else "-c", - m if m else c, - *prog_args, - ], + args=[*cmd, *args], env={"HYDRA_MAIN_MODULE": m} if m else {}, ) ], diff --git a/torchx/schedulers/docker_scheduler.py b/torchx/schedulers/docker_scheduler.py index 28ade8363..9240d4189 100644 --- a/torchx/schedulers/docker_scheduler.py +++ b/torchx/schedulers/docker_scheduler.py @@ -148,6 +148,8 @@ def schedule(self, dryrun_info: AppDryRunInfo[DockerJob]) -> str: for container in req.containers: images.add(container.image) for image in images: + if image.startswith("sha256:"): + continue log.info(f"Pulling container image: {image} (this may take a while)") try: client.images.pull(image) diff --git a/torchx/workspace/docker_workspace.py b/torchx/workspace/docker_workspace.py index 252abf6f2..f92a62f87 100644 --- a/torchx/workspace/docker_workspace.py +++ b/torchx/workspace/docker_workspace.py @@ -23,6 +23,9 @@ log: logging.Logger = logging.getLogger(__name__) +TORCHX_DOCKERFILE = "Dockerfile.torchx" + + class DockerWorkspace(Workspace): """ DockerWorkspace will build patched docker images from the workspace. These @@ -33,6 +36,12 @@ class DockerWorkspace(Workspace): This requires a running docker daemon locally and for remote pushing requires being authenticated to those repositories via ``docker login``. + If there is a ``Dockerfile.torchx`` file present in the workspace that will + be used instead to build the container. + + To exclude files from the build context you can use the standard + `.dockerignore` file. + See more: * https://docs.docker.com/engine/reference/commandline/login/ @@ -67,11 +76,17 @@ def build_workspace_and_update_role(self, role: Role, workspace: str) -> None: context = _build_context(role.image, workspace) try: - + try: + self._docker_client.images.pull(role.image) + except Exception as e: + log.warning( + f"failed to pull image {role.image}, falling back to local: {e}" + ) image, _ = self._docker_client.images.build( fileobj=context, custom_context=True, - pull=True, + dockerfile=TORCHX_DOCKERFILE, + pull=False, rm=True, labels={ self.LABEL_VERSION: torchx.__version__, @@ -151,7 +166,7 @@ def _build_context(img: str, workspace: str) -> IO[bytes]: ) dockerfile = bytes(f"FROM {img}\nCOPY . .\n", encoding="utf-8") with tarfile.open(fileobj=f, mode="w") as tf: - info = tarfile.TarInfo("Dockerfile") + info = tarfile.TarInfo(TORCHX_DOCKERFILE) info.size = len(dockerfile) tf.addfile(info, io.BytesIO(dockerfile)) diff --git a/torchx/workspace/test/docker_workspace_test.py b/torchx/workspace/test/docker_workspace_test.py index 97cbd2e34..791bd4c08 100644 --- a/torchx/workspace/test/docker_workspace_test.py +++ b/torchx/workspace/test/docker_workspace_test.py @@ -163,7 +163,7 @@ def test_dockerignore(self) -> None: self.assertCountEqual( tf.getnames(), { - "Dockerfile", + "Dockerfile.torchx", "foo.sh", ".dockerignore", "dir/ignorefile",