intel
diff --git a/‎README.md
Lines changed: 3 additions & 2 deletions b/‎README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎dependency_version.yml
Lines changed: 11 additions & 6 deletions b/‎dependency_version.yml
Lines changed: 11 additions & 6 deletions
diff --git a/‎docker/README.md
Lines changed: 6 additions & 5 deletions b/‎docker/README.md
Lines changed: 6 additions & 5 deletions
diff --git a/‎docker/build.sh
100755100644
Lines changed: 4 additions & 3 deletions b/‎docker/build.sh
100755100644
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/tutorials/contribution.md
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/contribution.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/features/DDP.md
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/features/DDP.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/features/torch_compile_gpu.md
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/features/torch_compile_gpu.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/getting_started.md
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/getting_started.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/installation.rst
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/installation.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/introduction.rst
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/introduction.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/llm.rst
Lines changed: 2 additions & 1 deletion b/‎docs/tutorials/llm.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/llm/int4_weight_only_quantization.md
Lines changed: 3 additions & 2 deletions b/‎docs/tutorials/llm/int4_weight_only_quantization.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/tutorials/llm/llm_optimize_transformers.md
Lines changed: 3 additions & 2 deletions b/‎docs/tutorials/llm/llm_optimize_transformers.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/gpu/inference/python/llm/Dockerfile
Lines changed: 6 additions & 6 deletions b/‎examples/gpu/inference/python/llm/Dockerfile
Lines changed: 6 additions & 6 deletions
@@ -60,9 +60,9 @@ Compilation instruction of the latest CPU code base `main` branch can be found i
 You can install Intel® Extension for PyTorch\* for GPU via command below.
 
 ```bash
-python -m pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30+xpu oneccl_bind_pt==2.1.300+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 
+python -m pip install torch==2.1.0.post3 torchvision==0.16.0.post3 torchaudio==2.1.0.post3 intel-extension-for-pytorch==2.1.40+xpu oneccl_bind_pt==2.1.400+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ 
 # for PRC user, you can check with the following link
-python -m pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30+xpu oneccl_bind_pt==2.1.300+xpu  --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+python -m pip install torch==2.1.0.post3 torchvision==0.16.0.post3 torchaudio==2.1.0.post3 intel-extension-for-pytorch==2.1.40+xpu oneccl_bind_pt==2.1.400+xpu  --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
 
 ```
 
@@ -127,3 +127,4 @@ See also: [Security Policy](SECURITY.md)
 
 
 
+
@@ -4,31 +4,36 @@ gcc:
 llvm:
   version: 16.0.6
 pytorch:
-  version: 2.1.0.post2+cxx11.abi
+  version: 2.1.0.post3+cxx11.abi
   commit: v2.1.0
 torchaudio:
-  version: 2.1.0.post2+cxx11.abi
+  version: 2.1.0.post3+cxx11.abi
   commit: v2.1.0
 torchvision:
-  version: 0.16.0.post2+cxx11.abi
+  version: 0.16.0.post3+cxx11.abi
   commit: v0.16.0
 torch-ccl:
   repo: https://github.com/intel/torch-ccl.git
   commit: 1053f1354f6293abc11e93af085524fe3664219f
-  version: 2.1.300+xpu
+  version: 2.1.400+xpu
 deepspeed:
-  version: 0.14.0
+  version: 0.14.2
 intel-extension-for-deepspeed:
-  version: 2.1.30
+  version: 2.1.40
 transformers:
   version: 4.31.0
   commit: v4.31.0
 protobuf:
   version: 3.20.3
 lm_eval:
   version: 0.3.0
+numpy:
+  version: 1.26.4
+setuptools:
+  version: 69.5.1
 basekit:
   dpcpp-cpp-rt:
     version: 2024.0.0
   mkl-dpcpp:
     version: 2024.0.0
+
@@ -19,10 +19,10 @@ Run the following commands to build a docker image by compiling from source.
 ```
 git clone https://github.com/intel/intel-extension-for-pytorch.git
 cd intel-extension-for-pytorch
-git checkout release/xpu/2.1.30
+git checkout release/xpu/2.1.40
 git submodule sync
 git submodule update --init --recursive
-docker build -f docker/Dockerfile.compile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t intel/intel-extension-for-pytorch:2.1.30-xpu .
+docker build -f docker/Dockerfile.compile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t intel/intel-extension-for-pytorch:2.1.40-xpu .
 ```
 
 Alternatively, `./build.sh` script has docker build command to install prebuilt wheel files, update all the relevant build arguments and execute the script. Run the command below in current directory.
@@ -34,7 +34,7 @@ export IMAGE_TYPE="xpu"
 To pull docker images use the following command:
 
 ```bash
-docker pull intel/intel-extension-for-pytorch:2.1.30-xpu
+docker pull intel/intel-extension-for-pytorch:2.1.40-xpu
 ```
 ### Running container:
 
@@ -43,7 +43,7 @@ local directory into the container. The `-v` argument can be omitted if you do n
 access to a local directory in the container. 
 
 ```
-IMAGE_NAME=intel/intel-extension-for-pytorch:2.1.30-xpu
+IMAGE_NAME=intel/intel-extension-for-pytorch:2.1.40-xpu
 ```
 ```bash
 docker run --rm \
@@ -89,7 +89,7 @@ python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch
 Sample output looks like below:
 ```bash
 2.1.0.post2+cxx11.abi
-2.1.30+xpu
+2.1.40+xpu
 [0]: _DeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) Level-Zero', dev_type='gpu', driver_version='1.3.27642', has_fp64=1, total_memory=65536MB, max_compute_units=448, gpu_eu_count=448)
 [1]: _DeviceProperties(name='Intel(R) Data Center GPU Max 1550', platform_name='Intel(R) Level-Zero', dev_type='gpu', driver_version='1.3.27642', has_fp64=1, total_memory=65536MB, max_compute_units=448, gpu_eu_count=448)
 ```
@@ -99,3 +99,4 @@ Sample output looks like below:
 Now you are inside container with Python 3.10, PyTorch, and Intel® Extension for PyTorch\* preinstalled. You can run your own script
 to run on Intel GPU.
 
+
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 if [[ ${IMAGE_TYPE} = "xpu" ]]; then
-    IMAGE_NAME=intel/intel-extension-for-pytorch:2.1.30-$IMAGE_TYPE
+    IMAGE_NAME=intel/intel-extension-for-pytorch:2.1.40-$IMAGE_TYPE
     docker build --build-arg http_proxy=$http_proxy \
                  --build-arg https_proxy=$https_proxy \
                  --build-arg no_proxy=" " \
@@ -16,10 +16,10 @@ if [[ ${IMAGE_TYPE} = "xpu" ]]; then
                  --build-arg MKL_VER=2024.1.0-691 \
                  --build-arg CCL_VER=2021.12.0-309 \
                  --build-arg TORCH_VERSION=2.1.0.post2+cxx11.abi  \
-                 --build-arg IPEX_VERSION=2.1.30+xpu \
+                 --build-arg IPEX_VERSION=2.1.40+xpu \
                  --build-arg TORCHVISION_VERSION=0.16.0.post2+cxx11.abi \
                  --build-arg TORCHAUDIO_VERSION=2.1.0.post2+cxx11.abi \
-                 --build-arg ONECCL_BIND_PT_VERSION=2.1.300+xpu \
+                 --build-arg ONECCL_BIND_PT_VERSION=2.1.400+xpu \
                  --build-arg TORCH_WHL_URL=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
                  --build-arg IPEX_WHL_URL=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
                  --build-arg TORCHVISION_WHL_URL=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
@@ -28,3 +28,4 @@ if [[ ${IMAGE_TYPE} = "xpu" ]]; then
                  -t ${IMAGE_NAME} \
                  -f Dockerfile.prebuilt .
 fi
+
@@ -16,7 +16,7 @@ Once you implement and test your feature or bug-fix, submit a Pull Request to ht
 
 ## Developing Intel® Extension for PyTorch\* on XPU
 
-A full set of instructions on installing Intel® Extension for PyTorch\* from source is in the [Installation document](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu).
+A full set of instructions on installing Intel® Extension for PyTorch\* from source is in the [Installation document](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.40%2bxpu).
 
 To develop on your machine, here are some tips:
 
@@ -127,3 +127,4 @@ To build the documentation:
 
 The `.rst` source files live in `docs/tutorials` folder. Some of the `.rst` files pull in docstrings from Intel® Extension for PyTorch\* Python code (for example, via the `autofunction` or `autoclass` directives). To shorten doc build times, it is helpful to remove the files you are not working on, only keeping the base `index.rst` file and the files you are editing. The Sphinx build will produce missing file warnings but will still complete.
 
+
@@ -50,7 +50,7 @@ python -m pip install oneccl_bind_pt --extra-index-url <REPO_URL>
 
 #### Install from source
 
-Refer to [Installation Guide](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.300+xpu?tab=readme-ov-file#install-from-source) to install Intel® oneCCL Bindings for Pytorch\* from source.
+Refer to [Installation Guide](https://github.com/intel/torch-ccl/tree/ccl_torch2.1.400+xpu?tab=readme-ov-file#install-from-source) to install Intel® oneCCL Bindings for Pytorch\* from source.
 
 ### Runtime Dynamic Linking
 
@@ -247,3 +247,4 @@ train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_
 ```
 Then you can start your model training on multiple GPU devices of one card.
 
+
@@ -14,7 +14,7 @@ Intel® Extension for PyTorch\* now empowers users to seamlessly harness graph c
 - `intel_extension_for_pytorch` : > v2.1.10
 - `triton` : [v2.1.0](https://github.com/intel/intel-xpu-backend-for-triton/releases/tag/v2.1.0) with Intel® XPU Backend for Triton* backend enabled.
 
-Follow [Intel® Extension for PyTorch\* Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu) to install `torch` and `intel_extension_for_pytorch` firstly.
+Follow [Intel® Extension for PyTorch\* Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.40%2bxpu) to install `torch` and `intel_extension_for_pytorch` firstly.
 
 Then install [Intel® XPU Backend for Triton\* backend](https://github.com/intel/intel-xpu-backend-for-triton) for `triton` package. You may install it via prebuilt wheel package or build it from the source. We recommend installing via prebuilt package:
 
@@ -72,3 +72,4 @@ loss.backward()
 optimizer.step()
 ```
 
+
@@ -1,6 +1,6 @@
 # Quick Start
 
-The following instructions assume you have installed the Intel® Extension for PyTorch\*. For installation instructions, refer to [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu).
+The following instructions assume you have installed the Intel® Extension for PyTorch\*. For installation instructions, refer to [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.40%2bxpu).
 
 To start using the Intel® Extension for PyTorch\* in your code, you need to make the following changes:
 
@@ -59,3 +59,4 @@ source /opt/intel/oneapi/mkl/latest/env/vars.sh
 python <script>
 ```
 
+
@@ -1,7 +1,8 @@
 Installation
 ============
 
-Select your preferences and follow the installation instructions provided on the `Installation page <https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu>`_.
+Select your preferences and follow the installation instructions provided on the `Installation page <https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.40%2bxpu>`_.
 
 After successful installation, refer to the `Quick Start <getting_started.md>`_ and `Examples <examples.md>`_ sections to start using the extension in your code.
 
+
@@ -9,11 +9,12 @@ For the detailed list of supported features and usage instructions, refer to `Fe
 
 Get Started
 -----------
-- `Installation <https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.30%2bxpu>`_
+- `Installation <https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.40%2bxpu>`_
 - `Quick Start <getting_started.md>`_
 - `Examples <examples.md>`_
 
 API Documentation
 -----------------
 For detailed description of the Intel® Extension for PyTorch* APIs, refer to the `API Documentation <api_doc.html>`_ section.
 
+
@@ -56,7 +56,7 @@ Optimized Models
 
 *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp16). For other LLMs families, we are working in progress to cover those optimizations, which will expand the model list above.
 
-Check `LLM best known practice <https://github.com/intel/intel-extension-for-pytorch/tree/release/xpu/2.1.30/examples/gpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts..
+Check `LLM best known practice <https://github.com/intel/intel-extension-for-pytorch/tree/release/xpu/2.1.40/examples/gpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts..
 
 Optimization Methodologies
 --------------------------
@@ -142,3 +142,4 @@ For more detailed information, check `WOQ INT4 <llm/int4_weight_only_quantizatio
    llm/int4_weight_only_quantization
 
 
+
@@ -119,7 +119,7 @@ After the policy is selected, Intel® Extension for PyTorch\* will use `HGEMM_IN
 Intel® Extension for PyTorch\* implements Weight-Only Quantization for Intel® Data Center GPU Max Series and Intel® Arc™ A-Series Graphics with Intel® Extension for Transformers\*. Below section uses Qwen-7B to demonstrate the detailed usage.
 
 ### Environment Setup
-Please refer to the [instructions](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.30%2Bxpu/examples/gpu/inference/python/llm/README.md#environment-setup).
+Please refer to the [instructions](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.40%2Bxpu/examples/gpu/inference/python/llm/README.md#environment-setup).
 
 ### Run Weight-Only Quantization LLM on Intel® GPU
 #### Install Intel-extension-for-transformers and Neural-compressor
@@ -177,7 +177,7 @@ output = loaded_model.generate(inputs)
 
 ```
 
-#### Execute [WOQ benchmark script](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.30%2Bxpu/examples/gpu/inference/python/llm/run_benchmark_woq.sh)
+#### Execute [WOQ benchmark script](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.40%2Bxpu/examples/gpu/inference/python/llm/run_benchmark_woq.sh)
 
 ```python
 bash run_benchmark_woq.sh
@@ -186,3 +186,4 @@ bash run_benchmark_woq.sh
 >Note:
 > * Do save quantized model before call `optimize_transformers` function.
 > * The optimize_transformers function is designed to optimize transformer-based models within frontend python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. Please refer to [Transformers Optimization Frontend API](../../tutorials/llm/llm_optimize_transformers.md) for the detail of `optimize_transformers`.
+
@@ -9,7 +9,7 @@ API documentation is available at [API Docs page](../api_doc.html#ipex.optimize_
 
 ## Pseudocode of Common Usage Scenarios
 
-The following sections show pseudocode snippets to invoke Intel® Extension for PyTorch\* APIs to work with LLMs. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.1.30%2Bxpu/examples/gpu/inference/python/llm).
+The following sections show pseudocode snippets to invoke Intel® Extension for PyTorch\* APIs to work with LLMs. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.1.40%2Bxpu/examples/gpu/inference/python/llm).
 
 ### FP16
 
@@ -117,7 +117,8 @@ print(modelJit.graph_for(inference_dta))
 
 Distributed inference can be performed with `DeepSpeed`. Based on original Intel® Extension for PyTorch\* scripts, the following code changes are required.
 
-Check Distributed Examples in [LLM example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.1.30%2Bxpu/examples/gpu/inference/python/llm) for complete codes.
+Check Distributed Examples in [LLM example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.1.40%2Bxpu/examples/gpu/inference/python/llm) for complete codes.
+
 
 
 
 
@@ -32,17 +32,17 @@ RUN useradd -m -s /bin/bash ubuntu && \
 USER ubuntu
 WORKDIR /home/ubuntu
 
-RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-    bash miniconda.sh -b -p ./miniconda3 && \
-    rm miniconda.sh && \
-    echo "source ~/miniconda3/bin/activate" >> ./.bashrc
+RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.1.2-0/Miniforge3-24.1.2-0-Linux-x86_64.sh && \
+    bash miniforge.sh -b -p ./miniforge3 && \
+    rm miniforge.sh && \
+    echo "source ~/miniforge3/bin/activate" >> ./.bashrc
 
 FROM base AS dev
 # --build-arg COMPILE=ON to compile from source
 ARG COMPILE
 RUN bash /basekit_driver_install_helper.sh dev
 COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
-RUN . ./miniconda3/bin/activate && \
+RUN . ./miniforge3/bin/activate && \
     conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
     cd intel-extension-for-pytorch/examples/gpu/inference/python/llm && \
     if [ -z ${COMPILE} ]; then MODE=6; else MODE=2; fi && \
@@ -54,7 +54,7 @@ COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/e
 COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
 COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/basekit_driver_install_helper.sh .
 RUN bash ./basekit_driver_install_helper.sh runtime-dev
-RUN . ./miniconda3/bin/activate && \
+RUN . ./miniforge3/bin/activate && \
     conda create -y -n py310 python=3.10 && conda activate py310 && \
     echo "conda activate py310" >> ./.bashrc && \
     ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh && \