From 77422d27d73b47ecf0f201853068b934302fd27b Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Wed, 24 Apr 2024 11:00:06 -0500 Subject: [PATCH 1/9] Initial commit --- examples/gen-ai-demo/main.tf | 2 +- examples/gen-ai-gaudi-demo/README.md | 115 ++++++++++++++++++++++ examples/gen-ai-gaudi-demo/cloud_init.yml | 20 ++++ examples/gen-ai-gaudi-demo/main.tf | 97 ++++++++++++++++++ examples/gen-ai-gaudi-demo/outputs.tf | 113 +++++++++++++++++++++ examples/gen-ai-gaudi-demo/providers.tf | 4 + examples/gen-ai-gaudi-demo/recipe.yml | 101 +++++++++++++++++++ examples/gen-ai-gaudi-demo/variables.tf | 50 ++++++++++ examples/gen-ai-gaudi-demo/versions.tf | 13 +++ 9 files changed, 514 insertions(+), 1 deletion(-) create mode 100644 examples/gen-ai-gaudi-demo/README.md create mode 100644 examples/gen-ai-gaudi-demo/cloud_init.yml create mode 100644 examples/gen-ai-gaudi-demo/main.tf create mode 100644 examples/gen-ai-gaudi-demo/outputs.tf create mode 100644 examples/gen-ai-gaudi-demo/providers.tf create mode 100644 examples/gen-ai-gaudi-demo/recipe.yml create mode 100644 examples/gen-ai-gaudi-demo/variables.tf create mode 100644 examples/gen-ai-gaudi-demo/versions.tf diff --git a/examples/gen-ai-demo/main.tf b/examples/gen-ai-demo/main.tf index 70a22c2..c7750ea 100644 --- a/examples/gen-ai-demo/main.tf +++ b/examples/gen-ai-demo/main.tf @@ -80,7 +80,7 @@ module "ec2-vm" { count = var.vm_count source = "intel/aws-vm/intel" key_name = aws_key_pair.TF_key.key_name - instance_type = "m7i.4xlarge" + instance_type = "dl1.24xlarge" availability_zone = "us-east-1c" ami = data.aws_ami.ubuntu-linux-2204.id user_data = data.cloudinit_config.ansible.rendered diff --git a/examples/gen-ai-gaudi-demo/README.md b/examples/gen-ai-gaudi-demo/README.md new file mode 100644 index 0000000..6a287c0 --- /dev/null +++ b/examples/gen-ai-gaudi-demo/README.md @@ -0,0 +1,115 @@ +

+ Intel Logo +

+ +# Intel® Optimized Cloud Modules for Terraform + +© Copyright 2024, Intel Corporation + +## AWS DL1 EC2 Instance with Intel Gaudi Accelerators + +This demo will showcase Large Language Model(LLM) HPU inference using Intel Gaudi AI Accelerators + +## Usage + +### variables.tf + +Modify the region to target a specific AWS Region + +```hcl +variable "region" { + description = "Target AWS region to deploy EC2 in." + type = string + default = "us-east-1" +} +``` + +### main.tf + +Modify settings in this file to choose your AMI as well as other details around the instance that will be created. This demo was tested on Ubuntu 22.04. + +```hcl +## Get latest Ubuntu 22.04 AMI in AWS for x86 +data "aws_ami" "ubuntu-linux-2204" { + most_recent = true + owners = ["099720109477"] # Canonical + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] + } + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +module "ec2-vm" { + source = "intel/aws-vm/intel" + key_name = aws_key_pair.TF_key.key_name + instance_type = "dl1.24xlarge" + availability_zone = "us-east-1a" + ami = data.aws_ami.ubuntu-linux-2204.id + user_data = data.cloudinit_config.ansible.rendered + + root_block_device = [{ + volume_size = "100" + }] + + tags = { + Name = "my-test-vm-${random_id.rid.dec}" + Owner = "OwnerName-${random_id.rid.dec}", + Duration = "2" + } +} +``` + +Run the Terraform Commands below to deploy the demos. + +```Shell +terraform init +terraform plan +terraform apply +``` + +## Running the Demo using AWS CloudShell + +Open your AWS account and click the Cloudshell prompt +At the command prompt enter in in these command prompts to install Terraform into the AWS Cloudshell + +```Shell +git clone https://github.com/tfutils/tfenv.git ~/.tfenv +mkdir ~/bin +ln -s ~/.tfenv/bin/* ~/bin/ +tfenv install 1.3.0 +tfenv use 1.3.0 +``` + +Download and run the [Gen-AI-Gaudi-Demo](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-demo) Terraform Module by typing this command + +```Shell +git clone https://github.com/intel/terraform-intel-aws-vm.git +``` + +Change into the `examples/gen-ai-gaudi-demo` example folder + +```Shell +cd terraform-intel-aws-vm/examples/gen-ai-gaudi-demo +``` + +Run the Terraform Commands below to deploy the demos. + +```Shell +terraform init +terraform plan +terraform apply +``` + +After the Terraform module successfully creates the EC2 instance, **wait ~15 minutes** for the recipe to download/install the Intel Gaudi driver and software. + +## Deleting the Demo + +To delete the demo, run `terraform destroy` to delete all resources created. + +## Considerations + +- The AWS region where this example is run should have a default VPC diff --git a/examples/gen-ai-gaudi-demo/cloud_init.yml b/examples/gen-ai-gaudi-demo/cloud_init.yml new file mode 100644 index 0000000..b7a98f4 --- /dev/null +++ b/examples/gen-ai-gaudi-demo/cloud_init.yml @@ -0,0 +1,20 @@ +#cloud-config +package_update: true +package_upgrade: true + +package: + - git + +runcmd: + - apt install ansible docker.io -y + - reboot -h now + +# Run the Ansible playbook from https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-gaudi-ubuntu/recipe.yml +# Path: recipe.yml +ansible: + install_method: distro + package_name: ansible + pull: + url: "https://github.com/intel/optimized-cloud-recipes" + playbook_name: "recipes/ai-gaudi-ubuntu/recipe.yml" + checkout: wsfowler-gaudi diff --git a/examples/gen-ai-gaudi-demo/main.tf b/examples/gen-ai-gaudi-demo/main.tf new file mode 100644 index 0000000..755de29 --- /dev/null +++ b/examples/gen-ai-gaudi-demo/main.tf @@ -0,0 +1,97 @@ +# Provision EC2 Instance on Icelake on Amazon Linux OS in default vpc. It is configured to create the EC2 in +# US-East-1 region. The region is provided in variables.tf in this example folder. + +# This example also create an EC2 key pair. Associate the public key with the EC2 instance. Create the private key +# in the local system where terraform apply is done. Create a new scurity group to open up the SSH port +# 22 to a specific IP CIDR block + +######### PLEASE NOTE TO CHANGE THE IP CIDR BLOCK TO ALLOW SSH FROM YOUR OWN ALLOWED IP ADDRESS FOR SSH ######### + +data "cloudinit_config" "ansible" { + gzip = true + base64_encode = true + + part { + filename = "cloud_init" + content_type = "text/cloud-config" + content = templatefile( + "cloud_init.yml", + {} + ) + } +} + +data "aws_ami" "ubuntu-linux-2204" { + most_recent = true + owners = ["099720109477"] # Canonical + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] + } + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +resource "random_id" "rid" { + byte_length = 5 +} + +# RSA key of size 4096 bits +resource "tls_private_key" "rsa" { + algorithm = "RSA" + rsa_bits = 4096 +} + +resource "aws_key_pair" "TF_key" { + key_name = "TF_key-${random_id.rid.dec}" + public_key = tls_private_key.rsa.public_key_openssh +} + +resource "local_file" "TF_private_key" { + content = tls_private_key.rsa.private_key_pem + filename = "tfkey.private" +} +resource "aws_security_group" "ssh_security_group" { + description = "security group to configure ports for ssh" + name_prefix = "ssh_security_group" +} + +# Modify the `ingress_rules` variable in the variables.tf file to allow the required ports for your CIDR ranges +resource "aws_security_group_rule" "ingress_rules" { + count = length(var.ingress_rules) + type = "ingress" + security_group_id = aws_security_group.ssh_security_group.id + from_port = var.ingress_rules[count.index].from_port + to_port = var.ingress_rules[count.index].to_port + protocol = var.ingress_rules[count.index].protocol + cidr_blocks = [var.ingress_rules[count.index].cidr_blocks] +} + +resource "aws_network_interface_sg_attachment" "sg_attachment" { + count = length(module.ec2-vm) + security_group_id = aws_security_group.ssh_security_group.id + network_interface_id = module.ec2-vm[count.index].primary_network_interface_id +} + +# Modify the `vm_count` variable in the variables.tf file to create the required number of EC2 instances +module "ec2-vm" { + count = var.vm_count + source = "intel/aws-vm/intel" + key_name = aws_key_pair.TF_key.key_name + instance_type = "dl1.24xlarge" + availability_zone = "us-east-1c" + ami = data.aws_ami.ubuntu-linux-2204.id + user_data = data.cloudinit_config.ansible.rendered + + root_block_device = [{ + volume_size = "100" + }] + + tags = { + Name = "my-test-vm-${count.index}-${random_id.rid.dec}" + Owner = "Fowler-${random_id.rid.dec}", + Duration = "2" + } +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/outputs.tf b/examples/gen-ai-gaudi-demo/outputs.tf new file mode 100644 index 0000000..24448ce --- /dev/null +++ b/examples/gen-ai-gaudi-demo/outputs.tf @@ -0,0 +1,113 @@ +output "id" { + description = "The ID of the instance" + value = try(module.ec2-vm.*.id, module.ec2-vm.*.id, "") +} + +output "arn" { + description = "The ARN of the instance" + value = try(module.ec2-vm.*.arn, "") +} + +output "capacity_reservation_specification" { + description = "Capacity reservation specification of the instance" + value = try(module.ec2-vm.*.capacity_reservation_specification, "") +} + +output "instance_state" { + description = "The state of the instance. One of: `pending`, `running`, `shutting-down`, `terminated`, `stopping`, `stopped`" + value = try(module.ec2-vm.*.instance_state, "") +} + +output "outpost_arn" { + description = "The ARN of the Outpost the instance is assigned to" + value = try(module.ec2-vm.*.outpost_arn, "") +} + +output "password_data" { + description = "Base-64 encoded encrypted password data for the instance. Useful for getting the administrator password for instances running Microsoft Windows. This attribute is only exported if `get_password_data` is true" + value = try(module.ec2-vm.*.password_data, "") +} + +output "primary_network_interface_id" { + description = "The ID of the instance's primary network interface" + value = try(module.ec2-vm.*.primary_network_interface_id, "") +} + +output "private_dns" { + description = "The private DNS name assigned to the instance. Can only be used inside the Amazon EC2, and only available if you've enabled DNS hostnames for your VPC" + value = try(module.ec2-vm.*.private_dns, "") +} + +output "public_dns" { + description = "The public DNS name assigned to the instance. For EC2-VPC, this is only available if you've enabled DNS hostnames for your VPC" + value = try(module.ec2-vm.*.public_dns, "") +} + +output "public_ip" { + description = "The public IP address assigned to the instance, if applicable. NOTE: If you are using an aws_eip with your instance, you should refer to the EIP's address directly and not use `public_ip` as this field will change after the EIP is attached" + value = try(module.ec2-vm.*.public_ip, "") +} + +output "private_ip" { + description = "The private IP address assigned to the instance." + value = try(module.ec2-vm.*.private_ip, "") +} + +output "ipv6_addresses" { + description = "The IPv6 address assigned to the instance, if applicable." + value = try(module.ec2-vm.*.ipv6_addresses, []) +} + +output "tags_all" { + description = "A map of tags assigned to the resource, including those inherited from the provider default_tags configuration block" + value = try(module.ec2-vm.*.tags_all, {}) +} + +output "spot_bid_status" { + description = "The current bid status of the Spot Instance Request" + value = try(module.ec2-vm.*.spot_bid_status, "") +} + +output "spot_request_state" { + description = "The current request state of the Spot Instance Request" + value = try(module.ec2-vm.*.spot_request_state, "") +} + +output "spot_instance_id" { + description = "The Instance ID (if any) that is currently fulfilling the Spot Instance request" + value = try(module.ec2-vm.*.spot_instance_id, "") +} + +################################################################################ +# IAM Role / Instance Profile +################################################################################ + +output "iam_role_name" { + description = "The name of the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.name, null) +} + +output "iam_role_arn" { + description = "The Amazon Resource Name (ARN) specifying the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.arn, null) +} + +output "iam_role_unique_id" { + description = "Stable and unique string identifying the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.unique_id, null) +} + +output "iam_instance_profile_arn" { + description = "ARN assigned by AWS to the instance profile" + value = try(module.ec2-vm.*.aws_iam_instance_profile.arn, null) +} + +output "iam_instance_profile_id" { + description = "Instance profile's ID" + value = try(module.ec2-vm.*.aws_iam_instance_profile.id, null) +} + +output "iam_instance_profile_unique" { + description = "Stable and unique string identifying the IAM instance profile" + value = try(module.ec2-vm.*.aws_iam_instance_profile.unique_id, null) +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/providers.tf b/examples/gen-ai-gaudi-demo/providers.tf new file mode 100644 index 0000000..260a2e3 --- /dev/null +++ b/examples/gen-ai-gaudi-demo/providers.tf @@ -0,0 +1,4 @@ +provider "aws" { + # Environment Variables used for Authentication + region = var.region +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/recipe.yml b/examples/gen-ai-gaudi-demo/recipe.yml new file mode 100644 index 0000000..743f123 --- /dev/null +++ b/examples/gen-ai-gaudi-demo/recipe.yml @@ -0,0 +1,101 @@ +########################################################## +# Host configuration # +########################################################## +--- +- name: Install pre-requisite packages + hosts: localhost + connection: local + tasks: + - name: Install pre-requisite packages + ansible.builtin.apt: + pkg: + - python3 + - python3-pip + - python-is-python3 + - net-tools + - libmkl-dev + state: present + update_cache: true + - name: Install Jupyterlab using pip + ansible.builtin.pip: + name: jupyterlab + state: present + +############################################################################################################################################# +# Setup Gaudi Frameworks # +# # +# Following the instructions here: https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#sw-stack-installation-bare # +# # +# Run the Habana setup script # +# - /tmp/habanalabs-installer.sh install -t base -y # +# - /tmp/habanalabs-installer.sh install -t dependencies -y # +# - /tmp/habanalabs-installer.sh install -t pytorch -y # +############################################################################################################################################# +- name: Install Habana base + hosts: localhost + connection: local + tasks: + - name: Download Habana installer + ansible.builtin.get_url: + url: https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh + dest: /tmp/habanalabs-installer.sh + mode: '0755' + - name: Install Habana base + ansible.builtin.shell: /tmp/habanalabs-installer.sh install -t base -y +- name: Install Habana dependencies + hosts: localhost + connection: local + tasks: + - name: Install Habana dependencies using script + ansible.builtin.shell: /tmp/habanalabs-installer.sh install -t dependencies -y + become_user: ubuntu + become: true +- name: Install Habana pytorch + hosts: localhost + connection: local + tasks: + - name: Install Habana pytorch + ansible.builtin.shell: /tmp/habanalabs-installer.sh install -t pytorch --venv -y + +# Install the Habana Container Runtime by dowloading the habana artifactory key, adding the artifactory repository to the sources list, and installing the habanalabs-container-runtime package +- name: Install Habana Container Runtime + hosts: localhost + connection: local + tasks: + - name: Download Habana artifactory key + ansible.builtin.get_url: + url: https://vault.habana.ai/artifactory/api/gpg/key/public + dest: /tmp/habana-artifactory-key + - name: Add Habana artifactory repository to sources list + ansible.builtin.copy: + content: | + deb https://vault.habana.ai/artifactory/debian jammy main + dest: /etc/apt/sources.list.d/artifactory.list + - name: Add Habana artifactory key + ansible.builtin.shell: apt-key add /tmp/habana-artifactory-key + - name: Install Habana Container Runtime + ansible.builtin.apt: + name: habanalabs-container-runtime + state: present + update_cache: true + - name: Add Habana Container Runtime to the docker daemon.json + ansible.builtin.copy: + content: | + { + "runtimes": { + "habana": { + "path": "/usr/bin/habana-container-runtime", + "runtimeArgs": [] + } + } + } + dest: /etc/docker/daemon.json + +# Install the Gaudi pytorch container from Habana Labs +- name: Install Gaudi pytorch container + hosts: localhost + connection: local + tasks: + - name: Pull the Gaudi pytorch container + ansible.builtin.shell: docker pull hdocker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/variables.tf b/examples/gen-ai-gaudi-demo/variables.tf new file mode 100644 index 0000000..b7d84ab --- /dev/null +++ b/examples/gen-ai-gaudi-demo/variables.tf @@ -0,0 +1,50 @@ +variable "region" { + description = "Target AWS region to deploy EC2 in." + type = string + default = "us-east-1" +} + +# Variable to add ingress rules to the security group. Replace the default values with the required ports and CIDR ranges. +variable "ingress_rules" { + type = list(object({ + from_port = number + to_port = number + protocol = string + cidr_blocks = string + })) + default = [ + { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + + }, + { + from_port = 7860 + to_port = 7860 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + + }, + { + from_port = 5000 + to_port = 5000 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + }, + { + from_port = 5001 + to_port = 5001 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + } + ] +} + +# Variable for how many VMs to build +variable "vm_count" { + description = "Number of VMs to build." + type = number + default = 1 +} diff --git a/examples/gen-ai-gaudi-demo/versions.tf b/examples/gen-ai-gaudi-demo/versions.tf new file mode 100644 index 0000000..ff6e689 --- /dev/null +++ b/examples/gen-ai-gaudi-demo/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_version = ">=1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.31" + } + cloudinit = { + source = "hashicorp/cloudinit" + version = ">=2.2.0" + } + } +} \ No newline at end of file From 5b1feffd5d79e63a3fda5c3e739da7bb84e52e39 Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Wed, 24 Apr 2024 13:19:37 -0500 Subject: [PATCH 2/9] Update cloudinit flow --- examples/gen-ai-gaudi-demo/cloud_init.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/gen-ai-gaudi-demo/cloud_init.yml b/examples/gen-ai-gaudi-demo/cloud_init.yml index b7a98f4..418dab2 100644 --- a/examples/gen-ai-gaudi-demo/cloud_init.yml +++ b/examples/gen-ai-gaudi-demo/cloud_init.yml @@ -8,13 +8,15 @@ package: runcmd: - apt install ansible docker.io -y - reboot -h now + - git clone https://github.com/intel/optimized-cloud-recipes/tree/wsfowler-gaudi.git /tmp/optimized-cloud-recipes + - ansible-playbook /tmp/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml # Run the Ansible playbook from https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-gaudi-ubuntu/recipe.yml # Path: recipe.yml -ansible: - install_method: distro - package_name: ansible - pull: - url: "https://github.com/intel/optimized-cloud-recipes" - playbook_name: "recipes/ai-gaudi-ubuntu/recipe.yml" - checkout: wsfowler-gaudi +# ansible: +# install_method: distro +# package_name: ansible +# pull: +# url: "https://github.com/intel/optimized-cloud-recipes" +# playbook_name: "recipes/ai-gaudi-ubuntu/recipe.yml" +# checkout: wsfowler-gaudi From f8d345607b25c2d41be429cca9ca134c3101f0eb Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Wed, 24 Apr 2024 13:35:41 -0500 Subject: [PATCH 3/9] Update cloud-init flow --- examples/gen-ai-gaudi-demo/cloud_init.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/gen-ai-gaudi-demo/cloud_init.yml b/examples/gen-ai-gaudi-demo/cloud_init.yml index 418dab2..042f0bc 100644 --- a/examples/gen-ai-gaudi-demo/cloud_init.yml +++ b/examples/gen-ai-gaudi-demo/cloud_init.yml @@ -7,7 +7,6 @@ package: runcmd: - apt install ansible docker.io -y - - reboot -h now - git clone https://github.com/intel/optimized-cloud-recipes/tree/wsfowler-gaudi.git /tmp/optimized-cloud-recipes - ansible-playbook /tmp/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml From a563da338f8fa12ea08420aa05078140b5c9c41b Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Wed, 24 Apr 2024 17:06:02 -0500 Subject: [PATCH 4/9] Updated cloud-init --- examples/gen-ai-gaudi-demo/cloud_init.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/gen-ai-gaudi-demo/cloud_init.yml b/examples/gen-ai-gaudi-demo/cloud_init.yml index 042f0bc..aa45c4c 100644 --- a/examples/gen-ai-gaudi-demo/cloud_init.yml +++ b/examples/gen-ai-gaudi-demo/cloud_init.yml @@ -2,13 +2,14 @@ package_update: true package_upgrade: true -package: - - git - runcmd: - - apt install ansible docker.io -y - - git clone https://github.com/intel/optimized-cloud-recipes/tree/wsfowler-gaudi.git /tmp/optimized-cloud-recipes - - ansible-playbook /tmp/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml + - apt install git ansible docker.io -y + - git clone https://github.com/intel/optimized-cloud-recipes.git /opt/optimized-cloud-recipes + - cd /opt/optimized-cloud-recipes + - git checkout wsfowler-gaudi + - echo "@reboot ansible-playbook /opt/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml" | crontab - + - reboot + # Run the Ansible playbook from https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-gaudi-ubuntu/recipe.yml # Path: recipe.yml From 8da505ef53464a9600b322fb5a9f7062c40a2126 Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Tue, 7 May 2024 09:51:49 -0500 Subject: [PATCH 5/9] Update Gaudi example --- examples/gen-ai-gaudi-base/README.md | 115 ++++++++++++++++++++++ examples/gen-ai-gaudi-base/cloud_init.yml | 9 ++ examples/gen-ai-gaudi-base/main.tf | 97 ++++++++++++++++++ examples/gen-ai-gaudi-base/outputs.tf | 113 +++++++++++++++++++++ examples/gen-ai-gaudi-base/providers.tf | 4 + examples/gen-ai-gaudi-base/variables.tf | 50 ++++++++++ examples/gen-ai-gaudi-base/versions.tf | 13 +++ 7 files changed, 401 insertions(+) create mode 100644 examples/gen-ai-gaudi-base/README.md create mode 100644 examples/gen-ai-gaudi-base/cloud_init.yml create mode 100644 examples/gen-ai-gaudi-base/main.tf create mode 100644 examples/gen-ai-gaudi-base/outputs.tf create mode 100644 examples/gen-ai-gaudi-base/providers.tf create mode 100644 examples/gen-ai-gaudi-base/variables.tf create mode 100644 examples/gen-ai-gaudi-base/versions.tf diff --git a/examples/gen-ai-gaudi-base/README.md b/examples/gen-ai-gaudi-base/README.md new file mode 100644 index 0000000..6be1d3d --- /dev/null +++ b/examples/gen-ai-gaudi-base/README.md @@ -0,0 +1,115 @@ +

+ Intel Logo +

+ +# Intel® Optimized Cloud Modules for Terraform + +© Copyright 2024, Intel Corporation + +## AWS DL1 EC2 Instance with Intel Gaudi Accelerators + +This demo will showcase Large Language Model(LLM) inference using Intel Gaudi AI Accelerators. This module will install the base software required to run other examples. + +## Usage + +### variables.tf + +Modify the region to target a specific AWS Region + +```hcl +variable "region" { + description = "Target AWS region to deploy EC2 in." + type = string + default = "us-east-1" +} +``` + +### main.tf + +Modify settings in this file to choose your AMI as well as other details around the instance that will be created. This demo was tested on Ubuntu 22.04. + +```hcl +## Get latest Ubuntu 22.04 AMI in AWS for x86 +data "aws_ami" "ubuntu-linux-2204" { + most_recent = true + owners = ["099720109477"] # Canonical + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] + } + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +module "ec2-vm" { + source = "intel/aws-vm/intel" + key_name = aws_key_pair.TF_key.key_name + instance_type = "dl1.24xlarge" + availability_zone = "us-east-1a" + ami = data.aws_ami.ubuntu-linux-2204.id + user_data = data.cloudinit_config.ansible.rendered + + root_block_device = [{ + volume_size = "100" + }] + + tags = { + Name = "my-test-vm-${random_id.rid.dec}" + Owner = "OwnerName-${random_id.rid.dec}", + Duration = "2" + } +} +``` + +Run the Terraform Commands below to deploy the demos. + +```Shell +terraform init +terraform plan +terraform apply +``` + +## Running the Demo using AWS CloudShell + +Open your AWS account and click the Cloudshell prompt +At the command prompt enter in in these command prompts to install Terraform into the AWS Cloudshell + +```Shell +git clone https://github.com/tfutils/tfenv.git ~/.tfenv +mkdir ~/bin +ln -s ~/.tfenv/bin/* ~/bin/ +tfenv install 1.3.0 +tfenv use 1.3.0 +``` + +Download and run the [Gen-AI-Gaudi-Demo](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-gaudi-base) Terraform Module by typing this command + +```Shell +git clone https://github.com/intel/terraform-intel-aws-vm.git +``` + +Change into the `examples/gen-ai-gaudi-base` example folder + +```Shell +cd terraform-intel-aws-vm/examples/gen-ai-gaudi-demo +``` + +Run the Terraform Commands below to deploy the demos. + +```Shell +terraform init +terraform plan +terraform apply +``` + +After the Terraform module successfully creates the EC2 instance, **wait ~10 minutes** for the recipe to download/install the Intel Gaudi driver and software. + +## Deleting the Demo + +To delete the demo, run `terraform destroy` to delete all resources created. + +## Considerations + +- The AWS region where this example is run should have a default VPC diff --git a/examples/gen-ai-gaudi-base/cloud_init.yml b/examples/gen-ai-gaudi-base/cloud_init.yml new file mode 100644 index 0000000..033df45 --- /dev/null +++ b/examples/gen-ai-gaudi-base/cloud_init.yml @@ -0,0 +1,9 @@ +#cloud-config +package_update: true +package_upgrade: true + +runcmd: + - apt install git ansible docker.io -y + - git clone https://github.com/intel/optimized-cloud-recipes.git /opt/optimized-cloud-recipes + - echo "@reboot ansible-playbook /opt/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml" | crontab - + - reboot diff --git a/examples/gen-ai-gaudi-base/main.tf b/examples/gen-ai-gaudi-base/main.tf new file mode 100644 index 0000000..885964e --- /dev/null +++ b/examples/gen-ai-gaudi-base/main.tf @@ -0,0 +1,97 @@ +# Provision EC2 DL1 Instance on Ubuntu Linux OS in default vpc. It is configured to create the EC2 in +# US-East-1 region. The region is provided in variables.tf in this example folder. + +# This example also create an EC2 key pair. Associate the public key with the EC2 instance. Create the private key +# in the local system where terraform apply is done. Create a new scurity group to open up the SSH port +# 22 to a specific IP CIDR block + +######### PLEASE NOTE TO CHANGE THE IP CIDR BLOCK TO ALLOW SSH FROM YOUR OWN ALLOWED IP ADDRESS FOR SSH ######### + +data "cloudinit_config" "ansible" { + gzip = true + base64_encode = true + + part { + filename = "cloud_init" + content_type = "text/cloud-config" + content = templatefile( + "cloud_init.yml", + {} + ) + } +} + +data "aws_ami" "ubuntu-linux-2204" { + most_recent = true + owners = ["099720109477"] # Canonical + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] + } + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +resource "random_id" "rid" { + byte_length = 5 +} + +# RSA key of size 4096 bits +resource "tls_private_key" "rsa" { + algorithm = "RSA" + rsa_bits = 4096 +} + +resource "aws_key_pair" "TF_key" { + key_name = "TF_key-${random_id.rid.dec}" + public_key = tls_private_key.rsa.public_key_openssh +} + +resource "local_file" "TF_private_key" { + content = tls_private_key.rsa.private_key_pem + filename = "tfkey.private" +} +resource "aws_security_group" "ssh_security_group" { + description = "security group to configure ports for ssh" + name_prefix = "ssh_security_group" +} + +# Modify the `ingress_rules` variable in the variables.tf file to allow the required ports for your CIDR ranges +resource "aws_security_group_rule" "ingress_rules" { + count = length(var.ingress_rules) + type = "ingress" + security_group_id = aws_security_group.ssh_security_group.id + from_port = var.ingress_rules[count.index].from_port + to_port = var.ingress_rules[count.index].to_port + protocol = var.ingress_rules[count.index].protocol + cidr_blocks = [var.ingress_rules[count.index].cidr_blocks] +} + +resource "aws_network_interface_sg_attachment" "sg_attachment" { + count = length(module.ec2-vm) + security_group_id = aws_security_group.ssh_security_group.id + network_interface_id = module.ec2-vm[count.index].primary_network_interface_id +} + +# Modify the `vm_count` variable in the variables.tf file to create the required number of EC2 instances +module "ec2-vm" { + count = var.vm_count + source = "intel/aws-vm/intel" + key_name = aws_key_pair.TF_key.key_name + instance_type = "dl1.24xlarge" + availability_zone = "us-east-1c" + ami = data.aws_ami.ubuntu-linux-2204.id + user_data = data.cloudinit_config.ansible.rendered + + root_block_device = [{ + volume_size = "100" + }] + + tags = { + Name = "my-test-vm-${count.index}-${random_id.rid.dec}" + Owner = "OwnerName-${random_id.rid.dec}", + Duration = "2" + } +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-base/outputs.tf b/examples/gen-ai-gaudi-base/outputs.tf new file mode 100644 index 0000000..24448ce --- /dev/null +++ b/examples/gen-ai-gaudi-base/outputs.tf @@ -0,0 +1,113 @@ +output "id" { + description = "The ID of the instance" + value = try(module.ec2-vm.*.id, module.ec2-vm.*.id, "") +} + +output "arn" { + description = "The ARN of the instance" + value = try(module.ec2-vm.*.arn, "") +} + +output "capacity_reservation_specification" { + description = "Capacity reservation specification of the instance" + value = try(module.ec2-vm.*.capacity_reservation_specification, "") +} + +output "instance_state" { + description = "The state of the instance. One of: `pending`, `running`, `shutting-down`, `terminated`, `stopping`, `stopped`" + value = try(module.ec2-vm.*.instance_state, "") +} + +output "outpost_arn" { + description = "The ARN of the Outpost the instance is assigned to" + value = try(module.ec2-vm.*.outpost_arn, "") +} + +output "password_data" { + description = "Base-64 encoded encrypted password data for the instance. Useful for getting the administrator password for instances running Microsoft Windows. This attribute is only exported if `get_password_data` is true" + value = try(module.ec2-vm.*.password_data, "") +} + +output "primary_network_interface_id" { + description = "The ID of the instance's primary network interface" + value = try(module.ec2-vm.*.primary_network_interface_id, "") +} + +output "private_dns" { + description = "The private DNS name assigned to the instance. Can only be used inside the Amazon EC2, and only available if you've enabled DNS hostnames for your VPC" + value = try(module.ec2-vm.*.private_dns, "") +} + +output "public_dns" { + description = "The public DNS name assigned to the instance. For EC2-VPC, this is only available if you've enabled DNS hostnames for your VPC" + value = try(module.ec2-vm.*.public_dns, "") +} + +output "public_ip" { + description = "The public IP address assigned to the instance, if applicable. NOTE: If you are using an aws_eip with your instance, you should refer to the EIP's address directly and not use `public_ip` as this field will change after the EIP is attached" + value = try(module.ec2-vm.*.public_ip, "") +} + +output "private_ip" { + description = "The private IP address assigned to the instance." + value = try(module.ec2-vm.*.private_ip, "") +} + +output "ipv6_addresses" { + description = "The IPv6 address assigned to the instance, if applicable." + value = try(module.ec2-vm.*.ipv6_addresses, []) +} + +output "tags_all" { + description = "A map of tags assigned to the resource, including those inherited from the provider default_tags configuration block" + value = try(module.ec2-vm.*.tags_all, {}) +} + +output "spot_bid_status" { + description = "The current bid status of the Spot Instance Request" + value = try(module.ec2-vm.*.spot_bid_status, "") +} + +output "spot_request_state" { + description = "The current request state of the Spot Instance Request" + value = try(module.ec2-vm.*.spot_request_state, "") +} + +output "spot_instance_id" { + description = "The Instance ID (if any) that is currently fulfilling the Spot Instance request" + value = try(module.ec2-vm.*.spot_instance_id, "") +} + +################################################################################ +# IAM Role / Instance Profile +################################################################################ + +output "iam_role_name" { + description = "The name of the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.name, null) +} + +output "iam_role_arn" { + description = "The Amazon Resource Name (ARN) specifying the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.arn, null) +} + +output "iam_role_unique_id" { + description = "Stable and unique string identifying the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.unique_id, null) +} + +output "iam_instance_profile_arn" { + description = "ARN assigned by AWS to the instance profile" + value = try(module.ec2-vm.*.aws_iam_instance_profile.arn, null) +} + +output "iam_instance_profile_id" { + description = "Instance profile's ID" + value = try(module.ec2-vm.*.aws_iam_instance_profile.id, null) +} + +output "iam_instance_profile_unique" { + description = "Stable and unique string identifying the IAM instance profile" + value = try(module.ec2-vm.*.aws_iam_instance_profile.unique_id, null) +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-base/providers.tf b/examples/gen-ai-gaudi-base/providers.tf new file mode 100644 index 0000000..260a2e3 --- /dev/null +++ b/examples/gen-ai-gaudi-base/providers.tf @@ -0,0 +1,4 @@ +provider "aws" { + # Environment Variables used for Authentication + region = var.region +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-base/variables.tf b/examples/gen-ai-gaudi-base/variables.tf new file mode 100644 index 0000000..b7d84ab --- /dev/null +++ b/examples/gen-ai-gaudi-base/variables.tf @@ -0,0 +1,50 @@ +variable "region" { + description = "Target AWS region to deploy EC2 in." + type = string + default = "us-east-1" +} + +# Variable to add ingress rules to the security group. Replace the default values with the required ports and CIDR ranges. +variable "ingress_rules" { + type = list(object({ + from_port = number + to_port = number + protocol = string + cidr_blocks = string + })) + default = [ + { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + + }, + { + from_port = 7860 + to_port = 7860 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + + }, + { + from_port = 5000 + to_port = 5000 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + }, + { + from_port = 5001 + to_port = 5001 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + } + ] +} + +# Variable for how many VMs to build +variable "vm_count" { + description = "Number of VMs to build." + type = number + default = 1 +} diff --git a/examples/gen-ai-gaudi-base/versions.tf b/examples/gen-ai-gaudi-base/versions.tf new file mode 100644 index 0000000..ff6e689 --- /dev/null +++ b/examples/gen-ai-gaudi-base/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_version = ">=1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.31" + } + cloudinit = { + source = "hashicorp/cloudinit" + version = ">=2.2.0" + } + } +} \ No newline at end of file From 0f80e756e522f9416dcd78702b190f9fb907fc55 Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Tue, 7 May 2024 09:54:24 -0500 Subject: [PATCH 6/9] Updated readme --- examples/gen-ai-gaudi-base/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/gen-ai-gaudi-base/README.md b/examples/gen-ai-gaudi-base/README.md index 6be1d3d..05ddc59 100644 --- a/examples/gen-ai-gaudi-base/README.md +++ b/examples/gen-ai-gaudi-base/README.md @@ -104,7 +104,11 @@ terraform plan terraform apply ``` -After the Terraform module successfully creates the EC2 instance, **wait ~10 minutes** for the recipe to download/install the Intel Gaudi driver and software. +After the Terraform module successfully creates the EC2 instance, **wait ~15 minutes** for the recipe to download/install the Intel Gaudi driver and software. After the deployment is done, you can launch the Habana Gaudi PyTorch container using the following: + +```bash +sudo docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest +``` ## Deleting the Demo From b96826d719f0953d4b6dc4c40739a5478ee486bb Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Tue, 7 May 2024 11:25:53 -0500 Subject: [PATCH 7/9] Remove older files --- examples/gen-ai-gaudi-demo/README.md | 115 ---------------------- examples/gen-ai-gaudi-demo/cloud_init.yml | 22 ----- examples/gen-ai-gaudi-demo/main.tf | 97 ------------------ examples/gen-ai-gaudi-demo/outputs.tf | 113 --------------------- examples/gen-ai-gaudi-demo/providers.tf | 4 - examples/gen-ai-gaudi-demo/recipe.yml | 101 ------------------- examples/gen-ai-gaudi-demo/variables.tf | 50 ---------- examples/gen-ai-gaudi-demo/versions.tf | 13 --- 8 files changed, 515 deletions(-) delete mode 100644 examples/gen-ai-gaudi-demo/README.md delete mode 100644 examples/gen-ai-gaudi-demo/cloud_init.yml delete mode 100644 examples/gen-ai-gaudi-demo/main.tf delete mode 100644 examples/gen-ai-gaudi-demo/outputs.tf delete mode 100644 examples/gen-ai-gaudi-demo/providers.tf delete mode 100644 examples/gen-ai-gaudi-demo/recipe.yml delete mode 100644 examples/gen-ai-gaudi-demo/variables.tf delete mode 100644 examples/gen-ai-gaudi-demo/versions.tf diff --git a/examples/gen-ai-gaudi-demo/README.md b/examples/gen-ai-gaudi-demo/README.md deleted file mode 100644 index 6a287c0..0000000 --- a/examples/gen-ai-gaudi-demo/README.md +++ /dev/null @@ -1,115 +0,0 @@ -

- Intel Logo -

- -# Intel® Optimized Cloud Modules for Terraform - -© Copyright 2024, Intel Corporation - -## AWS DL1 EC2 Instance with Intel Gaudi Accelerators - -This demo will showcase Large Language Model(LLM) HPU inference using Intel Gaudi AI Accelerators - -## Usage - -### variables.tf - -Modify the region to target a specific AWS Region - -```hcl -variable "region" { - description = "Target AWS region to deploy EC2 in." - type = string - default = "us-east-1" -} -``` - -### main.tf - -Modify settings in this file to choose your AMI as well as other details around the instance that will be created. This demo was tested on Ubuntu 22.04. - -```hcl -## Get latest Ubuntu 22.04 AMI in AWS for x86 -data "aws_ami" "ubuntu-linux-2204" { - most_recent = true - owners = ["099720109477"] # Canonical - filter { - name = "name" - values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] - } - filter { - name = "virtualization-type" - values = ["hvm"] - } -} - -module "ec2-vm" { - source = "intel/aws-vm/intel" - key_name = aws_key_pair.TF_key.key_name - instance_type = "dl1.24xlarge" - availability_zone = "us-east-1a" - ami = data.aws_ami.ubuntu-linux-2204.id - user_data = data.cloudinit_config.ansible.rendered - - root_block_device = [{ - volume_size = "100" - }] - - tags = { - Name = "my-test-vm-${random_id.rid.dec}" - Owner = "OwnerName-${random_id.rid.dec}", - Duration = "2" - } -} -``` - -Run the Terraform Commands below to deploy the demos. - -```Shell -terraform init -terraform plan -terraform apply -``` - -## Running the Demo using AWS CloudShell - -Open your AWS account and click the Cloudshell prompt -At the command prompt enter in in these command prompts to install Terraform into the AWS Cloudshell - -```Shell -git clone https://github.com/tfutils/tfenv.git ~/.tfenv -mkdir ~/bin -ln -s ~/.tfenv/bin/* ~/bin/ -tfenv install 1.3.0 -tfenv use 1.3.0 -``` - -Download and run the [Gen-AI-Gaudi-Demo](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-demo) Terraform Module by typing this command - -```Shell -git clone https://github.com/intel/terraform-intel-aws-vm.git -``` - -Change into the `examples/gen-ai-gaudi-demo` example folder - -```Shell -cd terraform-intel-aws-vm/examples/gen-ai-gaudi-demo -``` - -Run the Terraform Commands below to deploy the demos. - -```Shell -terraform init -terraform plan -terraform apply -``` - -After the Terraform module successfully creates the EC2 instance, **wait ~15 minutes** for the recipe to download/install the Intel Gaudi driver and software. - -## Deleting the Demo - -To delete the demo, run `terraform destroy` to delete all resources created. - -## Considerations - -- The AWS region where this example is run should have a default VPC diff --git a/examples/gen-ai-gaudi-demo/cloud_init.yml b/examples/gen-ai-gaudi-demo/cloud_init.yml deleted file mode 100644 index aa45c4c..0000000 --- a/examples/gen-ai-gaudi-demo/cloud_init.yml +++ /dev/null @@ -1,22 +0,0 @@ -#cloud-config -package_update: true -package_upgrade: true - -runcmd: - - apt install git ansible docker.io -y - - git clone https://github.com/intel/optimized-cloud-recipes.git /opt/optimized-cloud-recipes - - cd /opt/optimized-cloud-recipes - - git checkout wsfowler-gaudi - - echo "@reboot ansible-playbook /opt/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml" | crontab - - - reboot - - -# Run the Ansible playbook from https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-gaudi-ubuntu/recipe.yml -# Path: recipe.yml -# ansible: -# install_method: distro -# package_name: ansible -# pull: -# url: "https://github.com/intel/optimized-cloud-recipes" -# playbook_name: "recipes/ai-gaudi-ubuntu/recipe.yml" -# checkout: wsfowler-gaudi diff --git a/examples/gen-ai-gaudi-demo/main.tf b/examples/gen-ai-gaudi-demo/main.tf deleted file mode 100644 index 755de29..0000000 --- a/examples/gen-ai-gaudi-demo/main.tf +++ /dev/null @@ -1,97 +0,0 @@ -# Provision EC2 Instance on Icelake on Amazon Linux OS in default vpc. It is configured to create the EC2 in -# US-East-1 region. The region is provided in variables.tf in this example folder. - -# This example also create an EC2 key pair. Associate the public key with the EC2 instance. Create the private key -# in the local system where terraform apply is done. Create a new scurity group to open up the SSH port -# 22 to a specific IP CIDR block - -######### PLEASE NOTE TO CHANGE THE IP CIDR BLOCK TO ALLOW SSH FROM YOUR OWN ALLOWED IP ADDRESS FOR SSH ######### - -data "cloudinit_config" "ansible" { - gzip = true - base64_encode = true - - part { - filename = "cloud_init" - content_type = "text/cloud-config" - content = templatefile( - "cloud_init.yml", - {} - ) - } -} - -data "aws_ami" "ubuntu-linux-2204" { - most_recent = true - owners = ["099720109477"] # Canonical - filter { - name = "name" - values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] - } - filter { - name = "virtualization-type" - values = ["hvm"] - } -} - -resource "random_id" "rid" { - byte_length = 5 -} - -# RSA key of size 4096 bits -resource "tls_private_key" "rsa" { - algorithm = "RSA" - rsa_bits = 4096 -} - -resource "aws_key_pair" "TF_key" { - key_name = "TF_key-${random_id.rid.dec}" - public_key = tls_private_key.rsa.public_key_openssh -} - -resource "local_file" "TF_private_key" { - content = tls_private_key.rsa.private_key_pem - filename = "tfkey.private" -} -resource "aws_security_group" "ssh_security_group" { - description = "security group to configure ports for ssh" - name_prefix = "ssh_security_group" -} - -# Modify the `ingress_rules` variable in the variables.tf file to allow the required ports for your CIDR ranges -resource "aws_security_group_rule" "ingress_rules" { - count = length(var.ingress_rules) - type = "ingress" - security_group_id = aws_security_group.ssh_security_group.id - from_port = var.ingress_rules[count.index].from_port - to_port = var.ingress_rules[count.index].to_port - protocol = var.ingress_rules[count.index].protocol - cidr_blocks = [var.ingress_rules[count.index].cidr_blocks] -} - -resource "aws_network_interface_sg_attachment" "sg_attachment" { - count = length(module.ec2-vm) - security_group_id = aws_security_group.ssh_security_group.id - network_interface_id = module.ec2-vm[count.index].primary_network_interface_id -} - -# Modify the `vm_count` variable in the variables.tf file to create the required number of EC2 instances -module "ec2-vm" { - count = var.vm_count - source = "intel/aws-vm/intel" - key_name = aws_key_pair.TF_key.key_name - instance_type = "dl1.24xlarge" - availability_zone = "us-east-1c" - ami = data.aws_ami.ubuntu-linux-2204.id - user_data = data.cloudinit_config.ansible.rendered - - root_block_device = [{ - volume_size = "100" - }] - - tags = { - Name = "my-test-vm-${count.index}-${random_id.rid.dec}" - Owner = "Fowler-${random_id.rid.dec}", - Duration = "2" - } -} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/outputs.tf b/examples/gen-ai-gaudi-demo/outputs.tf deleted file mode 100644 index 24448ce..0000000 --- a/examples/gen-ai-gaudi-demo/outputs.tf +++ /dev/null @@ -1,113 +0,0 @@ -output "id" { - description = "The ID of the instance" - value = try(module.ec2-vm.*.id, module.ec2-vm.*.id, "") -} - -output "arn" { - description = "The ARN of the instance" - value = try(module.ec2-vm.*.arn, "") -} - -output "capacity_reservation_specification" { - description = "Capacity reservation specification of the instance" - value = try(module.ec2-vm.*.capacity_reservation_specification, "") -} - -output "instance_state" { - description = "The state of the instance. One of: `pending`, `running`, `shutting-down`, `terminated`, `stopping`, `stopped`" - value = try(module.ec2-vm.*.instance_state, "") -} - -output "outpost_arn" { - description = "The ARN of the Outpost the instance is assigned to" - value = try(module.ec2-vm.*.outpost_arn, "") -} - -output "password_data" { - description = "Base-64 encoded encrypted password data for the instance. Useful for getting the administrator password for instances running Microsoft Windows. This attribute is only exported if `get_password_data` is true" - value = try(module.ec2-vm.*.password_data, "") -} - -output "primary_network_interface_id" { - description = "The ID of the instance's primary network interface" - value = try(module.ec2-vm.*.primary_network_interface_id, "") -} - -output "private_dns" { - description = "The private DNS name assigned to the instance. Can only be used inside the Amazon EC2, and only available if you've enabled DNS hostnames for your VPC" - value = try(module.ec2-vm.*.private_dns, "") -} - -output "public_dns" { - description = "The public DNS name assigned to the instance. For EC2-VPC, this is only available if you've enabled DNS hostnames for your VPC" - value = try(module.ec2-vm.*.public_dns, "") -} - -output "public_ip" { - description = "The public IP address assigned to the instance, if applicable. NOTE: If you are using an aws_eip with your instance, you should refer to the EIP's address directly and not use `public_ip` as this field will change after the EIP is attached" - value = try(module.ec2-vm.*.public_ip, "") -} - -output "private_ip" { - description = "The private IP address assigned to the instance." - value = try(module.ec2-vm.*.private_ip, "") -} - -output "ipv6_addresses" { - description = "The IPv6 address assigned to the instance, if applicable." - value = try(module.ec2-vm.*.ipv6_addresses, []) -} - -output "tags_all" { - description = "A map of tags assigned to the resource, including those inherited from the provider default_tags configuration block" - value = try(module.ec2-vm.*.tags_all, {}) -} - -output "spot_bid_status" { - description = "The current bid status of the Spot Instance Request" - value = try(module.ec2-vm.*.spot_bid_status, "") -} - -output "spot_request_state" { - description = "The current request state of the Spot Instance Request" - value = try(module.ec2-vm.*.spot_request_state, "") -} - -output "spot_instance_id" { - description = "The Instance ID (if any) that is currently fulfilling the Spot Instance request" - value = try(module.ec2-vm.*.spot_instance_id, "") -} - -################################################################################ -# IAM Role / Instance Profile -################################################################################ - -output "iam_role_name" { - description = "The name of the IAM role" - value = try(module.ec2-vm.*.aws_iam_role.name, null) -} - -output "iam_role_arn" { - description = "The Amazon Resource Name (ARN) specifying the IAM role" - value = try(module.ec2-vm.*.aws_iam_role.arn, null) -} - -output "iam_role_unique_id" { - description = "Stable and unique string identifying the IAM role" - value = try(module.ec2-vm.*.aws_iam_role.unique_id, null) -} - -output "iam_instance_profile_arn" { - description = "ARN assigned by AWS to the instance profile" - value = try(module.ec2-vm.*.aws_iam_instance_profile.arn, null) -} - -output "iam_instance_profile_id" { - description = "Instance profile's ID" - value = try(module.ec2-vm.*.aws_iam_instance_profile.id, null) -} - -output "iam_instance_profile_unique" { - description = "Stable and unique string identifying the IAM instance profile" - value = try(module.ec2-vm.*.aws_iam_instance_profile.unique_id, null) -} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/providers.tf b/examples/gen-ai-gaudi-demo/providers.tf deleted file mode 100644 index 260a2e3..0000000 --- a/examples/gen-ai-gaudi-demo/providers.tf +++ /dev/null @@ -1,4 +0,0 @@ -provider "aws" { - # Environment Variables used for Authentication - region = var.region -} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/recipe.yml b/examples/gen-ai-gaudi-demo/recipe.yml deleted file mode 100644 index 743f123..0000000 --- a/examples/gen-ai-gaudi-demo/recipe.yml +++ /dev/null @@ -1,101 +0,0 @@ -########################################################## -# Host configuration # -########################################################## ---- -- name: Install pre-requisite packages - hosts: localhost - connection: local - tasks: - - name: Install pre-requisite packages - ansible.builtin.apt: - pkg: - - python3 - - python3-pip - - python-is-python3 - - net-tools - - libmkl-dev - state: present - update_cache: true - - name: Install Jupyterlab using pip - ansible.builtin.pip: - name: jupyterlab - state: present - -############################################################################################################################################# -# Setup Gaudi Frameworks # -# # -# Following the instructions here: https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#sw-stack-installation-bare # -# # -# Run the Habana setup script # -# - /tmp/habanalabs-installer.sh install -t base -y # -# - /tmp/habanalabs-installer.sh install -t dependencies -y # -# - /tmp/habanalabs-installer.sh install -t pytorch -y # -############################################################################################################################################# -- name: Install Habana base - hosts: localhost - connection: local - tasks: - - name: Download Habana installer - ansible.builtin.get_url: - url: https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh - dest: /tmp/habanalabs-installer.sh - mode: '0755' - - name: Install Habana base - ansible.builtin.shell: /tmp/habanalabs-installer.sh install -t base -y -- name: Install Habana dependencies - hosts: localhost - connection: local - tasks: - - name: Install Habana dependencies using script - ansible.builtin.shell: /tmp/habanalabs-installer.sh install -t dependencies -y - become_user: ubuntu - become: true -- name: Install Habana pytorch - hosts: localhost - connection: local - tasks: - - name: Install Habana pytorch - ansible.builtin.shell: /tmp/habanalabs-installer.sh install -t pytorch --venv -y - -# Install the Habana Container Runtime by dowloading the habana artifactory key, adding the artifactory repository to the sources list, and installing the habanalabs-container-runtime package -- name: Install Habana Container Runtime - hosts: localhost - connection: local - tasks: - - name: Download Habana artifactory key - ansible.builtin.get_url: - url: https://vault.habana.ai/artifactory/api/gpg/key/public - dest: /tmp/habana-artifactory-key - - name: Add Habana artifactory repository to sources list - ansible.builtin.copy: - content: | - deb https://vault.habana.ai/artifactory/debian jammy main - dest: /etc/apt/sources.list.d/artifactory.list - - name: Add Habana artifactory key - ansible.builtin.shell: apt-key add /tmp/habana-artifactory-key - - name: Install Habana Container Runtime - ansible.builtin.apt: - name: habanalabs-container-runtime - state: present - update_cache: true - - name: Add Habana Container Runtime to the docker daemon.json - ansible.builtin.copy: - content: | - { - "runtimes": { - "habana": { - "path": "/usr/bin/habana-container-runtime", - "runtimeArgs": [] - } - } - } - dest: /etc/docker/daemon.json - -# Install the Gaudi pytorch container from Habana Labs -- name: Install Gaudi pytorch container - hosts: localhost - connection: local - tasks: - - name: Pull the Gaudi pytorch container - ansible.builtin.shell: docker pull hdocker pull vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - \ No newline at end of file diff --git a/examples/gen-ai-gaudi-demo/variables.tf b/examples/gen-ai-gaudi-demo/variables.tf deleted file mode 100644 index b7d84ab..0000000 --- a/examples/gen-ai-gaudi-demo/variables.tf +++ /dev/null @@ -1,50 +0,0 @@ -variable "region" { - description = "Target AWS region to deploy EC2 in." - type = string - default = "us-east-1" -} - -# Variable to add ingress rules to the security group. Replace the default values with the required ports and CIDR ranges. -variable "ingress_rules" { - type = list(object({ - from_port = number - to_port = number - protocol = string - cidr_blocks = string - })) - default = [ - { - from_port = 22 - to_port = 22 - protocol = "tcp" - cidr_blocks = "0.0.0.0/0" - - }, - { - from_port = 7860 - to_port = 7860 - protocol = "tcp" - cidr_blocks = "0.0.0.0/0" - - }, - { - from_port = 5000 - to_port = 5000 - protocol = "tcp" - cidr_blocks = "0.0.0.0/0" - }, - { - from_port = 5001 - to_port = 5001 - protocol = "tcp" - cidr_blocks = "0.0.0.0/0" - } - ] -} - -# Variable for how many VMs to build -variable "vm_count" { - description = "Number of VMs to build." - type = number - default = 1 -} diff --git a/examples/gen-ai-gaudi-demo/versions.tf b/examples/gen-ai-gaudi-demo/versions.tf deleted file mode 100644 index ff6e689..0000000 --- a/examples/gen-ai-gaudi-demo/versions.tf +++ /dev/null @@ -1,13 +0,0 @@ -terraform { - required_version = ">=1.3.0" - required_providers { - aws = { - source = "hashicorp/aws" - version = "~> 5.31" - } - cloudinit = { - source = "hashicorp/cloudinit" - version = ">=2.2.0" - } - } -} \ No newline at end of file From abbf1bb379f70e2c11e46c3801d55e79ddf92483 Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Tue, 7 May 2024 11:55:26 -0500 Subject: [PATCH 8/9] Delete examples/gen-ai-demo/main.tf --- examples/gen-ai-demo/main.tf | 97 ------------------------------------ 1 file changed, 97 deletions(-) delete mode 100644 examples/gen-ai-demo/main.tf diff --git a/examples/gen-ai-demo/main.tf b/examples/gen-ai-demo/main.tf deleted file mode 100644 index c7750ea..0000000 --- a/examples/gen-ai-demo/main.tf +++ /dev/null @@ -1,97 +0,0 @@ -# Provision EC2 Instance on Icelake on Amazon Linux OS in default vpc. It is configured to create the EC2 in -# US-East-1 region. The region is provided in variables.tf in this example folder. - -# This example also create an EC2 key pair. Associate the public key with the EC2 instance. Create the private key -# in the local system where terraform apply is done. Create a new scurity group to open up the SSH port -# 22 to a specific IP CIDR block - -######### PLEASE NOTE TO CHANGE THE IP CIDR BLOCK TO ALLOW SSH FROM YOUR OWN ALLOWED IP ADDRESS FOR SSH ######### - -data "cloudinit_config" "ansible" { - gzip = true - base64_encode = true - - part { - filename = "cloud_init" - content_type = "text/cloud-config" - content = templatefile( - "cloud_init.yml", - {} - ) - } -} - -data "aws_ami" "ubuntu-linux-2204" { - most_recent = true - owners = ["099720109477"] # Canonical - filter { - name = "name" - values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] - } - filter { - name = "virtualization-type" - values = ["hvm"] - } -} - -resource "random_id" "rid" { - byte_length = 5 -} - -# RSA key of size 4096 bits -resource "tls_private_key" "rsa" { - algorithm = "RSA" - rsa_bits = 4096 -} - -resource "aws_key_pair" "TF_key" { - key_name = "TF_key-${random_id.rid.dec}" - public_key = tls_private_key.rsa.public_key_openssh -} - -resource "local_file" "TF_private_key" { - content = tls_private_key.rsa.private_key_pem - filename = "tfkey.private" -} -resource "aws_security_group" "ssh_security_group" { - description = "security group to configure ports for ssh" - name_prefix = "ssh_security_group" -} - -# Modify the `ingress_rules` variable in the variables.tf file to allow the required ports for your CIDR ranges -resource "aws_security_group_rule" "ingress_rules" { - count = length(var.ingress_rules) - type = "ingress" - security_group_id = aws_security_group.ssh_security_group.id - from_port = var.ingress_rules[count.index].from_port - to_port = var.ingress_rules[count.index].to_port - protocol = var.ingress_rules[count.index].protocol - cidr_blocks = [var.ingress_rules[count.index].cidr_blocks] -} - -resource "aws_network_interface_sg_attachment" "sg_attachment" { - count = length(module.ec2-vm) - security_group_id = aws_security_group.ssh_security_group.id - network_interface_id = module.ec2-vm[count.index].primary_network_interface_id -} - -# Modify the `vm_count` variable in the variables.tf file to create the required number of EC2 instances -module "ec2-vm" { - count = var.vm_count - source = "intel/aws-vm/intel" - key_name = aws_key_pair.TF_key.key_name - instance_type = "dl1.24xlarge" - availability_zone = "us-east-1c" - ami = data.aws_ami.ubuntu-linux-2204.id - user_data = data.cloudinit_config.ansible.rendered - - root_block_device = [{ - volume_size = "100" - }] - - tags = { - Name = "my-test-vm-${count.index}-${random_id.rid.dec}" - Owner = "OwnerName-${random_id.rid.dec}", - Duration = "2" - } -} \ No newline at end of file From 4a1aebbf3f710da490377bfd1b9f26205b297fa4 Mon Sep 17 00:00:00 2001 From: Steve Fowler Date: Tue, 7 May 2024 15:28:14 -0500 Subject: [PATCH 9/9] Add Habana links to readme --- examples/gen-ai-gaudi-base/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/gen-ai-gaudi-base/README.md b/examples/gen-ai-gaudi-base/README.md index 05ddc59..61d0228 100644 --- a/examples/gen-ai-gaudi-base/README.md +++ b/examples/gen-ai-gaudi-base/README.md @@ -117,3 +117,9 @@ To delete the demo, run `terraform destroy` to delete all resources created. ## Considerations - The AWS region where this example is run should have a default VPC + +## Links + +[Intel® Gaudi® AI Accelerator](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) + +[Intel® Gaudi® AI Accelerator - Developer Website](https://developer.habana.ai/)