diff --git a/examples/gen-ai-gaudi-base/README.md b/examples/gen-ai-gaudi-base/README.md new file mode 100644 index 0000000..61d0228 --- /dev/null +++ b/examples/gen-ai-gaudi-base/README.md @@ -0,0 +1,125 @@ +

+ Intel Logo +

+ +# Intel® Optimized Cloud Modules for Terraform + +© Copyright 2024, Intel Corporation + +## AWS DL1 EC2 Instance with Intel Gaudi Accelerators + +This demo will showcase Large Language Model(LLM) inference using Intel Gaudi AI Accelerators. This module will install the base software required to run other examples. + +## Usage + +### variables.tf + +Modify the region to target a specific AWS Region + +```hcl +variable "region" { + description = "Target AWS region to deploy EC2 in." + type = string + default = "us-east-1" +} +``` + +### main.tf + +Modify settings in this file to choose your AMI as well as other details around the instance that will be created. This demo was tested on Ubuntu 22.04. + +```hcl +## Get latest Ubuntu 22.04 AMI in AWS for x86 +data "aws_ami" "ubuntu-linux-2204" { + most_recent = true + owners = ["099720109477"] # Canonical + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"] + } + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +module "ec2-vm" { + source = "intel/aws-vm/intel" + key_name = aws_key_pair.TF_key.key_name + instance_type = "dl1.24xlarge" + availability_zone = "us-east-1a" + ami = data.aws_ami.ubuntu-linux-2204.id + user_data = data.cloudinit_config.ansible.rendered + + root_block_device = [{ + volume_size = "100" + }] + + tags = { + Name = "my-test-vm-${random_id.rid.dec}" + Owner = "OwnerName-${random_id.rid.dec}", + Duration = "2" + } +} +``` + +Run the Terraform Commands below to deploy the demos. + +```Shell +terraform init +terraform plan +terraform apply +``` + +## Running the Demo using AWS CloudShell + +Open your AWS account and click the Cloudshell prompt +At the command prompt enter in in these command prompts to install Terraform into the AWS Cloudshell + +```Shell +git clone https://github.com/tfutils/tfenv.git ~/.tfenv +mkdir ~/bin +ln -s ~/.tfenv/bin/* ~/bin/ +tfenv install 1.3.0 +tfenv use 1.3.0 +``` + +Download and run the [Gen-AI-Gaudi-Demo](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-gaudi-base) Terraform Module by typing this command + +```Shell +git clone https://github.com/intel/terraform-intel-aws-vm.git +``` + +Change into the `examples/gen-ai-gaudi-base` example folder + +```Shell +cd terraform-intel-aws-vm/examples/gen-ai-gaudi-demo +``` + +Run the Terraform Commands below to deploy the demos. + +```Shell +terraform init +terraform plan +terraform apply +``` + +After the Terraform module successfully creates the EC2 instance, **wait ~15 minutes** for the recipe to download/install the Intel Gaudi driver and software. After the deployment is done, you can launch the Habana Gaudi PyTorch container using the following: + +```bash +sudo docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest +``` + +## Deleting the Demo + +To delete the demo, run `terraform destroy` to delete all resources created. + +## Considerations + +- The AWS region where this example is run should have a default VPC + +## Links + +[Intel® Gaudi® AI Accelerator](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) + +[Intel® Gaudi® AI Accelerator - Developer Website](https://developer.habana.ai/) diff --git a/examples/gen-ai-gaudi-base/cloud_init.yml b/examples/gen-ai-gaudi-base/cloud_init.yml new file mode 100644 index 0000000..033df45 --- /dev/null +++ b/examples/gen-ai-gaudi-base/cloud_init.yml @@ -0,0 +1,9 @@ +#cloud-config +package_update: true +package_upgrade: true + +runcmd: + - apt install git ansible docker.io -y + - git clone https://github.com/intel/optimized-cloud-recipes.git /opt/optimized-cloud-recipes + - echo "@reboot ansible-playbook /opt/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml" | crontab - + - reboot diff --git a/examples/gen-ai-demo/main.tf b/examples/gen-ai-gaudi-base/main.tf similarity index 95% rename from examples/gen-ai-demo/main.tf rename to examples/gen-ai-gaudi-base/main.tf index 70a22c2..885964e 100644 --- a/examples/gen-ai-demo/main.tf +++ b/examples/gen-ai-gaudi-base/main.tf @@ -1,4 +1,4 @@ -# Provision EC2 Instance on Icelake on Amazon Linux OS in default vpc. It is configured to create the EC2 in +# Provision EC2 DL1 Instance on Ubuntu Linux OS in default vpc. It is configured to create the EC2 in # US-East-1 region. The region is provided in variables.tf in this example folder. # This example also create an EC2 key pair. Associate the public key with the EC2 instance. Create the private key @@ -80,7 +80,7 @@ module "ec2-vm" { count = var.vm_count source = "intel/aws-vm/intel" key_name = aws_key_pair.TF_key.key_name - instance_type = "m7i.4xlarge" + instance_type = "dl1.24xlarge" availability_zone = "us-east-1c" ami = data.aws_ami.ubuntu-linux-2204.id user_data = data.cloudinit_config.ansible.rendered diff --git a/examples/gen-ai-gaudi-base/outputs.tf b/examples/gen-ai-gaudi-base/outputs.tf new file mode 100644 index 0000000..24448ce --- /dev/null +++ b/examples/gen-ai-gaudi-base/outputs.tf @@ -0,0 +1,113 @@ +output "id" { + description = "The ID of the instance" + value = try(module.ec2-vm.*.id, module.ec2-vm.*.id, "") +} + +output "arn" { + description = "The ARN of the instance" + value = try(module.ec2-vm.*.arn, "") +} + +output "capacity_reservation_specification" { + description = "Capacity reservation specification of the instance" + value = try(module.ec2-vm.*.capacity_reservation_specification, "") +} + +output "instance_state" { + description = "The state of the instance. One of: `pending`, `running`, `shutting-down`, `terminated`, `stopping`, `stopped`" + value = try(module.ec2-vm.*.instance_state, "") +} + +output "outpost_arn" { + description = "The ARN of the Outpost the instance is assigned to" + value = try(module.ec2-vm.*.outpost_arn, "") +} + +output "password_data" { + description = "Base-64 encoded encrypted password data for the instance. Useful for getting the administrator password for instances running Microsoft Windows. This attribute is only exported if `get_password_data` is true" + value = try(module.ec2-vm.*.password_data, "") +} + +output "primary_network_interface_id" { + description = "The ID of the instance's primary network interface" + value = try(module.ec2-vm.*.primary_network_interface_id, "") +} + +output "private_dns" { + description = "The private DNS name assigned to the instance. Can only be used inside the Amazon EC2, and only available if you've enabled DNS hostnames for your VPC" + value = try(module.ec2-vm.*.private_dns, "") +} + +output "public_dns" { + description = "The public DNS name assigned to the instance. For EC2-VPC, this is only available if you've enabled DNS hostnames for your VPC" + value = try(module.ec2-vm.*.public_dns, "") +} + +output "public_ip" { + description = "The public IP address assigned to the instance, if applicable. NOTE: If you are using an aws_eip with your instance, you should refer to the EIP's address directly and not use `public_ip` as this field will change after the EIP is attached" + value = try(module.ec2-vm.*.public_ip, "") +} + +output "private_ip" { + description = "The private IP address assigned to the instance." + value = try(module.ec2-vm.*.private_ip, "") +} + +output "ipv6_addresses" { + description = "The IPv6 address assigned to the instance, if applicable." + value = try(module.ec2-vm.*.ipv6_addresses, []) +} + +output "tags_all" { + description = "A map of tags assigned to the resource, including those inherited from the provider default_tags configuration block" + value = try(module.ec2-vm.*.tags_all, {}) +} + +output "spot_bid_status" { + description = "The current bid status of the Spot Instance Request" + value = try(module.ec2-vm.*.spot_bid_status, "") +} + +output "spot_request_state" { + description = "The current request state of the Spot Instance Request" + value = try(module.ec2-vm.*.spot_request_state, "") +} + +output "spot_instance_id" { + description = "The Instance ID (if any) that is currently fulfilling the Spot Instance request" + value = try(module.ec2-vm.*.spot_instance_id, "") +} + +################################################################################ +# IAM Role / Instance Profile +################################################################################ + +output "iam_role_name" { + description = "The name of the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.name, null) +} + +output "iam_role_arn" { + description = "The Amazon Resource Name (ARN) specifying the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.arn, null) +} + +output "iam_role_unique_id" { + description = "Stable and unique string identifying the IAM role" + value = try(module.ec2-vm.*.aws_iam_role.unique_id, null) +} + +output "iam_instance_profile_arn" { + description = "ARN assigned by AWS to the instance profile" + value = try(module.ec2-vm.*.aws_iam_instance_profile.arn, null) +} + +output "iam_instance_profile_id" { + description = "Instance profile's ID" + value = try(module.ec2-vm.*.aws_iam_instance_profile.id, null) +} + +output "iam_instance_profile_unique" { + description = "Stable and unique string identifying the IAM instance profile" + value = try(module.ec2-vm.*.aws_iam_instance_profile.unique_id, null) +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-base/providers.tf b/examples/gen-ai-gaudi-base/providers.tf new file mode 100644 index 0000000..260a2e3 --- /dev/null +++ b/examples/gen-ai-gaudi-base/providers.tf @@ -0,0 +1,4 @@ +provider "aws" { + # Environment Variables used for Authentication + region = var.region +} \ No newline at end of file diff --git a/examples/gen-ai-gaudi-base/variables.tf b/examples/gen-ai-gaudi-base/variables.tf new file mode 100644 index 0000000..b7d84ab --- /dev/null +++ b/examples/gen-ai-gaudi-base/variables.tf @@ -0,0 +1,50 @@ +variable "region" { + description = "Target AWS region to deploy EC2 in." + type = string + default = "us-east-1" +} + +# Variable to add ingress rules to the security group. Replace the default values with the required ports and CIDR ranges. +variable "ingress_rules" { + type = list(object({ + from_port = number + to_port = number + protocol = string + cidr_blocks = string + })) + default = [ + { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + + }, + { + from_port = 7860 + to_port = 7860 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + + }, + { + from_port = 5000 + to_port = 5000 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + }, + { + from_port = 5001 + to_port = 5001 + protocol = "tcp" + cidr_blocks = "0.0.0.0/0" + } + ] +} + +# Variable for how many VMs to build +variable "vm_count" { + description = "Number of VMs to build." + type = number + default = 1 +} diff --git a/examples/gen-ai-gaudi-base/versions.tf b/examples/gen-ai-gaudi-base/versions.tf new file mode 100644 index 0000000..ff6e689 --- /dev/null +++ b/examples/gen-ai-gaudi-base/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_version = ">=1.3.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.31" + } + cloudinit = { + source = "hashicorp/cloudinit" + version = ">=2.2.0" + } + } +} \ No newline at end of file