From 5190054502be940c3aaee00b492415d2a684b3a5 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Sat, 18 Jan 2025 11:55:28 -0300 Subject: [PATCH 1/5] different users in manager and worker nodes in MW clusters --- CCconfig.toml | 12 +- Project.toml | 6 +- docs/src/index.md | 2 +- src/CloudClusters.jl | 3 +- src/cluster_providers/ec2/ec2_backend.jl | 141 ++++++++++++++++---- src/cluster_providers/ec2/ec2_deploy.jl | 17 ++- src/cluster_providers/ec2/ec2_persist.jl | 8 +- src/cluster_providers/local/local_deploy.jl | 2 +- src/deploy.jl | 5 +- src/utils.jl | 2 +- 10 files changed, 149 insertions(+), 49 deletions(-) diff --git a/CCconfig.toml b/CCconfig.toml index 75a2055..a851d02 100644 --- a/CCconfig.toml +++ b/CCconfig.toml @@ -18,10 +18,18 @@ mpiflags = "" [ec2] -imageid = "ami-09121cfdb459a0804" # found at us-east-1 (North Virginia). To use in other regions, copy it. +imageid = "ami-0b869698add04fbdc" # found at us-east-1 (North Virginia). To use in other regions, copy it. # placement_group = "pg-XXXXXXXXXXXX" or "automatic" # security_group_id = "sg-XXXXXXXXXXXX" or "automatic" # subnet_id = "subnet-XXXXXXXXXXXX" -[gcp] \ No newline at end of file +[gcp] + +imageid = "hpc-shelf-311900/global/images/cloudclusters-basic-v3" +zone = "us-central1-a" +project = "hpc-shelf-311900" +user = "heron" +exename = "/home/heron/.juliaup/bin/julia" +directory = "/home/heron" +mpiflags = "--map-by node --hostfile /home/heron/hostfile" diff --git a/Project.toml b/Project.toml index 6a7bdde..a93aa64 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" +GoogleCloud = "55e21f81-8b0a-565e-b5ad-6816892a5ee7" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" MPIClusterManagers = "e7922434-ae4b-11e9-05c5-9780451d2c66" PlatformAware = "e7c50b67-2c03-471e-9cf2-69e515d86ecf" @@ -26,8 +27,9 @@ YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" AWS = "1" Base64 = "1.10.4" Distributed = "1.10.4" -FilePathsBase = "0.9.21" Downloads = "1.6.0" +FilePathsBase = "0.9.21" +GoogleCloud = "0.11.0" JSON = "0.21" MPIClusterManagers = "0.2.4" PlatformAware = "0.6.1" @@ -36,6 +38,6 @@ Reexport = "1" Serialization = "1.10.4" Sockets = "1.10.4" TOML = "1.0.3" -Test = "1.11.0" +Test = "1.11.0" YAML = "0.4" julia = "1" diff --git a/docs/src/index.md b/docs/src/index.md index d54814d..f209d98 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -423,7 +423,7 @@ The parallel code sums the ranks of the processes using the _Reduce_ collective A ___Manager-Workers___ cluster comprises an _access node_ and a homogenous set of _compute nodes_. The compute nodes are only accessible from the access node. The instance type of the access node may be different from the instance type of the compute nodes. -In a ___Manager-Workers___ cluster, the master process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters. +In a ___Manager-Workers___ cluster, the manager process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters. A ___Manager-Workers___ cluster is useful when compute nodes are not directly accessible from the external network. This is a common situation in on-premises clusters. However, this is also possible in clusters built from the services of cluster providers specifically tailored to HPC applications. diff --git a/src/CloudClusters.jl b/src/CloudClusters.jl index e429839..b663abe 100644 --- a/src/CloudClusters.jl +++ b/src/CloudClusters.jl @@ -22,7 +22,8 @@ include("cluster_providers/ec2/ec2_persist.jl") include("cluster_providers/ec2/ec2_resolve.jl") include("cluster_providers/ec2/ec2_deploy.jl") include("cluster_providers/gcp/gcp_configs.jl") -#include("cluster_providers/gcp/gcp_backend.jl") +include("cluster_providers/gcp/gcp_backend.jl") +include("cluster_providers/gcp/gcp_persist.jl") include("cluster_providers/gcp/gcp_resolve.jl") include("cluster_providers/gcp/gcp_deploy.jl") include("cluster_providers/local/local_configs.jl") diff --git a/src/cluster_providers/ec2/ec2_backend.jl b/src/cluster_providers/ec2/ec2_backend.jl index 2f3bb62..9cc8b40 100644 --- a/src/cluster_providers/ec2/ec2_backend.jl +++ b/src/cluster_providers/ec2/ec2_backend.jl @@ -27,6 +27,8 @@ mutable struct EC2ManagerWorkers <: ManagerWorkers #Cluster count::Int image_id_manager::String image_id_worker::String + user_manager::String + user_worker::String subnet_id::Union{String, Nothing} placement_group::Union{String, Nothing} auto_pg::Bool @@ -44,6 +46,7 @@ mutable struct EC2PeerWorkers <: PeerWorkers # Cluster instance_type::String count::Int image_id::String + user::String subnet_id::Union{String, Nothing} placement_group::Union{String, Nothing} auto_pg::Bool @@ -60,6 +63,7 @@ mutable struct EC2PeerWorkersMPI <: PeerWorkersMPI # Cluster instance_type::String count::Int image_id::String + user::String subnet_id::Union{String, Nothing} placement_group::Union{String, Nothing} auto_pg::Bool @@ -169,6 +173,50 @@ Criação de Instâncias =# # Funções auxiliares. +# Funções auxiliares. +function ec2_set_up_ssh_connection(cluster_name, comment) + + internal_key_name = cluster_name + + ssh_path = joinpath(homedir(), ".ssh") + + !isdir(ssh_path) && mkdir(ssh_path) + + keypath = joinpath(ssh_path, "$internal_key_name.key") + pubpath = joinpath(ssh_path, "$internal_key_name.key.pub") + + # Criar chave interna pública e privada do SSH. + # chars = ['a':'z'; 'A':'Z'; '0':'9'] + # random_suffix = join(chars[Random.rand(1:length(chars), 5)]) + run(`ssh-keygen -t rsa -b 2048 -f $keypath -C $comment -N ""`) + run(`chmod 400 $keypath`) + private_key = base64encode(read(keypath, String)) + public_key = base64encode(read(pubpath, String)) + + private_key, public_key + end + + function ec2_get_user_data(cluster_name, user, private_key, public_key) + + # Define o script que irá instalar a chave pública e privada no headnode e workers. + user_data = "#!/bin/bash + echo $private_key | base64 -d > /home/$user/.ssh/$cluster_name + echo $public_key | base64 -d > /home/$user/.ssh/$cluster_name.pub + echo 'Host * + IdentityFile /home/$user/.ssh/$cluster_name + StrictHostKeyChecking no' > /home/$user/.ssh/config + cat /home/$user/.ssh/$cluster_name.pub >> /home/$user/.ssh/authorized_keys + chown -R $user:$user /home/$user/.ssh + chmod 600 /home/$user/.ssh/* + sed -i 's/#ClientAliveInterval 0/ClientAliveInterval 1000/g' /etc/ssh/sshd_config + sed -i 's/#ClientAliveCountMax 3/ClientAliveCountMax 100/g' /etc/ssh/sshd_config + systemctl restart ssh + " + + return user_data + end + +#= function ec2_set_up_ssh_connection(cluster_name) internal_key_name = cluster_name @@ -203,8 +251,12 @@ systemctl restart ssh " [internal_key_name, user_data] end +=# function ec2_create_params(cluster::ManagerWorkers, user_data_base64) + + user_data_manager_base64, user_data_worker_base64 = user_data_base64 + params_manager = Dict( "InstanceType" => cluster.instance_type_manager, "ImageId" => cluster.image_id_manager, @@ -214,7 +266,7 @@ function ec2_create_params(cluster::ManagerWorkers, user_data_base64) "Tag" => [Dict("Key" => "cluster", "Value" => cluster.name), Dict("Key" => "Name", "Value" => "manager") ] ), - "UserData" => user_data_base64, + "UserData" => user_data_manager_base64, ) params_workers = Dict( @@ -226,7 +278,7 @@ function ec2_create_params(cluster::ManagerWorkers, user_data_base64) "Tag" => [Dict("Key" => "cluster", "Value" => cluster.name), Dict("Key" => "Name", "Value" => "worker") ] ), - "UserData" => user_data_base64, + "UserData" => user_data_worker_base64, ) if !isnothing(cluster.subnet_id) @@ -283,9 +335,11 @@ function ec2_remove_temp_files(internal_key_name) rm(pubpath) end - +function ec2_set_hostfile(cluster_nodes, internal_key_name, user) + ec2_set_hostfile(cluster_nodes, internal_key_name, user, user) +end -function ec2_set_hostfile(cluster_nodes, internal_key_name) +function ec2_set_hostfile(cluster_nodes, internal_key_name, user_manager, user_worker) # Testando se a conexão SSH está ativa. for instance in keys(cluster_nodes) public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] @@ -316,16 +370,17 @@ function ec2_set_hostfile(cluster_nodes, internal_key_name) # Atualiza o hostname e o hostfile. for instance in keys(cluster_nodes) + user = instance == :manager ? user_manager : user_worker public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] # private_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["privateIpAddress"] - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo hostnamectl set-hostname $instance"`) - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "echo '$hostfilefile_content' > /home/ubuntu/hostfile"`) -# try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`) - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "echo '$hostfile_content' >> hosts.tmp"`) - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo chown ubuntu:ubuntu /etc/hosts"`) - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "cat hosts.tmp > /etc/hosts"`) - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo chown root:root /etc/hosts"`) - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "rm hosts.tmp"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo hostnamectl set-hostname $instance"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "echo '$hostfilefile_content' > /home/$user/hostfile"`) +# try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "echo '$hostfile_content' >> hosts.tmp"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo chown $user:$user /etc/hosts"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "cat hosts.tmp > /etc/hosts"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo chown root:root /etc/hosts"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "rm hosts.tmp"`) end #wait(h) @@ -342,22 +397,37 @@ function ec2_create_instances(cluster::ManagerWorkers) cluster_nodes = Dict() # Configurando a conexão SSH. - internal_key_name, user_data = ec2_set_up_ssh_connection(cluster.name) + + private_key, public_key = ec2_set_up_ssh_connection(cluster.name, cluster.user_manager) + + user_data_manager = ec2_get_user_data(cluster.name, cluster.user_manager, private_key, public_key) + user_data_worker = ec2_get_user_data(cluster.name, cluster.user_worker, private_key, public_key) + + internal_key_name = cluster.name # Configuração do NFS if cluster.shared_fs file_system_ip = cluster.environment.file_system_ip - nfs_user_data = "apt-get -y install nfs-common -mkdir /home/ubuntu/shared/ -mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/ubuntu/shared/ -chown -R ubuntu:ubuntu /home/ubuntu/shared + nfs_user_data_manager = "apt-get -y install nfs-common +mkdir /home/$user_manager/shared/ +mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/$user_manager/shared/ +chown -R $user_manager:$user_manager /home/$user_manager/shared " - user_data *= nfs_user_data + nfs_user_data_worker = "apt-get -y install nfs-common +mkdir /home/$user_worker/shared/ +mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/$user_worker/shared/ +chown -R $user_worker:$user_worker /home/$user_worker/shared +" + user_data_manager *= nfs_user_data_manager + user_data_worker *= nfs_user_data_worker end - user_data_base64 = base64encode(user_data) + + user_data_manager_base64 = base64encode(user_data_manager) + user_data_worker_base64 = base64encode(user_data_worker) # Criando as instâncias - params_manager, params_workers = ec2_create_params(cluster, user_data_base64) + params_manager, params_workers = ec2_create_params(cluster, (user_data_manager_base64, user_data_worker_base64)) + # Criar o headnode instance_headnode = run_instances(1, 1, params_manager) cluster_nodes[:manager] = instance_headnode["instancesSet"]["item"]["instanceId"] @@ -382,7 +452,7 @@ chown -R ubuntu:ubuntu /home/ubuntu/shared ec2_await_status(cluster_nodes, "running") ec2_await_check(cluster_nodes, "ok") - ec2_set_hostfile(cluster_nodes, internal_key_name) + ec2_set_hostfile(cluster_nodes, internal_key_name, cluster.user_manager, cluster.user_worker) #ec2_remove_temp_files(internal_key_name) @@ -393,15 +463,18 @@ function ec2_create_instances(cluster::PeerWorkers) cluster_nodes = Dict() # Configurando a conexão SSH. - internal_key_name, user_data = ec2_set_up_ssh_connection(cluster.name) + private_key, public_key = ec2_set_up_ssh_connection(cluster.name, cluster.user) + user_data = ec2_get_user_data(cluster.name, cluster.user, private_key, public_key) + + internal_key_name = cluster.name # Configuração do NFS if cluster.shared_fs file_system_ip = cluster.environment.file_system_ip nfs_user_data = "apt-get -y install nfs-common -mkdir /home/ubuntu/shared/ -mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/ubuntu/shared/ -chown -R ubuntu:ubuntu /home/ubuntu/shared +mkdir /home/$user/shared/ +mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/$user/shared/ +chown -R $user:$user /home/$user/shared " user_data *= nfs_user_data end @@ -427,7 +500,7 @@ chown -R ubuntu:ubuntu /home/ubuntu/shared ec2_await_status(cluster_nodes, "running") ec2_await_check(cluster_nodes, "ok") - ec2_set_hostfile(cluster_nodes, internal_key_name) + ec2_set_hostfile(cluster_nodes, internal_key_name, cluster.user) # ec2_remove_temp_files(internal_key_name) @@ -581,7 +654,17 @@ ec2_can_resume(cluster::Cluster) = ec2_cluster_status(cluster, ["stopped"]) # All instances must be in "interrupted" or "running" state. # If some instance is not in "interrupted" or "running" state, raise an exception. # PUBLIC -function ec2_resume_cluster(cluster::Cluster) + +function ec2_resume_cluster(cluster::PeerWorkers) + ec2_resume_cluster(cluster, cluster.user, cluster.user) +end + +function ec2_resume_cluster(cluster::ManagerWorkers) + ec2_resume_cluster(cluster, cluster.user_manager, cluster.user_worker) +end + +function ec2_resume_cluster(cluster::Cluster, user_manager, user_worker) + home = ENV["HOME"] ssh_path = joinpath(homedir(), ".ssh") keypath = joinpath(ssh_path, "$(cluster.name).key") @@ -589,8 +672,10 @@ function ec2_resume_cluster(cluster::Cluster) ec2_await_status(cluster.cluster_nodes, "running") ec2_await_check(cluster.cluster_nodes, "ok") for instance in keys(cluster.cluster_nodes) + user = instance == :manager ? user_manager : user_worker public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster.cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] - try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip uptime`) + run(`ssh-keygen -f $home/.ssh/known_hosts -R $public_ip`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip uptime`) end end diff --git a/src/cluster_providers/ec2/ec2_deploy.jl b/src/cluster_providers/ec2/ec2_deploy.jl index 476c107..6a98353 100644 --- a/src/cluster_providers/ec2/ec2_deploy.jl +++ b/src/cluster_providers/ec2/ec2_deploy.jl @@ -17,6 +17,7 @@ function deploy_cluster(_::Type{AmazonEC2}, count = get(cluster_features, :node_count, 1) imageid_manager, imageid_worker = extract_mwfeature(cluster_features, AmazonEC2, :imageid) + user_manager, user_worker = extract_mwfeature(cluster_features, AmazonEC2, :user) subnet_id = get(cluster_features, :subnet_id, get(defaults_dict[AmazonEC2], :subnet_id, nothing)) placement_group = get(cluster_features, :placement_group, get(defaults_dict[AmazonEC2], :placement_group, nothing)) @@ -26,7 +27,7 @@ function deploy_cluster(_::Type{AmazonEC2}, auto_sg, security_group_id = security_group_id == "automatic" ? (true, ec2_create_security_group(string("sgroup_", cluster_handle), "")) : (false, security_group_id) cluster = EC2ManagerWorkers(string(cluster_handle), instance_type_manager, instance_type_worker, count, - imageid_manager, imageid_worker, + imageid_manager, imageid_worker, user_manager, user_worker, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, nothing, nothing, false, cluster_features) @@ -53,6 +54,8 @@ function deploy_cluster(_::Type{AmazonEC2}, count = get(cluster_features, :node_count, 1) imageid = get(cluster_features, :imageid, defaults_dict[AmazonEC2][:imageid]) + user = get(cluster_features, :user, defaults_dict[GoogleCloud][:user]) + subnet_id = get(cluster_features, :subnet_id, get(defaults_dict[AmazonEC2], :subnet_id, nothing)) placement_group = get(cluster_features, :placement_group, get(defaults_dict[AmazonEC2], :placement_group, nothing)) security_group_id = get(cluster_features, :security_group_id, get(defaults_dict[AmazonEC2], :security_group_id, nothing)) @@ -60,7 +63,7 @@ function deploy_cluster(_::Type{AmazonEC2}, auto_pg, placement_group = placement_group == "automatic" ? (true, ec2_create_placement_group(string("pgroup_", cluster_handle))) : (false, placement_group) auto_sg, security_group_id = security_group_id == "automatic" ? (true, ec2_create_security_group(string("sgroup_", cluster_handle), "")) : (false, security_group_id) - cluster = ec2_build_clusterobj(cluster_type, string(cluster_handle), instance_type, count, imageid, + cluster = ec2_build_clusterobj(cluster_type, string(cluster_handle), instance_type, count, imageid, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, cluster_features) ec2_create_cluster(cluster) @@ -72,15 +75,15 @@ function deploy_cluster(_::Type{AmazonEC2}, return cluster end -ec2_build_clusterobj(_::Type{<:PeerWorkers}, cluster_handle, instance_type, count, imageid, subnet_id, +ec2_build_clusterobj(_::Type{<:PeerWorkers}, cluster_handle, instance_type, count, imageid, subnet_id, user, placement_group, auto_pg, security_group_id, auto_sg, cluster_features) = - EC2PeerWorkers(cluster_handle, instance_type, count, imageid, + EC2PeerWorkers(cluster_handle, instance_type, count, imageid, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, nothing, nothing, false, cluster_features) -ec2_build_clusterobj(_::Type{<:PeerWorkersMPI}, cluster_handle, instance_type, count, imageid, subnet_id, +ec2_build_clusterobj(_::Type{<:PeerWorkersMPI}, cluster_handle, instance_type, count, imageid, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, cluster_features) = - EC2PeerWorkersMPI(cluster_handle, instance_type, count, imageid, + EC2PeerWorkersMPI(cluster_handle, instance_type, count, imageid, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, nothing, nothing, false, cluster_features) @@ -123,7 +126,7 @@ function terminate_cluster(_::Type{AmazonEC2}, cluster_handle) ec2_terminate_cluster(cluster) ec2_delete_cluster(cluster_handle) delete!(ec2_cluster_info, cluster_handle) - nothing + return end #==== RESTART CLUSTER ====# diff --git a/src/cluster_providers/ec2/ec2_persist.jl b/src/cluster_providers/ec2/ec2_persist.jl index 0315053..817a89b 100644 --- a/src/cluster_providers/ec2/ec2_persist.jl +++ b/src/cluster_providers/ec2/ec2_persist.jl @@ -91,7 +91,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han for (node_name, instance_id) in _cluster_nodes cluster_nodes[Symbol(node_name)] = instance_id end - cluster_features = contents["cluster_features"] |> adjusttypefeatures + cluster_features = contents["cluster_features"] |> ec2_adjusttypefeatures shared_fs = contents["shared_fs"] cluster = EC2ManagerWorkers(string(cluster_handle), instance_type_manager, instance_type_worker, count, @@ -109,12 +109,12 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han end -function adjusttypefeatures(_cluster_features) +function ec2_adjusttypefeatures(_cluster_features) cluster_features = Dict() for (id, vl0) in _cluster_features idsym = Symbol(id) vl1 = idsym in [:cluster_type, :node_machinetype, :provider, :node_provider] ? fetchtype(vl0) : vl0 - vl2 = idsym in [:worker_features, :manager_features] ? adjusttypefeatures(vl1) : vl1 + vl2 = idsym in [:worker_features, :manager_features] ? ec2_adjusttypefeatures(vl1) : vl1 cluster_features[idsym] = vl2 end return cluster_features @@ -137,7 +137,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle for (node_name, instance_id) in _cluster_nodes cluster_nodes[Symbol(node_name)] = instance_id end - cluster_features = contents["cluster_features"] |> adjusttypefeatures + cluster_features = contents["cluster_features"] |> ec2_adjusttypefeatures shared_fs = contents["shared_fs"] cluster = EC2PeerWorkers(string(cluster_handle), instance_type, count, diff --git a/src/cluster_providers/local/local_deploy.jl b/src/cluster_providers/local/local_deploy.jl index f9a8825..08cffdd 100644 --- a/src/cluster_providers/local/local_deploy.jl +++ b/src/cluster_providers/local/local_deploy.jl @@ -79,7 +79,7 @@ function resume_cluster(type::Type{Localhost}, cluster_handle) end function can_resume(_::Type{Localhost}, cluster_handle) - @assert !haskey(ec2_cluster_info, cluster_handle) + @assert !haskey(local_cluster_info, cluster_handle) @warn "local clusters cannot be interrupted/resumed" false end diff --git a/src/deploy.jl b/src/deploy.jl index e3e75a0..0556cba 100644 --- a/src/deploy.jl +++ b/src/deploy.jl @@ -113,9 +113,10 @@ end function launch_processes_ssh(cluster_features, _::Type{<:ManagerWorkers}, ips) - user = get_user(cluster_features) cluster_provider = cluster_features[:node_provider] + user = get_user(cluster_features) + @info ips ip_manager = ips[:manager] manager_features = Dict(get(cluster_features, :manager_features, cluster_features)) @@ -498,7 +499,7 @@ function check_cluster_handle(cluster_handle; reconnecting = false) !reconnecting && !haskey(cluster_deploy_info, cluster_handle) && error("cluster $cluster_handle not found") end -get_user(cluster_features) = get(cluster_features, :user, defaults_dict[Provider][:user]) +get_user(cluster_features) = get(cluster_features, :user, defaults_dict[cluster_features[:node_provider]][:user]) function cluster_reconnect(cluster_handle::Symbol) diff --git a/src/utils.jl b/src/utils.jl index 10dab6a..201b799 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -20,7 +20,7 @@ function try_run(command) successfull = true catch @error "failed: $command - trying again" - sleep(0.5) + sleep(2) end end From 86fc175eb943dc6370dc6fd3f96c043acffed0af Mon Sep 17 00:00:00 2001 From: "Mykael A." Date: Sat, 18 Jan 2025 12:18:49 -0300 Subject: [PATCH 2/5] add gcp support --- src/cluster_providers/gcp/gcp_backend.jl | 646 ++++++++++++----------- src/cluster_providers/gcp/gcp_deploy.jl | 126 ++++- src/cluster_providers/gcp/gcp_persist.jl | 145 +++++ 3 files changed, 610 insertions(+), 307 deletions(-) create mode 100644 src/cluster_providers/gcp/gcp_persist.jl diff --git a/src/cluster_providers/gcp/gcp_backend.jl b/src/cluster_providers/gcp/gcp_backend.jl index a8e4a93..3322064 100644 --- a/src/cluster_providers/gcp/gcp_backend.jl +++ b/src/cluster_providers/gcp/gcp_backend.jl @@ -3,118 +3,98 @@ using AWS: @service using Serialization using Base64 using Sockets -@service Ec2 -@service Efs + +using JSON + +import GoogleCloud as GCPAPI + +gcp_session = Ref{Any}(nothing) + +function gcp_check_session() + if isnothing(gcp_session[]) + # Creates a GCP session and stores it. + gcp_session[] = GCPAPI.GoogleSession(ENV["GOOGLE_APPLICATION_CREDENTIALS"], ["cloud-platform"]) + GCPAPI.set_session!(GCPAPI.compute, gcp_session[]) + end +end + #= Estrutura para Armazenar as informações e função de criação do cluster =# - mutable struct GCPManagerWorkers <: ManagerWorkers #Cluster name::String - instance_type_manager::String - instance_type_worker::String - count::Int image_id_manager::String image_id_worker::String - subnet_id::Union{String, Nothing} - placement_group::Union{String, Nothing} - auto_pg::Bool - security_group_id::Union{String, Nothing} - auto_sg::Bool + count::Int + instance_type_manager::String + instance_type_worker::String + user_manager::String + user_worker::String + zone::String + project::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} - shared_fs::Bool features::Dict{Symbol, Any} end mutable struct GCPPeerWorkers <: PeerWorkers # Cluster name::String - instance_type::String - count::Int image_id::String - subnet_id::Union{String, Nothing} - placement_group::Union{String, Nothing} - auto_pg::Bool - security_group_id::Union{String,Nothing} - auto_sg::Bool + count::Int + instance_type::String + user::String + zone::String + project::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} - shared_fs::Bool features::Dict{Symbol, Any} end mutable struct GCPPeerWorkersMPI <: PeerWorkersMPI # Cluster name::String - instance_type::String - count::Int image_id::String - subnet_id::Union{String, Nothing} - placement_group::Union{String, Nothing} - auto_pg::Bool - security_group_id::Union{String,Nothing} - auto_sg::Bool + count::Int + instance_type::String + zone::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} - shared_fs::Bool features::Dict{Symbol, Any} end # PUBLIC +""" +Creates a compute instances cluster and returns it. +""" function gcp_create_cluster(cluster::Cluster) - + gcp_check_session() cluster.cluster_nodes = gcp_create_instances(cluster) cluster end - -function gcp_get_ips_instance(instance_id::String) - public_ip = Ec2.describe_instances(Dict("InstanceId" => instance_id))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] - private_ip = Ec2.describe_instances(Dict("InstanceId" => instance_id))["reservationSet"]["item"]["instancesSet"]["item"]["privateIpAddress"] - Dict(:public_ip => public_ip, :private_ip => private_ip) +function gcp_get_ips_instance(cluster::Cluster, name) + public_ip = gcp_get_instance_dict(cluster, name)["networkInterfaces"][1]["accessConfigs"][1]["natIP"] + private_ip = gcp_get_instance_dict(cluster, name)["networkInterfaces"][1]["networkIP"] + + return Dict(:public_ip => public_ip, :private_ip => private_ip) end # PUBLIC function gcp_terminate_cluster(cluster::Cluster) - gcp_delete_instances(cluster.cluster_nodes) - for instance in cluster.cluster_nodes - status = gcp_get_instance_status(instance[2]) - while status != "terminated" - println("Waiting for instances to terminate...") - sleep(2) - status = gcp_get_instance_status(instance[2]) - end - end - - cluster.shared_fs && gcp_delete_efs(cluster.file_system_id) - cluster.auto_sg && gcp_delete_security_group(cluster.security_group_id) - cluster.auto_pg && gcp_delete_placement_group(cluster.placement_group) - - return + gcp_delete_instances(cluster) + gcp_await_status(cluster, cluster.cluster_nodes, "notfound") end - #= Grupo de Alocação =# # PUBLIC function gcp_create_placement_group(name) - params = Dict( - "GroupName" => name, - "Strategy" => "cluster", - "TagSpecification" => - Dict( - "ResourceType" => "placement-group", - "Tag" => [Dict("Key" => "cluster", "Value" => name), - Dict("Key" => "Name", "Value" => name)] - ) - ) - Ec2.create_placement_group(params)["placementGroup"]["groupName"] + @warn "CALLED NOT IMPLEMENTED METHOD!" end function gcp_delete_placement_group(name) - params = Dict("GroupName" => name) - Ec2.delete_placement_group(name) + @warn "CALLED NOT IMPLEMENTED METHOD!" end #= @@ -122,37 +102,11 @@ Grupo de Segurança =# # PUBLIC function gcp_create_security_group(name, description) - # Criamos o grupo - params = Dict( - "TagSpecification" => - Dict( - "ResourceType" => "security-group", - "Tag" => [Dict("Key" => "cluster", "Value" => name), - Dict("Key" => "Name", "Value" => name)] - ) - ) - id = Ec2.create_security_group(name, description, params)["groupId"] - - # Liberamos o SSH. - params = Dict( - "GroupId" => id, - "CidrIp" => "0.0.0.0/0", - "IpProtocol" => "tcp", - "FromPort" => 22, - "ToPort" => 22) - Ec2.authorize_security_group_ingress(params) - - # Liberamos o tráfego interno do grupo. - sg_name = Ec2.describe_security_groups(Dict("GroupId" => id))["securityGroupInfo"]["item"]["groupName"] - params = Dict( - "GroupId" => id, - "SourceSecurityGroupName" => sg_name) - Ec2.authorize_security_group_ingress(params) - id + @warn "CALLED NOT IMPLEMENTED METHOD!" end function gcp_delete_security_group(id) - Ec2.delete_security_group(Dict("GroupId" => id)) + @warn "CALLED NOT IMPLEMENTED METHOD!" end #= @@ -160,162 +114,239 @@ Criação de Instâncias =# # Funções auxiliares. -function gcp_set_up_ssh_connection(cluster_name) +function gcp_set_up_ssh_connection(cluster_name, comment) + + internal_key_name = cluster_name + + ssh_path = joinpath(homedir(), ".ssh") + + !isdir(ssh_path) && mkdir(ssh_path) + + keypath = joinpath(ssh_path, "$internal_key_name.key") + pubpath = joinpath(ssh_path, "$internal_key_name.key.pub") + # Criar chave interna pública e privada do SSH. # chars = ['a':'z'; 'A':'Z'; '0':'9'] # random_suffix = join(chars[Random.rand(1:length(chars), 5)]) - internal_key_name = cluster_name - run(`ssh-keygen -f /tmp/$internal_key_name -N ""`) - private_key = base64encode(read("/tmp/$internal_key_name", String)) - public_key = base64encode(read("/tmp/$internal_key_name.pub", String)) + run(`ssh-keygen -t rsa -b 2048 -f $keypath -C $comment -N ""`) + run(`chmod 400 $keypath`) + private_key = base64encode(read(keypath, String)) + public_key = base64encode(read(pubpath, String)) + + private_key, public_key +end +function gcp_get_user_data(cluster_name, user, private_key, public_key) + # Define o script que irá instalar a chave pública e privada no headnode e workers. user_data = "#!/bin/bash -echo $private_key | base64 -d > /home/ubuntu/.ssh/$cluster_name -echo $public_key | base64 -d > /home/ubuntu/.ssh/$cluster_name.pub +echo $private_key | base64 -d > /home/$user/.ssh/$cluster_name +echo $public_key | base64 -d > /home/$user/.ssh/$cluster_name.pub echo 'Host * - IdentityFile /home/ubuntu/.ssh/$cluster_name - StrictHostKeyChecking no' > /home/ubuntu/.ssh/config -cat /home/ubuntu/.ssh/$cluster_name.pub >> /home/ubuntu/.ssh/authorized_keys -chown -R ubuntu:ubuntu /home/ubuntu/.ssh -chmod 600 /home/ubuntu/.ssh/* -sed -i 's/#ClientAliveInterval 0/ClientAliveInterval 1000/g' /etc/ssh/sshd_config -sed -i 's/#ClientAliveCountMax 3/ClientAliveCountMax 100/g' /etc/ssh/sshd_config -systemctl restart ssh + IdentityFile /home/$user/.ssh/$cluster_name + StrictHostKeyChecking no' > /home/$user/.ssh/config +cat /home/$user/.ssh/$cluster_name.pub >> /home/$user/.ssh/authorized_keys +chown -R $user:$user /home/$user/.ssh +chmod 600 /home/$user/.ssh/* +sudo sed -i 's/#ClientAliveInterval 0/ClientAliveInterval 1000/g' /etc/ssh/sshd_config +sudo sed -i 's/#ClientAliveCountMax 3/ClientAliveCountMax 100/g' /etc/ssh/sshd_config +sudo systemctl restart ssh " - [internal_key_name, user_data] -end - -function gcp_create_params(cluster::ManagerWorkers, user_data_base64) - params_manager = Dict( - "InstanceType" => cluster.instance_type_manager, - "ImageId" => cluster.image_id_manager, - "TagSpecification" => - Dict( - "ResourceType" => "instance", - "Tag" => [Dict("Key" => "cluster", "Value" => cluster.name), - Dict("Key" => "Name", "Value" => "manager") ] - ), - "UserData" => user_data_base64, - ) - params_workers = Dict( - "InstanceType" => cluster.instance_type_worker, - "ImageId" => cluster.image_id_worker, - "TagSpecification" => - Dict( - "ResourceType" => "instance", - "Tag" => [Dict("Key" => "cluster", "Value" => cluster.name), - Dict("Key" => "Name", "Value" => "worker") ] - ), - "UserData" => user_data_base64, - ) - - if !isnothing(cluster.subnet_id) - params_manager["SubnetId"] = cluster.subnet_id - params_workers["SubnetId"] = cluster.subnet_id - end - - if !isnothing(cluster.placement_group) - params_manager["Placement"] = Dict("GroupName" => cluster.placement_group) - params_workers["Placement"] = Dict("GroupName" => cluster.placement_group) - end + return user_data +end - if !isnothing(cluster.security_group_id) - params_manager["SecurityGroupId"] = [cluster.security_group_id] - params_workers["SecurityGroupId"] = [cluster.security_group_id] +function gcp_create_params(cluster::ManagerWorkers, cluster_nodes, internal_key_name, user_data, private_key, public_key) + + user_data_manager, user_data_worker = user_data + + ssh_path = joinpath(homedir(), ".ssh") + pubpath = joinpath(ssh_path, "$internal_key_name.key.pub") + + params_manager = Vector{Dict}() + + user_manager = cluster.user_manager + user_worker = cluster.user_worker + + push!(params_manager, Dict( + "disks" => [Dict( + "autoDelete" => true, + "boot" => true, + "initializeParams" => Dict( + "diskSizeGb" => 50, + "sourceImage" => "projects/$(cluster.image_id_manager)" + ), + "mode" => "READ_WRITE", + "type" => "PERSISTENT" + )], + "zone" => cluster.zone, + "machineType" => "zones/$(cluster.zone)/machineTypes/$(cluster.instance_type_manager)", + "name" => cluster_nodes[:manager], + "networkInterfaces" => [Dict( + "accessConfigs" => [Dict( + "name" => "external-nat", + "type" => "ONE_TO_ONE_NAT" + )], + "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default" + )], + "metadata" => + "items" => [Dict( + "key" => "startup-script", + "value" => user_data_manager + ), + Dict( + "key" => "ssh-keys", + "value" => "$user_manager:$pubpath" + )] + )) + + params_workers = Vector{Dict}() + + for i = 1:cluster.count + push!(params_workers, Dict( + "disks" => [Dict( + "autoDelete" => true, + "boot" => true, + "initializeParams" => Dict( + "diskSizeGb" => 50, + "sourceImage" => "projects/$(cluster.image_id_worker)" + ), + "mode" => "READ_WRITE", + "type" => "PERSISTENT" + )], + "zone" => cluster.zone, + "machineType" => "zones/$(cluster.zone)/machineTypes/$(cluster.instance_type_worker)", + "name" => cluster_nodes[Symbol("worker$i")], + "networkInterfaces" => [Dict( + "accessConfigs" => [Dict( + "name" => "external-nat", + "type" => "ONE_TO_ONE_NAT" + )], + "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default" + )], + "metadata" => + "items" => [Dict( + "key" => "startup-script", + "value" => user_data_worker + ), + Dict( + "key" => "ssh-keys", + "value" => "$user_worker:$pubpath" + )] + )) end - params_manager, params_workers + return params_manager, params_workers end -function gcp_create_params(cluster::PeerWorkers, user_data_base64) - params = Dict( - "InstanceType" => cluster.instance_type, - "ImageId" => cluster.image_id, - "TagSpecification" => - Dict( - "ResourceType" => "instance", - "Tag" => [Dict("Key" => "cluster", "Value" => cluster.name), - Dict("Key" => "Name", "Value" => "peer") ] - ), - "UserData" => user_data_base64, - ) +function gcp_create_params(cluster::PeerWorkers, cluster_nodes, internal_key_name, user_data, private_key, public_key) - if !isnothing(cluster.subnet_id) - params["SubnetId"] = cluster.subnet_id - end + ssh_path = joinpath(homedir(), ".ssh") + pubpath = joinpath(ssh_path, "$internal_key_name.key.pub") - if !isnothing(cluster.placement_group) - params["Placement"] = Dict("GroupName" => cluster.placement_group) - end + user = cluster.user - if !isnothing(cluster.security_group_id) - params["SecurityGroupId"] = [cluster.security_group_id] + params = Vector{Dict}() + + for instance in values(cluster_nodes) + push!(params, Dict( + "disks" => [Dict( + "autoDelete" => true, + "boot" => true, + "initializeParams" => Dict( + "diskSizeGb" => 50, + "sourceImage" => "projects/$(cluster.image_id)" + ), + "mode" => "READ_WRITE", + "type" => "PERSISTENT" + )], + "zone" => cluster.zone, + "machineType" => "zones/$(cluster.zone)/machineTypes/$(cluster.instance_type)", + "name" => instance, + "networkInterfaces" => [Dict( + "accessConfigs" => [Dict( + "name" => "external-nat", + "type" => "ONE_TO_ONE_NAT" + )], + "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default" + )], + "metadata" => + "items" => [Dict( + "key" => "startup-script", + "value" => user_data + ), + Dict( + "key" => "ssh-keys", + "value" => "$user:$pubpath" + )] + )) end - params + return params end function gcp_remove_temp_files(internal_key_name) - run(`rm /tmp/$internal_key_name`) - run(`rm /tmp/$internal_key_name.pub`) + ssh_path = joinpath(homedir(), ".ssh") + keypath = joinpath(ssh_path, "$internal_key_name.key") + pubpath = joinpath(ssh_path, "$internal_key_name.key.pub") + rm(keypath) + rm(pubpath) end +function gcp_set_hostfile(cluster::Cluster, cluster_nodes, internal_key_name, user) + gcp_set_hostfile(cluster, cluster_nodes, internal_key_name, user, user) +end -function gcp_set_hostfile(cluster_nodes, internal_key_name) +function gcp_set_hostfile(cluster::Cluster, cluster_nodes, internal_key_name, user_manager, user_worker) + # Testando se a conexão SSH está ativa. - for instance in keys(cluster_nodes) - public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] + for (name, instance) in cluster_nodes + public_ip = gcp_get_ips_instance(cluster, instance)[:public_ip] connection_ok = false + print("Waiting for $name to become accessible .") while !connection_ok try connect(public_ip, 22) connection_ok = true catch e - println("Waiting for $instance to be accessible...") + print(".") end end + println("ok") end # Criando o arquivo hostfile. hostfile_content = "127.0.0.1 localhost\n" hostfilefile_content = "" - for instance in keys(cluster_nodes) - private_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["privateIpAddress"] + for (name, instance) in cluster_nodes + private_ip = gcp_get_ips_instance(cluster, instance)[:private_ip] hostfile_content *= "$private_ip $instance\n" - if instance != :manager + if name != :manager hostfilefile_content *= "$instance\n" end end - #=h = Threads.@spawn begin - public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] - for instance in keys(cluster_nodes) - for instance_other in keys(cluster_nodes) - @info "--- $instance -> $instance_other" - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "ssh $instance_other uptime"`) - end - end - end=# + ssh_path = joinpath(homedir(), ".ssh") + keypath = joinpath(ssh_path, "$internal_key_name.key") + + home = ENV["HOME"] # Atualiza o hostname e o hostfile. - for instance in keys(cluster_nodes) - public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] + for (name, instance) in cluster_nodes + user = name == :manager ? user_manager : user_worker + public_ip = gcp_get_ips_instance(cluster, instance)[:public_ip] + run(`ssh-keygen -f $home/.ssh/known_hosts -R $public_ip`) # private_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["privateIpAddress"] - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo hostnamectl set-hostname $instance"`) - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "echo '$hostfilefile_content' > /home/ubuntu/hostfile"`) -# try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`) - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "echo '$hostfile_content' >> hosts.tmp"`) - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo chown ubuntu:ubuntu /etc/hosts"`) - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "cat hosts.tmp > /etc/hosts"`) - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo chown root:root /etc/hosts"`) - try_run(`ssh -i /tmp/$internal_key_name -o StrictHostKeyChecking=no ubuntu@$public_ip "rm hosts.tmp"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo hostnamectl set-hostname $instance"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "echo '$hostfilefile_content' > /home/$user/hostfile"`) +# try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "echo '$hostfile_content' >> hosts.tmp"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo chown $user:$user /etc/hosts"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "cat hosts.tmp > /etc/hosts"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo chown root:root /etc/hosts"`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "rm hosts.tmp"`) end - - #wait(h) - end @@ -325,98 +356,92 @@ Cria as instâncias. function gcp_create_instances(cluster::ManagerWorkers) + + new_cluster = cluster + cluster_nodes = Dict() + cluster_nodes[:manager] = lowercase(new_cluster.name) * string(1) + for i in 1:new_cluster.count + cluster_nodes[Symbol("worker$i")] = lowercase(new_cluster.name) * string(i + 1) + end + # Configurando a conexão SSH. - internal_key_name, user_data = gcp_set_up_ssh_connection(cluster.name) + + private_key, public_key = gcp_set_up_ssh_connection(cluster.name, cluster.user_manager) + + user_data_manager = gcp_get_user_data(cluster.name, cluster.user_manager, private_key, public_key) + user_data_worker = gcp_get_user_data(cluster.name, cluster.user_worker, private_key, public_key) + + internal_key_name = cluster.name - user_data_base64 = base64encode(user_data) + try gcp_allow_ssh(cluster.project) catch end # Criando as instâncias - params_manager, params_workers = gcp_create_params(cluster, user_data_base64) + params_manager, params_workers = gcp_create_params(cluster, cluster_nodes, internal_key_name, (user_data_manager, user_data_worker), private_key, public_key) + # Criar o headnode - instance_headnode = Ec2.run_instances(1, 1, params_manager) - cluster_nodes[:manager] = instance_headnode["instancesSet"]["item"]["instanceId"] + gcp_compute_instance_insert(new_cluster, params_manager) # Criar os worker nodes. - params_workers["InstanceType"] = cluster.instance_type_worker - params_workers["TagSpecification"]["Tag"][2]["Value"] = "worker" - count = cluster.count - instances_workers = Ec2.run_instances(count, count, params_workers) - workers = count - for i in 1:count - instance = "" - if count > 1 - instance = instances_workers["instancesSet"]["item"][i] - elseif count == 1 - instance = instances_workers["instancesSet"]["item"] - end - instance_id = instance["instanceId"] - cluster_nodes[Symbol("worker$i")] = instance_id - end + gcp_compute_instance_insert(new_cluster, params_workers) - gcp_await_status(cluster_nodes, "running") - gcp_await_check(cluster_nodes, "ok") + gcp_await_status(new_cluster, cluster_nodes, "RUNNING") - gcp_set_hostfile(cluster_nodes, internal_key_name) + gcp_set_hostfile(new_cluster, cluster_nodes, internal_key_name, cluster.user_manager, cluster.user_worker) #gcp_remove_temp_files(internal_key_name) - cluster_nodes + return cluster_nodes end function gcp_create_instances(cluster::PeerWorkers) + + new_cluster = cluster + cluster_nodes = Dict() + for i = 1:new_cluster.count + cluster_nodes[Symbol("peer$i")] = lowercase(new_cluster.name) * string(i) + end # Configurando a conexão SSH. - internal_key_name, user_data = gcp_set_up_ssh_connection(cluster.name) + private_key, public_key = gcp_set_up_ssh_connection(cluster.name, cluster.user) + user_data = gcp_get_user_data(cluster.name, cluster.user, private_key, public_key) + + internal_key_name = cluster.name - user_data_base64 = base64encode(user_data) + try gcp_allow_ssh(cluster.project) catch end # Criando as instâncias - params = gcp_create_params(cluster, user_data_base64) + params = gcp_create_params(new_cluster, cluster_nodes, internal_key_name, user_data, private_key, public_key) # Criar os Peers. - count = cluster.count - instances_peers = Ec2.run_instances(count, count, params) - for i in 1:count - instance = "" - if count > 1 - instance = instances_peers["instancesSet"]["item"][i] - elseif count == 1 - instance = instances_peers["instancesSet"]["item"] - end - instance_id = instance["instanceId"] - cluster_nodes[Symbol("peer$i")] = instance_id - end + gcp_compute_instance_insert(new_cluster, params) - gcp_await_status(cluster_nodes, "running") - gcp_await_check(cluster_nodes, "ok") + gcp_await_status(new_cluster, cluster_nodes, "RUNNING") - gcp_set_hostfile(cluster_nodes, internal_key_name) + gcp_set_hostfile(new_cluster, cluster_nodes, internal_key_name, cluster.user) - # gcp_remove_temp_files(internal_key_name) + #gcp_remove_temp_files(internal_key_name) - cluster_nodes + return cluster_nodes end -function gcp_await_status(cluster_nodes, status) - for nodeid in keys(cluster_nodes) - print("Waiting for $nodeid to be $status ...") - while gcp_get_instance_status(cluster_nodes[nodeid]) != status - print(".") - sleep(2) - end - println("successfull") +function gcp_compute_instance_insert(cluster::Cluster, params) + vector_size = size(params, 1) + for i = 1:vector_size + GCPAPI.compute(:Instance, :insert, cluster.project, cluster.zone; data=params[i]) end end -function gcp_await_check(cluster_nodes, status) +function gcp_await_status(cluster::Cluster, cluster_nodes, status) for nodeid in keys(cluster_nodes) print("Waiting for $nodeid to be $status ...") - while gcp_get_instance_check(cluster_nodes[nodeid]) != status + current_status = gcp_get_instance_status(cluster, cluster_nodes[nodeid]) + while current_status != status print(".") sleep(2) + current_status = gcp_get_instance_status(cluster, cluster_nodes[nodeid]) end println("successfull") end @@ -425,8 +450,9 @@ end # PUBLIC function gcp_cluster_status(cluster::Cluster, status_list) cluster_nodes = cluster.cluster_nodes - for nodeid in keys(cluster_nodes) - !(gcp_get_instance_status(cluster_nodes[nodeid]) in status_list) && return false + for nodeid in values(cluster_nodes) + current_status = gcp_get_instance_status(cluster, nodeid) + !(current_status in status_list) && return false end return true end @@ -439,43 +465,20 @@ function gcp_cluster_ready(cluster::Cluster; status="ok") return true end -function gcp_delete_instances(cluster_nodes) - for id in values(cluster_nodes) - Ec2.terminate_instances(id) - end -end - -function gcp_get_instance_status(id) - try - description = Ec2.describe_instances(Dict("InstanceId" => id)) - if haskey(description["reservationSet"], "item") - description["reservationSet"]["item"]["instancesSet"]["item"]["instanceState"]["name"] - else - "notfound" - end - catch _ - "notfound" +function gcp_delete_instances(cluster::Cluster) + for id in values(cluster.cluster_nodes) + GCPAPI.compute(:Instance, :delete, cluster.project, cluster.zone, id) end end -function gcp_get_instance_check(id) +function gcp_get_instance_status(cluster::Cluster, id) try - description = Ec2.describe_instance_status(Dict("InstanceId" => id)) - if haskey(description["instanceStatusSet"], "item") - description["instanceStatusSet"]["item"]["instanceStatus"]["status"] - else - "notfound" - end + return gcp_get_instance_dict(cluster, id)["status"] catch _ - "notfound" + return "notfound" end end -function gcp_get_instance_subnet(id) - description = Ec2.describe_instances(Dict("InstanceId" => id)) - description["reservationSet"]["item"]["instancesSet"]["item"]["subnetId"] -end - # PUBLIC gcp_can_interrupt(cluster::Cluster) = gcp_cluster_isrunning(cluster) @@ -483,40 +486,68 @@ gcp_can_interrupt(cluster::Cluster) = gcp_cluster_isrunning(cluster) # PUBLIC function gcp_interrupt_cluster(cluster::Cluster) gcp_stop_instances(cluster) - gcp_await_status(cluster.cluster_nodes, "stopped") + gcp_await_status(cluster, cluster.cluster_nodes, "TERMINATED") end # PUBLIC -gcp_can_resume(cluster::Cluster) = gcp_cluster_status(cluster, ["stopped"]) +gcp_can_resume(cluster::Cluster) = gcp_cluster_status(cluster, ["TERMINATED"]) # Start interrupted cluster instances or reboot running cluster instances. # All instances must be in "interrupted" or "running" state. # If some instance is not in "interrupted" or "running" state, raise an exception. # PUBLIC -function gcp_resume_cluster(cluster::Cluster) +function gcp_resume_cluster(cluster::ManagerWorkers) + home = ENV["HOME"] + ssh_path = joinpath(homedir(), ".ssh") + keypath = joinpath(ssh_path, "$(cluster.name).key") + user_manager = cluster.user_manager + user_worker = cluster.user_worker + gcp_start_instances(cluster) - gcp_await_status(cluster.cluster_nodes, "running") - gcp_await_check(cluster.cluster_nodes, "ok") - for instance in keys(cluster.cluster_nodes) - public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster.cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"] - try_run(`ssh -i /tmp/$(cluster.name) -o StrictHostKeyChecking=no ubuntu@$public_ip uptime`) + gcp_await_status(cluster, cluster.cluster_nodes, "RUNNING") + + public_ip = gcp_get_ips_instance(cluster, cluster.cluster_nodes[:manager])[:public_ip] + + run(`ssh-keygen -f $home/.ssh/known_hosts -R $public_ip`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user_manager@$public_ip uptime`) + + for i in 1:cluster.count + instance = cluster.cluster_nodes[Symbol("worker$i")] + public_ip = gcp_get_ips_instance(cluster, instance)[:public_ip] + run(`ssh-keygen -f $home/.ssh/known_hosts -R $public_ip`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user_worker@$public_ip uptime`) end end +function gcp_resume_cluster(cluster::PeerWorkers) + home = ENV["HOME"] + ssh_path = joinpath(homedir(), ".ssh") + keypath = joinpath(ssh_path, "$(cluster.name).key") + user = cluster.user + + gcp_start_instances(cluster) + gcp_await_status(cluster, cluster.cluster_nodes, "RUNNING") + + for instance in values(cluster.cluster_nodes) + public_ip = gcp_get_ips_instance(cluster, instance)[:public_ip] + run(`ssh-keygen -f $home/.ssh/known_hosts -R $public_ip`) + try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip uptime`) + end +end # Check if the cluster instances are running or interrupted. -gcp_cluster_isrunning(cluster::Cluster) = gcp_cluster_status(cluster, ["running"]) && gcp_cluster_ready(cluster) -gcp_cluster_isstopped(cluster::Cluster) = gcp_cluster_status(cluster, ["stopped"]) +gcp_cluster_isrunning(cluster::Cluster) = gcp_cluster_status(cluster, ["RUNNING"]) #&& gcp_cluster_ready(cluster) +gcp_cluster_isstopped(cluster::Cluster) = gcp_cluster_status(cluster, ["TERMINATED"]) function gcp_stop_instances(cluster::Cluster) for id in values(cluster.cluster_nodes) - Ec2.stop_instances(id) + GCPAPI.compute(:Instance, :stop, cluster.project, cluster.zone, id) end end function gcp_start_instances(cluster::Cluster) for id in values(cluster.cluster_nodes) - Ec2.start_instances(id) + GCPAPI.compute(:Instance, :start, cluster.project, cluster.zone, id) end end @@ -524,7 +555,30 @@ end function gcp_get_ips(cluster::Cluster) ips = Dict() for (node, id) in cluster.cluster_nodes - ips[node] = gcp_get_ips_instance(id) + ips[node] = gcp_get_ips_instance(cluster, id) end ips +end + +function gcp_get_instance_dict(cluster::Cluster, name) + gcp_check_session() + return JSON.parse(String(GCPAPI.compute(:Instance, :get, cluster.project, cluster.zone, name))) +end + + +function gcp_allow_ssh(project) + firewall_rule = Dict( + "allowed" => [ + Dict("IPProtocol" => "tcp", + "ports" => ["22"])], + "direction" => "INGRESS", + "kind" => "compute#firewall", + "name" => "allow-ssh", + "network" => "projects/$project/global/networks/default", + "priority" => 1000, + "selfLink" => "projects/$project/global/firewalls/allow-ssh", + "sourceRanges" => ["0.0.0.0/0"] + ) + + GCPAPI.compute(:Firewall, :insert, project; data=firewall_rule) end \ No newline at end of file diff --git a/src/cluster_providers/gcp/gcp_deploy.jl b/src/cluster_providers/gcp/gcp_deploy.jl index 44e83b1..ea42b21 100644 --- a/src/cluster_providers/gcp/gcp_deploy.jl +++ b/src/cluster_providers/gcp/gcp_deploy.jl @@ -1,40 +1,144 @@ +gcp_cluster_info = Dict() + +#=function get_ips(gcptype::Type{GoogleCloud}, cluster_handle) + ips = Vector{Dict}() + cluster = gcp_cluster_info[cluster_handle] + try + for i in cluster.count + name = lowercase(String(cluster_handle)) * string(i) + push!(ips, gcp_get_ips_instance(cluster, name)) + end + catch err + terminate_cluster(gcptype, cluster_handle) + + throw(err) + end + + return ips +end=# + +get_ips(_::Type{GoogleCloud}, cluster_handle) = gcp_cluster_info[cluster_handle] |> gcp_get_ips + + # 1. creates a worker process in the manager node # 2. from the manager node, create worker processes in the compute nodes with MPIClusterManager -function deploy_cluster(type::Type{GoogleCloud}, mode::Type{LinkMode}, features) +function deploy_cluster(gcptype::Type{GoogleCloud}, + _::Type{<:ManagerWorkers}, + _::Type{<:CreateMode}, + cluster_handle, + cluster_features, + instance_type) + node_count = get(cluster_features, :node_count, 1) + + imageid_manager, imageid_worker = extract_mwfeature(cluster_features, GoogleCloud, :imageid) + user_manager, user_worker = extract_mwfeature(cluster_features, GoogleCloud, :user) + + #image_id_workers = get(cluster_features, :image_id, defaults_dict[GoogleCloud][:image_id]) + #image_id_manager = get(cluster_features, :image_id_manager, defaults_dict[GoogleCloud][:image_id_manager]) + zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone]) + project = defaults_dict[GoogleCloud][:project] + instance_type_manager = instance_type[1] + instance_type_worker = instance_type[2] + + cluster = GCPManagerWorkers(string(cluster_handle), + imageid_manager, + imageid_worker, + node_count, + instance_type_manager, + instance_type_worker, + user_manager, + user_worker, + zone, + project, + nothing, + cluster_features) + gcp_create_cluster(cluster) + + gcp_cluster_info[cluster_handle] = cluster + + gcp_cluster_save(cluster) + + return cluster end # 1. run the script to clusterize the nodes # 2. call deploy_cluster to link ... -function deploy_cluster(type::Type{GoogleCloud}, mode::Type{ClusterizeMode}, features) +function deploy_cluster(gcptype::Type{GoogleCloud}, + _::Type{<:PeerWorkers}, + _::Type{<:CreateMode}, + cluster_handle, + cluster_features, + instance_type) -end + node_count = get(cluster_features, :node_count, 1) + imageid = get(cluster_features, :imageid, defaults_dict[GoogleCloud][:imageid]) + user = get(cluster_features, :user, defaults_dict[GoogleCloud][:user]) + zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone]) + project = defaults_dict[GoogleCloud][:project] -# 1. create a set of GCP instances using the GCP API -# 2. run deploy_cluster to clusterize them and link to them -function deploy_cluster(type::Type{GoogleCloud}, mode::Type{CreateMode}, features) + cluster = GCPPeerWorkers(string(cluster_handle), + imageid, + node_count, + instance_type, + user, + zone, + project, + nothing, + cluster_features) + + gcp_create_cluster(cluster) + + gcp_cluster_info[cluster_handle] = cluster + + gcp_cluster_save(cluster) + return cluster end -function launch_processes(_::Type{GoogleCloud}, cluster_type, cluster_handle, ips, user_id) +function launch_processes(_::Type{GoogleCloud}, cluster_type, cluster_handle, ips) + cluster = gcp_cluster_info[cluster_handle] + + return launch_processes_ssh(cluster.features, cluster_type, ips) end #==== INTERRUPT CLUSTER ====# +can_interrupt(_::Type{GoogleCloud}, cluster_handle) = gcp_cluster_info[cluster_handle] |> gcp_can_interrupt + function interrupt_cluster(_::Type{GoogleCloud}, cluster_handle) - + cluster = gcp_cluster_info[cluster_handle] + gcp_interrupt_cluster(cluster) end #==== CONTINUE CLUSTER ====# -function resume_cluster(type::Type{GoogleCloud}, cluster_handle) - +can_resume(_::Type{GoogleCloud}, cluster_handle) = gcp_cluster_info[cluster_handle] |> gcp_can_resume + +function resume_cluster(_::Type{GoogleCloud}, cluster_handle) + cluster = gcp_cluster_info[cluster_handle] + gcp_resume_cluster(cluster) + return gcp_get_ips(cluster) end + #==== TERMINATE CLUSTER ====# function terminate_cluster(type::Type{GoogleCloud}, cluster_handle) - + cluster = gcp_cluster_info[cluster_handle] + gcp_terminate_cluster(cluster) + gcp_delete_cluster(cluster_handle) + delete!(gcp_cluster_info, cluster_handle) + return +end + +function cluster_isrunning(_::Type{GoogleCloud}, cluster_handle) + try + return gcp_cluster_info[cluster_handle] |> gcp_cluster_isrunning + catch e + @warn "Erro ao verificar o status do cluster: ", e + return false + end end \ No newline at end of file diff --git a/src/cluster_providers/gcp/gcp_persist.jl b/src/cluster_providers/gcp/gcp_persist.jl new file mode 100644 index 0000000..9eb2c2d --- /dev/null +++ b/src/cluster_providers/gcp/gcp_persist.jl @@ -0,0 +1,145 @@ + + +function gcp_cluster_save(cluster::ManagerWorkers) + + contents = Dict() + + contents["type"] = ManagerWorkers + contents["timestamp"] = string(now()) + contents["provider"] = GoogleCloud + + contents["name"] = cluster.name + contents["user_manager"] = cluster.user_manager + contents["user_worker"] = cluster.user_worker + contents["instance_type_manager"] = cluster.instance_type_manager + contents["instance_type_worker"] = cluster.instance_type_worker + contents["count"] = cluster.count + contents["image_id_manager"] = cluster.image_id_manager + contents["image_id_worker"] = cluster.image_id_worker + contents["cluster_nodes"] = cluster.cluster_nodes + contents["cluster_features"] = cluster.features + contents["zone"] = cluster.zone + contents["project"] = cluster.project + + configpath = get(ENV,"CLOUD_CLUSTERS_CONFIG", pwd()) + + open(joinpath(configpath, string(cluster.name, ".cluster")), "w") do io + TOML.print(io, contents) do x + x isa DataType && return string(x) + error("unhandled type $(typeof(x))") + end + end + +end + +function gcp_cluster_save(cluster::PeerWorkers) + + contents = Dict() + + contents["type"] = PeerWorkers + contents["timestamp"] = string(now()) + contents["provider"] = GoogleCloud + + contents["name"] = cluster.name + contents["user"] = cluster.user + contents["instance_type"] = cluster.instance_type + contents["count"] = cluster.count + contents["image_id"] = cluster.image_id + contents["zone"] = cluster.zone + contents["project"] = cluster.project + contents["cluster_nodes"] = cluster.cluster_nodes + contents["cluster_features"] = cluster.features + + configpath = get(ENV,"CLOUD_CLUSTERS_CONFIG", pwd()) + + open(joinpath(configpath, string(cluster.name, ".cluster")), "w") do io + TOML.print(io, contents) do x + x isa DataType && return string(x) + error("unhandled type $(typeof(x))") + end + end + +end + + +function cluster_load(_::Type{GoogleCloud}, _::Type{<:ManagerWorkers}, cluster_handle, contents) + + instance_type_manager = contents["instance_type_manager"] + instance_type_worker = contents["instance_type_worker"] + count = contents["count"] + image_id_manager = contents["image_id_manager"] + image_id_worker = contents["image_id_worker"] + user_manager = contents["user_manager"] + user_worker = contents["user_worker"] + zone = contents["zone"] + project = contents["project"] + + _cluster_nodes = contents["cluster_nodes"] + cluster_nodes = Dict() + for (node_name, instance_id) in _cluster_nodes + cluster_nodes[Symbol(node_name)] = instance_id + end + + cluster_features = contents["cluster_features"] |> gcp_adjusttypefeatures + + cluster = GCPManagerWorkers(string(cluster_handle), image_id_manager, image_id_worker, count, + instance_type_manager, instance_type_worker, user_manager, user_worker, + zone, project, cluster_nodes, cluster_features) + + if gcp_cluster_status(cluster, ["RUNNING", "TERMINATED"]) + gcp_cluster_info[cluster_handle] = cluster + return cluster.features + else + gcp_delete_cluster(cluster_handle) + return nothing + end +end + + +function gcp_adjusttypefeatures(_cluster_features) + cluster_features = Dict() + for (id, vl0) in _cluster_features + idsym = Symbol(id) + vl1 = idsym in [:cluster_type, :node_machinetype, :provider, :node_provider] ? fetchtype(vl0) : vl0 + vl2 = idsym in [:worker_features, :manager_features] ? gcp_adjusttypefeatures(vl1) : vl1 + cluster_features[idsym] = vl2 + end + return cluster_features +end + +function cluster_load(_::Type{GoogleCloud}, _::Type{<:PeerWorkers}, cluster_handle, contents) + + image_id = contents["image_id"] + count = contents["count"] + instance_type = contents["instance_type"] + user = contents["user"] + zone = contents["zone"] + project = contents["project"] + + _cluster_nodes = contents["cluster_nodes"] + cluster_nodes = Dict() + for (node_name, instance_id) in _cluster_nodes + cluster_nodes[Symbol(node_name)] = instance_id + end + + cluster_features = contents["cluster_features"] |> gcp_adjusttypefeatures + + cluster = GCPPeerWorkers(string(cluster_handle), image_id, count, instance_type, user, zone, project, + cluster_nodes, cluster_features) + + if gcp_cluster_status(cluster, ["RUNNING", "TERMINATED"]) + gcp_cluster_info[cluster_handle] = cluster + return cluster.features + else + gcp_delete_cluster(cluster_handle) + return nothing + end +end + +function gcp_delete_cluster(cluster_handle) + configpath = get(ENV,"CLOUD_CLUSTERS_CONFIG", pwd()) + rm(joinpath(configpath, "$cluster_handle.cluster")) + gcp_remove_temp_files(cluster_handle) +end + + From 745aacdc860dc76830a6d115c1033333548c9e54 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Tue, 28 Jan 2025 08:11:09 -0300 Subject: [PATCH 3/5] GCP support --- src/cluster_providers/ec2/ec2_deploy.jl | 4 +- src/cluster_providers/ec2/ec2_persist.jl | 10 +++- src/cluster_providers/gcp/gcp_backend.jl | 2 + src/cluster_providers/gcp/gcp_deploy.jl | 34 +++++++++----- src/deploy.jl | 60 +++++++++++++++--------- src/utils.jl | 20 ++++++++ 6 files changed, 91 insertions(+), 39 deletions(-) diff --git a/src/cluster_providers/ec2/ec2_deploy.jl b/src/cluster_providers/ec2/ec2_deploy.jl index 6a98353..27d7b7e 100644 --- a/src/cluster_providers/ec2/ec2_deploy.jl +++ b/src/cluster_providers/ec2/ec2_deploy.jl @@ -54,7 +54,7 @@ function deploy_cluster(_::Type{AmazonEC2}, count = get(cluster_features, :node_count, 1) imageid = get(cluster_features, :imageid, defaults_dict[AmazonEC2][:imageid]) - user = get(cluster_features, :user, defaults_dict[GoogleCloud][:user]) + user = get(cluster_features, :user, defaults_dict[AmazonEC2][:user]) subnet_id = get(cluster_features, :subnet_id, get(defaults_dict[AmazonEC2], :subnet_id, nothing)) placement_group = get(cluster_features, :placement_group, get(defaults_dict[AmazonEC2], :placement_group, nothing)) @@ -75,7 +75,7 @@ function deploy_cluster(_::Type{AmazonEC2}, return cluster end -ec2_build_clusterobj(_::Type{<:PeerWorkers}, cluster_handle, instance_type, count, imageid, subnet_id, user, +ec2_build_clusterobj(_::Type{<:PeerWorkers}, cluster_handle, instance_type, count, imageid, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, cluster_features) = EC2PeerWorkers(cluster_handle, instance_type, count, imageid, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, diff --git a/src/cluster_providers/ec2/ec2_persist.jl b/src/cluster_providers/ec2/ec2_persist.jl index 817a89b..2426450 100644 --- a/src/cluster_providers/ec2/ec2_persist.jl +++ b/src/cluster_providers/ec2/ec2_persist.jl @@ -14,6 +14,8 @@ function ec2_cluster_save(cluster::ManagerWorkers) contents["count"] = cluster.count contents["image_id_manager"] = cluster.image_id_manager contents["image_id_worker"] = cluster.image_id_worker + contents["user_manager"] = cluster.user_manager + contents["user_worker"] = cluster.user_worker !isnothing(cluster.subnet_id) && (contents["subnet_id"] = cluster.subnet_id) !isnothing(cluster.placement_group) && (contents["placement_group"] = cluster.placement_group) contents["auto_pg"] = cluster.auto_pg @@ -49,6 +51,7 @@ function ec2_cluster_save(cluster::PeerWorkers) contents["instance_type"] = cluster.instance_type contents["count"] = cluster.count contents["image_id"] = cluster.image_id + contents["user"] = cluster.user !isnothing(cluster.subnet_id) && (contents["subnet_id"] = cluster.subnet_id) !isnothing(cluster.placement_group) && (contents["placement_group"] = cluster.placement_group) contents["auto_pg"] = cluster.auto_pg @@ -79,6 +82,8 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han count = contents["count"] image_id_manager = contents["image_id_manager"] image_id_worker = contents["image_id_worker"] + user_manager = contents["user_manager"] + user_worker = contents["user_worker"] subnet_id = haskey(contents, "subnet_id") ? contents["subnet_id"] : nothing placement_group = haskey(contents, "placement_group") ? contents["placement_group"] : nothing auto_pg = contents["auto_pg"] @@ -95,7 +100,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han shared_fs = contents["shared_fs"] cluster = EC2ManagerWorkers(string(cluster_handle), instance_type_manager, instance_type_worker, count, - image_id_manager, image_id_worker, + image_id_manager, image_id_worker, user_manager, user_worker, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, environment, cluster_nodes, shared_fs, cluster_features) @@ -125,6 +130,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle instance_type = contents["instance_type"] count = contents["count"] image_id = contents["image_id"] + user = contents["user"] subnet_id = haskey(contents, "subnet_id") ? contents["subnet_id"] : nothing placement_group = haskey(contents, "placement_group") ? contents["placement_group"] : nothing auto_pg = contents["auto_pg"] @@ -141,7 +147,7 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle shared_fs = contents["shared_fs"] cluster = EC2PeerWorkers(string(cluster_handle), instance_type, count, - image_id, + image_id, user, subnet_id, placement_group, auto_pg, security_group_id, auto_sg, environment, cluster_nodes, shared_fs, cluster_features) diff --git a/src/cluster_providers/gcp/gcp_backend.jl b/src/cluster_providers/gcp/gcp_backend.jl index 3322064..64dbf72 100644 --- a/src/cluster_providers/gcp/gcp_backend.jl +++ b/src/cluster_providers/gcp/gcp_backend.jl @@ -56,7 +56,9 @@ mutable struct GCPPeerWorkersMPI <: PeerWorkersMPI # Cluster image_id::String count::Int instance_type::String + user::String zone::String + project::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} features::Dict{Symbol, Any} end diff --git a/src/cluster_providers/gcp/gcp_deploy.jl b/src/cluster_providers/gcp/gcp_deploy.jl index ea42b21..0e9eaec 100644 --- a/src/cluster_providers/gcp/gcp_deploy.jl +++ b/src/cluster_providers/gcp/gcp_deploy.jl @@ -66,7 +66,7 @@ end # 1. run the script to clusterize the nodes # 2. call deploy_cluster to link ... function deploy_cluster(gcptype::Type{GoogleCloud}, - _::Type{<:PeerWorkers}, + cluster_type::Type{<:PeerWorkers}, _::Type{<:CreateMode}, cluster_handle, cluster_features, @@ -78,15 +78,16 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone]) project = defaults_dict[GoogleCloud][:project] - cluster = GCPPeerWorkers(string(cluster_handle), - imageid, - node_count, - instance_type, - user, - zone, - project, - nothing, - cluster_features) + cluster = gcp_build_clusterobj(cluster_type, + string(cluster_handle), + imageid, + node_count, + instance_type, + user, + zone, + project, + nothing, + cluster_features) gcp_create_cluster(cluster) @@ -97,11 +98,20 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, return cluster end +gcp_build_clusterobj(_::Type{<:PeerWorkers}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) = + GCPPeerWorkers(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) + +gcp_build_clusterobj(_::Type{<:PeerWorkersMPI}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) = + GCPPeerWorkersMPI(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) -function launch_processes(_::Type{GoogleCloud}, cluster_type, cluster_handle, ips) +function launch_processes(_::Type{GoogleCloud}, cluster_type::Type{<:Cluster}, cluster_handle, ips) cluster = gcp_cluster_info[cluster_handle] + launch_processes_ssh(cluster.features, cluster_type, ips) +end - return launch_processes_ssh(cluster.features, cluster_type, ips) +function launch_processes(_::Type{GoogleCloud}, cluster_type::Type{<:PeerWorkersMPI}, cluster_handle, ips) + cluster = gcp_cluster_info[cluster_handle] + launch_processes_mpi(cluster.features, cluster_type, ips) end #==== INTERRUPT CLUSTER ====# diff --git a/src/deploy.jl b/src/deploy.jl index 0556cba..77982c3 100644 --- a/src/deploy.jl +++ b/src/deploy.jl @@ -22,12 +22,12 @@ default_sshflags(provider_type) = defaults_dict[provider_type][:sshflags] function extract_mwfeature(cluster_features, provider_type, featureid) if haskey(cluster_features, :manager_features) && - haskey(cluster_features, :worker_features) && - haskey(cluster_features[:manager_features], featureid) && - haskey(cluster_features[:worker_features], featureid) && - !haskey(cluster_features, featureid) - feature_manager = cluster_features[:manager_features][featureid] - feature_worker = cluster_features[:worker_features][featureid] + haskey(cluster_features, :worker_features) && + haskey(cluster_features[:manager_features], featureid) && + haskey(cluster_features[:worker_features], featureid) && + !haskey(cluster_features, featureid) + feature_manager = cluster_features[:manager_features][featureid] + feature_worker = cluster_features[:worker_features][featureid] elseif haskey(cluster_features, featureid) feature_manager = feature_worker = cluster_features[featureid] else @@ -65,6 +65,7 @@ function cluster_deploy(contract_handle, config_args...) cluster_terminate(cluster_handle) return :unsupported_mwcluster else + save_exception_details() @error "Some error deploying cluster $cluster_handle ($e)" @warn "the cluster will be terminated" cluster_terminate(cluster_handle) @@ -385,12 +386,16 @@ function cluster_interrupt(cluster_handle) try kill_processes(cluster_handle, cluster_type, cluster_features) sleep(1) + catch e + save_exception_details() + @warn "error killing processes of cluster $cluster_handle ($e)" finally interrupt_cluster(node_provider, cluster_handle) end #@info "the cluster $cluster_handle has been interrupted" catch e - println(e) + save_exception_details() + @error "error interrupting cluster $cluster_handle ($e)" return :fail end return :success @@ -409,20 +414,21 @@ function cluster_resume(cluster_handle) try pids = launch_processes(node_provider, cluster_type, cluster_handle, ips) catch e + save_exception_details() @warn "some error creating processes for cluster $cluster_handle ($e)" + @warn "use '@restart $cluster_handle' to launch processes of cluster $cluster_handle." end if !isnothing(pids) cluster_deploy_info[cluster_handle][:pids] = pids - else - @error "resume partially failed due to an unrecoverable error in launching processes" end - #@info "the cluster $cluster_handle has been resumed" catch e - println(e) + save_exception_details() + @error "error resuming cluster $cluster_handle ($e)" return :fail end + return :success end @@ -436,14 +442,17 @@ function cluster_terminate(cluster_handle) try cluster_isrunning(node_provider, cluster_handle) && kill_processes(cluster_handle, cluster_features[:cluster_type], cluster_features) sleep(1) + catch e + save_exception_details() + @warn "error killing processes of cluster $cluster_handle ($e)" finally terminate_cluster(node_provider, cluster_handle) terminated_cluster[cluster_handle] = cluster_deploy_info[cluster_handle] delete!(cluster_deploy_info, cluster_handle) end - #@info "the cluster $cluster_handle has been terminated" catch e - println(e) + save_exception_details() + @error "error terminating cluster $cluster_handle ($e)" return :fail end return :success @@ -480,13 +489,16 @@ function cluster_restart(cluster_handle::Symbol) cluster_type = cluster_features[:cluster_type] try kill_processes(cluster_handle, cluster_type, cluster_features) - finally - ips = get_ips(cluster_provider, cluster_handle) - pids = launch_processes(cluster_provider, cluster_type, cluster_handle, ips) - cluster_deploy_info[cluster_handle][:pids] = pids - end + catch e + error("error killing processes of cluster $cluster_handle ($e)") + throw(e) + end + ips = get_ips(cluster_provider, cluster_handle) + pids = launch_processes(cluster_provider, cluster_type, cluster_handle, ips) + cluster_deploy_info[cluster_handle][:pids] = pids catch e - println(e) + save_exception_details() + @error "error restarting processes of cluster $cluster_handle ($e)" return :fail end @@ -519,8 +531,8 @@ function cluster_reconnect(cluster_handle::Symbol) try pids = launch_processes(cluster_provider, cluster_type, cluster_handle, ips) catch e + save_exception_details() @warn "exception caught when launching processes ($e) - fix the problem and try '@restart :$cluster_handle'" - @error "error launching processes" end if !isnothing(pids) @@ -533,6 +545,7 @@ function cluster_reconnect(cluster_handle::Symbol) @error "The cluster $cluster_handle is not active" end catch e + save_exception_details() println(e) return :fail end @@ -544,6 +557,7 @@ function cluster_reconnect(cluster_handle::Symbol) function report_exception(e) + save_exception_details() if e isa CompositeException @info "reporting composite exception:" for ee in e.exceptions @@ -590,12 +604,12 @@ function load_cluster(cluster_handle::String; from = DateTime(0), cluster_type = result[:timestamp] = timestamp result[:features] = cluster_features else - @warn "$this_cluster_type cluster $cluster_handle is not active" + @warn "$this_cluster_type cluster $cluster_handle is not accessible" end end catch e - @error e - @error "cluster $cluster_handle not found" + save_exception_details() + @warn "cluster $cluster_handle not found" end return result end diff --git a/src/utils.jl b/src/utils.jl index 201b799..770d9ea 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -24,4 +24,24 @@ function try_run(command) end end +end + +last_exceptions = Ref{Vector{Any}}(Vector{Any}()) + +function save_exception_details() + + empty!(last_exceptions[]) + for (exc, bt) in current_exceptions() + push!(last_exceptions[],(exc, bt)) + end + +end + +function show_exceptions() + + for (exc, bt) in last_exceptions[] + showerror(stdout, exc, bt) + println(stdout) + end + end \ No newline at end of file From 75a780e74f2353d4a355eb4a514f70a1ac1428a9 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 6 Feb 2025 10:25:10 -0300 Subject: [PATCH 4/5] GCP support, @status, @config, minor bug corrections --- CCconfig.toml | 6 +- Project.toml | 2 +- README.md | 387 ++++++++++++----------- docs/src/index.md | 387 ++++++++++++----------- src/CloudClusters.jl | 8 +- src/cluster.jl | 2 +- src/cluster_providers/ec2/ec2_deploy.jl | 21 ++ src/cluster_providers/ec2/ec2_persist.jl | 2 - src/cluster_providers/gcp/gcp_backend.jl | 35 +- src/cluster_providers/gcp/gcp_deploy.jl | 42 ++- src/cluster_providers/gcp/gcp_persist.jl | 11 +- src/config/configs.jl | 4 +- src/deploy.jl | 55 ++-- src/macros.jl | 12 + src/utils.jl | 4 +- 15 files changed, 542 insertions(+), 436 deletions(-) diff --git a/CCconfig.toml b/CCconfig.toml index a851d02..6955a09 100644 --- a/CCconfig.toml +++ b/CCconfig.toml @@ -18,7 +18,8 @@ mpiflags = "" [ec2] -imageid = "ami-0b869698add04fbdc" # found at us-east-1 (North Virginia). To use in other regions, copy it. +imageid = "ami-0bec2868f8f28086f" # found at us-east-1 (North Virginia). To use in other regions, copy it. +#security_group_id = "sg-09e2e7c3eebd45160" # placement_group = "pg-XXXXXXXXXXXX" or "automatic" # security_group_id = "sg-XXXXXXXXXXXX" or "automatic" @@ -26,10 +27,11 @@ imageid = "ami-0b869698add04fbdc" # found at us-east-1 (North Virginia). To use [gcp] -imageid = "hpc-shelf-311900/global/images/cloudclusters-basic-v3" +imageid = "hpc-shelf-311900/global/images/cloudclusters-basic-v5" zone = "us-central1-a" project = "hpc-shelf-311900" user = "heron" exename = "/home/heron/.juliaup/bin/julia" directory = "/home/heron" mpiflags = "--map-by node --hostfile /home/heron/hostfile" +# network_interface = "default" diff --git a/Project.toml b/Project.toml index a93aa64..3d3d8ef 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "CloudClusters" uuid = "4ca6f12b-c8f1-4945-b50f-6bb73234c039" authors = ["Francisco Heron de Carvalho Junior e João Marcelo Uchôa de Alencar "] -version = "0.1.2" +version = "0.2.0" [deps] AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" diff --git a/README.md b/README.md index d54814d..8b9647e 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,12 @@ + ![CloudClusters.jl](https://raw.githubusercontent.com/PlatformAwareProgramming/CloudClusters.jl/refs/heads/main/docs/src/assets/logo-text.svg) [![TagBot](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/TagBot.yml/badge.svg)](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/TagBot.yml) + [![CompatHelper](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/CompatHelper.yml/badge.svg)](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/CompatHelper.yml) _A package for creating, using, and managing clusters of virtual machine (VM) instances deployed with IaaS cloud providers._ -> [!NOTE] -> Currently, only [EC2](https://aws.amazon.com/ec2/) is supported. Those interested can ask us about progress with other providers. -> Contributors are welcome. - ## Target users _CloudClusters.jl_ targets Julia programming language users who need on-demand access to cutting-edge computing resources that IaaS cloud providers provide to meet high-performance computing (HPC) application requirements. @@ -17,9 +15,9 @@ _CloudClusters.jl_ targets Julia programming language users who need on-demand a ### Cloud providers' credentials -Even though _CloudClusters.jl_ currently only supports AWS EC2, it plans to support multiple IaaS cloud providers in the future. +Currently, _CloudClusters.jl_ supports AWS EC2 and Google Cloud Platform (GCP). In future versions, the support to other IaaS cloud providers may be implemented. -_CloudClusters.jl_ assumes that the user has configured their credentials for the services of their preferred cloud providers in the environment. +_CloudClusters.jl_ assumes that the user has configured the system with the required credentials for the cloud providers' services they will use. For GCP, _CloudClusters.jl_ starts a session using the [JSON credential file](https://cloud.google.com/docs/authentication/application-default-credentials) informed through the GOOGLE_APPLICATION_CREDENTIALS environment variable. In turn, the EC2 API will look for [credential files](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-files.html#cli-configure-files-where) in the $HOME/.aws folder. ### The configuration file (_CCconfig.toml_) @@ -28,67 +26,70 @@ Creating clusters with _CloudClusters.jl_ requires specifying some configuration * the current path; * the home path. -Section [Configuration parameters](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters) describes default configuration parameters and how they can be overridden in programs. +Section [Configuration parameters](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters) describes default configuration parameters and how they can be overridden in programs. + +A [_CCconfig.toml_](https://raw.githubusercontent.com/PlatformAwareProgramming/CloudClusters.jl/refs/heads/main/CCconfig.toml) file is provided in the repository's top-level directory. It is downloaded to the current directory if a _CCconfig.toml_ file is not found. It is configured to create clusters using prebuilt virtual machine images for each supported cloud provider. These images are based on the latest version of Ubuntu and include a Julia installation of a recent stable version with all the packages needed to instantiate the clusters added and precompiled. Users can create customized images, possibly derived from the provided image, using their preferred version of Julia and adding the packages they need. -A [_CCconfig.toml_](https://raw.githubusercontent.com/PlatformAwareProgramming/CloudClusters.jl/refs/heads/main/CCconfig.toml) file is provided in the repository's top-level directory. It is downloaded to the current directory if a _CCconfig.toml_ file is not found. It is configured to create clusters using prebuilt virtual machine images for each supported cloud provider. These images are based on the latest version of Ubuntu and include a Julia installation of a recent stable version with all the packages needed to instantiate the clusters added and precompiled. Users can create customized images, possibly derived from the provided image, using their preferred version of Julia and adding the packages they need. + > [!WARNING] > The version of Julia on the host computer using _CloudClusters.jl_ must be the same version as the image used to deploy the clusters. - > [!NOTE] > The current prebuilt image for EC2 is located at the _us-east-1_ (North Virginia) region. Suppose the user is going to deploy a cluster in another region. In that case, they must create a copy of the image for that region in their account and assign their id to the ```imageid``` parameter of _CCConfig.toml_. - + ### The _PlatformAware.jl_ package -_CloudClusters.jl_ relies on an experimental package called [_PlatformAware.jl_](https://github.com/PlatformAwareProgramming/PlatformAware.jl) for the specification of _platform types_, aimed at specifying assumptions about architectural features of virtual machines instances. Indeed, _PlatformAware.jl_ may be used with _CloudClusters.jl_ to write functions specifically tuned according to the features of VM instances that comprise the clusters. This is called _platform-aware programming_. The users of _CloudClusters.jl_, particularly package developers, are invited to explore and use the ideas behind _PlatformAware.jl_. +_CloudClusters.jl_ relies on an experimental package called [_PlatformAware.jl_](https://github.com/PlatformAwareProgramming/PlatformAware.jl) for the specification of _platform types_, aimed at specifying assumptions about architectural features of virtual machines instances. Indeed, _PlatformAware.jl_ may be used with _CloudClusters.jl_ to write functions specifically tuned according to the features of VM instances that comprise the clusters. This is called _[platform-aware programming_](https://sol.sbc.org.br/index.php/sscad/article/view/26529). The users of _CloudClusters.jl_, particularly package developers, are invited to explore and use the ideas behind _PlatformAware.jl_. Section [The integration with PlatformAware.jl](https://github.com/PlatformAwareProgramming/CloudClusters.jl#the-integration-with-platformawarejl) provides a deeper discussion about the integration of _PlatformAware.jl_ within _CloudClusters.jl_. # Tutorial -Next, we show a tutorial on how _CloudClusters.jl_ works, divided into two parts: _basic use_ and _advanced use_. +Next, we show a tutorial on how _CloudClusters.jl_ works, divided into two parts: _basic use_ and _advanced use_. -The basic tutorial teaches the reader how to create and deploy computations on ___peer-workers___ clusters, comprising a set of homogeneous VM instances deployed in the infrastructure of an IaaS cloud provider. +The basic tutorial teaches the reader how to create and deploy computations on ___peer-workers___ clusters, comprising a set of homogeneous VM instances deployed in the infrastructure of an IaaS cloud provider. The advanced tutorial includes: * [a deeper discussion about _cluster contracts_](https://github.com/PlatformAwareProgramming/CloudClusters.jl#working-with-cluster-contracts); + * [how to use MPI with ___peer-workers___ clusters](https://github.com/PlatformAwareProgramming/CloudClusters.jl#peer-workers-mpi-clusters); + * [how to create ___manager-workers___ clusters, a kind of cluster that comprises an access node and a set of homogenous compute nodes only accessible through the access node](https://github.com/PlatformAwareProgramming/CloudClusters.jl#manager-workers-clusters); -* [a description of configuration parameters and how programs can override the default values from the _CCconfig.toml_ file](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters). -# Basic use +* [a description of configuration parameters and how programs can override the default values from the _CCconfig.toml_ file](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters). -In what follows, we teach how to create ___peer-workers___ clusters and deploy computations on them using _Distributed.jl_ primitives. +# Basic use -Remember that the AWS credentials must be properly configured in the environment where the Julia REPL session or program will be executed. +In what follows, we teach how to create ___peer-workers___ clusters and deploy computations on them using _Distributed.jl_ primitives. -## How to create a cluster +## How to create a cluster -_CloudClusters.jl_ offers six primitives, as _macros_, to create and manage a cluster's lifecycle. They are: __@cluster__, __@resolve__, __@deploy__, __@terminate__, __@interrupt__, and __@resume__. +_CloudClusters.jl_ offers seven primitives, as _macros_, to create and manage a cluster's lifecycle. They are: __@cluster__, __@resolve__, __@deploy__, __@terminate__, __@interrupt__, __@resume__, and __@restart__. -First, let's try a simple scenario where a user creates a cluster comprising four ___t3.xlarge___ virtual machines (VM) instances through the AWS EC2 services. In the simplest way to do this, the user applies the __@cluster__ macro to the number of nodes and instance type, as arguments. +First, let's try a simple scenario where a user creates a cluster comprising four ___t3.xlarge___ virtual machines (VM) instances through the AWS EC2 services. In the simplest way to do this, the user applies the __@cluster__ macro to the number of nodes and instance type, as arguments. ```julia using CloudClusters +my_first_contract = @cluster node_count => 4 node_machinetype => PlatformAware.EC2Type_T3_xLarge +``` -my_first_cluster_contract = @cluster node_count => 4 node_machinetype => PlatformAware.EC2Type_T3_xLarge +```EC2Type_T3_xLarge``` is a Julia type from the _PlatformAware.jl_ package that represents the ___t3.xlarge___ instance type (and size) of EC2. _PlatformAware.jl_ offers a hierarchy of types representing instance types of supported providers (e.g., ```MachineType``` → ```EC2Type``` → ```EC2Type_T3``` → ```EC2Type_T3_xLarge``` and ```MachineType``` → ```GCPType``` → ```GCPType_E2``` → ```GCPType_E2_Medium```). -``` -```EC2Type_T3_xLarge``` is a Julia type from the _PlatformAware.jl_ package that represents the ___t3.xlarge___ instance type (and size) of EC2. _PlatformAware.jl_ offers a hierarchy of types representing instance types of supported providers (e.g., ```MachineType``` → ```EC2Type``` → ```EC2Type_T3``` → ```EC2Type_T3_xLarge```). -For example, the user may list all the supported EC2 instance types by executing ```subtypes(PlatformAware.EC2Type)``` in the REPL, or ```subtypes(PlatformAware.EC2Type_T3)``` if the user intends to list the available instance sizes for the ___t3___ instance type. +The user may list all the supported EC2 instance types by executing ```subtypes(PlatformAware.EC2Type)``` in the REPL, or ```subtypes(PlatformAware.EC2Type_T3)``` if the user intends to list the available instance sizes for the ___t3___ instance type. Analougously for GCP. -__@cluster__ does not instantiate a cluster yet. It creates a _cluster contract_ and returns a handle for it. In the example, the _contract handle_ is stored in the _my_first_cluster_contract_ variable, from which the user can create one or more clusters later. + __@cluster__ does not instantiate a cluster yet. It creates a _cluster contract_ and returns a handle for it. In the example, the _contract handle_ is stored in the _my_first_cluster_contract_ variable, from which the user can create one or more clusters later. > [!NOTE] -> In _CloudClusters.jl_, a handle is a symbol comprising 15 randomly calculated lower and upper case alphabetic characters (e.g.,```:FXqElAnSeTEpQAm``` ). As symbols, they are printable and may be used directly to refer to a cluster contract. + +> In _CloudClusters.jl_, a handle is a symbol comprising 15 random lower and upper case alphabetic characters (e.g.,```:FXqElAnSeTEpQAm``` ). As symbols, they are printable and may be used directly to refer to a cluster contract. A cluster contract must be resolved before creating clusters using it. For that, the user needs to apply __@resolve__ to the contract handle, as below: ```julia -@resolve my_first_cluster_contract +@resolve my_first_contract ``` -The __@resolve__ macro triggers a resolution procedure to calculate which instance type offered by one of the supported IaaS providers satisfies the contract. For ```my_first_cluster_contract```, the result is explicitly specified: the ___t3.xlarge___ instance type of AWS EC2. For advanced contract specifications, where cluster contract resolution shows its power, the reader can read the [Working with cluster contracts](https://github.com/PlatformAwareProgramming/CloudClusters.jl#working-with-cluster-contracts) section. +The __@resolve__ macro triggers a resolution procedure to calculate which instance type offered by one of the supported IaaS providers satisfies the contract. For ```my_first_contract```, the result is explicitly specified: the ___t3.xlarge___ instance type of AWS EC2. For advanced contract specifications, where cluster contract resolution shows its power, the reader can read the [Working with cluster contracts](https://github.com/PlatformAwareProgramming/CloudClusters.jl#working-with-cluster-contracts) section. A cluster may be instantiated by using ___@deploy___: @@ -96,11 +97,11 @@ A cluster may be instantiated by using ___@deploy___: my_first_cluster = @deploy my_first_cluster_contract ``` -The __@deploy__ macro will create a 4-node cluster comprising ___t3.xlarge___ AWS EC2 instances, returning a cluster handle, assigned to the ```my_first_cluster``` variable. +The __@deploy__ macro will create a 4-node cluster comprising ___t3.xlarge___ AWS EC2 instances, returning a cluster handle, assigned to the ```my_first_cluster``` variable. -After __@deploy__, a set of _worker processes_ is created, one at each cluster node. Their _PIDs_ may be inspected by applying the ___nodes___ function to the cluster handle. +After __@deploy__, a set of _worker processes_ is created, one at each cluster node. Their _PIDs_ may be inspected by applying the ___nodes___ function to the cluster handle. -In the following code, the user fetches the _PIDs_ of the processes running at the nodes of the cluster referred to by ```my_first_cluster```. + In the following code, the user uses __@nodes__ to fetch the _PIDs_ of the processes running at the nodes of the cluster referred to by ```my_first_cluster```. ```julia-repl julia> @nodes my_first_cluster @@ -114,70 +115,68 @@ julia> @nodes my_first_cluster The example shows that the default number of worker processes per cluster node is 1. However, the user may create N worker processes per cluster node using the ```node_process_count => N``` parameter in the contract specification. For example, in the following contract, the number of worker processes per cluster node is set to 2: ```julia -@cluster node_count => 4 node_process_count => 2 node_machinetype => EC2Type_T3_xLarge +@cluster node_count => 4 node_process_count => 2 node_machinetype => EC2Type_T3_xLarge ``` - ## Running computations on the cluster -The user may execute parallel computations on the cluster using _Distributed.jl_ operations. In fact, the user can employ any parallel/distributed computing package in the Julia ecosystem to launch computations across a set of worker processes. For instance, the advanced tutorial will show how to use _MPI.jl_ integrated with _Distributed.jl_. +The user may execute parallel computations on the cluster using _Distributed.jl_ operations. In fact, the user can employ any parallel/distributed computing package in the Julia ecosystem to launch computations across a set of worker processes. For instance, the advanced tutorial will show how to use _MPI.jl_ integrated with _Distributed.jl_. -The following code, adapted from [The ultimate guide to distributed computing in Julia](https://github.com/Arpeggeo/julia-distributed-computing#the-ultimate-guide-to-distributed-computing-in-julia), processes a set of CSV files in a data folder in parallel, using _pmap_, across the worker processes placed at the cluster nodes. The result of each file processing is saved locally, as a CSV file in a results folder. +The following code, adapted from [The ultimate guide to distributed computing in Julia](https://github.com/Arpeggeo/julia-distributed-computing#the-ultimate-guide-to-distributed-computing-in-julia), processes a set of CSV files in a data folder in parallel, using _pmap_, across the worker processes placed at the cluster nodes. The result of each file processing is saved locally, as a CSV file in a results folder. ```julia using Distributed @everywhere cluster_nodes(my_first_cluster) begin - # load dependencies - using ProgressMeter - using CSV - # helper functions - function process(infile, outfile) - # read file from disk - csv = CSV.File(infile) + # load dependencies + using ProgressMeter + using CSV - # perform calculations - sleep(60) + # helper functions + function process(infile, outfile) + # read file from disk + csv = CSV.File(infile) - # save new file to disk - CSV.write(outfile, csv) - end + # perform calculations + sleep(60) + + # save new file to disk + CSV.write(outfile, csv) + end end # MAIN SCRIPT # ----------- # relevant directories -indir = joinpath(@__DIR__,"data") +indir = joinpath(@__DIR__,"data") outdir = joinpath(@__DIR__,"results") # files to process -infiles = readdir(indir, join=true) +infiles = readdir(indir, join=true) outfiles = joinpath.(outdir, basename.(infiles)) -nfiles = length(infiles) - -status = @showprogress pmap(1:nfiles; pids=cluster_nodes(my_first_cluster)) do i - try - process(infiles[i], outfiles[i]) - true # success - catch e - false # failure - end +nfiles = length(infiles) + +status = @showprogress pmap(1:nfiles; pids=cluster_nodes(my_first_cluster)) do i + try + process(infiles[i], outfiles[i]) + true # success + catch e + false # failure + end end - ``` ## Multiple clusters -Users can create cluster contracts and deploy clusters from them as many times as they need. For example, the following code creates a second cluster contract, named ```my_second_cluster_contract```, asking for a cluster comprising eight VM instances equipped with exactly eight NVIDIA GPUs of Ada-Lovelace architecture and at least 512GB of memory per node. Then, it creates two clusters from the new contract. +Users can create cluster contracts and deploy clusters from them as many times as they need. For example, the following code creates a second cluster contract, named ```my_second_cluster_contract```, asking for a cluster comprising eight VM instances equipped with exactly eight NVIDIA GPUs of Ada-Lovelace architecture and at least 512GB of memory per node. Then, it creates two clusters from the new contract. ```julia - -my_second_cluster_contract = @cluster(node_count => 8, - node_memory_size => @atleast(512G), - accelerator_count => @just(8), - accelerator_architecture => Ada) +my_second_contract = @cluster(node_count => 8, + node_memory_size => @atleast(512G), + accelerator_count => @just(8), + accelerator_architecture => Ada) @resolve my_second_cluster_contract @@ -213,36 +212,39 @@ julia> @ nodes my_third_cluster 21 ``` -The user may orchestrate the processing power of multiple clusters to run computations of their interest, independent of their providers. This is _multicluster computation_. However, it is important to note that communication operations between processes placed at nodes of different clusters (inter-cluster communication), mainly when these clusters are deployed at different IaaS providers, must be used with care due to the high communication cost, only when necessary and overlapping communication and computation using asynchronous operations. +The user may orchestrate the processing power of multiple clusters to run computations of their interest, independent of their providers. This is _multicluster computation_. However, it is important to note that communication operations between processes placed at nodes of different clusters (inter-cluster communication), mainly when these clusters are deployed at different IaaS providers, must be used with care due to the high communication cost, only when necessary and overlapping communication and computation using asynchronous operations. ## Interrupting and resuming a cluster -A cluster may be interrupted through the ___@interrupt___ macro: +A cluster may be interrupted through the ___@interrupt___ macro: ```julia @interrupt my_first_cluster ``` -The effect of ___@interrupt___ is pausing/stopping the VM instances of the cluster nodes. + +The effect of ___@interrupt___ is pausing/stopping the VM instances of the cluster nodes. The semantics of interrupting a cluster may vary accross IaaS providers. An interrupted cluster can be brought back to the running state using the ___@resume___ macro: ```julia @resume my_first_cluster ``` -The resuming operation starts the VM instances and creates a fresh set of worker processes, with new _PIDs_. + +The resume operation starts the VM instances and creates a fresh set of worker processes, with new _PIDs_. > [!CAUTION] -> ___@interrupt___ does not preserve the state of undergoing computations in the cluster, since it kills the worker processes running at the cluster nodes. The interruption of a cluster may be used to avoid the cost of cloud resources that are not currently being used. The user is responsible for saving the state of undergoing computations in a cluster to be interrupted and reloading the state after resuming, if necessary. + +> ___@interrupt___ does not preserve the state of undergoing computations in the cluster, since it kills the worker processes running at the cluster nodes. The interruption of a cluster may be used to avoid the cost of cloud resources that are not currently being used. The user is responsible for saving the state of undergoing computations in a cluster to be interrupted and reloading the state after resuming, if necessary. ## Restarting processes -A user can restart the processes at the cluster nodes by using the ___@restart___ macro: +A user can restart the processes at the cluster nodes by using the ___@restart___ macro: ```julia @restart my_first_cluster ``` -The restart procedure kills all the current processes at the cluster nodes, losing their current state, and creates new processes, with fresh _PIDs_. +The restart procedure kills all the current processes at the cluster nodes, losing their current state, and creates new processes, with fresh _PIDs_. ## Terminating a cluster @@ -262,91 +264,89 @@ If a cluster was not terminated in a previous execution of a Julia program or RE @reconnect :FXqElAnSeTEpQAm ``` -In the above code, ```:FXqElAnSeTEpQAm``` is the handle of a cluster not terminated in a previous execution session. But how may the user discover the cluster handle of a non-terminated cluster? For example, after a system crash? For that, the user may call the ___@clusters___ macro, which returns a list of non-terminated clusters in previous sessions that are still alive and can be reconnected: +In the above code, ```:FXqElAnSeTEpQAm``` is the handle of a cluster not terminated in a previous execution session. But how may the user discover the cluster handle of a non-terminated cluster? For example, after a system crash? For that, the user may invoke the ___@clusters___ macro, which returns a list of non-terminated clusters in previous sessions that are still alive and can be reconnected: ```julia julia> @clusters [ Info: PeerWorkers FXqElAnSeTEpQAm, created at 2024-10-08T09:12:40.847 on PlatformAware.AmazonEC2 1-element Vector{Any}: - Dict{Any, Any}(:handle => :FXqElAnSeTEpQAm, :provider => PlatformAware.AmazonEC2, :type => PeerWorkers, :timestamp => Dates.DateTime("2024-10-08T09:12:40.847")) +Dict{Any, Any}(:handle => :FXqElAnSeTEpQAm, :provider => PlatformAware.AmazonEC2, :type => PeerWorkers, :timestamp => Dates.DateTime("2024-10-08T09:12:40.847")) ``` ## Advanced Use -### Working with cluster contracts +### Working with cluster contracts -As shown in the previous examples of using the ___@cluster___ macro, _CloudClusters.jl_ supports _cluster contracts_ to specify _assumptions_ about cluster _features_, with special attention to the types of VM instances comprising cluster nodes. +As shown in the previous examples of using the ___@cluster___ macro, _CloudClusters.jl_ supports _cluster contracts_ to specify _assumptions_ about cluster _features_, with special attention to the types of VM instances comprising cluster nodes. -Cluster contracts are a set of key-value pairs ```k => v``` called _assumption parameters_, where ```k``` is a name and ```v``` is a value or [_platform type_](). A predefined set of assumption parameters is supported, each with a _name_ and a default value or _base platform type_. +Cluster contracts are a set of key-value pairs ```k => v``` called _assumption parameters_, where ```k``` is a name and ```v``` is a value or [_platform type_](). A predefined set of assumption parameters is supported, each with a _name_ and a default value or _base platform type_. -The currently supported set of assumption parameters is listed [here](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters), providing a wide spectrum of assumptions for users to specify the architectural characteristics of a cluster to satisfy their needs. Note that assumption parameters are classified into cluster and instance parameters, where _instance parameters_ are the assumption parameters considered in the instance resolution procedure (_resolve_). +The currently supported set of assumption parameters is listed [here](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters), providing a wide spectrum of assumptions for users to specify the architectural characteristics of a cluster to satisfy their needs. Note that assumption parameters are classified into cluster and instance parameters, where _instance parameters_ are taken into consideration in the instance resolution procedure (__@resolve__). -In the case of ```my_first_cluster_contract```, the user uses the assumption parameters ___node_count___ and ___nodes_machinetype___ to specify that the required cluster must have four nodes and that the VM instances that comprise the cluster nodes must be of the ___t3.xlarge___ type, offered by the AWS EC2 provider. This is a direct approach, the simplest and least abstract one, where the resolution procedure, triggered by a call to __@resolve__, will return the EC2's ___t3.xlarge___ as the VM instance type that satisfies the contract. - -On the other hand, ```my_second_cluster_contract``` employs an indirect approach, demonstrating that the resolution procedure may look for a VM instance type from a set of abstract assumptions. They are specified using the assumption parameters __accelerator_count__, __accelerator_architecture__, and __accelerator_memory__, asking for cluster nodes with eight GPUs of NVIDIA Ada Lovelace architecture and at least 512GB of memory. Under these assumptions, the call to ___@resolve___ returns the __g6.48xlarge__ instance type of AWS EC2. +In the case of ```my_first_contract```, the user uses the assumption parameters ___node_count___ and ___nodes_machinetype___ to specify that the required cluster must have four nodes and that the VM instances that comprise the cluster nodes must be of the ___t3.xlarge___ type, offered by the AWS EC2 provider. This is a direct approach, the simplest and least abstract one, where the resolution procedure, triggered by a call to __@resolve__, will return the EC2's ___t3.xlarge___ as the VM instance type that satisfies the contract. +On the other hand, ```my_second_contract``` employs an indirect approach, demonstrating that the resolution procedure may look for a VM instance type from a set of abstract assumptions. They are specified using the instance parameters __accelerator_count__, __accelerator_architecture__, and __accelerator_memory__, asking for cluster nodes with eight GPUs of NVIDIA Ada Lovelace architecture and at least 512GB of memory. Under these assumptions, the call to ___@resolve___ returns the __g6.48xlarge__ instance type of AWS EC2. #### List of assumption parameters ___Cluster parameters___ specify features of the cluster: - * __cluster_type__::```Cluster```, denoting the cluster type: ManagerWorkers, PeerWorkers, or PeerWorkersMPI; - * __node_count__::```Integer```, denoting the number of cluster nodes (default to _1_); - * __node_process_count__::```Integer```, denoting the number of Julia processes (MPI ranks) per node (default to _1_). -___Instance parameters___, with their respective base platform types, are listed below: +* __cluster_type__::```Cluster```, denoting the cluster type: ManagerWorkers, PeerWorkers, or PeerWorkersMPI; +* __node_count__::```Integer```, denoting the number of cluster nodes (default to _1_); +* __node_process_count__::```Integer```, denoting the number of Julia processes (MPI ranks) per node (default to _1_). - * __node_provider__::```CloudProvider```, the provider of VM instances for the cluster nodes; - * __cluster_locale__::```Locale```, the geographic location where the cluster nodes will be instantiated; - * __node_machinetype__::```InstanceType```, the VM instance type of cluster nodes; - * __node_memory_size__::```@atleast 0```, the memory size of each cluster node; - * __node_ecu_count__::```@atleast 1```, the EC2 compute unit, a processing performance measure for VM instances (only for EC2 instances); - * __node_vcpus_unit__::```@atleast 1```, the number of virtual CPUs in each cluster node; - * __accelerator_count__::```@atleast 0```, the number of accelerators in the cluster node; - * __accelerator_memory__::```@atleast 0```, the amount of memory of the cluster node accelerators; - * __accelerator_type__::```AcceleratorType```, the type of accelerator; - * __accelerator_manufacturer__::```AcceleratorManufacturer```, the manufacturer of the accelerator; - * __accelerator_arch__::```AcceleratorArchitecture```, the architecture of the accelerator, depending on its type and manufacturer. - * __accelerator__::```AcceleratorModel```, the accelerator model; - * __processor_manufacturer__::```Manufacturer```, the processor manufacturer; - * __processor_microarchitecture__::```ProcessorArchitecture```, the processor microarchitecture; - * __processor__::```ProcessorModel```, the processor model; - * __storage_type__::```StorageType```, the type of storage in cluster nodes; - * __storage_size__::```@atleast 0```, the size of the storage in cluster nodes; - * __network_performance__::```@atleast 0```, the network performance between cluster nodes. +___Instance parameters___, with their respective base platform types, are listed below: +* __node_provider__::```CloudProvider```, the provider of VM instances for the cluster nodes; +* __cluster_locale__::```Locale```, the geographic location where the cluster nodes will be instantiated; +* __node_machinetype__::```InstanceType```, the VM instance type of cluster nodes; +* __node_memory_size__::```@atleast 0```, the memory size of each cluster node; +* __node_ecu_count__::```@atleast 1```, the EC2 compute unit, a processing performance measure for VM instances (only for EC2 instances); +* __node_vcpus_unit__::```@atleast 1```, the number of virtual CPUs in each cluster node; +* __accelerator_count__::```@atleast 0```, the number of accelerators in the cluster node; +* __accelerator_memory__::```@atleast 0```, the amount of memory of the cluster node accelerators; +* __accelerator_type__::```AcceleratorType```, the type of accelerator; +* __accelerator_manufacturer__::```AcceleratorManufacturer```, the manufacturer of the accelerator; +* __accelerator_arch__::```AcceleratorArchitecture```, the architecture of the accelerator, depending on its type and manufacturer. +* __accelerator__::```AcceleratorModel```, the accelerator model; +* __processor_manufacturer__::```Manufacturer```, the processor manufacturer; +* __processor_microarchitecture__::```ProcessorArchitecture```, the processor microarchitecture; +* __processor__::```ProcessorModel```, the processor model; +* __storage_type__::```StorageType```, the type of storage in cluster nodes; +* __storage_size__::```@atleast 0```, the size of the storage in cluster nodes; +* __network_performance__::```@atleast 0```, the network performance between cluster nodes. Most platform types are specified in the _PlatformAware.jl_ package. The user may open a REPL section to query types defined in _PlatformAware.jl_. For example, the user may apply the [```subtypes``` function](https://www.jlhub.com/julia/manual/en/function/subtypes) to know the subtypes of a given platform type, which define the available choices: ```julia-repl - julia> using PlatformAware julia> subtypes(Accelerator) 3-element Vector{Any}: - NVIDIAAccelerator - AMDAccelerator - IntelAccelerator +NVIDIAAccelerator +AMDAccelerator +IntelAccelerator julia> subtypes(EC2Type_T3) 8-element Vector{Any}: - EC2Type_T3A - EC2Type_T3_2xLarge - EC2Type_T3_Large - EC2Type_T3_Medium - EC2Type_T3_Micro - EC2Type_T3_Nano - EC2Type_T3_Small - EC2Type_T3_xLarge +EC2Type_T3A +EC2Type_T3_2xLarge +EC2Type_T3_Large +EC2Type_T3_Medium +EC2Type_T3_Micro +EC2Type_T3_Nano +EC2Type_T3_Small +EC2Type_T3_xLarge ``` - + #### Querying contracts In the current implementation of _CloudClusters.jl_, since contract resolution, using ___@resolve___, is implemented on top of Julia's multiple dispatch mechanism, it does not support ambiguity, i.e., only a single VM instance type must satisfy the contract. Otherwise, ___resolve___ returns an ambiguity error, like in the example below: ```julia-repl -julia> cc = @cluster(node_count => 4, +julia> cc = @cluster(node_count => 4, accelerator_count => @atleast(4), - accelerator_architecture => Ada, + accelerator_architecture => Ada, node_memory_size => @atleast(256G)) :NKPlCvagfSSpIgD @@ -354,91 +354,91 @@ julia> @resolve cc ERROR: MethodError: resolve(::Type{CloudProvider}, ::Type{MachineType}, ::Type{Tuple{AtLeast256G, AtMostInf, var"#92#X"} where var"#92#X"}, ::Type{Tuple{AtLeast1, AtMostInf, Q} where Q}, ::Type{Tuple{AtLeast4, AtMostInf, var"#91#X"} where var"#91#X"}, ::Type{AcceleratorType}, ::Type{Ada}, ::Type{Manufacturer}, ::Type{Tuple{AtLeast0, AtMostInf, Q} where Q}, ::Type{Accelerator}, ::Type{Processor}, ::Type{Manufacturer}, ::Type{ProcessorMicroarchitecture}, ::Type{StorageType}, ::Type{Tuple{AtLeast0, AtMostInf, Q} where Q}, ::Type{Tuple{AtLeast0, AtMostInf, Q} where Q}) is ambiguous. ``` -The user can use the ___@select___ macro to query which instance types satisfy the ambiguous contract: +The user can use the ___@select___ macro to query which instance types satisfy the ambiguous contract: ```julia-repl julia> @select(node_count => 4, accelerator_count => @atleast(4), - accelerator_architecture => Ada, + accelerator_architecture => Ada, node_memory_size => @atleast(256G)) ┌ Warning: Only instance features are allowed. Ignoring node_count. └ @ CloudClusters ~/Dropbox/Copy/ufc_mdcc_hpc/CloudClusters.jl/src/resolve.jl:78 Dict{String, Any} with 3 entries: - "g6.48xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast512G, AtMost1T, 8.24634e11}}, :storage_size => Type{>:Tuple{AtLeast32T, AtMost64T, 6.52835e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast128, AtMost256, 192.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_48xLarge}, :processor_microarchitecture => Type{>:Zen}) - "g2-standard-96" => Dict{Symbol, Any}(:processor => Type{>:IntelXeon_8280L}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:Intel}, :storage_type => Type{>:StorageType}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast0, AtMostInf, Q} where Q}, :node_provider => Type{>:GoogleCloud}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:GCPType_G2}, :processor_microarchitecture => Type{>:CascadeLake}) - "g6.24xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast8T, AtMost16T, 1.63209e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast4, AtMost4, 4.0}}, :network_performance => Type{>:Tuple{AtLeast32G, AtMost64G, 5.36871e10}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_24xLarge}, :processor_microarchitecture => Type{>:Zen}) +"g6.48xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast512G, AtMost1T, 8.24634e11}}, :storage_size => Type{>:Tuple{AtLeast32T, AtMost64T, 6.52835e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast128, AtMost256, 192.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_48xLarge}, :processor_microarchitecture => Type{>:Zen}) +"g2-standard-96" => Dict{Symbol, Any}(:processor => Type{>:IntelXeon_8280L}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:Intel}, :storage_type => Type{>:StorageType}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast0, AtMostInf, Q} where Q}, :node_provider => Type{>:GoogleCloud}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:GCPType_G2}, :processor_microarchitecture => Type{>:CascadeLake}) +"g6.24xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast8T, AtMost16T, 1.63209e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast4, AtMost4, 4.0}}, :network_performance => Type{>:Tuple{AtLeast32G, AtMost64G, 5.36871e10}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_24xLarge}, :processor_microarchitecture => Type{>:Zen}) ``` + Notice that ___@select___ emits a warning because __node_count__ is ignored since only instance features are considered in contract resolution. Three VM instance types satisfy the contract, since they provide at least 256GB of memory and at least four NVIDIA GPUs of Ada architecture (L4 Tensor Core). They are: ___g6.48xlarge___, ___g2-standard-96___, and ___g6.24xlarge___. The user may inspect the features of each instance type and write a contract that selects one directly. ```julia-repl -julia> cc = @cluster node_count => 4 node_machinetype => EC2Type_G6_48xLarge +julia> cc = @cluster node_count => 4 node_machinetype => EC2Type_G6_48xLarge :mBrvXUsilkpxWJC julia> @resolve cc 1-element Vector{Pair{Symbol, SubString{String}}}: - :instance_type => "g6.48xlarge" +:instance_type => "g6.48xlarge" ``` ### Peer-Workers-MPI clusters -___Peer-Workers-MPI___ is a variation of ___Peer-Workers___ clusters, where worker processes are connected through a global MPI communicator. This is possible through _MPI.jl_ and _MPIClusterManagers.jl_. +___Peer-Workers-MPI___ is a variation of ___Peer-Workers___ clusters, where worker processes are connected through a global MPI communicator. This is possible through _MPI.jl_ and _MPIClusterManagers.jl_. -In what follows, we modify the ```my_second_cluster_contract``` to build a ___Peer-Workers-MPI___ cluster that will be referred by ```my_fourth_cluster``´, by using the ```cluster_type``` parameter: +In what follows, we modify the ```my_second_contract``` to build a ___Peer-Workers-MPI___ cluster that will be referred by ```my_fourth_cluster```, by using the ```cluster_type``` parameter: ```julia -my_third_cluster_contract = @cluster(cluster_type => PeerWorkersMPI, - node_count => 8, - node_memory_size => @atleast(512G), - accelerator_count => @just(8), - accelerator_architecture => Ada) +my_third__contract = @cluster(cluster_type => PeerWorkersMPI, + node_count => 8, + node_memory_size => @atleast(512G), + accelerator_count => @just(8), + accelerator_architecture => Ada) + my_fourth_cluster = @deploy my_third_cluster_contract ``` - -The following code launches a simple _MPI.jl_ code in _my_fourth_cluster_, using the ```@everywhere``` primitive of _Distributed.jl_. +The following code launches a simple _MPI.jl_ code in _my_fourth_cluster_, using the ```@everywhere``` primitive of _Distributed.jl_. ```julia - -@everywhere cluster_nodes(my_fourth_cluster) begin - @eval using MPI - MPI.Init() - rank = MPI.Comm_rank(MPI.COMM_WORLD) - size = MPI.Comm_size(MPI.COMM_WORLD) - @info "I am $rank among $size processes" - root_rank = 0 - rank_sum = MPI.Reduce(rank, (x,y) -> x + y, root_rank, MPI.COMM_WORLD) +@everywhere cluster_nodes(my_fourth_cluster) begin + @eval using MPI + MPI.Init() + rank = MPI.Comm_rank(MPI.COMM_WORLD) + size = MPI.Comm_size(MPI.COMM_WORLD) + @info "I am $rank among $size processes" + root_rank = 0 + rank_sum = MPI.Reduce(rank, (x,y) -> x + y, root_rank, MPI.COMM_WORLD) end -result = @fetchfrom ranks(my_first_cluster)[0] rank_sum -@info "The sum of ranks in the cluster is $result" +result = @fetchfrom ranks(my_first_cluster)[0] rank_sum +@info "The sum of ranks in the cluster is $result" ``` -The parallel code sums the ranks of the processes using the _Reduce_ collective operation of _MPI.jl_ and stores the result in the global variable _rank_sum_ of the root process (rank 0). Then, this value is fetched by the program and assigned to the result variable using ```@fetchfrom```. For that, the ```ranks``` function is used to discover the _PID_ of the root process. - +The parallel code sums the ranks of the processes using the _Reduce_ collective operation of _MPI.jl_ and stores the result in the global variable _rank_sum_ of the root process (rank 0). Then, this value is fetched by the program and assigned to the result variable using ```@fetchfrom```. For that, the ```ranks``` function is used to discover the _PID_ of the root process. ### Manager-Workers clusters - -A ___Manager-Workers___ cluster comprises an _access node_ and a homogenous set of _compute nodes_. The compute nodes are only accessible from the access node. The instance type of the access node may be different from the instance type of the compute nodes. +A ___Manager-Workers___ cluster comprises an _access node_ and a homogenous set of _compute nodes_. The compute nodes are only accessible from the access node. The instance type of the access node may be different from the instance type of the compute nodes. In a ___Manager-Workers___ cluster, the master process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters. -A ___Manager-Workers___ cluster is useful when compute nodes are not directly accessible from the external network. This is a common situation in on-premises clusters. However, this is also possible in clusters built from the services of cluster providers specifically tailored to HPC applications. - +A ___Manager-Workers___ cluster is useful when compute nodes are not directly accessible from the external network. This is a common situation in on-premises clusters. However, this is also possible in clusters built from the services of cluster providers specifically tailored to HPC applications. > [!IMPORTANT] -> ___Manager-Workers___ are not natively supported by Julia, because _Distributed.jl_ does not support that worker processes create new processes, as shown below: -> ```julia -> julia>addprocs(1) -> 1-element Vector{Int64}: -> 2 -> julia> @fetchfrom 2 addprocs(1) -> ERROR: On worker 2: -> Only process 1 can add or remove workers + +> ___Manager-Workers___ are not natively supported by Julia, because _Distributed.jl_ does not support that worker processes create new processes, as shown below: + +>```julia-repl +> julia>addprocs(1) +> 1-element Vector{Int64}: +> +> julia> @fetchfrom 2 addprocs(1) +> ERROR: On worker 2: +> Only process 1 can add or remove workers > ``` -> The _CloudClusters.jl_ developers have developed an extended version of _Distributed.jl_ that removes this limitation, making it possible to create hierarchies of Julia processes [2]. However, the multilevel extension of _Distributed.jl_ is necessary only for the access node of ___Manager-Workers___ cluster, where the so-called _entry processes_, launched by the master process at the REPL/program and responsible for launching the worker processes across computing nodes of the cluster, will be running. + +> The _CloudClusters.jl_ developers have developed an extended version of _Distributed.jl_ that removes this limitation, making it possible to create hierarchies of Julia processes [2]. However, the multilevel extension of _Distributed.jl_ is necessary only for the access node of ___Manager-Workers___ cluster, where the so-called _entry processes_, launched by the master process at the REPL/program and responsible for launching the worker processes across computing nodes of the cluster, will be running. > > So, only users who need to develop customized images to instantiate cluster nodes must be concerned with adapting the Julia installation for the extended _Distributed.jl_ version, and only if an image is intended to be used for manager nodes of ___Manager-Workers___ clusters. > @@ -448,8 +448,8 @@ Users may apply the __cluster_type__ parameter to command the creation of a ___M ```julia my_first_cluster_contract = @cluster(cluster_type => ManageWorkers, - node_count => 4, - node_machinetype => EC2Type_T3_xLarge) +node_count => 4, +node_machinetype => EC2Type_T3_xLarge) ``` In this case, the __node_count__ parameter specifies the number of worker nodes. So, for a cluster deployed using ```my_first_cluster_contract```, five VM instances will be created, including the manager node. @@ -457,12 +457,12 @@ In this case, the __node_count__ parameter specifies the number of worker nodes. The user may use "dot notation" to specify different assumptions for manager and worker nodes. For example: ```julia -my_second_cluster_contract = @cluster(cluster_type => ManageWorkers, - node_count => 8, - manager.node_machinetype => EC2Type_T3_xLarge, - worker.accelerator_count => @just(8), - worker.accelerator_architecture => Ada, - worker.accelerator_memory => @atleast(512G)) +my_second_contract = @cluster(cluster_type => ManageWorkers, + node_count => 8, + manager.node_machinetype => EC2Type_T3_xLarge, + worker.accelerator_count => @just(8), + worker.accelerator_architecture => Ada, + worker.accelerator_memory => @atleast(512G)) ``` This contract specifies that the manager node must be a ___t3.xlarge___ VM instance, while the worker nodes will have eight NVIDIA GPUs of Ada architecture and at least 512GB of memory. @@ -470,15 +470,14 @@ This contract specifies that the manager node must be a ___t3.xlarge___ VM insta ### Configuration parameters Configuration parameters exist for the proper instantiation of clusters, whose default values are specified in the _CCconfig.toml_ file. The user may override the default values by passing configuration parameters through ___@cluster___ and ___@deploy___ operations. For instance: - + ```julia my_cluster_contract = @cluster(node_count => 4, node_machinetype => EC2Type_T3_xLarge, image_id => "ami-07f6c5b6de73ce7ae") - -my_cluster = @deploy(my_first_cluster, - user => "ubuntu", - sshflags => "-i mykey.pem") + my_cluster = @deploy(my_first_cluster, + user => "ubuntu", + sshflags => "-i mykey.pem") ``` In the above code, ```image_id``` specifies that the EC2 image identified by ```ami-07f6c5b6de73ce7ae``` must be used when creating clusters from _my_cluster_contract_. On the other hand, ```user``` and ```sshflags``` will be used to access the nodes of _my_cluster_. For instance, ```ami-07f6c5b6de73ce7ae``` may provide a set of predefined users with different privileges to access the features offered by such an image. @@ -486,25 +485,38 @@ In the above code, ```image_id``` specifies that the EC2 image identified by ``` Currently, there are four categories of configuration parameters. They are described in the following paragraphs. The following configuration parameters set up the SSH connections to nodes of ___Peer-Workers___ clusters and the manager node of ___Master-Worker___ clusters, i.e., those nodes that are externally accessible: -* __user__::```String```, the user login to access VM instances (e.g., ```user@xxx.xxx.xxx.xxx```, where ```xxx.xxx.xxx.xxx``` is the public IP of the VM instance); -* __sshflags__::```String```, the flags that must be passed to the ssh command to access the VM instances; -* __tunneled__::```Bool```, a keyword argument to be passed to ```addprocs``` to determine whether or not ssh access should be [tunneled](https://www.ssh.com/academy/ssh/tunneling). + +* __user__::```String```, the user login to access VM instances (e.g., ```user@xxx.xxx.xxx.xxx```, where ```xxx.xxx.xxx.xxx``` is the public IP of the VM instance); +* __sshflags__::```String```, the flags that must be passed to the ssh command to access the VM instances; +* __tunneled__::```Bool```, a keyword argument to be passed to ```addprocs``` to determine whether or not ssh access should be [tunneled](https://www.ssh.com/academy/ssh/tunneling). The following configuration parameters apply to cluster nodes of any cluster type: -* __exename__::```String```, the full path to the ```julia``` executable (e.g., /home/ubuntu/.juliaup/bin/julia); -* __exeflags__::```String```, flags to be passed to the ```julia``` executable when starting processes on cluster nodes; -* __directory__::```String```, the current directory of the ```julia``` execution in the VM instance. + +* __exename__::```String```, the full path to the ```julia``` executable (e.g., /home/ubuntu/.juliaup/bin/julia); +* __exeflags__::```String```, flags to be passed to the ```julia``` executable when starting processes on cluster nodes; +* __directory__::```String```, the current directory of the ```julia``` execution in the VM instance. The following configuration parameters apply to nodes of ___Peer-Workers-MPI___ and worker nodes of ___Manager-Workers___ clusters, i.e., the ones with MPI-based message-passing enabled: -* __threadlevel__::```Symbol```, a keyword argument passed to ```MPI.Init```, whose possible values are: [```single```, ```:serialized```, ```:funneled```, ```:multiple```](https://juliaparallel.org/MPI.jl/stable/reference/environment/#MPI.ThreadLevel); -* __mpiflags__::```String```, a keyword argument passed to MPI (e.g., ```"--map-by node --hostfile /home/ubuntu/hostfile"```). -The last set of configuration parameters depends on the IaaS provider selected through __@resolve__. For AWS EC2, they are: -* __imageid__::```String```, the _ID_ of the image used to instantiate the VM instances that form the cluster nodes; -* __subnet_id__::```String```, the _ID_ of a subnet for the communication between VM instances that form the cluster nodes; -* __placement_group__::```String```, the _ID_ of an existing placement group where the user wishes to colocate the VM instances that form the cluster nodes (the default is to create a temporary placement group); -* __security_group_id__::```String```, the _ID_ of an existing security group for the VM instances that form the cluster nodes. +* __threadlevel__::```Symbol```, a keyword argument passed to ```MPI.Init```, whose possible values are: [```single```, ```:serialized```, ```:funneled```, ```:multiple```](https://juliaparallel.org/MPI.jl/stable/reference/environment/#MPI.ThreadLevel); +* __mpiflags__::```String```, a keyword argument passed to MPI (e.g., ```"--map-by node --hostfile /home/ubuntu/hostfile"```). + +The last set of configuration parameters depends on the IaaS provider selected through __@resolve__. +For AWS EC2, they are: + +* __imageid__::```String```, the _ID_ of the image used to instantiate the VM instances that form the cluster nodes; +* __subnet_id__::```String```, the _ID_ of a subnet for the communication between VM instances that form the cluster nodes; +* __placement_group__::```String```, the _ID_ of an existing placement group where the user wishes to colocate the VM instances that form the cluster nodes (the default is to create a temporary placement group); +* __security_group_id__::```String```, the _ID_ of an existing security group for the VM instances that form the cluster nodes. + +Finally, for GCP, they are: + +* __imageid__::```String```, the _ID_ of the image used to instantiate the VM instances that form the cluster nodes; +* __zone__::```String```, the [zone](https://cloud.google.com/compute/docs/regions-zones) where the cluster node instances will be placed; +* __project__::```String```, the [project](https://cloud.google.com/storage/docs/projects) where the cluster node instances will be created; +* __network_interface__::```String```, the _network interface_ of cluster node instances. + ### The integration with PlatformAware.jl UNDER CONSTRUCTION @@ -513,5 +525,4 @@ UNDER CONSTRUCTION * Francisco Heron de Carvalho Junior, João Marcelo Uchoa de Alencar, and Claro Henrique Silva Sales. 2024. ___Cloud-based parallel computing across multiple clusters in Julia___. In Proceedings of the _28th Brazilian Symposium on Programming Languages_ (SBLP'2024), September 30, 2024, Curitiba, Brazil. SBC, Porto Alegre, Brasil, 44-52. DOI: https://doi.org/10.5753/sblp.2024.3470. -* Francisco Heron de Carvalho Junior and Tiago Carneiro. 2024. ___Towards multicluster computations with Julia___. In Proceedings of the XXV Symposium on High-Performance Computational Systems (SSCAD’2024), October 25, 2024, São Carlos, Brazil. SBC, Porto Alegre, Brazil. DOI: https://doi.org/10.5753/sscad.2024.244307 - +* Francisco Heron de Carvalho Junior and Tiago Carneiro. 2024. ___Towards multicluster computations with Julia___. In Proceedings of the XXV Symposium on High-Performance Computational Systems (SSCAD’2024), October 25, 2024, São Carlos, Brazil. SBC, Porto Alegre, Brazil. DOI: https://doi.org/10.5753/sscad.2024.244307 \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index f209d98..8b9647e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,14 +1,12 @@ + ![CloudClusters.jl](https://raw.githubusercontent.com/PlatformAwareProgramming/CloudClusters.jl/refs/heads/main/docs/src/assets/logo-text.svg) [![TagBot](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/TagBot.yml/badge.svg)](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/TagBot.yml) + [![CompatHelper](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/CompatHelper.yml/badge.svg)](https://github.com/PlatformAwareProgramming/CloudClusters.jl/actions/workflows/CompatHelper.yml) _A package for creating, using, and managing clusters of virtual machine (VM) instances deployed with IaaS cloud providers._ -> [!NOTE] -> Currently, only [EC2](https://aws.amazon.com/ec2/) is supported. Those interested can ask us about progress with other providers. -> Contributors are welcome. - ## Target users _CloudClusters.jl_ targets Julia programming language users who need on-demand access to cutting-edge computing resources that IaaS cloud providers provide to meet high-performance computing (HPC) application requirements. @@ -17,9 +15,9 @@ _CloudClusters.jl_ targets Julia programming language users who need on-demand a ### Cloud providers' credentials -Even though _CloudClusters.jl_ currently only supports AWS EC2, it plans to support multiple IaaS cloud providers in the future. +Currently, _CloudClusters.jl_ supports AWS EC2 and Google Cloud Platform (GCP). In future versions, the support to other IaaS cloud providers may be implemented. -_CloudClusters.jl_ assumes that the user has configured their credentials for the services of their preferred cloud providers in the environment. +_CloudClusters.jl_ assumes that the user has configured the system with the required credentials for the cloud providers' services they will use. For GCP, _CloudClusters.jl_ starts a session using the [JSON credential file](https://cloud.google.com/docs/authentication/application-default-credentials) informed through the GOOGLE_APPLICATION_CREDENTIALS environment variable. In turn, the EC2 API will look for [credential files](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-files.html#cli-configure-files-where) in the $HOME/.aws folder. ### The configuration file (_CCconfig.toml_) @@ -28,67 +26,70 @@ Creating clusters with _CloudClusters.jl_ requires specifying some configuration * the current path; * the home path. -Section [Configuration parameters](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters) describes default configuration parameters and how they can be overridden in programs. +Section [Configuration parameters](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters) describes default configuration parameters and how they can be overridden in programs. + +A [_CCconfig.toml_](https://raw.githubusercontent.com/PlatformAwareProgramming/CloudClusters.jl/refs/heads/main/CCconfig.toml) file is provided in the repository's top-level directory. It is downloaded to the current directory if a _CCconfig.toml_ file is not found. It is configured to create clusters using prebuilt virtual machine images for each supported cloud provider. These images are based on the latest version of Ubuntu and include a Julia installation of a recent stable version with all the packages needed to instantiate the clusters added and precompiled. Users can create customized images, possibly derived from the provided image, using their preferred version of Julia and adding the packages they need. -A [_CCconfig.toml_](https://raw.githubusercontent.com/PlatformAwareProgramming/CloudClusters.jl/refs/heads/main/CCconfig.toml) file is provided in the repository's top-level directory. It is downloaded to the current directory if a _CCconfig.toml_ file is not found. It is configured to create clusters using prebuilt virtual machine images for each supported cloud provider. These images are based on the latest version of Ubuntu and include a Julia installation of a recent stable version with all the packages needed to instantiate the clusters added and precompiled. Users can create customized images, possibly derived from the provided image, using their preferred version of Julia and adding the packages they need. + > [!WARNING] > The version of Julia on the host computer using _CloudClusters.jl_ must be the same version as the image used to deploy the clusters. - > [!NOTE] > The current prebuilt image for EC2 is located at the _us-east-1_ (North Virginia) region. Suppose the user is going to deploy a cluster in another region. In that case, they must create a copy of the image for that region in their account and assign their id to the ```imageid``` parameter of _CCConfig.toml_. - + ### The _PlatformAware.jl_ package -_CloudClusters.jl_ relies on an experimental package called [_PlatformAware.jl_](https://github.com/PlatformAwareProgramming/PlatformAware.jl) for the specification of _platform types_, aimed at specifying assumptions about architectural features of virtual machines instances. Indeed, _PlatformAware.jl_ may be used with _CloudClusters.jl_ to write functions specifically tuned according to the features of VM instances that comprise the clusters. This is called _platform-aware programming_. The users of _CloudClusters.jl_, particularly package developers, are invited to explore and use the ideas behind _PlatformAware.jl_. +_CloudClusters.jl_ relies on an experimental package called [_PlatformAware.jl_](https://github.com/PlatformAwareProgramming/PlatformAware.jl) for the specification of _platform types_, aimed at specifying assumptions about architectural features of virtual machines instances. Indeed, _PlatformAware.jl_ may be used with _CloudClusters.jl_ to write functions specifically tuned according to the features of VM instances that comprise the clusters. This is called _[platform-aware programming_](https://sol.sbc.org.br/index.php/sscad/article/view/26529). The users of _CloudClusters.jl_, particularly package developers, are invited to explore and use the ideas behind _PlatformAware.jl_. Section [The integration with PlatformAware.jl](https://github.com/PlatformAwareProgramming/CloudClusters.jl#the-integration-with-platformawarejl) provides a deeper discussion about the integration of _PlatformAware.jl_ within _CloudClusters.jl_. # Tutorial -Next, we show a tutorial on how _CloudClusters.jl_ works, divided into two parts: _basic use_ and _advanced use_. +Next, we show a tutorial on how _CloudClusters.jl_ works, divided into two parts: _basic use_ and _advanced use_. -The basic tutorial teaches the reader how to create and deploy computations on ___peer-workers___ clusters, comprising a set of homogeneous VM instances deployed in the infrastructure of an IaaS cloud provider. +The basic tutorial teaches the reader how to create and deploy computations on ___peer-workers___ clusters, comprising a set of homogeneous VM instances deployed in the infrastructure of an IaaS cloud provider. The advanced tutorial includes: * [a deeper discussion about _cluster contracts_](https://github.com/PlatformAwareProgramming/CloudClusters.jl#working-with-cluster-contracts); + * [how to use MPI with ___peer-workers___ clusters](https://github.com/PlatformAwareProgramming/CloudClusters.jl#peer-workers-mpi-clusters); + * [how to create ___manager-workers___ clusters, a kind of cluster that comprises an access node and a set of homogenous compute nodes only accessible through the access node](https://github.com/PlatformAwareProgramming/CloudClusters.jl#manager-workers-clusters); -* [a description of configuration parameters and how programs can override the default values from the _CCconfig.toml_ file](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters). -# Basic use +* [a description of configuration parameters and how programs can override the default values from the _CCconfig.toml_ file](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters). -In what follows, we teach how to create ___peer-workers___ clusters and deploy computations on them using _Distributed.jl_ primitives. +# Basic use -Remember that the AWS credentials must be properly configured in the environment where the Julia REPL session or program will be executed. +In what follows, we teach how to create ___peer-workers___ clusters and deploy computations on them using _Distributed.jl_ primitives. -## How to create a cluster +## How to create a cluster -_CloudClusters.jl_ offers six primitives, as _macros_, to create and manage a cluster's lifecycle. They are: __@cluster__, __@resolve__, __@deploy__, __@terminate__, __@interrupt__, and __@resume__. +_CloudClusters.jl_ offers seven primitives, as _macros_, to create and manage a cluster's lifecycle. They are: __@cluster__, __@resolve__, __@deploy__, __@terminate__, __@interrupt__, __@resume__, and __@restart__. -First, let's try a simple scenario where a user creates a cluster comprising four ___t3.xlarge___ virtual machines (VM) instances through the AWS EC2 services. In the simplest way to do this, the user applies the __@cluster__ macro to the number of nodes and instance type, as arguments. +First, let's try a simple scenario where a user creates a cluster comprising four ___t3.xlarge___ virtual machines (VM) instances through the AWS EC2 services. In the simplest way to do this, the user applies the __@cluster__ macro to the number of nodes and instance type, as arguments. ```julia using CloudClusters +my_first_contract = @cluster node_count => 4 node_machinetype => PlatformAware.EC2Type_T3_xLarge +``` -my_first_cluster_contract = @cluster node_count => 4 node_machinetype => PlatformAware.EC2Type_T3_xLarge +```EC2Type_T3_xLarge``` is a Julia type from the _PlatformAware.jl_ package that represents the ___t3.xlarge___ instance type (and size) of EC2. _PlatformAware.jl_ offers a hierarchy of types representing instance types of supported providers (e.g., ```MachineType``` → ```EC2Type``` → ```EC2Type_T3``` → ```EC2Type_T3_xLarge``` and ```MachineType``` → ```GCPType``` → ```GCPType_E2``` → ```GCPType_E2_Medium```). -``` -```EC2Type_T3_xLarge``` is a Julia type from the _PlatformAware.jl_ package that represents the ___t3.xlarge___ instance type (and size) of EC2. _PlatformAware.jl_ offers a hierarchy of types representing instance types of supported providers (e.g., ```MachineType``` → ```EC2Type``` → ```EC2Type_T3``` → ```EC2Type_T3_xLarge```). -For example, the user may list all the supported EC2 instance types by executing ```subtypes(PlatformAware.EC2Type)``` in the REPL, or ```subtypes(PlatformAware.EC2Type_T3)``` if the user intends to list the available instance sizes for the ___t3___ instance type. +The user may list all the supported EC2 instance types by executing ```subtypes(PlatformAware.EC2Type)``` in the REPL, or ```subtypes(PlatformAware.EC2Type_T3)``` if the user intends to list the available instance sizes for the ___t3___ instance type. Analougously for GCP. -__@cluster__ does not instantiate a cluster yet. It creates a _cluster contract_ and returns a handle for it. In the example, the _contract handle_ is stored in the _my_first_cluster_contract_ variable, from which the user can create one or more clusters later. + __@cluster__ does not instantiate a cluster yet. It creates a _cluster contract_ and returns a handle for it. In the example, the _contract handle_ is stored in the _my_first_cluster_contract_ variable, from which the user can create one or more clusters later. > [!NOTE] -> In _CloudClusters.jl_, a handle is a symbol comprising 15 randomly calculated lower and upper case alphabetic characters (e.g.,```:FXqElAnSeTEpQAm``` ). As symbols, they are printable and may be used directly to refer to a cluster contract. + +> In _CloudClusters.jl_, a handle is a symbol comprising 15 random lower and upper case alphabetic characters (e.g.,```:FXqElAnSeTEpQAm``` ). As symbols, they are printable and may be used directly to refer to a cluster contract. A cluster contract must be resolved before creating clusters using it. For that, the user needs to apply __@resolve__ to the contract handle, as below: ```julia -@resolve my_first_cluster_contract +@resolve my_first_contract ``` -The __@resolve__ macro triggers a resolution procedure to calculate which instance type offered by one of the supported IaaS providers satisfies the contract. For ```my_first_cluster_contract```, the result is explicitly specified: the ___t3.xlarge___ instance type of AWS EC2. For advanced contract specifications, where cluster contract resolution shows its power, the reader can read the [Working with cluster contracts](https://github.com/PlatformAwareProgramming/CloudClusters.jl#working-with-cluster-contracts) section. +The __@resolve__ macro triggers a resolution procedure to calculate which instance type offered by one of the supported IaaS providers satisfies the contract. For ```my_first_contract```, the result is explicitly specified: the ___t3.xlarge___ instance type of AWS EC2. For advanced contract specifications, where cluster contract resolution shows its power, the reader can read the [Working with cluster contracts](https://github.com/PlatformAwareProgramming/CloudClusters.jl#working-with-cluster-contracts) section. A cluster may be instantiated by using ___@deploy___: @@ -96,11 +97,11 @@ A cluster may be instantiated by using ___@deploy___: my_first_cluster = @deploy my_first_cluster_contract ``` -The __@deploy__ macro will create a 4-node cluster comprising ___t3.xlarge___ AWS EC2 instances, returning a cluster handle, assigned to the ```my_first_cluster``` variable. +The __@deploy__ macro will create a 4-node cluster comprising ___t3.xlarge___ AWS EC2 instances, returning a cluster handle, assigned to the ```my_first_cluster``` variable. -After __@deploy__, a set of _worker processes_ is created, one at each cluster node. Their _PIDs_ may be inspected by applying the ___nodes___ function to the cluster handle. +After __@deploy__, a set of _worker processes_ is created, one at each cluster node. Their _PIDs_ may be inspected by applying the ___nodes___ function to the cluster handle. -In the following code, the user fetches the _PIDs_ of the processes running at the nodes of the cluster referred to by ```my_first_cluster```. + In the following code, the user uses __@nodes__ to fetch the _PIDs_ of the processes running at the nodes of the cluster referred to by ```my_first_cluster```. ```julia-repl julia> @nodes my_first_cluster @@ -114,70 +115,68 @@ julia> @nodes my_first_cluster The example shows that the default number of worker processes per cluster node is 1. However, the user may create N worker processes per cluster node using the ```node_process_count => N``` parameter in the contract specification. For example, in the following contract, the number of worker processes per cluster node is set to 2: ```julia -@cluster node_count => 4 node_process_count => 2 node_machinetype => EC2Type_T3_xLarge +@cluster node_count => 4 node_process_count => 2 node_machinetype => EC2Type_T3_xLarge ``` - ## Running computations on the cluster -The user may execute parallel computations on the cluster using _Distributed.jl_ operations. In fact, the user can employ any parallel/distributed computing package in the Julia ecosystem to launch computations across a set of worker processes. For instance, the advanced tutorial will show how to use _MPI.jl_ integrated with _Distributed.jl_. +The user may execute parallel computations on the cluster using _Distributed.jl_ operations. In fact, the user can employ any parallel/distributed computing package in the Julia ecosystem to launch computations across a set of worker processes. For instance, the advanced tutorial will show how to use _MPI.jl_ integrated with _Distributed.jl_. -The following code, adapted from [The ultimate guide to distributed computing in Julia](https://github.com/Arpeggeo/julia-distributed-computing#the-ultimate-guide-to-distributed-computing-in-julia), processes a set of CSV files in a data folder in parallel, using _pmap_, across the worker processes placed at the cluster nodes. The result of each file processing is saved locally, as a CSV file in a results folder. +The following code, adapted from [The ultimate guide to distributed computing in Julia](https://github.com/Arpeggeo/julia-distributed-computing#the-ultimate-guide-to-distributed-computing-in-julia), processes a set of CSV files in a data folder in parallel, using _pmap_, across the worker processes placed at the cluster nodes. The result of each file processing is saved locally, as a CSV file in a results folder. ```julia using Distributed @everywhere cluster_nodes(my_first_cluster) begin - # load dependencies - using ProgressMeter - using CSV - # helper functions - function process(infile, outfile) - # read file from disk - csv = CSV.File(infile) + # load dependencies + using ProgressMeter + using CSV - # perform calculations - sleep(60) + # helper functions + function process(infile, outfile) + # read file from disk + csv = CSV.File(infile) - # save new file to disk - CSV.write(outfile, csv) - end + # perform calculations + sleep(60) + + # save new file to disk + CSV.write(outfile, csv) + end end # MAIN SCRIPT # ----------- # relevant directories -indir = joinpath(@__DIR__,"data") +indir = joinpath(@__DIR__,"data") outdir = joinpath(@__DIR__,"results") # files to process -infiles = readdir(indir, join=true) +infiles = readdir(indir, join=true) outfiles = joinpath.(outdir, basename.(infiles)) -nfiles = length(infiles) - -status = @showprogress pmap(1:nfiles; pids=cluster_nodes(my_first_cluster)) do i - try - process(infiles[i], outfiles[i]) - true # success - catch e - false # failure - end +nfiles = length(infiles) + +status = @showprogress pmap(1:nfiles; pids=cluster_nodes(my_first_cluster)) do i + try + process(infiles[i], outfiles[i]) + true # success + catch e + false # failure + end end - ``` ## Multiple clusters -Users can create cluster contracts and deploy clusters from them as many times as they need. For example, the following code creates a second cluster contract, named ```my_second_cluster_contract```, asking for a cluster comprising eight VM instances equipped with exactly eight NVIDIA GPUs of Ada-Lovelace architecture and at least 512GB of memory per node. Then, it creates two clusters from the new contract. +Users can create cluster contracts and deploy clusters from them as many times as they need. For example, the following code creates a second cluster contract, named ```my_second_cluster_contract```, asking for a cluster comprising eight VM instances equipped with exactly eight NVIDIA GPUs of Ada-Lovelace architecture and at least 512GB of memory per node. Then, it creates two clusters from the new contract. ```julia - -my_second_cluster_contract = @cluster(node_count => 8, - node_memory_size => @atleast(512G), - accelerator_count => @just(8), - accelerator_architecture => Ada) +my_second_contract = @cluster(node_count => 8, + node_memory_size => @atleast(512G), + accelerator_count => @just(8), + accelerator_architecture => Ada) @resolve my_second_cluster_contract @@ -213,36 +212,39 @@ julia> @ nodes my_third_cluster 21 ``` -The user may orchestrate the processing power of multiple clusters to run computations of their interest, independent of their providers. This is _multicluster computation_. However, it is important to note that communication operations between processes placed at nodes of different clusters (inter-cluster communication), mainly when these clusters are deployed at different IaaS providers, must be used with care due to the high communication cost, only when necessary and overlapping communication and computation using asynchronous operations. +The user may orchestrate the processing power of multiple clusters to run computations of their interest, independent of their providers. This is _multicluster computation_. However, it is important to note that communication operations between processes placed at nodes of different clusters (inter-cluster communication), mainly when these clusters are deployed at different IaaS providers, must be used with care due to the high communication cost, only when necessary and overlapping communication and computation using asynchronous operations. ## Interrupting and resuming a cluster -A cluster may be interrupted through the ___@interrupt___ macro: +A cluster may be interrupted through the ___@interrupt___ macro: ```julia @interrupt my_first_cluster ``` -The effect of ___@interrupt___ is pausing/stopping the VM instances of the cluster nodes. + +The effect of ___@interrupt___ is pausing/stopping the VM instances of the cluster nodes. The semantics of interrupting a cluster may vary accross IaaS providers. An interrupted cluster can be brought back to the running state using the ___@resume___ macro: ```julia @resume my_first_cluster ``` -The resuming operation starts the VM instances and creates a fresh set of worker processes, with new _PIDs_. + +The resume operation starts the VM instances and creates a fresh set of worker processes, with new _PIDs_. > [!CAUTION] -> ___@interrupt___ does not preserve the state of undergoing computations in the cluster, since it kills the worker processes running at the cluster nodes. The interruption of a cluster may be used to avoid the cost of cloud resources that are not currently being used. The user is responsible for saving the state of undergoing computations in a cluster to be interrupted and reloading the state after resuming, if necessary. + +> ___@interrupt___ does not preserve the state of undergoing computations in the cluster, since it kills the worker processes running at the cluster nodes. The interruption of a cluster may be used to avoid the cost of cloud resources that are not currently being used. The user is responsible for saving the state of undergoing computations in a cluster to be interrupted and reloading the state after resuming, if necessary. ## Restarting processes -A user can restart the processes at the cluster nodes by using the ___@restart___ macro: +A user can restart the processes at the cluster nodes by using the ___@restart___ macro: ```julia @restart my_first_cluster ``` -The restart procedure kills all the current processes at the cluster nodes, losing their current state, and creates new processes, with fresh _PIDs_. +The restart procedure kills all the current processes at the cluster nodes, losing their current state, and creates new processes, with fresh _PIDs_. ## Terminating a cluster @@ -262,91 +264,89 @@ If a cluster was not terminated in a previous execution of a Julia program or RE @reconnect :FXqElAnSeTEpQAm ``` -In the above code, ```:FXqElAnSeTEpQAm``` is the handle of a cluster not terminated in a previous execution session. But how may the user discover the cluster handle of a non-terminated cluster? For example, after a system crash? For that, the user may call the ___@clusters___ macro, which returns a list of non-terminated clusters in previous sessions that are still alive and can be reconnected: +In the above code, ```:FXqElAnSeTEpQAm``` is the handle of a cluster not terminated in a previous execution session. But how may the user discover the cluster handle of a non-terminated cluster? For example, after a system crash? For that, the user may invoke the ___@clusters___ macro, which returns a list of non-terminated clusters in previous sessions that are still alive and can be reconnected: ```julia julia> @clusters [ Info: PeerWorkers FXqElAnSeTEpQAm, created at 2024-10-08T09:12:40.847 on PlatformAware.AmazonEC2 1-element Vector{Any}: - Dict{Any, Any}(:handle => :FXqElAnSeTEpQAm, :provider => PlatformAware.AmazonEC2, :type => PeerWorkers, :timestamp => Dates.DateTime("2024-10-08T09:12:40.847")) +Dict{Any, Any}(:handle => :FXqElAnSeTEpQAm, :provider => PlatformAware.AmazonEC2, :type => PeerWorkers, :timestamp => Dates.DateTime("2024-10-08T09:12:40.847")) ``` ## Advanced Use -### Working with cluster contracts +### Working with cluster contracts -As shown in the previous examples of using the ___@cluster___ macro, _CloudClusters.jl_ supports _cluster contracts_ to specify _assumptions_ about cluster _features_, with special attention to the types of VM instances comprising cluster nodes. +As shown in the previous examples of using the ___@cluster___ macro, _CloudClusters.jl_ supports _cluster contracts_ to specify _assumptions_ about cluster _features_, with special attention to the types of VM instances comprising cluster nodes. -Cluster contracts are a set of key-value pairs ```k => v``` called _assumption parameters_, where ```k``` is a name and ```v``` is a value or [_platform type_](). A predefined set of assumption parameters is supported, each with a _name_ and a default value or _base platform type_. +Cluster contracts are a set of key-value pairs ```k => v``` called _assumption parameters_, where ```k``` is a name and ```v``` is a value or [_platform type_](). A predefined set of assumption parameters is supported, each with a _name_ and a default value or _base platform type_. -The currently supported set of assumption parameters is listed [here](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters), providing a wide spectrum of assumptions for users to specify the architectural characteristics of a cluster to satisfy their needs. Note that assumption parameters are classified into cluster and instance parameters, where _instance parameters_ are the assumption parameters considered in the instance resolution procedure (_resolve_). +The currently supported set of assumption parameters is listed [here](https://github.com/PlatformAwareProgramming/CloudClusters.jl#configuration-parameters), providing a wide spectrum of assumptions for users to specify the architectural characteristics of a cluster to satisfy their needs. Note that assumption parameters are classified into cluster and instance parameters, where _instance parameters_ are taken into consideration in the instance resolution procedure (__@resolve__). -In the case of ```my_first_cluster_contract```, the user uses the assumption parameters ___node_count___ and ___nodes_machinetype___ to specify that the required cluster must have four nodes and that the VM instances that comprise the cluster nodes must be of the ___t3.xlarge___ type, offered by the AWS EC2 provider. This is a direct approach, the simplest and least abstract one, where the resolution procedure, triggered by a call to __@resolve__, will return the EC2's ___t3.xlarge___ as the VM instance type that satisfies the contract. - -On the other hand, ```my_second_cluster_contract``` employs an indirect approach, demonstrating that the resolution procedure may look for a VM instance type from a set of abstract assumptions. They are specified using the assumption parameters __accelerator_count__, __accelerator_architecture__, and __accelerator_memory__, asking for cluster nodes with eight GPUs of NVIDIA Ada Lovelace architecture and at least 512GB of memory. Under these assumptions, the call to ___@resolve___ returns the __g6.48xlarge__ instance type of AWS EC2. +In the case of ```my_first_contract```, the user uses the assumption parameters ___node_count___ and ___nodes_machinetype___ to specify that the required cluster must have four nodes and that the VM instances that comprise the cluster nodes must be of the ___t3.xlarge___ type, offered by the AWS EC2 provider. This is a direct approach, the simplest and least abstract one, where the resolution procedure, triggered by a call to __@resolve__, will return the EC2's ___t3.xlarge___ as the VM instance type that satisfies the contract. +On the other hand, ```my_second_contract``` employs an indirect approach, demonstrating that the resolution procedure may look for a VM instance type from a set of abstract assumptions. They are specified using the instance parameters __accelerator_count__, __accelerator_architecture__, and __accelerator_memory__, asking for cluster nodes with eight GPUs of NVIDIA Ada Lovelace architecture and at least 512GB of memory. Under these assumptions, the call to ___@resolve___ returns the __g6.48xlarge__ instance type of AWS EC2. #### List of assumption parameters ___Cluster parameters___ specify features of the cluster: - * __cluster_type__::```Cluster```, denoting the cluster type: ManagerWorkers, PeerWorkers, or PeerWorkersMPI; - * __node_count__::```Integer```, denoting the number of cluster nodes (default to _1_); - * __node_process_count__::```Integer```, denoting the number of Julia processes (MPI ranks) per node (default to _1_). -___Instance parameters___, with their respective base platform types, are listed below: +* __cluster_type__::```Cluster```, denoting the cluster type: ManagerWorkers, PeerWorkers, or PeerWorkersMPI; +* __node_count__::```Integer```, denoting the number of cluster nodes (default to _1_); +* __node_process_count__::```Integer```, denoting the number of Julia processes (MPI ranks) per node (default to _1_). - * __node_provider__::```CloudProvider```, the provider of VM instances for the cluster nodes; - * __cluster_locale__::```Locale```, the geographic location where the cluster nodes will be instantiated; - * __node_machinetype__::```InstanceType```, the VM instance type of cluster nodes; - * __node_memory_size__::```@atleast 0```, the memory size of each cluster node; - * __node_ecu_count__::```@atleast 1```, the EC2 compute unit, a processing performance measure for VM instances (only for EC2 instances); - * __node_vcpus_unit__::```@atleast 1```, the number of virtual CPUs in each cluster node; - * __accelerator_count__::```@atleast 0```, the number of accelerators in the cluster node; - * __accelerator_memory__::```@atleast 0```, the amount of memory of the cluster node accelerators; - * __accelerator_type__::```AcceleratorType```, the type of accelerator; - * __accelerator_manufacturer__::```AcceleratorManufacturer```, the manufacturer of the accelerator; - * __accelerator_arch__::```AcceleratorArchitecture```, the architecture of the accelerator, depending on its type and manufacturer. - * __accelerator__::```AcceleratorModel```, the accelerator model; - * __processor_manufacturer__::```Manufacturer```, the processor manufacturer; - * __processor_microarchitecture__::```ProcessorArchitecture```, the processor microarchitecture; - * __processor__::```ProcessorModel```, the processor model; - * __storage_type__::```StorageType```, the type of storage in cluster nodes; - * __storage_size__::```@atleast 0```, the size of the storage in cluster nodes; - * __network_performance__::```@atleast 0```, the network performance between cluster nodes. +___Instance parameters___, with their respective base platform types, are listed below: +* __node_provider__::```CloudProvider```, the provider of VM instances for the cluster nodes; +* __cluster_locale__::```Locale```, the geographic location where the cluster nodes will be instantiated; +* __node_machinetype__::```InstanceType```, the VM instance type of cluster nodes; +* __node_memory_size__::```@atleast 0```, the memory size of each cluster node; +* __node_ecu_count__::```@atleast 1```, the EC2 compute unit, a processing performance measure for VM instances (only for EC2 instances); +* __node_vcpus_unit__::```@atleast 1```, the number of virtual CPUs in each cluster node; +* __accelerator_count__::```@atleast 0```, the number of accelerators in the cluster node; +* __accelerator_memory__::```@atleast 0```, the amount of memory of the cluster node accelerators; +* __accelerator_type__::```AcceleratorType```, the type of accelerator; +* __accelerator_manufacturer__::```AcceleratorManufacturer```, the manufacturer of the accelerator; +* __accelerator_arch__::```AcceleratorArchitecture```, the architecture of the accelerator, depending on its type and manufacturer. +* __accelerator__::```AcceleratorModel```, the accelerator model; +* __processor_manufacturer__::```Manufacturer```, the processor manufacturer; +* __processor_microarchitecture__::```ProcessorArchitecture```, the processor microarchitecture; +* __processor__::```ProcessorModel```, the processor model; +* __storage_type__::```StorageType```, the type of storage in cluster nodes; +* __storage_size__::```@atleast 0```, the size of the storage in cluster nodes; +* __network_performance__::```@atleast 0```, the network performance between cluster nodes. Most platform types are specified in the _PlatformAware.jl_ package. The user may open a REPL section to query types defined in _PlatformAware.jl_. For example, the user may apply the [```subtypes``` function](https://www.jlhub.com/julia/manual/en/function/subtypes) to know the subtypes of a given platform type, which define the available choices: ```julia-repl - julia> using PlatformAware julia> subtypes(Accelerator) 3-element Vector{Any}: - NVIDIAAccelerator - AMDAccelerator - IntelAccelerator +NVIDIAAccelerator +AMDAccelerator +IntelAccelerator julia> subtypes(EC2Type_T3) 8-element Vector{Any}: - EC2Type_T3A - EC2Type_T3_2xLarge - EC2Type_T3_Large - EC2Type_T3_Medium - EC2Type_T3_Micro - EC2Type_T3_Nano - EC2Type_T3_Small - EC2Type_T3_xLarge +EC2Type_T3A +EC2Type_T3_2xLarge +EC2Type_T3_Large +EC2Type_T3_Medium +EC2Type_T3_Micro +EC2Type_T3_Nano +EC2Type_T3_Small +EC2Type_T3_xLarge ``` - + #### Querying contracts In the current implementation of _CloudClusters.jl_, since contract resolution, using ___@resolve___, is implemented on top of Julia's multiple dispatch mechanism, it does not support ambiguity, i.e., only a single VM instance type must satisfy the contract. Otherwise, ___resolve___ returns an ambiguity error, like in the example below: ```julia-repl -julia> cc = @cluster(node_count => 4, +julia> cc = @cluster(node_count => 4, accelerator_count => @atleast(4), - accelerator_architecture => Ada, + accelerator_architecture => Ada, node_memory_size => @atleast(256G)) :NKPlCvagfSSpIgD @@ -354,91 +354,91 @@ julia> @resolve cc ERROR: MethodError: resolve(::Type{CloudProvider}, ::Type{MachineType}, ::Type{Tuple{AtLeast256G, AtMostInf, var"#92#X"} where var"#92#X"}, ::Type{Tuple{AtLeast1, AtMostInf, Q} where Q}, ::Type{Tuple{AtLeast4, AtMostInf, var"#91#X"} where var"#91#X"}, ::Type{AcceleratorType}, ::Type{Ada}, ::Type{Manufacturer}, ::Type{Tuple{AtLeast0, AtMostInf, Q} where Q}, ::Type{Accelerator}, ::Type{Processor}, ::Type{Manufacturer}, ::Type{ProcessorMicroarchitecture}, ::Type{StorageType}, ::Type{Tuple{AtLeast0, AtMostInf, Q} where Q}, ::Type{Tuple{AtLeast0, AtMostInf, Q} where Q}) is ambiguous. ``` -The user can use the ___@select___ macro to query which instance types satisfy the ambiguous contract: +The user can use the ___@select___ macro to query which instance types satisfy the ambiguous contract: ```julia-repl julia> @select(node_count => 4, accelerator_count => @atleast(4), - accelerator_architecture => Ada, + accelerator_architecture => Ada, node_memory_size => @atleast(256G)) ┌ Warning: Only instance features are allowed. Ignoring node_count. └ @ CloudClusters ~/Dropbox/Copy/ufc_mdcc_hpc/CloudClusters.jl/src/resolve.jl:78 Dict{String, Any} with 3 entries: - "g6.48xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast512G, AtMost1T, 8.24634e11}}, :storage_size => Type{>:Tuple{AtLeast32T, AtMost64T, 6.52835e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast128, AtMost256, 192.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_48xLarge}, :processor_microarchitecture => Type{>:Zen}) - "g2-standard-96" => Dict{Symbol, Any}(:processor => Type{>:IntelXeon_8280L}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:Intel}, :storage_type => Type{>:StorageType}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast0, AtMostInf, Q} where Q}, :node_provider => Type{>:GoogleCloud}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:GCPType_G2}, :processor_microarchitecture => Type{>:CascadeLake}) - "g6.24xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast8T, AtMost16T, 1.63209e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast4, AtMost4, 4.0}}, :network_performance => Type{>:Tuple{AtLeast32G, AtMost64G, 5.36871e10}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_24xLarge}, :processor_microarchitecture => Type{>:Zen}) +"g6.48xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast512G, AtMost1T, 8.24634e11}}, :storage_size => Type{>:Tuple{AtLeast32T, AtMost64T, 6.52835e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast128, AtMost256, 192.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_48xLarge}, :processor_microarchitecture => Type{>:Zen}) +"g2-standard-96" => Dict{Symbol, Any}(:processor => Type{>:IntelXeon_8280L}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:Intel}, :storage_type => Type{>:StorageType}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast0, AtMostInf, Q} where Q}, :node_provider => Type{>:GoogleCloud}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast8, AtMost8, 8.0}}, :network_performance => Type{>:Tuple{AtLeast64G, AtMost128G, 1.07374e11}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:GCPType_G2}, :processor_microarchitecture => Type{>:CascadeLake}) +"g6.24xlarge" => Dict{Symbol, Any}(:processor => Type{>:AMDEPYC_7R13}, :accelerator_architecture => Type{>:Ada}, :processor_manufacturer => Type{>:AMD}, :storage_type => Type{>:StorageType_EC2_NVMeSSD}, :node_memory_size => Type{>:Tuple{AtLeast256G, AtMost512G, 4.12317e11}}, :storage_size => Type{>:Tuple{AtLeast8T, AtMost16T, 1.63209e13}}, :node_provider => Type{>:AmazonEC2}, :node_vcpus_count => Type{>:Tuple{AtLeast64, AtMost128, 96.0}}, :accelerator_count => Type{>:Tuple{AtLeast4, AtMost4, 4.0}}, :network_performance => Type{>:Tuple{AtLeast32G, AtMost64G, 5.36871e10}}, :accelerator => Type{>:NVIDIA_L4}, :accelerator_type => Type{>:GPU}, :accelerator_memory_size => Type{>:Tuple{AtLeast16G, AtMost32G, 2.57698e10}}, :accelerator_manufacturer => Type{>:NVIDIA}, :node_machinetype => Type{>:EC2Type_G6_24xLarge}, :processor_microarchitecture => Type{>:Zen}) ``` + Notice that ___@select___ emits a warning because __node_count__ is ignored since only instance features are considered in contract resolution. Three VM instance types satisfy the contract, since they provide at least 256GB of memory and at least four NVIDIA GPUs of Ada architecture (L4 Tensor Core). They are: ___g6.48xlarge___, ___g2-standard-96___, and ___g6.24xlarge___. The user may inspect the features of each instance type and write a contract that selects one directly. ```julia-repl -julia> cc = @cluster node_count => 4 node_machinetype => EC2Type_G6_48xLarge +julia> cc = @cluster node_count => 4 node_machinetype => EC2Type_G6_48xLarge :mBrvXUsilkpxWJC julia> @resolve cc 1-element Vector{Pair{Symbol, SubString{String}}}: - :instance_type => "g6.48xlarge" +:instance_type => "g6.48xlarge" ``` ### Peer-Workers-MPI clusters -___Peer-Workers-MPI___ is a variation of ___Peer-Workers___ clusters, where worker processes are connected through a global MPI communicator. This is possible through _MPI.jl_ and _MPIClusterManagers.jl_. +___Peer-Workers-MPI___ is a variation of ___Peer-Workers___ clusters, where worker processes are connected through a global MPI communicator. This is possible through _MPI.jl_ and _MPIClusterManagers.jl_. -In what follows, we modify the ```my_second_cluster_contract``` to build a ___Peer-Workers-MPI___ cluster that will be referred by ```my_fourth_cluster``´, by using the ```cluster_type``` parameter: +In what follows, we modify the ```my_second_contract``` to build a ___Peer-Workers-MPI___ cluster that will be referred by ```my_fourth_cluster```, by using the ```cluster_type``` parameter: ```julia -my_third_cluster_contract = @cluster(cluster_type => PeerWorkersMPI, - node_count => 8, - node_memory_size => @atleast(512G), - accelerator_count => @just(8), - accelerator_architecture => Ada) +my_third__contract = @cluster(cluster_type => PeerWorkersMPI, + node_count => 8, + node_memory_size => @atleast(512G), + accelerator_count => @just(8), + accelerator_architecture => Ada) + my_fourth_cluster = @deploy my_third_cluster_contract ``` - -The following code launches a simple _MPI.jl_ code in _my_fourth_cluster_, using the ```@everywhere``` primitive of _Distributed.jl_. +The following code launches a simple _MPI.jl_ code in _my_fourth_cluster_, using the ```@everywhere``` primitive of _Distributed.jl_. ```julia - -@everywhere cluster_nodes(my_fourth_cluster) begin - @eval using MPI - MPI.Init() - rank = MPI.Comm_rank(MPI.COMM_WORLD) - size = MPI.Comm_size(MPI.COMM_WORLD) - @info "I am $rank among $size processes" - root_rank = 0 - rank_sum = MPI.Reduce(rank, (x,y) -> x + y, root_rank, MPI.COMM_WORLD) +@everywhere cluster_nodes(my_fourth_cluster) begin + @eval using MPI + MPI.Init() + rank = MPI.Comm_rank(MPI.COMM_WORLD) + size = MPI.Comm_size(MPI.COMM_WORLD) + @info "I am $rank among $size processes" + root_rank = 0 + rank_sum = MPI.Reduce(rank, (x,y) -> x + y, root_rank, MPI.COMM_WORLD) end -result = @fetchfrom ranks(my_first_cluster)[0] rank_sum -@info "The sum of ranks in the cluster is $result" +result = @fetchfrom ranks(my_first_cluster)[0] rank_sum +@info "The sum of ranks in the cluster is $result" ``` -The parallel code sums the ranks of the processes using the _Reduce_ collective operation of _MPI.jl_ and stores the result in the global variable _rank_sum_ of the root process (rank 0). Then, this value is fetched by the program and assigned to the result variable using ```@fetchfrom```. For that, the ```ranks``` function is used to discover the _PID_ of the root process. - +The parallel code sums the ranks of the processes using the _Reduce_ collective operation of _MPI.jl_ and stores the result in the global variable _rank_sum_ of the root process (rank 0). Then, this value is fetched by the program and assigned to the result variable using ```@fetchfrom```. For that, the ```ranks``` function is used to discover the _PID_ of the root process. ### Manager-Workers clusters +A ___Manager-Workers___ cluster comprises an _access node_ and a homogenous set of _compute nodes_. The compute nodes are only accessible from the access node. The instance type of the access node may be different from the instance type of the compute nodes. -A ___Manager-Workers___ cluster comprises an _access node_ and a homogenous set of _compute nodes_. The compute nodes are only accessible from the access node. The instance type of the access node may be different from the instance type of the compute nodes. +In a ___Manager-Workers___ cluster, the master process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters. -In a ___Manager-Workers___ cluster, the manager process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters. +A ___Manager-Workers___ cluster is useful when compute nodes are not directly accessible from the external network. This is a common situation in on-premises clusters. However, this is also possible in clusters built from the services of cluster providers specifically tailored to HPC applications. -A ___Manager-Workers___ cluster is useful when compute nodes are not directly accessible from the external network. This is a common situation in on-premises clusters. However, this is also possible in clusters built from the services of cluster providers specifically tailored to HPC applications. +> [!IMPORTANT] +> ___Manager-Workers___ are not natively supported by Julia, because _Distributed.jl_ does not support that worker processes create new processes, as shown below: -> [!IMPORTANT] -> ___Manager-Workers___ are not natively supported by Julia, because _Distributed.jl_ does not support that worker processes create new processes, as shown below: -> ```julia -> julia>addprocs(1) -> 1-element Vector{Int64}: -> 2 -> julia> @fetchfrom 2 addprocs(1) -> ERROR: On worker 2: -> Only process 1 can add or remove workers +>```julia-repl +> julia>addprocs(1) +> 1-element Vector{Int64}: +> +> julia> @fetchfrom 2 addprocs(1) +> ERROR: On worker 2: +> Only process 1 can add or remove workers > ``` -> The _CloudClusters.jl_ developers have developed an extended version of _Distributed.jl_ that removes this limitation, making it possible to create hierarchies of Julia processes [2]. However, the multilevel extension of _Distributed.jl_ is necessary only for the access node of ___Manager-Workers___ cluster, where the so-called _entry processes_, launched by the master process at the REPL/program and responsible for launching the worker processes across computing nodes of the cluster, will be running. + +> The _CloudClusters.jl_ developers have developed an extended version of _Distributed.jl_ that removes this limitation, making it possible to create hierarchies of Julia processes [2]. However, the multilevel extension of _Distributed.jl_ is necessary only for the access node of ___Manager-Workers___ cluster, where the so-called _entry processes_, launched by the master process at the REPL/program and responsible for launching the worker processes across computing nodes of the cluster, will be running. > > So, only users who need to develop customized images to instantiate cluster nodes must be concerned with adapting the Julia installation for the extended _Distributed.jl_ version, and only if an image is intended to be used for manager nodes of ___Manager-Workers___ clusters. > @@ -448,8 +448,8 @@ Users may apply the __cluster_type__ parameter to command the creation of a ___M ```julia my_first_cluster_contract = @cluster(cluster_type => ManageWorkers, - node_count => 4, - node_machinetype => EC2Type_T3_xLarge) +node_count => 4, +node_machinetype => EC2Type_T3_xLarge) ``` In this case, the __node_count__ parameter specifies the number of worker nodes. So, for a cluster deployed using ```my_first_cluster_contract```, five VM instances will be created, including the manager node. @@ -457,12 +457,12 @@ In this case, the __node_count__ parameter specifies the number of worker nodes. The user may use "dot notation" to specify different assumptions for manager and worker nodes. For example: ```julia -my_second_cluster_contract = @cluster(cluster_type => ManageWorkers, - node_count => 8, - manager.node_machinetype => EC2Type_T3_xLarge, - worker.accelerator_count => @just(8), - worker.accelerator_architecture => Ada, - worker.accelerator_memory => @atleast(512G)) +my_second_contract = @cluster(cluster_type => ManageWorkers, + node_count => 8, + manager.node_machinetype => EC2Type_T3_xLarge, + worker.accelerator_count => @just(8), + worker.accelerator_architecture => Ada, + worker.accelerator_memory => @atleast(512G)) ``` This contract specifies that the manager node must be a ___t3.xlarge___ VM instance, while the worker nodes will have eight NVIDIA GPUs of Ada architecture and at least 512GB of memory. @@ -470,15 +470,14 @@ This contract specifies that the manager node must be a ___t3.xlarge___ VM insta ### Configuration parameters Configuration parameters exist for the proper instantiation of clusters, whose default values are specified in the _CCconfig.toml_ file. The user may override the default values by passing configuration parameters through ___@cluster___ and ___@deploy___ operations. For instance: - + ```julia my_cluster_contract = @cluster(node_count => 4, node_machinetype => EC2Type_T3_xLarge, image_id => "ami-07f6c5b6de73ce7ae") - -my_cluster = @deploy(my_first_cluster, - user => "ubuntu", - sshflags => "-i mykey.pem") + my_cluster = @deploy(my_first_cluster, + user => "ubuntu", + sshflags => "-i mykey.pem") ``` In the above code, ```image_id``` specifies that the EC2 image identified by ```ami-07f6c5b6de73ce7ae``` must be used when creating clusters from _my_cluster_contract_. On the other hand, ```user``` and ```sshflags``` will be used to access the nodes of _my_cluster_. For instance, ```ami-07f6c5b6de73ce7ae``` may provide a set of predefined users with different privileges to access the features offered by such an image. @@ -486,25 +485,38 @@ In the above code, ```image_id``` specifies that the EC2 image identified by ``` Currently, there are four categories of configuration parameters. They are described in the following paragraphs. The following configuration parameters set up the SSH connections to nodes of ___Peer-Workers___ clusters and the manager node of ___Master-Worker___ clusters, i.e., those nodes that are externally accessible: -* __user__::```String```, the user login to access VM instances (e.g., ```user@xxx.xxx.xxx.xxx```, where ```xxx.xxx.xxx.xxx``` is the public IP of the VM instance); -* __sshflags__::```String```, the flags that must be passed to the ssh command to access the VM instances; -* __tunneled__::```Bool```, a keyword argument to be passed to ```addprocs``` to determine whether or not ssh access should be [tunneled](https://www.ssh.com/academy/ssh/tunneling). + +* __user__::```String```, the user login to access VM instances (e.g., ```user@xxx.xxx.xxx.xxx```, where ```xxx.xxx.xxx.xxx``` is the public IP of the VM instance); +* __sshflags__::```String```, the flags that must be passed to the ssh command to access the VM instances; +* __tunneled__::```Bool```, a keyword argument to be passed to ```addprocs``` to determine whether or not ssh access should be [tunneled](https://www.ssh.com/academy/ssh/tunneling). The following configuration parameters apply to cluster nodes of any cluster type: -* __exename__::```String```, the full path to the ```julia``` executable (e.g., /home/ubuntu/.juliaup/bin/julia); -* __exeflags__::```String```, flags to be passed to the ```julia``` executable when starting processes on cluster nodes; -* __directory__::```String```, the current directory of the ```julia``` execution in the VM instance. + +* __exename__::```String```, the full path to the ```julia``` executable (e.g., /home/ubuntu/.juliaup/bin/julia); +* __exeflags__::```String```, flags to be passed to the ```julia``` executable when starting processes on cluster nodes; +* __directory__::```String```, the current directory of the ```julia``` execution in the VM instance. The following configuration parameters apply to nodes of ___Peer-Workers-MPI___ and worker nodes of ___Manager-Workers___ clusters, i.e., the ones with MPI-based message-passing enabled: -* __threadlevel__::```Symbol```, a keyword argument passed to ```MPI.Init```, whose possible values are: [```single```, ```:serialized```, ```:funneled```, ```:multiple```](https://juliaparallel.org/MPI.jl/stable/reference/environment/#MPI.ThreadLevel); -* __mpiflags__::```String```, a keyword argument passed to MPI (e.g., ```"--map-by node --hostfile /home/ubuntu/hostfile"```). -The last set of configuration parameters depends on the IaaS provider selected through __@resolve__. For AWS EC2, they are: -* __imageid__::```String```, the _ID_ of the image used to instantiate the VM instances that form the cluster nodes; -* __subnet_id__::```String```, the _ID_ of a subnet for the communication between VM instances that form the cluster nodes; -* __placement_group__::```String```, the _ID_ of an existing placement group where the user wishes to colocate the VM instances that form the cluster nodes (the default is to create a temporary placement group); -* __security_group_id__::```String```, the _ID_ of an existing security group for the VM instances that form the cluster nodes. +* __threadlevel__::```Symbol```, a keyword argument passed to ```MPI.Init```, whose possible values are: [```single```, ```:serialized```, ```:funneled```, ```:multiple```](https://juliaparallel.org/MPI.jl/stable/reference/environment/#MPI.ThreadLevel); +* __mpiflags__::```String```, a keyword argument passed to MPI (e.g., ```"--map-by node --hostfile /home/ubuntu/hostfile"```). + +The last set of configuration parameters depends on the IaaS provider selected through __@resolve__. +For AWS EC2, they are: + +* __imageid__::```String```, the _ID_ of the image used to instantiate the VM instances that form the cluster nodes; +* __subnet_id__::```String```, the _ID_ of a subnet for the communication between VM instances that form the cluster nodes; +* __placement_group__::```String```, the _ID_ of an existing placement group where the user wishes to colocate the VM instances that form the cluster nodes (the default is to create a temporary placement group); +* __security_group_id__::```String```, the _ID_ of an existing security group for the VM instances that form the cluster nodes. + +Finally, for GCP, they are: + +* __imageid__::```String```, the _ID_ of the image used to instantiate the VM instances that form the cluster nodes; +* __zone__::```String```, the [zone](https://cloud.google.com/compute/docs/regions-zones) where the cluster node instances will be placed; +* __project__::```String```, the [project](https://cloud.google.com/storage/docs/projects) where the cluster node instances will be created; +* __network_interface__::```String```, the _network interface_ of cluster node instances. + ### The integration with PlatformAware.jl UNDER CONSTRUCTION @@ -513,5 +525,4 @@ UNDER CONSTRUCTION * Francisco Heron de Carvalho Junior, João Marcelo Uchoa de Alencar, and Claro Henrique Silva Sales. 2024. ___Cloud-based parallel computing across multiple clusters in Julia___. In Proceedings of the _28th Brazilian Symposium on Programming Languages_ (SBLP'2024), September 30, 2024, Curitiba, Brazil. SBC, Porto Alegre, Brasil, 44-52. DOI: https://doi.org/10.5753/sblp.2024.3470. -* Francisco Heron de Carvalho Junior and Tiago Carneiro. 2024. ___Towards multicluster computations with Julia___. In Proceedings of the XXV Symposium on High-Performance Computational Systems (SSCAD’2024), October 25, 2024, São Carlos, Brazil. SBC, Porto Alegre, Brazil. DOI: https://doi.org/10.5753/sscad.2024.244307 - +* Francisco Heron de Carvalho Junior and Tiago Carneiro. 2024. ___Towards multicluster computations with Julia___. In Proceedings of the XXV Symposium on High-Performance Computational Systems (SSCAD’2024), October 25, 2024, São Carlos, Brazil. SBC, Porto Alegre, Brazil. DOI: https://doi.org/10.5753/sscad.2024.244307 \ No newline at end of file diff --git a/src/CloudClusters.jl b/src/CloudClusters.jl index b663abe..ee90ff6 100644 --- a/src/CloudClusters.jl +++ b/src/CloudClusters.jl @@ -46,10 +46,10 @@ export cluster_create, @cluster, cluster_restart, @restart, cluster_features, @features, cluster_nodes, @nodes, - cluster_defaultconfig, - cluster_providers, - cluster_features - + cluster_status, @status, + cluster_config, @config, + cluster_providers, @providers + # Cluster types export ManagerWorkers, PeerWorkers, PeerWorkersMPI, Localhost diff --git a/src/cluster.jl b/src/cluster.jl index 000ee5c..58de6f2 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -66,7 +66,7 @@ function cluster_list(;from = DateTime(0), cluster_type = :AnyCluster) path_contents = readdir(configpath; join = true) for cluster_file in path_contents - if occursin(r"\s*.cluster", cluster_file) + if file_extension(cluster_file) == "cluster" cluster_data = load_cluster(cluster_file; from=from, cluster_type=cluster_type) !isempty(cluster_data) && push!(result, cluster_data) end diff --git a/src/cluster_providers/ec2/ec2_deploy.jl b/src/cluster_providers/ec2/ec2_deploy.jl index 27d7b7e..c56a591 100644 --- a/src/cluster_providers/ec2/ec2_deploy.jl +++ b/src/cluster_providers/ec2/ec2_deploy.jl @@ -135,3 +135,24 @@ cluster_isrunning(_::Type{AmazonEC2}, cluster_handle) = ec2_cluster_info[cluster cluster_isstopped(_::Type{AmazonEC2}, cluster_handle) = ec2_cluster_info[cluster_handle] |> ec2_cluster_isstopped +function cluster_status(_::Type{AmazonEC2}, cluster_handle) + cluster = ec2_cluster_info[cluster_handle] + cluster_nodes = cluster.cluster_nodes + error = false + cluster_status = nothing + for (nodeid,instanceid) in cluster_nodes + node_status = ec2_get_instance_status(instanceid) + @info "$nodeid ($instanceid) is $node_status" + error = !isnothing(cluster_status) && cluster_status != node_status + cluster_status = node_status + end + if error + @error "The EC2 cluster is in a inconsistent status (all nodes must be in the same status)" + else + @info "The cluster $cluster_handle at EC2 is in $cluster_status status" + end +end + +function cluster_delete(_::Type{AmazonEC2}, cluster_handle) + ec2_delete_cluster(cluster_handle) +end \ No newline at end of file diff --git a/src/cluster_providers/ec2/ec2_persist.jl b/src/cluster_providers/ec2/ec2_persist.jl index 2426450..35bfd22 100644 --- a/src/cluster_providers/ec2/ec2_persist.jl +++ b/src/cluster_providers/ec2/ec2_persist.jl @@ -108,7 +108,6 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:ManagerWorkers}, cluster_han ec2_cluster_info[cluster_handle] = cluster return cluster.features else - ec2_delete_cluster(cluster_handle) return nothing end end @@ -155,7 +154,6 @@ function cluster_load(_::Type{AmazonEC2}, _::Type{<:PeerWorkers}, cluster_handle ec2_cluster_info[cluster_handle] = cluster return cluster.features else - ec2_delete_cluster(cluster_handle) return nothing end end diff --git a/src/cluster_providers/gcp/gcp_backend.jl b/src/cluster_providers/gcp/gcp_backend.jl index 64dbf72..497bf59 100644 --- a/src/cluster_providers/gcp/gcp_backend.jl +++ b/src/cluster_providers/gcp/gcp_backend.jl @@ -34,6 +34,7 @@ mutable struct GCPManagerWorkers <: ManagerWorkers #Cluster user_worker::String zone::String project::String + network_interface::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} features::Dict{Symbol, Any} end @@ -47,6 +48,7 @@ mutable struct GCPPeerWorkers <: PeerWorkers # Cluster user::String zone::String project::String + network_interface::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} features::Dict{Symbol, Any} end @@ -59,6 +61,7 @@ mutable struct GCPPeerWorkersMPI <: PeerWorkersMPI # Cluster user::String zone::String project::String + network_interface::String cluster_nodes::Union{Dict{Symbol, String}, Nothing} features::Dict{Symbol, Any} end @@ -189,7 +192,7 @@ function gcp_create_params(cluster::ManagerWorkers, cluster_nodes, internal_key_ "name" => "external-nat", "type" => "ONE_TO_ONE_NAT" )], - "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default" + "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/$(cluster.network_interface)" )], "metadata" => "items" => [Dict( @@ -224,7 +227,7 @@ function gcp_create_params(cluster::ManagerWorkers, cluster_nodes, internal_key_ "name" => "external-nat", "type" => "ONE_TO_ONE_NAT" )], - "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default" + "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/$(cluster.network_interface)" )], "metadata" => "items" => [Dict( @@ -270,7 +273,7 @@ function gcp_create_params(cluster::PeerWorkers, cluster_nodes, internal_key_nam "name" => "external-nat", "type" => "ONE_TO_ONE_NAT" )], - "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/default" + "network" => "https://www.googleapis.com/compute/v1/projects/$(cluster.project)/global/networks/$(cluster.network_interface)" )], "metadata" => "items" => [Dict( @@ -377,7 +380,7 @@ function gcp_create_instances(cluster::ManagerWorkers) internal_key_name = cluster.name - try gcp_allow_ssh(cluster.project) catch end + try gcp_allow_ssh(cluster) catch end # Criando as instâncias params_manager, params_workers = gcp_create_params(cluster, cluster_nodes, internal_key_name, (user_data_manager, user_data_worker), private_key, public_key) @@ -412,7 +415,7 @@ function gcp_create_instances(cluster::PeerWorkers) internal_key_name = cluster.name - try gcp_allow_ssh(cluster.project) catch end + try gcp_allow_ssh(cluster) catch end # Criando as instâncias params = gcp_create_params(new_cluster, cluster_nodes, internal_key_name, user_data, private_key, public_key) @@ -568,18 +571,20 @@ function gcp_get_instance_dict(cluster::Cluster, name) end -function gcp_allow_ssh(project) +function gcp_allow_ssh(cluster) firewall_rule = Dict( - "allowed" => [ - Dict("IPProtocol" => "tcp", - "ports" => ["22"])], - "direction" => "INGRESS", +# "allowed" => [ +# Dict("IPProtocol" => "tcp", +# "ports" => ["22"])], +# "direction" => "INGRESS", "kind" => "compute#firewall", - "name" => "allow-ssh", - "network" => "projects/$project/global/networks/default", - "priority" => 1000, - "selfLink" => "projects/$project/global/firewalls/allow-ssh", - "sourceRanges" => ["0.0.0.0/0"] +# "name" => "allow-ssh", + "name" => "hpcshelf-virtualplatform-network-rules", +"network" => "projects/$(cluster.project)/global/networks/$(cluster.network_interface)", +# "priority" => 1000, +# "selfLink" => "projects/$(cluster.project)/global/firewalls/allow-ssh", + "selfLink" => "projects/$(cluster.project)/global/firewalls/hpcshelf-virtualplatform-network-rules", +"sourceRanges" => ["0.0.0.0/0"] ) GCPAPI.compute(:Firewall, :insert, project; data=firewall_rule) diff --git a/src/cluster_providers/gcp/gcp_deploy.jl b/src/cluster_providers/gcp/gcp_deploy.jl index 0e9eaec..b550fb7 100644 --- a/src/cluster_providers/gcp/gcp_deploy.jl +++ b/src/cluster_providers/gcp/gcp_deploy.jl @@ -37,7 +37,8 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, #image_id_workers = get(cluster_features, :image_id, defaults_dict[GoogleCloud][:image_id]) #image_id_manager = get(cluster_features, :image_id_manager, defaults_dict[GoogleCloud][:image_id_manager]) zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone]) - project = defaults_dict[GoogleCloud][:project] + project = get(cluster_features, :project, defaults_dict[GoogleCloud][:project]) + network_interface = get(cluster_features, :network_interface, get(defaults_dict[GoogleCloud], :network_interface, "default")) instance_type_manager = instance_type[1] instance_type_worker = instance_type[2] @@ -51,6 +52,7 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, user_worker, zone, project, + network_interface, nothing, cluster_features) @@ -76,7 +78,11 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, imageid = get(cluster_features, :imageid, defaults_dict[GoogleCloud][:imageid]) user = get(cluster_features, :user, defaults_dict[GoogleCloud][:user]) zone = get(cluster_features, :zone, defaults_dict[GoogleCloud][:zone]) - project = defaults_dict[GoogleCloud][:project] + project = get(cluster_features, :project, defaults_dict[GoogleCloud][:project]) + network_interface = get(cluster_features, :network_interface, get(defaults_dict[GoogleCloud],:network_interface, "default")) + + + # get(cluster_features, :placement_group, get(defaults_dict[AmazonEC2], :placement_group, nothing)) cluster = gcp_build_clusterobj(cluster_type, string(cluster_handle), @@ -86,6 +92,7 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, user, zone, project, + network_interface, nothing, cluster_features) @@ -98,11 +105,11 @@ function deploy_cluster(gcptype::Type{GoogleCloud}, return cluster end -gcp_build_clusterobj(_::Type{<:PeerWorkers}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) = - GCPPeerWorkers(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) +gcp_build_clusterobj(_::Type{<:PeerWorkers}, name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features) = + GCPPeerWorkers(name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features) -gcp_build_clusterobj(_::Type{<:PeerWorkersMPI}, name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) = - GCPPeerWorkersMPI(name, image_id, count, instance_type, user, zone, project, cluster_nodes, features) +gcp_build_clusterobj(_::Type{<:PeerWorkersMPI}, name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features) = + GCPPeerWorkersMPI(name, image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, features) function launch_processes(_::Type{GoogleCloud}, cluster_type::Type{<:Cluster}, cluster_handle, ips) cluster = gcp_cluster_info[cluster_handle] @@ -151,4 +158,27 @@ function cluster_isrunning(_::Type{GoogleCloud}, cluster_handle) @warn "Erro ao verificar o status do cluster: ", e return false end +end + + +function cluster_status(_::Type{GoogleCloud}, cluster_handle) + cluster = gcp_cluster_info[cluster_handle] + cluster_nodes = cluster.cluster_nodes + error = false + cluster_status = nothing + for (nodeid,instanceid) in cluster_nodes + node_status = gcp_get_instance_status(cluster, instanceid) + @info "$nodeid ($instanceid) is $node_status" + error = !isnothing(cluster_status) && cluster_status != node_status + cluster_status = node_status + end + if error + @error "The GCP cluster is in a inconsistent status (all nodes must be in the same status)" + else + @info "The cluster $cluster_handle at GCP is in $cluster_status status" + end +end + +function cluster_delete(_::Type{GoogleCloud}, cluster_handle) + gcp_delete_cluster(cluster_handle) end \ No newline at end of file diff --git a/src/cluster_providers/gcp/gcp_persist.jl b/src/cluster_providers/gcp/gcp_persist.jl index 9eb2c2d..46ce32a 100644 --- a/src/cluster_providers/gcp/gcp_persist.jl +++ b/src/cluster_providers/gcp/gcp_persist.jl @@ -20,6 +20,7 @@ function gcp_cluster_save(cluster::ManagerWorkers) contents["cluster_features"] = cluster.features contents["zone"] = cluster.zone contents["project"] = cluster.project + contents["network_interface"] = cluster.network_interface configpath = get(ENV,"CLOUD_CLUSTERS_CONFIG", pwd()) @@ -47,6 +48,7 @@ function gcp_cluster_save(cluster::PeerWorkers) contents["image_id"] = cluster.image_id contents["zone"] = cluster.zone contents["project"] = cluster.project + contents["network_interface"] = cluster.network_interface contents["cluster_nodes"] = cluster.cluster_nodes contents["cluster_features"] = cluster.features @@ -73,6 +75,7 @@ function cluster_load(_::Type{GoogleCloud}, _::Type{<:ManagerWorkers}, cluster_h user_worker = contents["user_worker"] zone = contents["zone"] project = contents["project"] + network_interface = contents["network_interface"] _cluster_nodes = contents["cluster_nodes"] cluster_nodes = Dict() @@ -84,13 +87,12 @@ function cluster_load(_::Type{GoogleCloud}, _::Type{<:ManagerWorkers}, cluster_h cluster = GCPManagerWorkers(string(cluster_handle), image_id_manager, image_id_worker, count, instance_type_manager, instance_type_worker, user_manager, user_worker, - zone, project, cluster_nodes, cluster_features) + zone, project, network_interface, cluster_nodes, cluster_features) if gcp_cluster_status(cluster, ["RUNNING", "TERMINATED"]) gcp_cluster_info[cluster_handle] = cluster return cluster.features else - gcp_delete_cluster(cluster_handle) return nothing end end @@ -115,6 +117,7 @@ function cluster_load(_::Type{GoogleCloud}, _::Type{<:PeerWorkers}, cluster_hand user = contents["user"] zone = contents["zone"] project = contents["project"] + network_interface = contents["network_interface"] _cluster_nodes = contents["cluster_nodes"] cluster_nodes = Dict() @@ -124,14 +127,12 @@ function cluster_load(_::Type{GoogleCloud}, _::Type{<:PeerWorkers}, cluster_hand cluster_features = contents["cluster_features"] |> gcp_adjusttypefeatures - cluster = GCPPeerWorkers(string(cluster_handle), image_id, count, instance_type, user, zone, project, - cluster_nodes, cluster_features) + cluster = GCPPeerWorkers(string(cluster_handle), image_id, count, instance_type, user, zone, project, network_interface, cluster_nodes, cluster_features) if gcp_cluster_status(cluster, ["RUNNING", "TERMINATED"]) gcp_cluster_info[cluster_handle] = cluster return cluster.features else - gcp_delete_cluster(cluster_handle) return nothing end end diff --git a/src/config/configs.jl b/src/config/configs.jl index f9c1d17..366cf5f 100644 --- a/src/config/configs.jl +++ b/src/config/configs.jl @@ -73,11 +73,11 @@ function load!() end end -function cluster_defaultconfig(provider_type) +function cluster_config(provider_type) defaults_dict[provider_type] end -function cluster_defaultconfig() +function cluster_config() defaults_dict[Provider] end diff --git a/src/deploy.jl b/src/deploy.jl index 77982c3..9efc557 100644 --- a/src/deploy.jl +++ b/src/deploy.jl @@ -123,7 +123,10 @@ function launch_processes_ssh(cluster_features, _::Type{<:ManagerWorkers}, ips) manager_features = Dict(get(cluster_features, :manager_features, cluster_features)) worker_features = Dict(get(cluster_features, :worker_features, cluster_features)) - exeflags_manager = get(manager_features, :exeflags, default_exeflags(cluster_provider)) |> x -> Cmd(convert(Vector{String}, split(x))) + exeflags_manager_ = get(manager_features, :exeflags, default_exeflags(cluster_provider)) + exeflags_manager_ *= " --bind-to $(ip_manager[:private_ip])" + exeflags_manager = exeflags_manager_ |> x -> Cmd(convert(Vector{String}, split(x))) + exeflags_worker = get(worker_features, :exeflags, default_exeflags(cluster_provider)) |> x -> Cmd(convert(Vector{String}, split(x))) exename_manager = get(manager_features, :exename, default_exename(cluster_provider)) |> x -> Cmd(convert(Vector{String}, split(x))) @@ -142,15 +145,15 @@ function launch_processes_ssh(cluster_features, _::Type{<:ManagerWorkers}, ips) threadlevel = get(cluster_features, :threadlevel, default_threadlevel(cluster_provider)) mpiflags = get(cluster_features, :mpiflags, default_mpiflags(cluster_provider)) |> x -> Cmd(convert(Vector{String}, split(x))) - #= FOR DEBUGGING - @info "user_id=$user_id" + #= FOR DEBUGGING + @info "user=$user" @info "sshflags=$sshflags" @info "exename_manager=$exename_manager" @info "exeflags_manager=$exeflags_manager" @info "tunnel=$tunnel" @info "directory_manager=$directory_manager" - @info "===> $user_id@$(ip_manager[:public_ip])" - =# + @info "===> $user@$(ip_manager[:public_ip])" + =# master_id = nothing ntries = 1 @@ -583,18 +586,18 @@ load_cluster(cluster_handle::Symbol; from = DateTime(0), cluster_type = :AnyClus function load_cluster(cluster_handle::String; from = DateTime(0), cluster_type = :AnyCluster) result = Dict() - try - configpath = get(ENV,"CLOUD_CLUSTERS_CONFIG", pwd()) - cluster_file = occursin(r"\s*.cluster", cluster_handle) ? cluster_handle : cluster_handle * ".cluster" - cluster_path = joinpath(configpath, cluster_file) - contents = TOML.parsefile(cluster_path) - timestamp = DateTime(contents["timestamp"]) - this_cluster_type = contents["type"] - cluster_handle = Symbol(contents["name"]) - if timestamp > from && (cluster_type == :AnyCluster || cluster_type == Symbol(this_cluster_type)) - cluster_provider = contents["provider"] - cluster_provider_type = fetchtype(cluster_provider) - this_cluster_type_type = fetchtype(this_cluster_type) + configpath = get(ENV,"CLOUD_CLUSTERS_CONFIG", pwd()) + cluster_file = occursin(r"\s*.cluster", cluster_handle) ? cluster_handle : cluster_handle * ".cluster" + cluster_path = joinpath(configpath, cluster_file) + contents = TOML.parsefile(cluster_path) + timestamp = DateTime(contents["timestamp"]) + this_cluster_type = contents["type"] + cluster_handle = Symbol(contents["name"]) + if timestamp > from && (cluster_type == :AnyCluster || cluster_type == Symbol(this_cluster_type)) + cluster_provider = contents["provider"] + cluster_provider_type = fetchtype(cluster_provider) + this_cluster_type_type = fetchtype(this_cluster_type) + try cluster_features = cluster_load(cluster_provider_type, this_cluster_type_type, cluster_handle, contents) if !isnothing(cluster_features) @info "$this_cluster_type $cluster_handle, created at $timestamp on $cluster_provider" @@ -604,16 +607,26 @@ function load_cluster(cluster_handle::String; from = DateTime(0), cluster_type = result[:timestamp] = timestamp result[:features] = cluster_features else + cluster_delete(cluster_provider_type, cluster_handle) @warn "$this_cluster_type cluster $cluster_handle is not accessible" end + catch e + save_exception_details() + cluster_delete(cluster_provider_type, cluster_handle) + @warn "cluster $cluster_handle not found" end - catch e - save_exception_details() - @warn "cluster $cluster_handle not found" end return result end function cluster_features(cluster_handle) Dict(cluster_deploy_info[cluster_handle][:features]) -end \ No newline at end of file +end + +function cluster_status(cluster_handle) + check_cluster_handle(cluster_handle) + cluster_features = cluster_deploy_info[cluster_handle][:features] + cluster_provider = cluster_features[:node_provider] + cluster_status(cluster_provider, cluster_handle) +end + diff --git a/src/macros.jl b/src/macros.jl index 117d9ae..4aeba2e 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -151,4 +151,16 @@ macro nodes(cluster_handle) end macro status(cluster_handle) + call = Expr(:call, cluster_status, cluster_handle) + esc(call) +end + +macro config(provider_type) + call = Expr(:call, cluster_config, provider_type) + esc(call) end + +macro providers() + call = Expr(:call, cluster_providers) + esc(call) +end \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index 770d9ea..b5b4a11 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -44,4 +44,6 @@ function show_exceptions() println(stdout) end -end \ No newline at end of file +end + +file_extension(file::String) = file[findlast(==('.'), file)+1:end] \ No newline at end of file From 6d79f276ddf32a6b8ee6c900d239a1fb977cec49 Mon Sep 17 00:00:00 2001 From: Francisco Heron de Carvalho Junior Date: Thu, 6 Feb 2025 11:03:07 -0300 Subject: [PATCH 5/5] Version 0.1.3 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 3d3d8ef..2aab40b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "CloudClusters" uuid = "4ca6f12b-c8f1-4945-b50f-6bb73234c039" authors = ["Francisco Heron de Carvalho Junior e João Marcelo Uchôa de Alencar "] -version = "0.2.0" +version = "0.1.3" [deps] AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"