Training and deploying a custom model • googleCloudVertexAIR

Overview

Emulation of blog post Train and deploy ML models with R and plumber on Vertex AI | Google Cloud Blog and related notebook: vertex-ai-samples/get_started_vertex_training_r_using_r_kernel.ipynb at main · GoogleCloudPlatform/vertex-ai-samples

Core concept: Create a TrainingPipeline that runs a CustomJob and imports the resulting artifacts as a Model.

Setup

Installation

Run the following chunk to install googleCloudVertexAIR and the other required R packages to complete this tutorial (checking to see if they are installed first and only install if not already):

required_packages <- c("remotes", "googleAuthR", "withr", "jsonlite")
missing_packages <- required_packages[!(required_packages %in%
                                          installed.packages()[,"Package"])]
if(length(missing_packages)) install.packages(missing_packages)
# remotes::install_github("justinjm/googleCloudVertexAIR") # run first time
# options(googleAuthR.verbose = 0) # set when debugging

library(withr) # for gcloud sdk: https://stackoverflow.com/a/64381848/1812363 
library(glue) 

sh <- function(cmd, args = c(), intern = FALSE) {
  with_path(path.expand("~/google-cloud-sdk/bin/"), {
    if (is.null(args)) {
      cmd <- glue(cmd)
      s <- strsplit(cmd, " ")[[1]]
      cmd <- s[1]
      args <- s[2:length(s)]
    }
    ret <- system2(cmd, args, stdout = TRUE, stderr = TRUE)
    if ("errmsg" %in% attributes(attributes(ret))$names) cat(attr(ret, "errmsg"), "\n")
    if (intern) return(ret) else cat(paste(ret, collapse = "\n"))
  }
  )
}

.Renviron

Create a file called .Renviron in your project’s working directory and use the following environemtn arguments:

GAR_SERVICE_JSON - path to service account (JSON) keyfile downloaded before and copied
GCVA_DEFAULT_PROJECT_ID - string of your GCP project you configured before
GCVA_DEFAULT_REGION - region of GCP resorces that can be one of: "us-central1" or "eu"

e.g. your .Renviron should look like:

# .Renviron
GAR_SERVICE_JSON="/Users/me/auth/auth.json"
GCVA_DEFAULT_PROJECT_ID="my-project"
GCVA_DEFAULT_REGION="us-central1"

Setup Google Cloud Project

TODO

Authenticate your Google Cloud Account

library(googleAuthR)
library(googleCloudVertexAIR)

options(googleAuthR.scopes.selected = "https://www.googleapis.com/auth/cloud-platform")

gar_auth_service(json_file = Sys.getenv("GAR_SERVICE_JSON"))

Set global arguements

gcva_region_set(region = "us-central1")

## 2024-07-08 12:35:05.391413> Region set to 'us-central1'

gcva_project_set(projectId = Sys.getenv("GCVA_DEFAULT_PROJECT_ID"))

## 2024-07-08 12:35:05.391977> ProjectId set to 'gc-vertex-ai-r'

timestamp <- strftime(Sys.time(), "%Y%m%d%H%M%S")
paste0("timestamp: ", timestamp)

## [1] "timestamp: 20240708123505"

datasetDisplayName <- sprintf("california-housing-%s", timestamp)
paste0("datasetDisplayName: ", datasetDisplayName)

## [1] "datasetDisplayName: california-housing-20240708123505"

bucket_name <- paste0(gcva_project_get(), "-aip-", timestamp)
bucket_uri <- paste0("gs://", bucket_name)

paste0("bucket_name: ", bucket_name)

## [1] "bucket_name: gc-vertex-ai-r-aip-20240708123505"

paste0("bucket_uri: ", bucket_uri)

## [1] "bucket_uri: gs://gc-vertex-ai-r-aip-20240708123505"

private_repo <- "my-docker-repo"
image_name <- "vertex-r"  
image_tag <- "latest"
image_uri <- glue(
  "{gcva_region_get()}-docker.pkg.dev/{gcva_project_get()}/{private_repo}/{image_name}:{image_tag}"
)
image_uri

## us-central1-docker.pkg.dev/gc-vertex-ai-r/my-docker-repo/vertex-r:latest

Create cloud storage bucket

sh("gcloud config set project {gcva_project_get()}")

## Updated property [core/project].

sh("gsutil mb -l {gcva_region_get()} -p {gcva_project_get()} {bucket_uri}")

## Creating gs://gc-vertex-ai-r-aip-20240708123505/...

Set

Set newly created bucket as a global argument:

gcva_bucket_set(bucket_uri)

## 2024-07-08 12:35:07.513878> bucket set to 'gs://gc-vertex-ai-r-aip-20240708123505'

Create a private docker repository

Your first step is to create your own Docker repository in Google Artifact Registry.

Run the gcloud artifacts repositories create command to create a new Docker repository with your region with the description “docker repository”.

Run the gcloud artifacts repositories list command to verify that your repository was created.

sh('gcloud artifacts repositories create {private_repo} --repository-format=docker --location={gcva_region_get()} --description="Docker repository"')

sh("gcloud artifacts repositories list")

## Listing items under project gc-vertex-ai-r, across all locations.
## 
##                                                                                     ARTIFACT_REGISTRY
## REPOSITORY       FORMAT  MODE                 DESCRIPTION                                     LOCATION     LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME          SIZE (MB)
## patient-summary  DOCKER  STANDARD_REPOSITORY  Docker repository for patient summary demo app  us-central1          Google-managed key  2023-07-15T14:44:05  2023-07-15T15:54:42  672.668

Configure authentication to your private repo

Before you push or pull container images, configure Docker to use the gcloud command-line tool to authenticate requests to Artifact Registry for your region.

sh("gcloud auth configure-docker {gcva_project_get()}-docker.pkg.dev --quiet")

## WARNING: `docker` not in system PATH.
## `docker` and `docker-credential-gcloud` need to be in the same PATH in order to work correctly together.
## gcloud's Docker credential helper can be configured but it will not work until this is corrected.
## WARNING: Your config file at [/Users/justin/.docker/config.json] contains these credential helper entries:
## 
## {
##   "credHelpers": {
##     "us-central1-docker.pkg.dev": "gcloud"
##   }
## }
## Adding credentials for: gc-vertex-ai-r-docker.pkg.dev
## WARNING: gc-vertex-ai-r-docker.pkg.dev is not a supported registry
## gcloud credential helpers already registered correctly.

Create Dockerfile

dir.create("src", showWarnings=FALSE)

.src/Dockerfile

Dockerfile = cat(
"FROM gcr.io/deeplearning-platform-release/r-cpu.4-1:latest

WORKDIR /root

COPY train.R /root/train.R
COPY serve.R /root/serve.R

# Install Fortran
RUN apt-get update
RUN apt-get install gfortran -yy

# Install R packages
RUN Rscript -e \"install.packages('plumber')\"
RUN Rscript -e \"install.packages('randomForest')\"

EXPOSE 8080
", file = "src/Dockerfile")

Create training script

.src/train.R

cat(
'#!/usr/bin/env Rscript
library(tidyverse)
library(data.table)
library(randomForest)
Sys.getenv()

# The GCP Project ID
project_id <- Sys.getenv("CLOUD_ML_PROJECT_ID")

# The GCP Region
location <- Sys.getenv("CLOUD_ML_REGION")

# The Cloud Storage URI to upload the trained model artifact to
model_dir <- Sys.getenv("AIP_MODEL_DIR")

# Next, you create directories to download our training, validation, and test set into.
dir.create("training")
dir.create("validation")
dir.create("test")

# You download the Vertex AI managed data sets into the container environment locally.
system2("gsutil", c("cp", Sys.getenv("AIP_TRAINING_DATA_URI"), "training/"))
system2("gsutil", c("cp", Sys.getenv("AIP_VALIDATION_DATA_URI"), "validation/"))
system2("gsutil", c("cp", Sys.getenv("AIP_TEST_DATA_URI"), "test/"))

# For each data set, you may receive one or more CSV files that you will read into data frames.
training_df <- list.files("training", full.names = TRUE) %>% map_df(~fread(.))
validation_df <- list.files("validation", full.names = TRUE) %>% map_df(~fread(.))
test_df <- list.files("test", full.names = TRUE) %>% map_df(~fread(.))

print("Starting Model Training")
rf <- randomForest(median_house_value ~ ., data=training_df, ntree=100)
rf

saveRDS(rf, "rf.rds")
system2("gsutil", c("cp", "rf.rds", model_dir))
', file = "src/train.R")

Create serving script

.src/serve.R

cat(
'#!/usr/bin/env Rscript
Sys.getenv()
library(plumber)

system2("gsutil", c("cp", "-r", Sys.getenv("AIP_STORAGE_URI"), "."))
system("du -a .")

rf <- readRDS("artifacts/rf.rds")
library(randomForest)

predict_route <- function(req, res) {
    print("Handling prediction request")
    df <- as.data.frame(req$body$instances)
    preds <- predict(rf, df)
    return(list(predictions=preds))
}

print("Staring Serving")

pr() %>%
    pr_get(Sys.getenv("AIP_HEALTH_ROUTE"), function() "OK") %>%
    pr_post(Sys.getenv("AIP_PREDICT_ROUTE"), predict_route) %>%
    pr_run(host = "0.0.0.0", port=as.integer(Sys.getenv("AIP_HTTP_PORT", 8080)))
', file = "src/serve.R")

Build container and push to artifact registry

sh("gcloud services enable artifactregistry.googleapis.com")

sh("cd src && gcloud builds submit --region={gcva_region_get()} --tag={image_uri} --timeout=1h")

Create Dataset

data_uri <- "gs://cloud-samples-data/ai-platform-unified/datasets/tabular/california-housing-tabular-regression.csv"

dataset <- gcva_create_tabluar_dataset(
  displayName = datasetDisplayName,
  gcsSource = data_uri)

## 2024-07-08 12:35:14.20546> Waiting 2 seconds...

dataset

## ==Google Cloud Vertex AI Dataset==
## name:                 projects/442003009360/locations/us-central1/datasets/3846820650169663488 
## displayName:          california-housing-20240708123505 
## createTime:           2024-07-08 16:35:12 
## gcsSource:            gs://cloud-samples-data/ai-platform-unified/datasets/tabular/california-housing-tabular-regression.csv

Create custom container training job

job <- gcva_custom_container_training_job(
  displayName = image_name,
  containerUri = image_uri,
  command = c("Rscript", "train.R"),
  modelServingContainerCommand = c("Rscript", "serve.R"),
  modelServingContainerImageUri = image_uri
)
job

## ==Google Cloud Vertex AI Custom Container Training Job================
## {
##   "displayName": "vertex-r",
##   "inputDataConfig": {
##     "datasetId": "",
##     "gcsDestination": {
##       "outputUriPrefix": "gs://gc-vertex-ai-r-aip-20240708123505"
##     }
##   },
##   "trainingTaskDefinition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml",
##   "trainingTaskInputs": {
##     "workerPoolSpecs": [
##       {
##         "containerSpec": {
##           "imageUri": "us-central1-docker.pkg.dev/gc-vertex-ai-r/my-docker-repo/vertex-r:latest",
##           "command": ["Rscript", "train.R"]
##         }
##       }
##     ],
##     "baseOutputDirectory": {
##       "outputUriPrefix": ""
##     }
##   },
##   "modelToUpload": {
##     "containerSpec": {
##       "imageUri": "us-central1-docker.pkg.dev/gc-vertex-ai-r/my-docker-repo/vertex-r:latest",
##       "command": ["Rscript", "serve.R"]
##     }
##   }
## }
## ====================================================================

Execute custom training job

gcva_bucket_set(bucket_uri) # set here during testing only 

model <- gcva_run_job(
  job = job,
  dataset = dataset,
  modelDisplayName = "vertex-r-model",
  machineType = "n1-standard-4"
)
model

023-01-20 17:37:59> bucket set to 'gs://gc-vertex-ai-r-aip-20230120172747'
2023-01-20 17:38:00> view training: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=gc-vertex-ai-r
2023-01-20 17:38:00> pipeline state: PIPELINE_STATE_RUNNING
2023-01-20 17:43:00> view training: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=gc-vertex-ai-r
2023-01-20 17:43:00> pipeline state: PIPELINE_STATE_RUNNING
2023-01-20 17:48:01> view training: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=gc-vertex-ai-r
2023-01-20 17:48:01> pipeline state: PIPELINE_STATE_SUCCEEDED
==Google Cloud Vertex AI TrainingPipeline Job==
console:              https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=442003009360 
state:                PIPELINE_STATE_SUCCEEDED

Create Endpoint

Doc: Overview of getting predictions on Vertex AI | Google Cloud API: https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints/create

endpoint <- gcva_create_endpoint(
    displayName = "CaliforniaHousingEndpoint"
)
endpoint

Deploy Model to Endpoint

model <- gcva_model(model = Sys.getenv("GCVA_TEST_MODEL_NAME_CUSTOM"))

gcva_deploy(
  model = model,
  endpoint = endpoint, 
  machineType = "n1-standard-4"
)

Test deployed model

Now we will test the deployed model by performing an online prediction.

Create sample input data

First, we create a sample set of input data from the original source, selecting the first 5 rows for ease of demonstration):

library(jsonlite)
## create example data for prediction request 
data_raw <- read.csv(text=sh("gsutil cat {data_uri}", intern = TRUE))

## convert all to string values to build correct prediction request
data <- as.data.frame(lapply(data_raw, as.character))

## build list of 5 examples/rows only for testing
instances <- list(instances=head(data[, names(data) != "median_house_value"], 5))

## convert to required format of JSON
json_instances <- toJSON(instances, auto_unbox = TRUE)
json_instances

## {"instances":[{"longitude":"-122.23","latitude":"37.88","housing_median_age":"41","total_rooms":"880","total_bedrooms":"129","population":"322","households":"126","median_income":"8.3252"},{"longitude":"-122.22","latitude":"37.86","housing_median_age":"21","total_rooms":"7099","total_bedrooms":"1106","population":"2401","households":"1138","median_income":"8.3014"},{"longitude":"-122.24","latitude":"37.85","housing_median_age":"52","total_rooms":"1467","total_bedrooms":"190","population":"496","households":"177","median_income":"7.2574"},{"longitude":"-122.25","latitude":"37.85","housing_median_age":"52","total_rooms":"1274","total_bedrooms":"235","population":"558","households":"219","median_income":"5.6431"},{"longitude":"-122.25","latitude":"37.85","housing_median_age":"52","total_rooms":"1627","total_bedrooms":"280","population":"565","households":"259","median_income":"3.8462"}]}

submit request

gcva_predict(
  endpoint = endpoint,
  instances = json_instances 
)

Cleanup

Undeploy

# gcva_undeploy_all(endpoint = endpoint)

Deletes

gcva_delete_endpoint(endpoint = endpoint)

gcva_delete_job()

gcva_delete_dataset(displayName = datasetDisplayName)

## 2024-07-08 12:35:16.531893> Dataset successfully deleted.

local directory

unlink("src", recursive = TRUE) # will delete directory called 'mydir'

GCS bucket

delete_bucket <- TRUE 
if (delete_bucket == TRUE) sh("gsutil -m rm -r {bucket_uri}")

## If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.
## 
## Removing gs://gc-vertex-ai-r-aip-20240708123505/...