diff --git a/handwritten_digit_recog/Makefile b/handwritten_digit_recog/Makefile new file mode 100644 index 0000000..53181bc --- /dev/null +++ b/handwritten_digit_recog/Makefile @@ -0,0 +1,16 @@ +MAKE := $(MAKE) -e +DOCKER_BUILD_CMD = docker build + +all: + $(MAKE) train serve website + +train: + $(DOCKER_BUILD_CMD) --build-arg stage=train -f docker/Dockerfile.$(TYPE) -t $(REGISTRY)/dlrs-train-$(TYPE) . + +serve: + $(DOCKER_BUILD_CMD) --build-arg stage=serve -f docker/Dockerfile.$(TYPE) -t $(REGISTRY)/dlrs-serve-$(TYPE) . + +website: + $(DOCKER_BUILD_CMD) -f docker/Dockerfile.website -t $(REGISTRY)/dlrs-website . + +.PHONY: all train serve website diff --git a/handwritten_digit_recog/README.md b/handwritten_digit_recog/README.md new file mode 100644 index 0000000..5253bc6 --- /dev/null +++ b/handwritten_digit_recog/README.md @@ -0,0 +1,50 @@ +# Handwritten Digit Recognition with DLRS + +The content in this directory is for running a handwritten digit recognition example using the Deep Learning Reference Stack, Pytorch and MNIST. + +#### Pre-requisites + +* Docker + +## Running on containers + +Please follow these instructions to train the model and classify random handwritten digits on DLRS based Docker containers. + +### Set up + +Set TYPE and REGISTRY env variables +TYPE options: mkl or oss +REGISTRY options: registry name + +```bash +export TYPE= +export REGISTRY= +make +``` + +### Train + +```bash +mkdir models +docker run --rm -ti -v ${PWD}/models:/workdir/models $REGISTRY/dlrs-train-$TYPE:latest "-s train" +``` + +### Serving the model for live classification + +```bash +docker run -p 5059:5059 -it -v ${PWD}/models:/workdir/models $REGISTRY/dlrs-serve-$TYPE:latest "-s serve" +curl -i -X POST -d 'Classify' http://localhost:5059/digit_recognition/classify +``` + +### Website + +We have created a simple website template for you to interact with. + +```bash +docker run --rm -p 8080:5000 -it $REGISTRY/dlrs-website:latest --website_endpoint 0.0.0.0 +Open localhost:8080 on a web browser +``` + +## Running on Kubeflow pipelines + +We have created a Kubeflow Pipeline to run this example. Please go to [Kubeflow Pipelines](https://github.com/intel/stacks-usecase/kubeflow/pipelines) for more details. diff --git a/handwritten_digit_recog/docker/Dockerfile.mkl b/handwritten_digit_recog/docker/Dockerfile.mkl new file mode 100644 index 0000000..7db4f62 --- /dev/null +++ b/handwritten_digit_recog/docker/Dockerfile.mkl @@ -0,0 +1,15 @@ +FROM clearlinux/stacks-pytorch-mkl:v0.5.0 + +ARG stage + +ENV PATH=$PATH:/opt/conda/bin/ \ + LD_LIBRARY_PATH=/usr/lib64:/opt/conda/lib \ + STAGE=$stage + +WORKDIR /workdir +COPY python/ python/ +COPY scripts/entrypoint.sh . + +RUN chmod +x entrypoint.sh + +ENTRYPOINT ["./entrypoint.sh"] diff --git a/handwritten_digit_recog/docker/Dockerfile.oss b/handwritten_digit_recog/docker/Dockerfile.oss new file mode 100644 index 0000000..f777075 --- /dev/null +++ b/handwritten_digit_recog/docker/Dockerfile.oss @@ -0,0 +1,14 @@ +FROM clearlinux/stacks-pytorch-oss:v0.5.0 + +ARG stage +ENV STAGE=$stage + +WORKDIR /workdir +COPY python/ python/ +COPY scripts/entrypoint.sh . + +RUN chmod +x entrypoint.sh + +EXPOSE 5059 + +ENTRYPOINT ["./entrypoint.sh"] diff --git a/handwritten_digit_recog/docker/Dockerfile.website b/handwritten_digit_recog/docker/Dockerfile.website new file mode 100644 index 0000000..6d1113e --- /dev/null +++ b/handwritten_digit_recog/docker/Dockerfile.website @@ -0,0 +1,9 @@ +FROM clearlinux/stacks-pytorch-oss:v0.5.0 + +WORKDIR /workdir/website +COPY website/ /workdir/website/ + +EXPOSE 5000 + +SHELL ["/bin/bash", "-c"] +ENTRYPOINT ["python", "app.py"] diff --git a/handwritten_digit_recog/python/classify.py b/handwritten_digit_recog/python/classify.py new file mode 100644 index 0000000..886c94d --- /dev/null +++ b/handwritten_digit_recog/python/classify.py @@ -0,0 +1,63 @@ +import torch +import numpy as np +import matplotlib.pyplot as plt +from torchvision import datasets, transforms + +from train import Net + +# Load pre-trained model +model_path = "/workdir/models/mnist_cnn.pt" +device = torch.device("cpu") +model = Net().to(device) +model.load_state_dict(torch.load(model_path)) + +# Use a transform to normalize data (same as in training) +transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] +) + +# Load training data +kwargs = {"num_workers": 1, "pin_memory": True} +test_loader = datasets.MNIST("data", download=True, train=False, transform=transform) +data_loader = torch.utils.data.DataLoader(test_loader, batch_size=64, shuffle=True) + + +def img_show(img, ps, probab): + ps = ps.data.numpy().squeeze() + fig, (ax1, ax2) = plt.subplots(figsize=(5, 3), ncols=2) + ax1.imshow(img.resize_(1, 28, 28).numpy().squeeze(), cmap="gray_r") + ax1.axis("off") + ax1.set_title("Random Test Image") + ax1.text( + 5, + 30, + "Predicted value: %s" % probab.index(max(probab)), + fontsize=10, + bbox={"facecolor": "cornsilk", "boxstyle": "round", "alpha": 0.5}, + ) + ax2.barh(np.arange(10), ps, color="gold") + ax2.set_aspect(0.1) + ax2.set_yticks(np.arange(10)) + ax2.set_yticklabels(np.arange(10)) + ax2.set_title("Probability Chart") + ax2.set_xlim(0, 1.1) + plt.grid(True) + plt.tight_layout() + plt.show() + + +# Function for classifying random handwritten numbers from the MNIST dataset +def classify(imgshow=False): + images, labels = next(iter(data_loader)) + img = images[0].view(1, 1, 28, 28) + with torch.no_grad(): + logps = model(img) + ps = torch.exp(logps) + probab = list(ps.numpy()[0]) + if imgshow: + img_show(img, ps, probab) + return img, probab + + +if __name__ == "__main__": + classify(imgshow=True) diff --git a/handwritten_digit_recog/python/serve.py b/handwritten_digit_recog/python/serve.py new file mode 100644 index 0000000..08cba1f --- /dev/null +++ b/handwritten_digit_recog/python/serve.py @@ -0,0 +1,52 @@ +#!usr/bin/env python +# +# Copyright (c) 2019 Intel Corporation +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""A REST API for Pytorch MNIST Handwritten Digit Recognition""" + +import flask + +from classify import classify + +app = flask.Flask("Handwritten Digit Recognition") + +banner = { + "what": "Handwritten Digit Recognition", + "usage": { + "Client": "curl -i -X POST -d 'Classify' http://localhost:5059/digit_recognition/classify", + "Server": "docker run -d -p 5059:5059 stacks_handwritten_digit_recog", + }, +} + + +@app.route("/digit_recognition/", methods=["GET"]) +def index(): + return flask.jsonify(banner), 201 + + +@app.route("/digit_recognition/classify", methods=["POST"]) +def digit_recog(): + img, probab = classify(imgshow=False) + return flask.jsonify({"Prediction": probab.index(max(probab))}), 201 + + +@app.errorhandler(404) +def not_found(error): + return flask.make_response(flask.jsonify({"error": "Not found"}), 404) + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=5059) diff --git a/handwritten_digit_recog/python/train.py b/handwritten_digit_recog/python/train.py new file mode 100644 index 0000000..7528865 --- /dev/null +++ b/handwritten_digit_recog/python/train.py @@ -0,0 +1,197 @@ +#BSD 3-Clause License +# +#Copyright (c) 2017, Intel Corporation +#Copyright (c) 2017, Pytorch contributors +#All rights reserved. +# +#Redistribution and use in source and binary forms, with or without +#modification, are permitted provided that the following conditions are met: +# +#* Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +#* Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +#* Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +#AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +#DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +#FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +#DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +#SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +#CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +#OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import print_function +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +from torch.optim.lr_scheduler import StepLR + + +device = torch.device("cpu") +transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] +) + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout2d(0.25) + self.dropout2 = nn.Dropout2d(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss( + output, target, reduction="sum" + ).item() # sum up batch loss + pred = output.argmax( + dim=1, keepdim=True + ) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + print( + "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format( + test_loss, + correct, + len(test_loader.dataset), + 100.0 * correct / len(test_loader.dataset), + ) + ) + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description="PyTorch MNIST Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + metavar="N", + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", + type=int, + default=1, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)" + ) + parser.add_argument( + "--gamma", + type=float, + default=0.7, + metavar="M", + help="Learning rate step gamma (default: 0.7)", + ) + parser.add_argument( + "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)" + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + + args = parser.parse_args() + + torch.manual_seed(args.seed) + + kwargs = {"num_workers": int(cpu_count()/2), "pin_memory": True} + train_loader = torch.utils.data.DataLoader( + datasets.MNIST("data", train=True, download=True, transform=transform), + batch_size=args.batch_size, + shuffle=True, + **kwargs, + ) + test_loader = torch.utils.data.DataLoader( + datasets.MNIST("data", train=False, transform=transform), + batch_size=args.test_batch_size, + shuffle=True, + **kwargs, + ) + + model = Net().to(device) + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, optimizer, epoch) + test(args, model, device, test_loader) + scheduler.step() + + torch.save(model.state_dict(), "/workdir/models/mnist_cnn.pt") + + +if __name__ == "__main__": + main() diff --git a/handwritten_digit_recog/scripts/entrypoint.sh b/handwritten_digit_recog/scripts/entrypoint.sh new file mode 100644 index 0000000..6144760 --- /dev/null +++ b/handwritten_digit_recog/scripts/entrypoint.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +function usage() { + cat << EOF +\$ $(basename $0) OPTIONS [SERIES] + +where OPTIONS are + + -b : GCP Storage Bucket where your pretrained model will be accessed + -c : Set cloud mode + -s : Stage to deploy + -t : GCP Access token --> `gcloud auth print-access-token` + +EOF +>&2 + + exit 1 +} + +function fetch_model() { + echo "Fetching model from ${bucket}" + curl -H "Authorization: Bearer ${token}" \ + https://www.googleapis.com/storage/v1/b/${bucket}/o/mnist_cnn.pt?alt=media \ + --output /workdir/models/mnist_cnn.pt +} + +function upload_model() { + echo "Uploading model to ${bucket}" + curl -v --upload-file /workdir/models/mnist_cnn.pt \ + -H "Authorization: Bearer ${token}" \ + https://storage.googleapis.com/${bucket}/mnist_cnn.pt +} + +while getopts "cb:s:t:h" opt; do + case $opt in + b) + bucket=$OPTARG + ;; + c) + cloud=true + ;; + s) + stage=$OPTARG + ;; + t) + token=$OPTARG + ;; + h) + usage + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + usage + ;; + esac +done + +if [[ ! -d "/workdir/models/" ]]; then + mkdir /workdir/models/ +fi + +cd python/ + +if [[ ${cloud} ]]; then + echo "Deploying ${stage} stage on cloud mode" + if [[ ${stage} == "serve" ]]; then + fetch_model && python ${stage}.py + elif [[ ${stage} == "train" ]]; then + python ${stage}.py && upload_model + fi +else + echo "Deploying ${stage} stage" + python ${stage}.py +fi diff --git a/handwritten_digit_recog/website/app.py b/handwritten_digit_recog/website/app.py new file mode 100644 index 0000000..b6028e5 --- /dev/null +++ b/handwritten_digit_recog/website/app.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# +# Copyright (c) 2019 Intel Corporation +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +import requests +from flask import Flask +from flask import request +from flask import render_template + +app = Flask("Website for Pytorch MNIST handwritten digit recognition example") +parser = argparse.ArgumentParser() +parser.add_argument( + "-ip", + "--endpoint", + required=False, + default="localhost", + help="Endpoint address of the inference API", +) +parser.add_argument( + "-p", + "--port", + required=False, + default="5059", + help="Endpoint port of the inference API", +) +parser.add_argument( + "-wip", + "--website_endpoint", + required=False, + default="localhost", + help="Endpoint address for the website", +) +parser.add_argument( + "-wp", + "--website_port", + required=False, + default="5000", + help="Endpoint port for the website", +) +parser.add_argument( + "-s", + "--tls", + required=False, + action="store_true", + default=False, + help="Is the inference endpoint encrypted with TLS ", +) +args = vars(parser.parse_args()) + + +@app.route("/", methods=["GET", "POST"]) +def homepage(): + args["protocol"] = "https" if args["tls"] else "http" + if request.method == "POST": + response = requests.post( + "{}://{}:{}/digit_recognition/classify".format( + args["protocol"], args["endpoint"], args["port"] + ), + json={"Prediction": "Classify"}, + ) + prediction_json = response.json() + prediction = prediction_json["Prediction"] + return render_template("homepage.html", optText=prediction,) + return render_template("homepage.html", optText=None) + + +if __name__ == "__main__": + app.run( + debug=True, + host="{}".format(args["website_endpoint"]), + port=int(args["website_port"]), + ) diff --git a/handwritten_digit_recog/website/static/styles/mainpage.css b/handwritten_digit_recog/website/static/styles/mainpage.css new file mode 100644 index 0000000..9dc2481 --- /dev/null +++ b/handwritten_digit_recog/website/static/styles/mainpage.css @@ -0,0 +1,27 @@ +.center-div +{ + margin: 0 auto; + text-align: center; +} + +div#title-box +{ + margin: auto; + text-align: center; + font-size: 24pt; + font-weight: bold; +} + +div#author +{ + margin: 0 auto; + text-align: center; + font-size: 10pt; +} + +.fill-text +{ +margin: auto; + text-align: justify; + font-size: 12pt; +} diff --git a/handwritten_digit_recog/website/templates/homepage.html b/handwritten_digit_recog/website/templates/homepage.html new file mode 100644 index 0000000..f28b084 --- /dev/null +++ b/handwritten_digit_recog/website/templates/homepage.html @@ -0,0 +1,25 @@ + + + + + + Handwritten Digit Recognition with Intel's DLRS +
Handwritten Digit Recognition with Intel's DLRS
+
Interactive handwritten digit recognition using PyTorch, running on an Intel Deep Learning Reference Stack (DLRS) backend. See clearlinux.org for more information.
+ + + +
+ +
+

+ +

+ +
+ +
+ +
+ + diff --git a/kubeflow/README.md b/kubeflow/README.md new file mode 100644 index 0000000..61b0f40 --- /dev/null +++ b/kubeflow/README.md @@ -0,0 +1,77 @@ +# Kubeflow Specific Files + +This folder is home for kubeflow specific files to enable DLRS images with various workloads that exist as part of kubeflow. Long term plan would be to upstream some of these to kubeflow. + +# Getting started with Kubeflow + +>IMPORTANT: For troubleshooting, please refer to Kubeflow [documentation](https://www.kubeflow.org/docs/started/k8s/kfctl-k8s-istio/). + +#### Pre-requisites: + +* A running Kubernetes cluster + +Please refer to: [Run Kubernetes on Clear Linux OS](https://clearlinux.org/documentation/clear-linux/tutorials/kubernetes) + +## Deploying Kubeflow with kfctl/kustomize + +1. Get kfctl tarball, untar and add to your PATH if necessary + +```bash +KFCTL_PATH="path_desired_to_kfctl" +kfctl_ver=v0.7.0 +KFCTL_URL="https://github.com/kubeflow/kubeflow/releases/download/${kfctl_ver}/kfctl_${kfctl_ver}_linux.tar.gz" +wget -P ${KFCTL_PATH} ${KFCTL_URL} +tar -C ${KFCTL_PATH} -xvf ${KFCTL_PATH}/kfctl_v${kfctl_ver}_linux.tar.gz +export PATH=$PATH:${KFCTL_PATH} +``` + +2. Install Kubeflow resources +```bash +# Env variables needed for your deployment +export KF_NAME= +export BASE_DIR= +export KF_DIR=${BASE_DIR}/${KF_NAME} +export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/v0.7-branch/kfdef/kfctl_k8s_istio.0.7.0.yaml" +``` + +3. Set up and deploy Kubeflow +```bash +mkdir -p ${KF_DIR} +cd ${KF_DIR} +kfctl apply -V -f ${CONFIG_URI} +``` + +Deployment takes around 10 minutes (or more depending on the hardware) to be ready to use. After that you can do +```bash +kubectl get pods -n kubeflow +``` +to list all the Kubeflow resources deployed and monitor their status. + + +## Deploying Kubeflow on Google Cloud Platform (GCP) + +Kubeflow has excellent documentation on how to deploy on GCP [here](https://www.kubeflow.org/docs/gke/deploy/deploy-cli/). However, the [Deep Learning Reference Stack](https://clearlinux.org/stacks/deep-learning) (DLRS) utilizes hardware advancments that are only on certain Intel chips (Skylake), and there is no single document explaining how to specify a minimum chip during kubeflow deployment. Those instructions are provided here. + +1. Choose a zone you want to deploy in that has Intel Skylake cpus. Zones are listed [here](https://cloud.google.com/compute/docs/regions-zones/). + +2. Deploy Kubeflow normally as specified [here](https://www.kubeflow.org/docs/gke/deploy/deploy-cli/) but stop at section ["Set up and deploy Kubeflow"](https://www.kubeflow.org/docs/gke/deploy/deploy-cli/#set-up-and-deploy-kubeflow). Instead, navigate to section ["Alternatively, set up your configuration for later deployment"](https://www.kubeflow.org/docs/gke/deploy/deploy-cli/#alternatively-set-up-your-configuration-for-later-deployment). Then follow step 1. + +3. In step 2, you are instructed to edit the configuration files. There are two changes required for a DLRS compatible GCP cluster, and they are detailed below. + +4. Navigate to the gcp_config directory and open the cluster.jinja file. Change the cluster property "minCpuPlatform" from "Intel Broadwell" to "Intel Skylake". Note: you may notice there are two minCpuPlatform properties in the file. On of them is for gpu node pools, and not all cpu/gpu combinations are combatible. Leave the gpu property untouched, and we will disable gpu node pools in the next step. + +5. Open the "cluster-kubeflow.yaml" file and change the "gpu-pool-max-nodes" property to 0. + +6. Follow steps 3-4 of ["Alternatively, set up your configuration for later deployment"](https://www.kubeflow.org/docs/gke/deploy/deploy-cli/#alternatively-set-up-your-configuration-for-later-deployment). + +That's it! you have a GCP cluster with Intel Skylake cpus. + + +# Kubeflow components + +There is a list of Kubeflow components you can interact with. In this repository you will find a set of components using the System Stacks for different workloads: +- TfJobs +- PytorchJobs +- Jupyter Notebooks +- Seldon Model Server +- Pipelines diff --git a/kubeflow/pipelines/pytorch-mnist/README.md b/kubeflow/pipelines/pytorch-mnist/README.md new file mode 100644 index 0000000..55c9766 --- /dev/null +++ b/kubeflow/pipelines/pytorch-mnist/README.md @@ -0,0 +1,90 @@ +# MNIST Kubeflow Pipeline on DLRS + +The following document describes how to run the Pytorch MNIST example on Kubeflow Pipelines on Google Cloud Platform (GCP). + +## Prerequisites + +* A [GCS bucket](https://cloud.google.com/storage/) to hold a Pytorch trained model. +* A Kubernetes deployment on GKE (v1.12.10-gke.20) +* A Kubeflow deployment on GKE (v0.6.1) +* Docker +* All images in the [Pytorch MNIST example](https://github.com/intel/stacks-usecase/tree/master/handwritten_digit_recog) already built and available in accessible registry + +## Host set up + +Kubeflow Pipelines must be compiled into a domain-specific language (DSL). Please follow these steps to set up the machine were you will be compiling all pipelines. + +The following commands will set up a virtual environment for Kubeflow Pipelines to work. + +```bash +# In case you don't have conda already + +wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh + +# Create and activate the mlpipeline virtual environment +conda create --name mlpipeline python=3.7 +conda activate mlpipeline + +# Install Kubeflow Pipelines SDK and other dependencies +wget https://raw.githubusercontent.com/kubeflow/examples/master/pipelines/mnist-pipelines/requirements.txt +pip install -r requirements.txt --upgrade +``` + +## Modify files + +All files in this repository have fields that you have to manually edit. Before running any `docker build` command make sure you have already modified the following: + +```bash +# Replace REGISTRY and TYPE where needed in handwritten-digit-recog-pipeline.py + image='REGISTRY/dlrs-train-TYPE' +# Replace REGISTRY and TYPE where needed in all yaml files inside manifests/ + image: REGISTRY/dlrs-serve-TYPE +``` + +## Compile the Handwritten Digit Recognition Pipeline + +Pipelines are written in Python, but they need to be compiled into a DSL, after activating the `mlpipeline` virtual environment, run the following command: + +```bash +python3 handwritten-digit-recog-pipeline.py +``` + +This will produce a compiled `handwritten-digit-recog-pipeline.py.tar.gz` file. + +## Running the pipeline + +### Upload the pipeline through Kubeflow UI + +Uploading the compiled pipeline using the UI is fairly easy. Hit the Upload pipeline button and follow the on screen instructions. + +![Upload Pipeline](./img/upload-pipeline.png "Upload Pipeline") + +### Run the Handwritten Digit Recognition Pipeline + +Start a new run and fill all the required fields. +This pipeline depends on two main user definitions: + +* `model-bucket`: The GCS Bucket you already have (see prerequisites) +* `gcloud-access-token`: This is gcloud auth token, you can get it by running `gcloud auth print-access-token` on the Cloud Shell + +![Start Run](./img/start-run.png "Start Run") + +This pipeline is comprised of three stages: + +#### Train + +The train stage will run `train.py`, which will train a Pytorch model using the MNIST dataset for handwritten digit recognition. The recently trained model will be then uploaded to the specified GCS Bucket. + + +#### Serve + +The serve stage will expose an API that receives POST requests and calls `classify.py` for testing random handwritten digits. + +#### Website + +The last stage creates a service hosting a simple Web UI that makes calls to the model server (see previous stage) for testing random handwritten digits. + +After all three stages of the pipeline finish, you can interact with the Web UI by going to the URL specified in the logs of the `web-ui` stage. + +![Web UI](./img/web-ui-url.png "Web UI") diff --git a/kubeflow/pipelines/pytorch-mnist/docker/Dockerfile b/kubeflow/pipelines/pytorch-mnist/docker/Dockerfile new file mode 100644 index 0000000..e701e29 --- /dev/null +++ b/kubeflow/pipelines/pytorch-mnist/docker/Dockerfile @@ -0,0 +1,49 @@ +# Copyright (c) 2019 Intel Corporation +# Copyright 2018 The Kubeflow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM debian + +RUN apt-get update -q && apt-get upgrade -y && \ + apt-get install -y -qq --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + git \ + gnupg \ + lsb-release \ + unzip \ + wget && \ + wget --no-verbose -O /bin/kubectl \ + https://storage.googleapis.com/kubernetes-release/release/v1.11.2/bin/linux/amd64/kubectl && \ + chmod u+x /bin/kubectl && \ + wget --no-verbose -O /opt/kubernetes_v1.11.2 \ + https://github.com/kubernetes/kubernetes/archive/v1.11.2.tar.gz && \ + mkdir -p /src && \ + tar -C /src -xzf /opt/kubernetes_v1.11.2 && \ + rm -rf /opt/kubernetes_v1.11.2 && \ + wget --no-verbose -O /opt/google-apt-key.gpg \ + https://packages.cloud.google.com/apt/doc/apt-key.gpg && \ + apt-key add /opt/google-apt-key.gpg && \ + export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \ + echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" >> \ + /etc/apt/sources.list.d/google-cloud-sdk.list && \ + apt-get update -q && \ + apt-get install -y -qq --no-install-recommends google-cloud-sdk && \ + gcloud config set component_manager/disable_update_check true + +COPY manifests/ /workdir/manifests/ +COPY scripts/deploy.sh /workdir/. +RUN chmod +x /workdir/deploy.sh + +ENTRYPOINT ["/workdir/deploy.sh"] diff --git a/kubeflow/pipelines/pytorch-mnist/docker/manifests/model-server.yaml b/kubeflow/pipelines/pytorch-mnist/docker/manifests/model-server.yaml new file mode 100644 index 0000000..4d8d88e --- /dev/null +++ b/kubeflow/pipelines/pytorch-mnist/docker/manifests/model-server.yaml @@ -0,0 +1,25 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dlrs-mnist-model-server + namespace: kubeflow + labels: + app: mnist-model-server +spec: + selector: + matchLabels: + app: mnist-model-server + tier: web + template: + metadata: + labels: + app: mnist-model-server + tier: web + spec: + containers: + - name: dlrs-mnist-model-server + image: REGISTRY/dlrs-serve-oss + command: ["/workdir/entrypoint.sh"] + args: ["-cb", "model-bucket", "-s", "serve", "-t", "gcloud_access_token"] + ports: + - containerPort: 5059 diff --git a/kubeflow/pipelines/pytorch-mnist/docker/manifests/website.yaml b/kubeflow/pipelines/pytorch-mnist/docker/manifests/website.yaml new file mode 100644 index 0000000..0f03afd --- /dev/null +++ b/kubeflow/pipelines/pytorch-mnist/docker/manifests/website.yaml @@ -0,0 +1,24 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dlrs-mnist-website + namespace: kubeflow + labels: + app: digit-classification +spec: + selector: + matchLabels: + app: digit-classification + tier: web + template: + metadata: + labels: + app: digit-classification + tier: web + spec: + containers: + - name: dlrs-mnist-website + image: REGISTRY/dlrs-website + args: ["--endpoint", "inference_API_endpoint", "--website_endpoint", "0.0.0.0"] + ports: + - containerPort: 5000 diff --git a/kubeflow/pipelines/pytorch-mnist/docker/scripts/deploy.sh b/kubeflow/pipelines/pytorch-mnist/docker/scripts/deploy.sh new file mode 100644 index 0000000..1ffbe33 --- /dev/null +++ b/kubeflow/pipelines/pytorch-mnist/docker/scripts/deploy.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +# Copyright (c) 2019 Intel Corporation +# Copyright 2018 The Kubeflow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +function get_ip() { + EXTERNAL_IP=$(kubectl get service $1 -n kubeflow | awk '{print $4}' | tail -n 1) + while [[ ${EXTERNAL_IP} == "" ]]; do + EXTERNAL_IP=$(kubectl get service $1 -n kubeflow | awk '{print $4}' | tail -n 1) + done +} + +function cluster_connect() { + if [ -z "${CLUSTER_NAME}" ]; then + CLUSTER_NAME=$(wget -q -O- --header="Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name) + fi + + # Ensure the server name is not more than 63 characters. + SERVER_NAME="${SERVER_NAME:0:63}" + # Trim any trailing hyphens from the server name. + while [[ "${SERVER_NAME:(-1)}" == "-" ]]; do SERVER_NAME="${SERVER_NAME::-1}"; done + + echo "Deploying ${SERVER_NAME} to the cluster ${CLUSTER_NAME}" + + # Connect kubectl to the local cluster + kubectl config set-cluster "${CLUSTER_NAME}" --server=https://kubernetes.default --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + kubectl config set-credentials pipeline --token "$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" + kubectl config set-context kubeflow --cluster "${CLUSTER_NAME}" --user pipeline + kubectl config use-context kubeflow +} + +function deploy() { + if [ "${stage}" == "serve" ]; then + echo "Deploying MODEL SERVER" + sed -i "s/model\-bucket/${bucket}/g" /workdir/manifests/model-server.yaml + sed -i "s/gcloud\_access\_token/${token}/g" /workdir/manifests/model-server.yaml + kubectl create -f /workdir/manifests/model-server.yaml + kubectl expose deployment -n kubeflow dlrs-mnist-model-server --type LoadBalancer --port 5059 --target-port 5059 + echo "Getting API's External IP" && get_ip "dlrs-mnist-model-server" + echo "Finish with API's External IP: ${EXTERNAL_IP}" + elif [ "${stage}" == "website" ]; then + echo "Deploying WEBSITE" + MODEL_EXTERNAL_IP=$(kubectl get service dlrs-mnist-model-server -n kubeflow | awk '{print $4}' | tail -n 1) + echo "API's External IP: ${MODEL_EXTERNAL_IP}" + sed -i "s/inference\_API\_endpoint/${MODEL_EXTERNAL_IP}/g" /workdir/manifests/website.yaml + kubectl create -f /workdir/manifests/website.yaml + kubectl expose deployment -n kubeflow dlrs-mnist-website --type LoadBalancer --port 8080 --target-port 5000 + echo "Getting Website's External IP" && get_ip "dlrs-mnist-website" + echo "Website URL : ${EXTERNAL_IP}:8080" + fi +} + +while getopts "b:s:t:" opt; do + case $opt in + b) + bucket=$OPTARG + ;; + s) + stage=$OPTARG + ;; + t) + token=$OPTARG + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + ;; + esac +done + +echo "Connecting kubectl to local cluster" && cluster_connect +echo "Deploying ${stage} stage" && deploy diff --git a/kubeflow/pipelines/pytorch-mnist/handwritten-digit-recog-pipeline.py b/kubeflow/pipelines/pytorch-mnist/handwritten-digit-recog-pipeline.py new file mode 100644 index 0000000..9a81da2 --- /dev/null +++ b/kubeflow/pipelines/pytorch-mnist/handwritten-digit-recog-pipeline.py @@ -0,0 +1,71 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Pytorch MNIST example running on Kubeflow Pipelines + +Run this script to compile pipeline +""" + + +import kfp.dsl as dsl +import kfp.gcp as gcp + +@dsl.pipeline( + name='dlrs-mnist-pipeline', + description='A pipeline to train and serve the Pytorch MNIST example.' +) +def mnist_pipeline(model_bucket='your-gs-bucket-name', gcloud_access_token='your-access-token'): + """ + Pipeline with three stages: + 1. Train a MNIST handwritten digit classifier + 2. Deploy a model server to the cluster + 3. Deploy a web-ui to interact with it + """ + train = dsl.ContainerOp( + name='train', + image='REGISTRY/dlrs-train-TYPE', + arguments=[ + "-cb", model_bucket, + "-s", "train", + "-t", gcloud_access_token + ] + ) + + serve = dsl.ContainerOp( + name='serve', + image='REGISTRY/dlrs-pipelines-deployer', + arguments=[ + "-cb", model_bucket, + "-s", "serve", + "-t", gcloud_access_token + ] + ) + serve.after(train) + + web_ui = dsl.ContainerOp( + name='web-ui', + image='REGISTRY/dlrs-pipelines-deployer' + arguments=[ + "-s", "website" + ] + ) + web_ui.after(serve) + + steps = [train, serve, web_ui] + for step in steps: + step.apply(gcp.use_gcp_secret('user-gcp-sa')) + +if __name__ == '__main__': + import kfp.compiler as compiler + compiler.Compiler().compile(mnist_pipeline, __file__ + '.tar.gz') diff --git a/kubeflow/pipelines/pytorch-mnist/img/start-run.png b/kubeflow/pipelines/pytorch-mnist/img/start-run.png new file mode 100644 index 0000000..766bff2 Binary files /dev/null and b/kubeflow/pipelines/pytorch-mnist/img/start-run.png differ diff --git a/kubeflow/pipelines/pytorch-mnist/img/upload-pipeline.png b/kubeflow/pipelines/pytorch-mnist/img/upload-pipeline.png new file mode 100644 index 0000000..dd73d7c Binary files /dev/null and b/kubeflow/pipelines/pytorch-mnist/img/upload-pipeline.png differ diff --git a/kubeflow/pipelines/pytorch-mnist/img/web-ui-url.png b/kubeflow/pipelines/pytorch-mnist/img/web-ui-url.png new file mode 100644 index 0000000..dd1953f Binary files /dev/null and b/kubeflow/pipelines/pytorch-mnist/img/web-ui-url.png differ