From ab1a48ab1178e373f33889c5c7a1d7464d939b4b Mon Sep 17 00:00:00 2001
From: Tomoyuki Shimizu <tomoyuki.labs@gmail.com>
Date: Thu, 8 Nov 2018 16:38:48 +0900
Subject: [PATCH 1/5] add use cases

---
 index.bs   | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 index.html |  93 +++++++++++++++++++++++++++++--
 2 files changed, 244 insertions(+), 8 deletions(-)
diff --git a/index.bs b/index.bs
index 792d120c..47fc7f60 100644
--- a/index.bs
+++ b/index.bs
@@ -6,7 +6,7 @@ Status: CG-DRAFT
 Group: webml
 URL: https://webmachinelearning.github.io/webnn/
 Editor: Your Name, Your Company http://example.com/your-company, your-email@example.com, http://example.com/your-personal-website
-Abstract: A dedicated API for neural network inference hardware acceleration.
+Abstract: This document describes a dedicated low-level API for neural network inference hardware acceleration.
 Repository: https://github.com/webmachinelearning/webnn
 </pre>
 
@@ -18,5 +18,160 @@ Introduction here.
 Use cases {#usecases}
 =====================
 
-Use cases here.
+## High-Level Use Cases ## {#usecases-highlevel}
 
+This section illustrates application-level use cases for the Web Machine
+Learning API (WebML API). All applications in those use cases can be built on
+top of pre-trained deep neural network (DNN) models.
+
+### Person Detection ### {#usecase-person-detection}
+
+A user is browsing a social media site and wishes to take a photo and upload it
+to the site. Before the photo is uploaded, the site runs [[SSD]] or [[YOLO]] on
+the WebML API to detect regions that include persons so that the user can filter
+and de-personalize irrelevant persons on it.
+
+### Skeleton Detecton ### {#usecase-skeleton-detection}
+
+A user opens a web application that continuously captures her body with her
+smartphone's camera. The web application extracts her skeleton by running
+[[PoseNet]] on the WebML API to recognize her gesture or body language. When she
+strikes a specified pose like raising a hand, the web application automatically
+takes a photo and upload it to an online storage.
+
+### Random Image Generation ### {#usecase-image-generation}
+
+A user wishes to make her new account and looks for a new icon image. When she
+clicks a "Generate" button on the webpage for creating an account, the webpage
+runs a generator model of generative adversarial network (GAN) for icon
+synthesis [[LogoSynthesis]] on the WebML API. She can repeat random icon
+generation until she finds her favorite one.
+
+## Low-Level Use Cases ## {#usecases-lowlevel}
+
+This section collects API-level use cases for the WebML API. It is supposed that
+ML frameworks refer to WebML API so that application developers can make use of
+those capabilities via the frameworks.
+
+### Custom Layer ### {#usecase-custom-layer}
+
+A web application developer wants to run a DNN model on the WebML. However, she
+has found that some of activation functions like [[LeakyReLU]], [[ELU]], etc. are
+not included in the WebML API. So she constructs custom layers of the additional
+activation functions on top of the WebML API. Note that the scope of custom
+layers may include convolution, normalization, etc. as well as activation.
+
+### Network Concatenation ### {#usecase-network-concat}
+
+A web application developer is trying to implement a DNN model that refers to
+outputs from some convolutional layer in [[MobileNets]] or [[ResNet]] as a
+feature map. She wants to use a pre-trained model of these networks to reduce
+training time. So she implements her network architecture so that its input is
+the output of convolutional layers of pre-trained MobileNets or ResNet models.
+
+### Performance Adaptation ### {#usecase-perf-adapt}
+
+A web application developer has a concern about performance of her DNN model on
+mobile devices. She has confirmed that the model runs too slow on mobile devices
+which does not have GPU acceleration. So her web application refers to the WebML
+API to confirm whether acceleration is available or not, so that the application
+can display the warning for devices without acceleration.
+
+After several weeks, she has developed a tiny DNN model that can even run on
+CPU. So she modifies the application so that the application loads the tiny
+model in the case of CPU-only devices.
+
+<pre class="biblio">
+{
+  "SSD": {
+    "href": "https://arxiv.org/abs/1512.02325",
+    "title": "SSD: Single Shot MultiBox Detector",
+    "authors": [
+      "Wei Liu",
+      "Dragomir Anguelov",
+      "Dumitru Erhan",
+      "Christian Szegedy",
+      "Scott Reed",
+      "Cheng-Yang Fu",
+      "Alexander C. Berg"
+    ],
+    "date": "December 2016"
+  },
+  "YOLO": {
+    "href": "https://arxiv.org/abs/1506.02640",
+    "title": "You Only Look Once: Unified, Real-Time Object Detection",
+    "authors": [
+      "Joseph Redmon",
+      "Santosh Divvala,",
+      "Ross Girshick",
+      "Ali Farhadi"
+    ],
+    "date": "May 2016"
+  },
+  "PoseNet": {
+    "href": "https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5",
+    "title": "Real-time Human Pose Estimation in the Browser with TensorFlow.js",
+    "authors": [
+      "Dan Oved"
+    ],
+    "date": "May 2018"
+  },
+  "LogoSynthesis": {
+    "href": "https://arxiv.org/abs/1712.04407",
+    "title": "Logo Synthesis and Manipulation with Clustered Generative Adversarial Networks",
+    "authors": [
+      "Alexander Sage",
+      "Eirikur Agustsson",
+      "Radu Timofte",
+      "Luc Van Gool"
+    ],
+    "date": "December 2017"
+  },
+  "LeakyReLU": {
+    "href": "https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf",
+    "title": "Rectifier Nonlinearities Improve Neural Network Acoustic Models",
+    "authors": [
+      "Andrew L. Maas",
+      "Awni Y. Hannun",
+      "Andrew Y. Ng"
+    ],
+    "date": "June 2013"
+  },
+  "ELU": {
+    "href": "https://arxiv.org/abs/1511.07289",
+    "title": "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)",
+    "authors": [
+      "Djork-Arné Clevert",
+      "Thomas Unterthiner",
+      "Sepp Hochreiter"
+    ],
+    "date": "February 2016"
+  },
+  "MobileNets": {
+    "href": "https://arxiv.org/abs/1704.04861",
+    "title": "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications",
+    "authors": [
+      "Andrew G. Howard",
+      "Menglong Zhu",
+      "Bo Chen",
+      "Dmitry Kalenichenko",
+      "Weijun Wang",
+      "Tobias Weyand",
+      "Marco Andreetto",
+      "Hartwig Adam"
+    ],
+    "date": "April 2017"
+  },
+  "ResNet": {
+    "href": "https://arxiv.org/abs/1512.03385",
+    "title": "Deep Residual Learning for Image Recognition",
+    "authors": [
+      "Kaiming He",
+      "Xiangyu Zhang",
+      "Shaoqing Ren",
+      "Jian Sun"
+    ],
+    "date": "December 2015"
+  }
+}
+</pre>
\ No newline at end of file
diff --git a/index.html b/index.html
index 5230b746..d567a2c5 100644
--- a/index.html
+++ b/index.html
@@ -1212,9 +1212,9 @@
 		}
 	}
 </style>
-  <meta content="Bikeshed version b8fb82cae4b66ce32bcb256c30253189edf1cc92" name="generator">
+  <meta content="Bikeshed version 3d3d56b4350a4c381fdf2373ffcf534daaa9ba33" name="generator">
   <link href="https://webmachinelearning.github.io/webnn/" rel="canonical">
-  <meta content="c882572e42e611dd3ee145277681f5101b21f363" name="document-revision">
+  <meta content="277e29eed7e67b8fdb00431115c99d940b8d2ed8" name="document-revision">
 <style>/* style-md-lists */
 
 /* This is a weird hack for me not yet following the commonmark spec
@@ -1366,7 +1366,7 @@
   <div class="head">
    <p data-fill-with="logo"></p>
    <h1 class="p-name no-ref" id="title">Web Neural Network API</h1>
-   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-11-02">2 November 2018</time></span></h2>
+   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-11-08">8 November 2018</time></span></h2>
    <div data-fill-with="spec-metadata">
     <dl>
      <dt>This version:
@@ -1384,7 +1384,7 @@ <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="cont
   </div>
   <div class="p-summary" data-fill-with="abstract">
    <h2 class="no-num no-toc no-ref heading settled" id="abstract"><span class="content">Abstract</span></h2>
-   <p>A dedicated API for neural network inference hardware acceleration.</p>
+   <p>This document describes a dedicated low-level API for neural network inference hardware acceleration.</p>
   </div>
   <div data-fill-with="at-risk"></div>
   <h2 class="no-num no-toc no-ref heading settled" id="status"><span class="content">Status of this document</span></h2>
@@ -1402,12 +1402,30 @@ <h2 class="no-num no-toc no-ref heading settled" id="status"><span class="conten
    <h2 class="no-num no-toc no-ref" id="contents">Table of Contents</h2>
    <ol class="toc" role="directory">
     <li><a href="#intro"><span class="secno">1</span> <span class="content">Introduction</span></a>
-    <li><a href="#usecases"><span class="secno">2</span> <span class="content">Use cases</span></a>
+    <li>
+     <a href="#usecases"><span class="secno">2</span> <span class="content">Use cases</span></a>
+     <ol class="toc">
+      <li>
+       <a href="#usecases-highlevel"><span class="secno">2.1</span> <span class="content">High-Level Use Cases</span></a>
+       <ol class="toc">
+        <li><a href="#usecase-person-detection"><span class="secno">2.1.1</span> <span class="content">Person Detection</span></a>
+        <li><a href="#usecase-skeleton-detection"><span class="secno">2.1.2</span> <span class="content">Skeleton Detecton</span></a>
+        <li><a href="#usecase-image-generation"><span class="secno">2.1.3</span> <span class="content">Random Image Generation</span></a>
+       </ol>
+      <li>
+       <a href="#usecases-lowlevel"><span class="secno">2.2</span> <span class="content">Low-Level Use Cases</span></a>
+       <ol class="toc">
+        <li><a href="#usecase-custom-layer"><span class="secno">2.2.1</span> <span class="content">Custom Layer</span></a>
+        <li><a href="#usecase-network-concat"><span class="secno">2.2.2</span> <span class="content">Network Concatenation</span></a>
+        <li><a href="#usecase-perf-adapt"><span class="secno">2.2.3</span> <span class="content">Performance Adaptation</span></a>
+       </ol>
+     </ol>
     <li><a href="#conformance"><span class="secno"></span> <span class="content"> Conformance</span></a>
     <li>
      <a href="#references"><span class="secno"></span> <span class="content">References</span></a>
      <ol class="toc">
       <li><a href="#normative"><span class="secno"></span> <span class="content">Normative References</span></a>
+      <li><a href="#informative"><span class="secno"></span> <span class="content">Informative References</span></a>
      </ol>
    </ol>
   </nav>
@@ -1415,7 +1433,51 @@ <h2 class="no-num no-toc no-ref" id="contents">Table of Contents</h2>
    <h2 class="heading settled" data-level="1" id="intro"><span class="secno">1. </span><span class="content">Introduction</span><a class="self-link" href="#intro"></a></h2>
    <p>Introduction here.</p>
    <h2 class="heading settled" data-level="2" id="usecases"><span class="secno">2. </span><span class="content">Use cases</span><a class="self-link" href="#usecases"></a></h2>
-   <p>Use cases here.</p>
+   <h3 class="heading settled" data-level="2.1" id="usecases-highlevel"><span class="secno">2.1. </span><span class="content">High-Level Use Cases</span><a class="self-link" href="#usecases-highlevel"></a></h3>
+   <p>This section illustrates application-level use cases for the Web Machine
+Learning API (WebML API). All applications in those use cases can be built on
+top of pre-trained deep neural network (DNN) models.</p>
+   <h4 class="heading settled" data-level="2.1.1" id="usecase-person-detection"><span class="secno">2.1.1. </span><span class="content">Person Detection</span><a class="self-link" href="#usecase-person-detection"></a></h4>
+   <p>A user is browsing a social media site and wishes to take a photo and upload it
+to the site. Before the photo is uploaded, the site runs <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a> or <a data-link-type="biblio" href="#biblio-yolo">[YOLO]</a> on
+the WebML API to detect regions that include persons so that the user can filter
+and de-personalize irrelevant persons on it.</p>
+   <h4 class="heading settled" data-level="2.1.2" id="usecase-skeleton-detection"><span class="secno">2.1.2. </span><span class="content">Skeleton Detecton</span><a class="self-link" href="#usecase-skeleton-detection"></a></h4>
+   <p>A user opens a web application that continuously captures her body with her
+smartphone’s camera. The web application extracts her skeleton by running <a data-link-type="biblio" href="#biblio-posenet">[PoseNet]</a> on the WebML API to recognize her gesture or body language. When she
+strikes a specified pose like raising a hand, the web application automatically
+takes a photo and upload it to an online storage.</p>
+   <h4 class="heading settled" data-level="2.1.3" id="usecase-image-generation"><span class="secno">2.1.3. </span><span class="content">Random Image Generation</span><a class="self-link" href="#usecase-image-generation"></a></h4>
+   <p>A user wishes to make her new account and looks for a new icon image. When she
+clicks a "Generate" button on the webpage for creating an account, the webpage
+runs a generator model of generative adversarial network (GAN) for icon
+synthesis <a data-link-type="biblio" href="#biblio-logosynthesis">[LogoSynthesis]</a> on the WebML API. She can repeat random icon
+generation until she finds her favorite one.</p>
+   <h3 class="heading settled" data-level="2.2" id="usecases-lowlevel"><span class="secno">2.2. </span><span class="content">Low-Level Use Cases</span><a class="self-link" href="#usecases-lowlevel"></a></h3>
+   <p>This section collects API-level use cases for the WebML API. It is supposed that
+ML frameworks refer to WebML API so that application developers can make use of
+those capabilities via the frameworks.</p>
+   <h4 class="heading settled" data-level="2.2.1" id="usecase-custom-layer"><span class="secno">2.2.1. </span><span class="content">Custom Layer</span><a class="self-link" href="#usecase-custom-layer"></a></h4>
+   <p>A web application developer wants to run a DNN model on the WebML. However, she
+has found that some of activation functions like <a data-link-type="biblio" href="#biblio-leakyrelu">[LeakyReLU]</a>, <a data-link-type="biblio" href="#biblio-elu">[ELU]</a>, etc. are
+not included in the WebML API. So she constructs custom layers of the additional
+activation functions on top of the WebML API. Note that the scope of custom
+layers may include convolution, normalization, etc. as well as activation.</p>
+   <h4 class="heading settled" data-level="2.2.2" id="usecase-network-concat"><span class="secno">2.2.2. </span><span class="content">Network Concatenation</span><a class="self-link" href="#usecase-network-concat"></a></h4>
+   <p>A web application developer is trying to implement a DNN model that refers to
+outputs from some convolutional layer in <a data-link-type="biblio" href="#biblio-mobilenets">[MobileNets]</a> or <a data-link-type="biblio" href="#biblio-resnet">[ResNet]</a> as a
+feature map. She wants to use a pre-trained model of these networks to reduce
+training time. So she implements her network architecture so that its input is
+the output of convolutional layers of pre-trained MobileNets or ResNet models.</p>
+   <h4 class="heading settled" data-level="2.2.3" id="usecase-perf-adapt"><span class="secno">2.2.3. </span><span class="content">Performance Adaptation</span><a class="self-link" href="#usecase-perf-adapt"></a></h4>
+   <p>A web application developer has a concern about performance of her DNN model on
+mobile devices. She has confirmed that the model runs too slow on mobile devices
+which does not have GPU acceleration. So her web application refers to the WebML
+API to confirm whether acceleration is available or not, so that the application
+can display the warning for devices without acceleration.</p>
+   <p>After several weeks, she has developed a tiny DNN model that can even run on
+CPU. So she modifies the application so that the application loads the tiny
+model in the case of CPU-only devices.</p>
   </main>
   <div data-fill-with="conformance">
    <h2 class="no-ref no-num heading settled" id="conformance"><span class="content"> Conformance</span><a class="self-link" href="#conformance"></a></h2>
@@ -1568,4 +1630,23 @@ <h3 class="no-num no-ref heading settled" id="normative"><span class="content">N
   <dl>
    <dt id="biblio-rfc2119">[RFC2119]
    <dd>S. Bradner. <a href="https://tools.ietf.org/html/rfc2119">Key words for use in RFCs to Indicate Requirement Levels</a>. March 1997. Best Current Practice. URL: <a href="https://tools.ietf.org/html/rfc2119">https://tools.ietf.org/html/rfc2119</a>
+  </dl>
+  <h3 class="no-num no-ref heading settled" id="informative"><span class="content">Informative References</span><a class="self-link" href="#informative"></a></h3>
+  <dl>
+   <dt id="biblio-elu">[ELU]
+   <dd>Djork-Arné Clevert; Thomas Unterthiner; Sepp Hochreiter. <a href="https://arxiv.org/abs/1511.07289">Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)</a>. February 2016. URL: <a href="https://arxiv.org/abs/1511.07289">https://arxiv.org/abs/1511.07289</a>
+   <dt id="biblio-leakyrelu">[LeakyReLU]
+   <dd>Andrew L. Maas; Awni Y. Hannun; Andrew Y. Ng. <a href="https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf">Rectifier Nonlinearities Improve Neural Network Acoustic Models</a>. June 2013. URL: <a href="https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf">https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf</a>
+   <dt id="biblio-logosynthesis">[LogoSynthesis]
+   <dd>Alexander Sage; et al. <a href="https://arxiv.org/abs/1712.04407">Logo Synthesis and Manipulation with Clustered Generative Adversarial Networks</a>. December 2017. URL: <a href="https://arxiv.org/abs/1712.04407">https://arxiv.org/abs/1712.04407</a>
+   <dt id="biblio-mobilenets">[MobileNets]
+   <dd>Andrew G. Howard; et al. <a href="https://arxiv.org/abs/1704.04861">MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications</a>. April 2017. URL: <a href="https://arxiv.org/abs/1704.04861">https://arxiv.org/abs/1704.04861</a>
+   <dt id="biblio-posenet">[PoseNet]
+   <dd>Dan Oved. <a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">Real-time Human Pose Estimation in the Browser with TensorFlow.js</a>. May 2018. URL: <a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5</a>
+   <dt id="biblio-resnet">[ResNet]
+   <dd>Kaiming He; et al. <a href="https://arxiv.org/abs/1512.03385">Deep Residual Learning for Image Recognition</a>. December 2015. URL: <a href="https://arxiv.org/abs/1512.03385">https://arxiv.org/abs/1512.03385</a>
+   <dt id="biblio-ssd">[SSD]
+   <dd>Wei Liu; et al. <a href="https://arxiv.org/abs/1512.02325">SSD: Single Shot MultiBox Detector</a>. December 2016. URL: <a href="https://arxiv.org/abs/1512.02325">https://arxiv.org/abs/1512.02325</a>
+   <dt id="biblio-yolo">[YOLO]
+   <dd>Joseph Redmon; et al. <a href="https://arxiv.org/abs/1506.02640">You Only Look Once: Unified, Real-Time Object Detection</a>. May 2016. URL: <a href="https://arxiv.org/abs/1506.02640">https://arxiv.org/abs/1506.02640</a>
   </dl>
\ No newline at end of file

From 57a14adb05b34c64ad6da0c4cbf7c10fdc4d733f Mon Sep 17 00:00:00 2001
From: Tomoyuki Shimizu <tomoyuki.labs@gmail.com>
Date: Wed, 14 Nov 2018 11:35:45 +0900
Subject: [PATCH 2/5] Follow review by @anssiko

---
 index.bs   | 48 ++++++++++++++++++++++++++---------------------
 index.html | 55 ++++++++++++++++++++++++++++++------------------------
 2 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/index.bs b/index.bs
index 47fc7f60..c5aeecca 100644
--- a/index.bs
+++ b/index.bs
@@ -20,46 +20,52 @@ Use cases {#usecases}
 
 ## High-Level Use Cases ## {#usecases-highlevel}
 
-This section illustrates application-level use cases for the Web Machine
-Learning API (WebML API). All applications in those use cases can be built on
-top of pre-trained deep neural network (DNN) models.
+This section illustrates application-level use cases for neural network
+inference hardware acceleration. All applications in those use cases can be
+built on top of pre-trained deep neural network (DNN) models.
 
 ### Person Detection ### {#usecase-person-detection}
 
 A user is browsing a social media site and wishes to take a photo and upload it
-to the site. Before the photo is uploaded, the site runs [[SSD]] or [[YOLO]] on
-the WebML API to detect regions that include persons so that the user can filter
+to the site. Before the photo is uploaded, the site does object detection (for
+example, using object detection approaches such as [[SSD]] or [[YOLO]] that use
+a single DNN) to detect regions that include persons so that the user can filter
 and de-personalize irrelevant persons on it.
 
-### Skeleton Detecton ### {#usecase-skeleton-detection}
+### Skeleton Detection ### {#usecase-skeleton-detection}
 
-A user opens a web application that continuously captures her body with her
-smartphone's camera. The web application extracts her skeleton by running
-[[PoseNet]] on the WebML API to recognize her gesture or body language. When she
-strikes a specified pose like raising a hand, the web application automatically
-takes a photo and upload it to an online storage.
+The web application extracts her skeleton by running a machine learning model
+which allows for real-time human pose estimation such as [[PoseNet]] to
+recognize her gesture and body language. When she strikes a specified pose like
+raising a hand, the web application automatically takes a photo and upload it to
+an online storage.
 
 ### Random Image Generation ### {#usecase-image-generation}
 
 A user wishes to make her new account and looks for a new icon image. When she
 clicks a "Generate" button on the webpage for creating an account, the webpage
 runs a generator model of generative adversarial network (GAN) for icon
-synthesis [[LogoSynthesis]] on the WebML API. She can repeat random icon
+synthesis [[LogoSynthesis]]. She can repeat random icon
 generation until she finds her favorite one.
 
 ## Low-Level Use Cases ## {#usecases-lowlevel}
 
-This section collects API-level use cases for the WebML API. It is supposed that
-ML frameworks refer to WebML API so that application developers can make use of
-those capabilities via the frameworks.
+This section collects API-level use cases for a dedicated low-level API for
+neural network inference hardware acceleration. It is expected that Machine
+Learning frameworks will be key consumers of the Web Neural Network API (WebNN
+API) and the low-level details exposed through the WebNN API are abstracted out
+from typical web developers. However, it is also expected that web developers
+with specific interest and competence in Machine Learning will want to interface
+with the WebNN API directly instead of a higher-level ML framework.
 
 ### Custom Layer ### {#usecase-custom-layer}
 
-A web application developer wants to run a DNN model on the WebML. However, she
-has found that some of activation functions like [[LeakyReLU]], [[ELU]], etc. are
-not included in the WebML API. So she constructs custom layers of the additional
-activation functions on top of the WebML API. Note that the scope of custom
-layers may include convolution, normalization, etc. as well as activation.
+A web application developer wants to run a DNN model on the WebNN API. However,
+she has found that some of activation functions like [[LeakyReLU]], [[ELU]],
+etc. are not included in the WebNN API. So she constructs custom layers of the
+additional activation functions on top of the WebNN API. Note that the scope of
+custom layers may include convolution, normalization, etc. as well as
+activation.
 
 ### Network Concatenation ### {#usecase-network-concat}
 
@@ -73,7 +79,7 @@ the output of convolutional layers of pre-trained MobileNets or ResNet models.
 
 A web application developer has a concern about performance of her DNN model on
 mobile devices. She has confirmed that the model runs too slow on mobile devices
-which does not have GPU acceleration. So her web application refers to the WebML
+which does not have GPU acceleration. So her web application refers to the WebNN
 API to confirm whether acceleration is available or not, so that the application
 can display the warning for devices without acceleration.
 
diff --git a/index.html b/index.html
index d567a2c5..5c885e26 100644
--- a/index.html
+++ b/index.html
@@ -1212,9 +1212,9 @@
 		}
 	}
 </style>
-  <meta content="Bikeshed version 3d3d56b4350a4c381fdf2373ffcf534daaa9ba33" name="generator">
+  <meta content="Bikeshed version db559f98d6c99655936abc96647a0e88ba98a9cf" name="generator">
   <link href="https://webmachinelearning.github.io/webnn/" rel="canonical">
-  <meta content="277e29eed7e67b8fdb00431115c99d940b8d2ed8" name="document-revision">
+  <meta content="ab1a48ab1178e373f33889c5c7a1d7464d939b4b" name="document-revision">
 <style>/* style-md-lists */
 
 /* This is a weird hack for me not yet following the commonmark spec
@@ -1366,7 +1366,7 @@
   <div class="head">
    <p data-fill-with="logo"></p>
    <h1 class="p-name no-ref" id="title">Web Neural Network API</h1>
-   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-11-08">8 November 2018</time></span></h2>
+   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-11-14">14 November 2018</time></span></h2>
    <div data-fill-with="spec-metadata">
     <dl>
      <dt>This version:
@@ -1409,7 +1409,7 @@ <h2 class="no-num no-toc no-ref" id="contents">Table of Contents</h2>
        <a href="#usecases-highlevel"><span class="secno">2.1</span> <span class="content">High-Level Use Cases</span></a>
        <ol class="toc">
         <li><a href="#usecase-person-detection"><span class="secno">2.1.1</span> <span class="content">Person Detection</span></a>
-        <li><a href="#usecase-skeleton-detection"><span class="secno">2.1.2</span> <span class="content">Skeleton Detecton</span></a>
+        <li><a href="#usecase-skeleton-detection"><span class="secno">2.1.2</span> <span class="content">Skeleton Detection</span></a>
         <li><a href="#usecase-image-generation"><span class="secno">2.1.3</span> <span class="content">Random Image Generation</span></a>
        </ol>
       <li>
@@ -1434,35 +1434,42 @@ <h2 class="heading settled" data-level="1" id="intro"><span class="secno">1. </s
    <p>Introduction here.</p>
    <h2 class="heading settled" data-level="2" id="usecases"><span class="secno">2. </span><span class="content">Use cases</span><a class="self-link" href="#usecases"></a></h2>
    <h3 class="heading settled" data-level="2.1" id="usecases-highlevel"><span class="secno">2.1. </span><span class="content">High-Level Use Cases</span><a class="self-link" href="#usecases-highlevel"></a></h3>
-   <p>This section illustrates application-level use cases for the Web Machine
-Learning API (WebML API). All applications in those use cases can be built on
-top of pre-trained deep neural network (DNN) models.</p>
+   <p>This section illustrates application-level use cases for neural network
+inference hardware acceleration. All applications in those use cases can be
+built on top of pre-trained deep neural network (DNN) models.</p>
    <h4 class="heading settled" data-level="2.1.1" id="usecase-person-detection"><span class="secno">2.1.1. </span><span class="content">Person Detection</span><a class="self-link" href="#usecase-person-detection"></a></h4>
    <p>A user is browsing a social media site and wishes to take a photo and upload it
-to the site. Before the photo is uploaded, the site runs <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a> or <a data-link-type="biblio" href="#biblio-yolo">[YOLO]</a> on
-the WebML API to detect regions that include persons so that the user can filter
+to the site. Before the photo is uploaded, the site does object detection (for
+example, using object detection approaches such as <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a> or <a data-link-type="biblio" href="#biblio-yolo">[YOLO]</a> that use
+a single DNN) to detect regions that include persons so that the user can filter
 and de-personalize irrelevant persons on it.</p>
-   <h4 class="heading settled" data-level="2.1.2" id="usecase-skeleton-detection"><span class="secno">2.1.2. </span><span class="content">Skeleton Detecton</span><a class="self-link" href="#usecase-skeleton-detection"></a></h4>
-   <p>A user opens a web application that continuously captures her body with her
-smartphone’s camera. The web application extracts her skeleton by running <a data-link-type="biblio" href="#biblio-posenet">[PoseNet]</a> on the WebML API to recognize her gesture or body language. When she
-strikes a specified pose like raising a hand, the web application automatically
-takes a photo and upload it to an online storage.</p>
+   <h4 class="heading settled" data-level="2.1.2" id="usecase-skeleton-detection"><span class="secno">2.1.2. </span><span class="content">Skeleton Detection</span><a class="self-link" href="#usecase-skeleton-detection"></a></h4>
+   <p>The web application extracts her skeleton by running a machine learning model
+which allows for real-time human pose estimation such as <a data-link-type="biblio" href="#biblio-posenet">[PoseNet]</a> to
+recognize her gesture and body language. When she strikes a specified pose like
+raising a hand, the web application automatically takes a photo and upload it to
+an online storage.</p>
    <h4 class="heading settled" data-level="2.1.3" id="usecase-image-generation"><span class="secno">2.1.3. </span><span class="content">Random Image Generation</span><a class="self-link" href="#usecase-image-generation"></a></h4>
    <p>A user wishes to make her new account and looks for a new icon image. When she
 clicks a "Generate" button on the webpage for creating an account, the webpage
 runs a generator model of generative adversarial network (GAN) for icon
-synthesis <a data-link-type="biblio" href="#biblio-logosynthesis">[LogoSynthesis]</a> on the WebML API. She can repeat random icon
+synthesis <a data-link-type="biblio" href="#biblio-logosynthesis">[LogoSynthesis]</a>. She can repeat random icon
 generation until she finds her favorite one.</p>
    <h3 class="heading settled" data-level="2.2" id="usecases-lowlevel"><span class="secno">2.2. </span><span class="content">Low-Level Use Cases</span><a class="self-link" href="#usecases-lowlevel"></a></h3>
-   <p>This section collects API-level use cases for the WebML API. It is supposed that
-ML frameworks refer to WebML API so that application developers can make use of
-those capabilities via the frameworks.</p>
+   <p>This section collects API-level use cases for a dedicated low-level API for
+neural network inference hardware acceleration. It is expected that Machine
+Learning frameworks will be key consumers of the Web Neural Network API (WebNN
+API) and the low-level details exposed through the WebNN API are abstracted out
+from typical web developers. However, it is also expected that web developers
+with specific interest and competence in Machine Learning will want to interface
+with the WebNN API directly instead of a higher-level ML framework.</p>
    <h4 class="heading settled" data-level="2.2.1" id="usecase-custom-layer"><span class="secno">2.2.1. </span><span class="content">Custom Layer</span><a class="self-link" href="#usecase-custom-layer"></a></h4>
-   <p>A web application developer wants to run a DNN model on the WebML. However, she
-has found that some of activation functions like <a data-link-type="biblio" href="#biblio-leakyrelu">[LeakyReLU]</a>, <a data-link-type="biblio" href="#biblio-elu">[ELU]</a>, etc. are
-not included in the WebML API. So she constructs custom layers of the additional
-activation functions on top of the WebML API. Note that the scope of custom
-layers may include convolution, normalization, etc. as well as activation.</p>
+   <p>A web application developer wants to run a DNN model on the WebNN API. However,
+she has found that some of activation functions like <a data-link-type="biblio" href="#biblio-leakyrelu">[LeakyReLU]</a>, <a data-link-type="biblio" href="#biblio-elu">[ELU]</a>,
+etc. are not included in the WebNN API. So she constructs custom layers of the
+additional activation functions on top of the WebNN API. Note that the scope of
+custom layers may include convolution, normalization, etc. as well as
+activation.</p>
    <h4 class="heading settled" data-level="2.2.2" id="usecase-network-concat"><span class="secno">2.2.2. </span><span class="content">Network Concatenation</span><a class="self-link" href="#usecase-network-concat"></a></h4>
    <p>A web application developer is trying to implement a DNN model that refers to
 outputs from some convolutional layer in <a data-link-type="biblio" href="#biblio-mobilenets">[MobileNets]</a> or <a data-link-type="biblio" href="#biblio-resnet">[ResNet]</a> as a
@@ -1472,7 +1479,7 @@ <h4 class="heading settled" data-level="2.2.2" id="usecase-network-concat"><span
    <h4 class="heading settled" data-level="2.2.3" id="usecase-perf-adapt"><span class="secno">2.2.3. </span><span class="content">Performance Adaptation</span><a class="self-link" href="#usecase-perf-adapt"></a></h4>
    <p>A web application developer has a concern about performance of her DNN model on
 mobile devices. She has confirmed that the model runs too slow on mobile devices
-which does not have GPU acceleration. So her web application refers to the WebML
+which does not have GPU acceleration. So her web application refers to the WebNN
 API to confirm whether acceleration is available or not, so that the application
 can display the warning for devices without acceleration.</p>
    <p>After several weeks, she has developed a tiny DNN model that can even run on

From 46a320a9e874dc486e07b4b57f8ebc067d5a224d Mon Sep 17 00:00:00 2001
From: Tomoyuki Shimizu <tomoyuki.labs@gmail.com>
Date: Wed, 5 Dec 2018 10:09:55 +0900
Subject: [PATCH 3/5] add several use cases and revise existing ones

---
 index.bs   | 231 ++++++++++++++++++++++++++++++++++++++++-------------
 index.html | 116 +++++++++++++++++++--------
 2 files changed, 260 insertions(+), 87 deletions(-)

diff --git a/index.bs b/index.bs
index c5aeecca..eb4e1e24 100644
--- a/index.bs
+++ b/index.bs
@@ -26,27 +26,74 @@ built on top of pre-trained deep neural network (DNN) models.
 
 ### Person Detection ### {#usecase-person-detection}
 
-A user is browsing a social media site and wishes to take a photo and upload it
-to the site. Before the photo is uploaded, the site does object detection (for
-example, using object detection approaches such as [[SSD]] or [[YOLO]] that use
-a single DNN) to detect regions that include persons so that the user can filter
-and de-personalize irrelevant persons on it.
+A user opens a web-based video conferencing application, but she temporarily
+leaves from her room. The application is watching whether she is in front of her
+PC by using object detection (for example, using object detection approaches
+such as [[SSD]] or [[YOLO]] that use a single DNN) to detect regions in a camera
+input frame that include persons.
+
+When she comes back, the application automatically detects her and notifies
+other online users that she is active now.
+
+### Semantic Segmentation ### {#usecase-segmentation}
+
+A user joins a teleconference via a web-based video conferencing application
+from her room. However, she does not wish that her room is visible on the
+screen. So the application runs a machine learning model such as [[DeepLabv3+]]
+or [[MaskR-CNN]] to semantically split an image into segments and replaces
+background segments with another picture.
 
 ### Skeleton Detection ### {#usecase-skeleton-detection}
 
-The web application extracts her skeleton by running a machine learning model
-which allows for real-time human pose estimation such as [[PoseNet]] to
-recognize her gesture and body language. When she strikes a specified pose like
-raising a hand, the web application automatically takes a photo and upload it to
-an online storage.
+A web-based video conferencing application tracks a pose of user's skeleton by
+running a machine learning model, which allows for real-time human pose
+estimation, such as [[PoseNet]] to recognize her gesture and body language. When
+she raises her hand, her microphone is automatically unmuted and she can start
+speaking on the teleconference.
+
+### Face Recognition ### {#usecase-face-recognition}
+
+There are multiple people in the conference room and they join an online meeting
+using a web-based video conferencing application. The application detects faces
+of participants by using object detection (for example, using object detection
+approaches such as [[SSD]]) and checks whether each face was present at the
+previous meeting or not by running a machine learning model such as [[FaceNet]],
+which verifies whether two faces would be identical or not.
+
+### Super Resolution ### {#usecase-super-resolution}
+
+A web-based video conferencing is receiving a video stream from its peer, but
+the resolution of the video becomes lower due to network congestion. So the
+application runs a machine learning model for super-resolution such as [[SRGAN]]
+to generate higher-resolution video frames.
+
+### Image Captioning ### {#usecase-image-captioning}
+
+For better accessibility, a web-based presentation application provides
+automatic image captioning by running a machine learning model such as
+[[im2txt]] which predicts explanatory words of the presentation slides.
+
+### Machine Translation ### {#usecase-translation}
+
+Multiple people from various countries are talking via a web-based real-time
+text chat application. The application translates their conversation by using a
+machine learning model such as [[GNMT]] or [[OpenNMT]], which translates every
+text into different language.
+
+### Emotion Analysis ### {#usecase-emotion-analysis}
+
+A user is talking to her friend via a web-based real-time text chat application,
+and she is wondering how the friend feels because she cannot see the friend's
+face. The application analyses the friend's emotion by using a machine learning
+model such as [[DeepMoji]], which infers emotion from input texts, and displays
+an emoji that represents the estimated emotion.
 
-### Random Image Generation ### {#usecase-image-generation}
+### Video Summarization ### {#usecase-video-summalization}
 
-A user wishes to make her new account and looks for a new icon image. When she
-clicks a "Generate" button on the webpage for creating an account, the webpage
-runs a generator model of generative adversarial network (GAN) for icon
-synthesis [[LogoSynthesis]]. She can repeat random icon
-generation until she finds her favorite one.
+A web-based video conferencing application records received video streams, and
+it needs to reduce recorded video data to be stored. The application generates
+the short version of the recoreded video by using a machine learning model for
+video summarization such as [[Video-Summarization-with-LSTM]].
 
 ## Low-Level Use Cases ## {#usecases-lowlevel}
 
@@ -69,11 +116,15 @@ activation.
 
 ### Network Concatenation ### {#usecase-network-concat}
 
-A web application developer is trying to implement a DNN model that refers to
-outputs from some convolutional layer in [[MobileNets]] or [[ResNet]] as a
-feature map. She wants to use a pre-trained model of these networks to reduce
-training time. So she implements her network architecture so that its input is
-the output of convolutional layers of pre-trained MobileNets or ResNet models.
+A web application uses a DNN model, and its model data of upper convolutional
+layers and lower fully-connected layers are stored in separate files, since
+model data of the fully-connected layers are periodically updated due to fine
+tuning at the server side.
+
+Therefore, the application downloads both partial model files at first and
+concatenates them into a single model. When the model is updated, the
+application downloads fine-tuned part of the model and replace only the
+fully-connected layers with it.
 
 ### Performance Adaptation ### {#usecase-perf-adapt}
 
@@ -114,6 +165,29 @@ model in the case of CPU-only devices.
     ],
     "date": "May 2016"
   },
+  "DeepLabv3+": {
+    "href": "https://arxiv.org/abs/1802.02611",
+    "title": "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation",
+    "authors": [
+      "Liang-Chieh Chen",
+      "Yukun Zhu",
+      "George Papandreou",
+      "Florian Schroff",
+      "Hartwig Adam"
+    ],
+    "date": "August 2018"
+  },
+  "MaskR-CNN": {
+    "href": "https://arxiv.org/abs/1703.06870",
+    "title": "Mask R-CNN",
+    "authors": [
+      "Kaiming He",
+      "Georgia Gkioxari",
+      "Piotr Dollár",
+      "Ross Girshick"
+    ],
+    "date": "January 2018"
+  },
   "PoseNet": {
     "href": "https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5",
     "title": "Real-time Human Pose Estimation in the Browser with TensorFlow.js",
@@ -122,16 +196,89 @@ model in the case of CPU-only devices.
     ],
     "date": "May 2018"
   },
-  "LogoSynthesis": {
-    "href": "https://arxiv.org/abs/1712.04407",
-    "title": "Logo Synthesis and Manipulation with Clustered Generative Adversarial Networks",
+  "FaceNet": {
+    "href": "https://arxiv.org/abs/1503.03832",
+    "title": "FaceNet: A Unified Embedding for Face Recognition and Clustering",
+    "authors": [
+      "Florian Schroff",
+      "Dmitry Kalenichenko",
+      "James Philbin"
+    ],
+    "date": "June 2015"
+  },
+  "SRGAN": {
+    "href": "https://arxiv.org/abs/1609.04802",
+    "title": "Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network",
+    "authors": [
+      "Christian Ledig",
+      "Lucas Theis",
+      "Ferenc Huszar",
+      "Jose Caballero",
+      "Andrew Cunningham",
+      "Alejandro Acosta",
+      "Andrew Aitken",
+      "Alykhan Tejani",
+      "Johannes Totz",
+      "Zehan Wang",
+      "Wenzhe Shi"
+    ],
+    "date": "May 2017"
+  },
+  "im2txt": {
+    "href": "https://arxiv.org/abs/1609.06647",
+    "title": "Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning Challenge",
+    "authors": [
+      "Oriol Vinyals",
+      "Alexander Toshev",
+      "Samy Bengio",
+      "Dumitru Erhan"
+    ],
+    "date": "September 2016"
+  },
+  "GNMT": {
+    "href": "https://github.com/tensorflow/nmt",
+    "title": "Neural Machine Translation (seq2seq) Tutorial",
+    "authors": [
+      "Minh-Thang Luong",
+      "Eugene Brevdo",
+      "Rui Zhao"
+    ],
+    "date": "May 2017"
+  },
+  "OpenNMT": {
+    "href": "https://arxiv.org/abs/1701.02810",
+    "title": "OpenNMT: Open-Source Toolkit for Neural Machine Translation",
+    "authors": [
+      "Guillaume Klein",
+      "Yoon Kim",
+      "Yuntian Deng",
+      "Jean Senellart",
+      "Alexander M. Rush"
+    ],
+    "date": "March 2017"
+  },
+  "DeepMoji": {
+    "href": "https://arxiv.org/abs/1708.00524",
+    "title": "Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm",
     "authors": [
-      "Alexander Sage",
-      "Eirikur Agustsson",
-      "Radu Timofte",
-      "Luc Van Gool"
+      "Bjarke Felbo",
+      "Alan Mislove",
+      "Anders Søgaard",
+      "Iyad Rahwan",
+      "Sune Lehmann"
     ],
-    "date": "December 2017"
+    "date": "October 2017"
+  },
+  "Video-Summarization-with-LSTM": {
+    "href": "http://www-scf.usc.edu/~zhan355/ke_eccv2016.pdf",
+    "title": "Video summarization with long short-term memory",
+    "authors": [
+      "Ke Zhang",
+      "Wei-Lun Chao",
+      "Fei Sha",
+      "Kristen Grauman"
+    ],
+    "date": "October 2016"
   },
   "LeakyReLU": {
     "href": "https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf",
@@ -152,32 +299,6 @@ model in the case of CPU-only devices.
       "Sepp Hochreiter"
     ],
     "date": "February 2016"
-  },
-  "MobileNets": {
-    "href": "https://arxiv.org/abs/1704.04861",
-    "title": "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications",
-    "authors": [
-      "Andrew G. Howard",
-      "Menglong Zhu",
-      "Bo Chen",
-      "Dmitry Kalenichenko",
-      "Weijun Wang",
-      "Tobias Weyand",
-      "Marco Andreetto",
-      "Hartwig Adam"
-    ],
-    "date": "April 2017"
-  },
-  "ResNet": {
-    "href": "https://arxiv.org/abs/1512.03385",
-    "title": "Deep Residual Learning for Image Recognition",
-    "authors": [
-      "Kaiming He",
-      "Xiangyu Zhang",
-      "Shaoqing Ren",
-      "Jian Sun"
-    ],
-    "date": "December 2015"
   }
 }
 </pre>
\ No newline at end of file
diff --git a/index.html b/index.html
index 5c885e26..d2919387 100644
--- a/index.html
+++ b/index.html
@@ -1214,7 +1214,7 @@
 </style>
   <meta content="Bikeshed version db559f98d6c99655936abc96647a0e88ba98a9cf" name="generator">
   <link href="https://webmachinelearning.github.io/webnn/" rel="canonical">
-  <meta content="ab1a48ab1178e373f33889c5c7a1d7464d939b4b" name="document-revision">
+  <meta content="57a14adb05b34c64ad6da0c4cbf7c10fdc4d733f" name="document-revision">
 <style>/* style-md-lists */
 
 /* This is a weird hack for me not yet following the commonmark spec
@@ -1366,7 +1366,7 @@
   <div class="head">
    <p data-fill-with="logo"></p>
    <h1 class="p-name no-ref" id="title">Web Neural Network API</h1>
-   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-11-14">14 November 2018</time></span></h2>
+   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-12-05">5 December 2018</time></span></h2>
    <div data-fill-with="spec-metadata">
     <dl>
      <dt>This version:
@@ -1409,8 +1409,14 @@ <h2 class="no-num no-toc no-ref" id="contents">Table of Contents</h2>
        <a href="#usecases-highlevel"><span class="secno">2.1</span> <span class="content">High-Level Use Cases</span></a>
        <ol class="toc">
         <li><a href="#usecase-person-detection"><span class="secno">2.1.1</span> <span class="content">Person Detection</span></a>
-        <li><a href="#usecase-skeleton-detection"><span class="secno">2.1.2</span> <span class="content">Skeleton Detection</span></a>
-        <li><a href="#usecase-image-generation"><span class="secno">2.1.3</span> <span class="content">Random Image Generation</span></a>
+        <li><a href="#usecase-segmentation"><span class="secno">2.1.2</span> <span class="content">Semantic Segmentation</span></a>
+        <li><a href="#usecase-skeleton-detection"><span class="secno">2.1.3</span> <span class="content">Skeleton Detection</span></a>
+        <li><a href="#usecase-face-recognition"><span class="secno">2.1.4</span> <span class="content">Face Recognition</span></a>
+        <li><a href="#usecase-super-resolution"><span class="secno">2.1.5</span> <span class="content">Super Resolution</span></a>
+        <li><a href="#usecase-image-captioning"><span class="secno">2.1.6</span> <span class="content">Image Captioning</span></a>
+        <li><a href="#usecase-translation"><span class="secno">2.1.7</span> <span class="content">Machine Translation</span></a>
+        <li><a href="#usecase-emotion-analysis"><span class="secno">2.1.8</span> <span class="content">Emotion Analysis</span></a>
+        <li><a href="#usecase-video-summalization"><span class="secno">2.1.9</span> <span class="content">Video Summarization</span></a>
        </ol>
       <li>
        <a href="#usecases-lowlevel"><span class="secno">2.2</span> <span class="content">Low-Level Use Cases</span></a>
@@ -1438,23 +1444,54 @@ <h3 class="heading settled" data-level="2.1" id="usecases-highlevel"><span class
 inference hardware acceleration. All applications in those use cases can be
 built on top of pre-trained deep neural network (DNN) models.</p>
    <h4 class="heading settled" data-level="2.1.1" id="usecase-person-detection"><span class="secno">2.1.1. </span><span class="content">Person Detection</span><a class="self-link" href="#usecase-person-detection"></a></h4>
-   <p>A user is browsing a social media site and wishes to take a photo and upload it
-to the site. Before the photo is uploaded, the site does object detection (for
-example, using object detection approaches such as <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a> or <a data-link-type="biblio" href="#biblio-yolo">[YOLO]</a> that use
-a single DNN) to detect regions that include persons so that the user can filter
-and de-personalize irrelevant persons on it.</p>
-   <h4 class="heading settled" data-level="2.1.2" id="usecase-skeleton-detection"><span class="secno">2.1.2. </span><span class="content">Skeleton Detection</span><a class="self-link" href="#usecase-skeleton-detection"></a></h4>
-   <p>The web application extracts her skeleton by running a machine learning model
-which allows for real-time human pose estimation such as <a data-link-type="biblio" href="#biblio-posenet">[PoseNet]</a> to
-recognize her gesture and body language. When she strikes a specified pose like
-raising a hand, the web application automatically takes a photo and upload it to
-an online storage.</p>
-   <h4 class="heading settled" data-level="2.1.3" id="usecase-image-generation"><span class="secno">2.1.3. </span><span class="content">Random Image Generation</span><a class="self-link" href="#usecase-image-generation"></a></h4>
-   <p>A user wishes to make her new account and looks for a new icon image. When she
-clicks a "Generate" button on the webpage for creating an account, the webpage
-runs a generator model of generative adversarial network (GAN) for icon
-synthesis <a data-link-type="biblio" href="#biblio-logosynthesis">[LogoSynthesis]</a>. She can repeat random icon
-generation until she finds her favorite one.</p>
+   <p>A user opens a web-based video conferencing application, but she temporarily
+leaves from her room. The application is watching whether she is in front of her
+PC by using object detection (for example, using object detection approaches
+such as <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a> or <a data-link-type="biblio" href="#biblio-yolo">[YOLO]</a> that use a single DNN) to detect regions in a camera
+input frame that include persons.</p>
+   <p>When she comes back, the application automatically detects her and notifies
+other online users that she is active now.</p>
+   <h4 class="heading settled" data-level="2.1.2" id="usecase-segmentation"><span class="secno">2.1.2. </span><span class="content">Semantic Segmentation</span><a class="self-link" href="#usecase-segmentation"></a></h4>
+   <p>A user joins a teleconference via a web-based video conferencing application
+from her room. However, she does not wish that her room is visible on the
+screen. So the application runs a machine learning model such as <a data-link-type="biblio" href="#biblio-deeplabv3">[DeepLabv3+]</a> or <a data-link-type="biblio" href="#biblio-maskr-cnn">[MaskR-CNN]</a> to semantically split an image into segments and replaces
+background segments with another picture.</p>
+   <h4 class="heading settled" data-level="2.1.3" id="usecase-skeleton-detection"><span class="secno">2.1.3. </span><span class="content">Skeleton Detection</span><a class="self-link" href="#usecase-skeleton-detection"></a></h4>
+   <p>A web-based video conferencing application tracks a pose of user’s skeleton by
+running a machine learning model, which allows for real-time human pose
+estimation, such as <a data-link-type="biblio" href="#biblio-posenet">[PoseNet]</a> to recognize her gesture and body language. When
+she raises her hand, her microphone is automatically unmuted and she can start
+speaking on the teleconference.</p>
+   <h4 class="heading settled" data-level="2.1.4" id="usecase-face-recognition"><span class="secno">2.1.4. </span><span class="content">Face Recognition</span><a class="self-link" href="#usecase-face-recognition"></a></h4>
+   <p>There are multiple people in the conference room and they join an online meeting
+using a web-based video conferencing application. The application detects faces
+of participants by using object detection (for example, using object detection
+approaches such as <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a>) and checks whether each face was present at the
+previous meeting or not by running a machine learning model such as <a data-link-type="biblio" href="#biblio-facenet">[FaceNet]</a>,
+which verifies whether two faces would be identical or not.</p>
+   <h4 class="heading settled" data-level="2.1.5" id="usecase-super-resolution"><span class="secno">2.1.5. </span><span class="content">Super Resolution</span><a class="self-link" href="#usecase-super-resolution"></a></h4>
+   <p>A web-based video conferencing is receiving a video stream from its peer, but
+the resolution of the video becomes lower due to network congestion. So the
+application runs a machine learning model for super-resolution such as <a data-link-type="biblio" href="#biblio-srgan">[SRGAN]</a> to generate higher-resolution video frames.</p>
+   <h4 class="heading settled" data-level="2.1.6" id="usecase-image-captioning"><span class="secno">2.1.6. </span><span class="content">Image Captioning</span><a class="self-link" href="#usecase-image-captioning"></a></h4>
+   <p>For better accessibility, a web-based presentation application provides
+automatic image captioning by running a machine learning model such as <a data-link-type="biblio" href="#biblio-im2txt">[im2txt]</a> which predicts explanatory words of the presentation slides.</p>
+   <h4 class="heading settled" data-level="2.1.7" id="usecase-translation"><span class="secno">2.1.7. </span><span class="content">Machine Translation</span><a class="self-link" href="#usecase-translation"></a></h4>
+   <p>Multiple people from various countries are talking via a web-based real-time
+text chat application. The application translates their conversation by using a
+machine learning model such as <a data-link-type="biblio" href="#biblio-gnmt">[GNMT]</a> or <a data-link-type="biblio" href="#biblio-opennmt">[OpenNMT]</a>, which translates every
+text into different language.</p>
+   <h4 class="heading settled" data-level="2.1.8" id="usecase-emotion-analysis"><span class="secno">2.1.8. </span><span class="content">Emotion Analysis</span><a class="self-link" href="#usecase-emotion-analysis"></a></h4>
+   <p>A user is talking to her friend via a web-based real-time text chat application,
+and she is wondering how the friend feels because she cannot see the friend’s
+face. The application analyses the friend’s emotion by using a machine learning
+model such as <a data-link-type="biblio" href="#biblio-deepmoji">[DeepMoji]</a>, which infers emotion from input texts, and displays
+an emoji that represents the estimated emotion.</p>
+   <h4 class="heading settled" data-level="2.1.9" id="usecase-video-summalization"><span class="secno">2.1.9. </span><span class="content">Video Summarization</span><a class="self-link" href="#usecase-video-summalization"></a></h4>
+   <p>A web-based video conferencing application records received video streams, and
+it needs to reduce recorded video data to be stored. The application generates
+the short version of the recoreded video by using a machine learning model for
+video summarization such as <a data-link-type="biblio" href="#biblio-video-summarization-with-lstm">[Video-Summarization-with-LSTM]</a>.</p>
    <h3 class="heading settled" data-level="2.2" id="usecases-lowlevel"><span class="secno">2.2. </span><span class="content">Low-Level Use Cases</span><a class="self-link" href="#usecases-lowlevel"></a></h3>
    <p>This section collects API-level use cases for a dedicated low-level API for
 neural network inference hardware acceleration. It is expected that Machine
@@ -1471,11 +1508,14 @@ <h4 class="heading settled" data-level="2.2.1" id="usecase-custom-layer"><span c
 custom layers may include convolution, normalization, etc. as well as
 activation.</p>
    <h4 class="heading settled" data-level="2.2.2" id="usecase-network-concat"><span class="secno">2.2.2. </span><span class="content">Network Concatenation</span><a class="self-link" href="#usecase-network-concat"></a></h4>
-   <p>A web application developer is trying to implement a DNN model that refers to
-outputs from some convolutional layer in <a data-link-type="biblio" href="#biblio-mobilenets">[MobileNets]</a> or <a data-link-type="biblio" href="#biblio-resnet">[ResNet]</a> as a
-feature map. She wants to use a pre-trained model of these networks to reduce
-training time. So she implements her network architecture so that its input is
-the output of convolutional layers of pre-trained MobileNets or ResNet models.</p>
+   <p>A web application uses a DNN model, and its model data of upper convolutional
+layers and lower fully-connected layers are stored in separate files, since
+model data of the fully-connected layers are periodically updated due to fine
+tuning at the server side.</p>
+   <p>Therefore, the application downloads both partial model files at first and
+concatenates them into a single model. When the model is updated, the
+application downloads fine-tuned part of the model and replace only the
+fully-connected layers with it.</p>
    <h4 class="heading settled" data-level="2.2.3" id="usecase-perf-adapt"><span class="secno">2.2.3. </span><span class="content">Performance Adaptation</span><a class="self-link" href="#usecase-perf-adapt"></a></h4>
    <p>A web application developer has a concern about performance of her DNN model on
 mobile devices. She has confirmed that the model runs too slow on mobile devices
@@ -1640,20 +1680,32 @@ <h3 class="no-num no-ref heading settled" id="normative"><span class="content">N
   </dl>
   <h3 class="no-num no-ref heading settled" id="informative"><span class="content">Informative References</span><a class="self-link" href="#informative"></a></h3>
   <dl>
+   <dt id="biblio-deeplabv3">[DeepLabv3+]
+   <dd>Liang-Chieh Chen; et al. <a href="https://arxiv.org/abs/1802.02611">Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation</a>. August 2018. URL: <a href="https://arxiv.org/abs/1802.02611">https://arxiv.org/abs/1802.02611</a>
+   <dt id="biblio-deepmoji">[DeepMoji]
+   <dd>Bjarke Felbo; et al. <a href="https://arxiv.org/abs/1708.00524">Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm</a>. October 2017. URL: <a href="https://arxiv.org/abs/1708.00524">https://arxiv.org/abs/1708.00524</a>
    <dt id="biblio-elu">[ELU]
    <dd>Djork-Arné Clevert; Thomas Unterthiner; Sepp Hochreiter. <a href="https://arxiv.org/abs/1511.07289">Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)</a>. February 2016. URL: <a href="https://arxiv.org/abs/1511.07289">https://arxiv.org/abs/1511.07289</a>
+   <dt id="biblio-facenet">[FaceNet]
+   <dd>Florian Schroff; Dmitry Kalenichenko; James Philbin. <a href="https://arxiv.org/abs/1503.03832">FaceNet: A Unified Embedding for Face Recognition and Clustering</a>. June 2015. URL: <a href="https://arxiv.org/abs/1503.03832">https://arxiv.org/abs/1503.03832</a>
+   <dt id="biblio-gnmt">[GNMT]
+   <dd>Minh-Thang Luong; Eugene Brevdo; Rui Zhao. <a href="https://github.com/tensorflow/nmt">Neural Machine Translation (seq2seq) Tutorial</a>. May 2017. URL: <a href="https://github.com/tensorflow/nmt">https://github.com/tensorflow/nmt</a>
+   <dt id="biblio-im2txt">[IM2TXT]
+   <dd>Oriol Vinyals; et al. <a href="https://arxiv.org/abs/1609.06647">Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning Challenge</a>. September 2016. URL: <a href="https://arxiv.org/abs/1609.06647">https://arxiv.org/abs/1609.06647</a>
    <dt id="biblio-leakyrelu">[LeakyReLU]
    <dd>Andrew L. Maas; Awni Y. Hannun; Andrew Y. Ng. <a href="https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf">Rectifier Nonlinearities Improve Neural Network Acoustic Models</a>. June 2013. URL: <a href="https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf">https://pdfs.semanticscholar.org/367f/2c63a6f6a10b3b64b8729d601e69337ee3cc.pdf</a>
-   <dt id="biblio-logosynthesis">[LogoSynthesis]
-   <dd>Alexander Sage; et al. <a href="https://arxiv.org/abs/1712.04407">Logo Synthesis and Manipulation with Clustered Generative Adversarial Networks</a>. December 2017. URL: <a href="https://arxiv.org/abs/1712.04407">https://arxiv.org/abs/1712.04407</a>
-   <dt id="biblio-mobilenets">[MobileNets]
-   <dd>Andrew G. Howard; et al. <a href="https://arxiv.org/abs/1704.04861">MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications</a>. April 2017. URL: <a href="https://arxiv.org/abs/1704.04861">https://arxiv.org/abs/1704.04861</a>
+   <dt id="biblio-maskr-cnn">[MaskR-CNN]
+   <dd>Kaiming He; et al. <a href="https://arxiv.org/abs/1703.06870">Mask R-CNN</a>. January 2018. URL: <a href="https://arxiv.org/abs/1703.06870">https://arxiv.org/abs/1703.06870</a>
+   <dt id="biblio-opennmt">[OpenNMT]
+   <dd>Guillaume Klein; et al. <a href="https://arxiv.org/abs/1701.02810">OpenNMT: Open-Source Toolkit for Neural Machine Translation</a>. March 2017. URL: <a href="https://arxiv.org/abs/1701.02810">https://arxiv.org/abs/1701.02810</a>
    <dt id="biblio-posenet">[PoseNet]
    <dd>Dan Oved. <a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">Real-time Human Pose Estimation in the Browser with TensorFlow.js</a>. May 2018. URL: <a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5</a>
-   <dt id="biblio-resnet">[ResNet]
-   <dd>Kaiming He; et al. <a href="https://arxiv.org/abs/1512.03385">Deep Residual Learning for Image Recognition</a>. December 2015. URL: <a href="https://arxiv.org/abs/1512.03385">https://arxiv.org/abs/1512.03385</a>
+   <dt id="biblio-srgan">[SRGAN]
+   <dd>Christian Ledig; et al. <a href="https://arxiv.org/abs/1609.04802">Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network</a>. May 2017. URL: <a href="https://arxiv.org/abs/1609.04802">https://arxiv.org/abs/1609.04802</a>
    <dt id="biblio-ssd">[SSD]
    <dd>Wei Liu; et al. <a href="https://arxiv.org/abs/1512.02325">SSD: Single Shot MultiBox Detector</a>. December 2016. URL: <a href="https://arxiv.org/abs/1512.02325">https://arxiv.org/abs/1512.02325</a>
+   <dt id="biblio-video-summarization-with-lstm">[Video-Summarization-with-LSTM]
+   <dd>Ke Zhang; et al. <a href="http://www-scf.usc.edu/~zhan355/ke_eccv2016.pdf">Video summarization with long short-term memory</a>. October 2016. URL: <a href="http://www-scf.usc.edu/~zhan355/ke_eccv2016.pdf">http://www-scf.usc.edu/~zhan355/ke_eccv2016.pdf</a>
    <dt id="biblio-yolo">[YOLO]
    <dd>Joseph Redmon; et al. <a href="https://arxiv.org/abs/1506.02640">You Only Look Once: Unified, Real-Time Object Detection</a>. May 2016. URL: <a href="https://arxiv.org/abs/1506.02640">https://arxiv.org/abs/1506.02640</a>
   </dl>
\ No newline at end of file

From f46f10701f2c2ff59725ae71d5a53702f1106f4b Mon Sep 17 00:00:00 2001
From: Tomoyuki Shimizu <tomoyuki.labs@gmail.com>
Date: Tue, 11 Dec 2018 10:30:57 +0900
Subject: [PATCH 4/5] minor revision

---
 index.bs | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/index.bs b/index.bs
index eb4e1e24..62867f14 100644
--- a/index.bs
+++ b/index.bs
@@ -37,11 +37,13 @@ other online users that she is active now.
 
 ### Semantic Segmentation ### {#usecase-segmentation}
 
-A user joins a teleconference via a web-based video conferencing application
-from her room. However, she does not wish that her room is visible on the
-screen. So the application runs a machine learning model such as [[DeepLabv3+]]
-or [[MaskR-CNN]] to semantically split an image into segments and replaces
-background segments with another picture.
+A user joins a teleconference via a web-based video conferencing application at
+her desk since no meeting room in her office is available. During the
+teleconference, she does not wish that her room and people in the background are
+visible. To protect the privacy of the other people and the surroundings, the
+application runs a machine learning model such as [[DeepLabv3+]] or
+[[MaskR-CNN]] to semantically split an image into segments and replaces
+segments that represent other people and background with another picture.
 
 ### Skeleton Detection ### {#usecase-skeleton-detection}
 
@@ -63,9 +65,10 @@ which verifies whether two faces would be identical or not.
 ### Super Resolution ### {#usecase-super-resolution}
 
 A web-based video conferencing is receiving a video stream from its peer, but
-the resolution of the video becomes lower due to network congestion. So the
-application runs a machine learning model for super-resolution such as [[SRGAN]]
-to generate higher-resolution video frames.
+the resolution of the video becomes lower due to network congestion. To prevent
+degradation of the perceived video quality, the application runs a machine
+learning model for super-resolution such as [[SRGAN]] to generate
+higher-resolution video frames.
 
 ### Image Captioning ### {#usecase-image-captioning}
 
@@ -92,7 +95,7 @@ an emoji that represents the estimated emotion.
 
 A web-based video conferencing application records received video streams, and
 it needs to reduce recorded video data to be stored. The application generates
-the short version of the recoreded video by using a machine learning model for
+the short version of the recorded video by using a machine learning model for
 video summarization such as [[Video-Summarization-with-LSTM]].
 
 ## Low-Level Use Cases ## {#usecases-lowlevel}
@@ -109,10 +112,10 @@ with the WebNN API directly instead of a higher-level ML framework.
 
 A web application developer wants to run a DNN model on the WebNN API. However,
 she has found that some of activation functions like [[LeakyReLU]], [[ELU]],
-etc. are not included in the WebNN API. So she constructs custom layers of the
-additional activation functions on top of the WebNN API. Note that the scope of
-custom layers may include convolution, normalization, etc. as well as
-activation.
+etc. are not included in the WebNN API. To address this issue, she constructs
+custom layers of the additional activation functions on top of the WebNN API.
+Note that the scope of custom layers may include convolution, normalization,
+etc. as well as activation.
 
 ### Network Concatenation ### {#usecase-network-concat}
 
@@ -129,14 +132,14 @@ fully-connected layers with it.
 ### Performance Adaptation ### {#usecase-perf-adapt}
 
 A web application developer has a concern about performance of her DNN model on
-mobile devices. She has confirmed that the model runs too slow on mobile devices
-which does not have GPU acceleration. So her web application refers to the WebNN
-API to confirm whether acceleration is available or not, so that the application
-can display the warning for devices without acceleration.
+mobile devices. She has confirmed that it may run too slow on mobile devices
+which do not have GPU acceleration. To address this issue, her web application
+refers to the WebNN API to confirm whether acceleration is available or not, so
+that the application can display the warning for devices without acceleration.
 
 After several weeks, she has developed a tiny DNN model that can even run on
-CPU. So she modifies the application so that the application loads the tiny
-model in the case of CPU-only devices.
+CPU. In order to accommodate CPU execution, she modifies the application
+so that the application loads the tiny model in the case of CPU-only devices.
 
 <pre class="biblio">
 {

From 5a69c57e7a477fab901eaf211f1cfdeda783f1f6 Mon Sep 17 00:00:00 2001
From: Tomoyuki Shimizu <tomoyuki.labs@gmail.com>
Date: Tue, 18 Dec 2018 17:41:28 +0900
Subject: [PATCH 5/5] add a couple of use cases about face apps

---
 index.bs   | 48 +++++++++++++++++++++++++++++++
 index.html | 83 +++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/index.bs b/index.bs
index 62867f14..9cc45b7b 100644
--- a/index.bs
+++ b/index.bs
@@ -62,6 +62,24 @@ approaches such as [[SSD]]) and checks whether each face was present at the
 previous meeting or not by running a machine learning model such as [[FaceNet]],
 which verifies whether two faces would be identical or not.
 
+### Facial Landmark Detection ### {#usecase-facial-landmarks}
+
+A user wants to find new glasses that beautifully fits her on an online glasses
+store. The online store offers web-based try-on simulator that runs a machine
+learning model such as Face Alignment Network [[FAN]] to detect facial landmarks
+like eyes, nose, mouth, etc. When she chooses a pair of glasses, the simulator
+properly render the selected glasses on the detected position of eyes on her
+facial image.
+
+### Style Transfer ### {#usecase-style-transfer}
+
+A user is looking for cosmetics on an online store and wondering which color may
+fit her face. The online store shows sample facial makeup images of cosmetics,
+and offers makeup simulator that runs a machine learning model like
+[[ContextualLoss]] or [[PairedCycleGAN]] to transfer the makeup style of the
+sample makeup image to her facial image. She can check how the selected makeup
+looks like on her face by the simulator.
+
 ### Super Resolution ### {#usecase-super-resolution}
 
 A web-based video conferencing is receiving a video stream from its peer, but
@@ -209,6 +227,36 @@ so that the application loads the tiny model in the case of CPU-only devices.
     ],
     "date": "June 2015"
   },
+  "FAN": {
+    "href": "https://arxiv.org/abs/1703.07332",
+    "title": "How far are we from solving the 2D & 3D Face Alignment problem? (and a dataset of 230,000 3D facial landmarks)",
+    "authors": [
+      "Adrian Bulat",
+      "Georgios Tzimiropoulos"
+    ],
+    "date": "September 2017"
+  },
+  "ContextualLoss": {
+    "href": "https://arxiv.org/abs/1803.02077",
+    "title": "The Contextual Loss for Image Transformation with Non-Aligned Data",
+    "authors": [
+      "Roey Mechrez",
+      "Itamar Talmi",
+      "Lihi Zelnik-Manor"
+    ],
+    "date": "July 2018"
+  },
+  "PairedCycleGAN": {
+    "href": "http://openaccess.thecvf.com/content_cvpr_2018/html/Chang_PairedCycleGAN_Asymmetric_Style_CVPR_2018_paper.html",
+    "title": "PairedCycleGAN: Asymmetric Style Transfer for Applying and Removing Makeup",
+    "authors": [
+      "Huiwen Chang",
+      "Jingwan Lu",
+      "Fisher Yu",
+      "Adam Finkelstein"
+    ],
+    "date": "June 2018"
+  },
   "SRGAN": {
     "href": "https://arxiv.org/abs/1609.04802",
     "title": "Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network",
diff --git a/index.html b/index.html
index d2919387..5148a755 100644
--- a/index.html
+++ b/index.html
@@ -1214,7 +1214,7 @@
 </style>
   <meta content="Bikeshed version db559f98d6c99655936abc96647a0e88ba98a9cf" name="generator">
   <link href="https://webmachinelearning.github.io/webnn/" rel="canonical">
-  <meta content="57a14adb05b34c64ad6da0c4cbf7c10fdc4d733f" name="document-revision">
+  <meta content="f46f10701f2c2ff59725ae71d5a53702f1106f4b" name="document-revision">
 <style>/* style-md-lists */
 
 /* This is a weird hack for me not yet following the commonmark spec
@@ -1366,7 +1366,7 @@
   <div class="head">
    <p data-fill-with="logo"></p>
    <h1 class="p-name no-ref" id="title">Web Neural Network API</h1>
-   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-12-05">5 December 2018</time></span></h2>
+   <h2 class="no-num no-toc no-ref heading settled" id="subtitle"><span class="content">Draft Community Group Report, <time class="dt-updated" datetime="2018-12-18">18 December 2018</time></span></h2>
    <div data-fill-with="spec-metadata">
     <dl>
      <dt>This version:
@@ -1412,11 +1412,13 @@ <h2 class="no-num no-toc no-ref" id="contents">Table of Contents</h2>
         <li><a href="#usecase-segmentation"><span class="secno">2.1.2</span> <span class="content">Semantic Segmentation</span></a>
         <li><a href="#usecase-skeleton-detection"><span class="secno">2.1.3</span> <span class="content">Skeleton Detection</span></a>
         <li><a href="#usecase-face-recognition"><span class="secno">2.1.4</span> <span class="content">Face Recognition</span></a>
-        <li><a href="#usecase-super-resolution"><span class="secno">2.1.5</span> <span class="content">Super Resolution</span></a>
-        <li><a href="#usecase-image-captioning"><span class="secno">2.1.6</span> <span class="content">Image Captioning</span></a>
-        <li><a href="#usecase-translation"><span class="secno">2.1.7</span> <span class="content">Machine Translation</span></a>
-        <li><a href="#usecase-emotion-analysis"><span class="secno">2.1.8</span> <span class="content">Emotion Analysis</span></a>
-        <li><a href="#usecase-video-summalization"><span class="secno">2.1.9</span> <span class="content">Video Summarization</span></a>
+        <li><a href="#usecase-facial-landmarks"><span class="secno">2.1.5</span> <span class="content">Facial Landmark Detection</span></a>
+        <li><a href="#usecase-style-transfer"><span class="secno">2.1.6</span> <span class="content">Style Transfer</span></a>
+        <li><a href="#usecase-super-resolution"><span class="secno">2.1.7</span> <span class="content">Super Resolution</span></a>
+        <li><a href="#usecase-image-captioning"><span class="secno">2.1.8</span> <span class="content">Image Captioning</span></a>
+        <li><a href="#usecase-translation"><span class="secno">2.1.9</span> <span class="content">Machine Translation</span></a>
+        <li><a href="#usecase-emotion-analysis"><span class="secno">2.1.10</span> <span class="content">Emotion Analysis</span></a>
+        <li><a href="#usecase-video-summalization"><span class="secno">2.1.11</span> <span class="content">Video Summarization</span></a>
        </ol>
       <li>
        <a href="#usecases-lowlevel"><span class="secno">2.2</span> <span class="content">Low-Level Use Cases</span></a>
@@ -1452,10 +1454,12 @@ <h4 class="heading settled" data-level="2.1.1" id="usecase-person-detection"><sp
    <p>When she comes back, the application automatically detects her and notifies
 other online users that she is active now.</p>
    <h4 class="heading settled" data-level="2.1.2" id="usecase-segmentation"><span class="secno">2.1.2. </span><span class="content">Semantic Segmentation</span><a class="self-link" href="#usecase-segmentation"></a></h4>
-   <p>A user joins a teleconference via a web-based video conferencing application
-from her room. However, she does not wish that her room is visible on the
-screen. So the application runs a machine learning model such as <a data-link-type="biblio" href="#biblio-deeplabv3">[DeepLabv3+]</a> or <a data-link-type="biblio" href="#biblio-maskr-cnn">[MaskR-CNN]</a> to semantically split an image into segments and replaces
-background segments with another picture.</p>
+   <p>A user joins a teleconference via a web-based video conferencing application at
+her desk since no meeting room in her office is available. During the
+teleconference, she does not wish that her room and people in the background are
+visible. To protect the privacy of the other people and the surroundings, the
+application runs a machine learning model such as <a data-link-type="biblio" href="#biblio-deeplabv3">[DeepLabv3+]</a> or <a data-link-type="biblio" href="#biblio-maskr-cnn">[MaskR-CNN]</a> to semantically split an image into segments and replaces
+segments that represent other people and background with another picture.</p>
    <h4 class="heading settled" data-level="2.1.3" id="usecase-skeleton-detection"><span class="secno">2.1.3. </span><span class="content">Skeleton Detection</span><a class="self-link" href="#usecase-skeleton-detection"></a></h4>
    <p>A web-based video conferencing application tracks a pose of user’s skeleton by
 running a machine learning model, which allows for real-time human pose
@@ -1469,28 +1473,43 @@ <h4 class="heading settled" data-level="2.1.4" id="usecase-face-recognition"><sp
 approaches such as <a data-link-type="biblio" href="#biblio-ssd">[SSD]</a>) and checks whether each face was present at the
 previous meeting or not by running a machine learning model such as <a data-link-type="biblio" href="#biblio-facenet">[FaceNet]</a>,
 which verifies whether two faces would be identical or not.</p>
-   <h4 class="heading settled" data-level="2.1.5" id="usecase-super-resolution"><span class="secno">2.1.5. </span><span class="content">Super Resolution</span><a class="self-link" href="#usecase-super-resolution"></a></h4>
+   <h4 class="heading settled" data-level="2.1.5" id="usecase-facial-landmarks"><span class="secno">2.1.5. </span><span class="content">Facial Landmark Detection</span><a class="self-link" href="#usecase-facial-landmarks"></a></h4>
+   <p>A user wants to find new glasses that beautifully fits her on an online glasses
+store. The online store offers web-based try-on simulator that runs a machine
+learning model such as Face Alignment Network <a data-link-type="biblio" href="#biblio-fan">[FAN]</a> to detect facial landmarks
+like eyes, nose, mouth, etc. When she chooses a pair of glasses, the simulator
+properly render the selected glasses on the detected position of eyes on her
+facial image.</p>
+   <h4 class="heading settled" data-level="2.1.6" id="usecase-style-transfer"><span class="secno">2.1.6. </span><span class="content">Style Transfer</span><a class="self-link" href="#usecase-style-transfer"></a></h4>
+   <p>A user is looking for cosmetics on an online store and wondering which color may
+fit her face. The online store shows sample facial makeup images of cosmetics,
+and offers makeup simulator that runs a machine learning model like <a data-link-type="biblio" href="#biblio-contextualloss">[ContextualLoss]</a> or <a data-link-type="biblio" href="#biblio-pairedcyclegan">[PairedCycleGAN]</a> to transfer the makeup style of the
+sample makeup image to her facial image. She can check how the selected makeup
+looks like on her face by the simulator.</p>
+   <h4 class="heading settled" data-level="2.1.7" id="usecase-super-resolution"><span class="secno">2.1.7. </span><span class="content">Super Resolution</span><a class="self-link" href="#usecase-super-resolution"></a></h4>
    <p>A web-based video conferencing is receiving a video stream from its peer, but
-the resolution of the video becomes lower due to network congestion. So the
-application runs a machine learning model for super-resolution such as <a data-link-type="biblio" href="#biblio-srgan">[SRGAN]</a> to generate higher-resolution video frames.</p>
-   <h4 class="heading settled" data-level="2.1.6" id="usecase-image-captioning"><span class="secno">2.1.6. </span><span class="content">Image Captioning</span><a class="self-link" href="#usecase-image-captioning"></a></h4>
+the resolution of the video becomes lower due to network congestion. To prevent
+degradation of the perceived video quality, the application runs a machine
+learning model for super-resolution such as <a data-link-type="biblio" href="#biblio-srgan">[SRGAN]</a> to generate
+higher-resolution video frames.</p>
+   <h4 class="heading settled" data-level="2.1.8" id="usecase-image-captioning"><span class="secno">2.1.8. </span><span class="content">Image Captioning</span><a class="self-link" href="#usecase-image-captioning"></a></h4>
    <p>For better accessibility, a web-based presentation application provides
 automatic image captioning by running a machine learning model such as <a data-link-type="biblio" href="#biblio-im2txt">[im2txt]</a> which predicts explanatory words of the presentation slides.</p>
-   <h4 class="heading settled" data-level="2.1.7" id="usecase-translation"><span class="secno">2.1.7. </span><span class="content">Machine Translation</span><a class="self-link" href="#usecase-translation"></a></h4>
+   <h4 class="heading settled" data-level="2.1.9" id="usecase-translation"><span class="secno">2.1.9. </span><span class="content">Machine Translation</span><a class="self-link" href="#usecase-translation"></a></h4>
    <p>Multiple people from various countries are talking via a web-based real-time
 text chat application. The application translates their conversation by using a
 machine learning model such as <a data-link-type="biblio" href="#biblio-gnmt">[GNMT]</a> or <a data-link-type="biblio" href="#biblio-opennmt">[OpenNMT]</a>, which translates every
 text into different language.</p>
-   <h4 class="heading settled" data-level="2.1.8" id="usecase-emotion-analysis"><span class="secno">2.1.8. </span><span class="content">Emotion Analysis</span><a class="self-link" href="#usecase-emotion-analysis"></a></h4>
+   <h4 class="heading settled" data-level="2.1.10" id="usecase-emotion-analysis"><span class="secno">2.1.10. </span><span class="content">Emotion Analysis</span><a class="self-link" href="#usecase-emotion-analysis"></a></h4>
    <p>A user is talking to her friend via a web-based real-time text chat application,
 and she is wondering how the friend feels because she cannot see the friend’s
 face. The application analyses the friend’s emotion by using a machine learning
 model such as <a data-link-type="biblio" href="#biblio-deepmoji">[DeepMoji]</a>, which infers emotion from input texts, and displays
 an emoji that represents the estimated emotion.</p>
-   <h4 class="heading settled" data-level="2.1.9" id="usecase-video-summalization"><span class="secno">2.1.9. </span><span class="content">Video Summarization</span><a class="self-link" href="#usecase-video-summalization"></a></h4>
+   <h4 class="heading settled" data-level="2.1.11" id="usecase-video-summalization"><span class="secno">2.1.11. </span><span class="content">Video Summarization</span><a class="self-link" href="#usecase-video-summalization"></a></h4>
    <p>A web-based video conferencing application records received video streams, and
 it needs to reduce recorded video data to be stored. The application generates
-the short version of the recoreded video by using a machine learning model for
+the short version of the recorded video by using a machine learning model for
 video summarization such as <a data-link-type="biblio" href="#biblio-video-summarization-with-lstm">[Video-Summarization-with-LSTM]</a>.</p>
    <h3 class="heading settled" data-level="2.2" id="usecases-lowlevel"><span class="secno">2.2. </span><span class="content">Low-Level Use Cases</span><a class="self-link" href="#usecases-lowlevel"></a></h3>
    <p>This section collects API-level use cases for a dedicated low-level API for
@@ -1503,10 +1522,10 @@ <h3 class="heading settled" data-level="2.2" id="usecases-lowlevel"><span class=
    <h4 class="heading settled" data-level="2.2.1" id="usecase-custom-layer"><span class="secno">2.2.1. </span><span class="content">Custom Layer</span><a class="self-link" href="#usecase-custom-layer"></a></h4>
    <p>A web application developer wants to run a DNN model on the WebNN API. However,
 she has found that some of activation functions like <a data-link-type="biblio" href="#biblio-leakyrelu">[LeakyReLU]</a>, <a data-link-type="biblio" href="#biblio-elu">[ELU]</a>,
-etc. are not included in the WebNN API. So she constructs custom layers of the
-additional activation functions on top of the WebNN API. Note that the scope of
-custom layers may include convolution, normalization, etc. as well as
-activation.</p>
+etc. are not included in the WebNN API. To address this issue, she constructs
+custom layers of the additional activation functions on top of the WebNN API.
+Note that the scope of custom layers may include convolution, normalization,
+etc. as well as activation.</p>
    <h4 class="heading settled" data-level="2.2.2" id="usecase-network-concat"><span class="secno">2.2.2. </span><span class="content">Network Concatenation</span><a class="self-link" href="#usecase-network-concat"></a></h4>
    <p>A web application uses a DNN model, and its model data of upper convolutional
 layers and lower fully-connected layers are stored in separate files, since
@@ -1518,13 +1537,13 @@ <h4 class="heading settled" data-level="2.2.2" id="usecase-network-concat"><span
 fully-connected layers with it.</p>
    <h4 class="heading settled" data-level="2.2.3" id="usecase-perf-adapt"><span class="secno">2.2.3. </span><span class="content">Performance Adaptation</span><a class="self-link" href="#usecase-perf-adapt"></a></h4>
    <p>A web application developer has a concern about performance of her DNN model on
-mobile devices. She has confirmed that the model runs too slow on mobile devices
-which does not have GPU acceleration. So her web application refers to the WebNN
-API to confirm whether acceleration is available or not, so that the application
-can display the warning for devices without acceleration.</p>
+mobile devices. She has confirmed that it may run too slow on mobile devices
+which do not have GPU acceleration. To address this issue, her web application
+refers to the WebNN API to confirm whether acceleration is available or not, so
+that the application can display the warning for devices without acceleration.</p>
    <p>After several weeks, she has developed a tiny DNN model that can even run on
-CPU. So she modifies the application so that the application loads the tiny
-model in the case of CPU-only devices.</p>
+CPU. In order to accommodate CPU execution, she modifies the application
+so that the application loads the tiny model in the case of CPU-only devices.</p>
   </main>
   <div data-fill-with="conformance">
    <h2 class="no-ref no-num heading settled" id="conformance"><span class="content"> Conformance</span><a class="self-link" href="#conformance"></a></h2>
@@ -1680,6 +1699,8 @@ <h3 class="no-num no-ref heading settled" id="normative"><span class="content">N
   </dl>
   <h3 class="no-num no-ref heading settled" id="informative"><span class="content">Informative References</span><a class="self-link" href="#informative"></a></h3>
   <dl>
+   <dt id="biblio-contextualloss">[ContextualLoss]
+   <dd>Roey Mechrez; Itamar Talmi; Lihi Zelnik-Manor. <a href="https://arxiv.org/abs/1803.02077">The Contextual Loss for Image Transformation with Non-Aligned Data</a>. July 2018. URL: <a href="https://arxiv.org/abs/1803.02077">https://arxiv.org/abs/1803.02077</a>
    <dt id="biblio-deeplabv3">[DeepLabv3+]
    <dd>Liang-Chieh Chen; et al. <a href="https://arxiv.org/abs/1802.02611">Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation</a>. August 2018. URL: <a href="https://arxiv.org/abs/1802.02611">https://arxiv.org/abs/1802.02611</a>
    <dt id="biblio-deepmoji">[DeepMoji]
@@ -1688,6 +1709,8 @@ <h3 class="no-num no-ref heading settled" id="informative"><span class="content"
    <dd>Djork-Arné Clevert; Thomas Unterthiner; Sepp Hochreiter. <a href="https://arxiv.org/abs/1511.07289">Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)</a>. February 2016. URL: <a href="https://arxiv.org/abs/1511.07289">https://arxiv.org/abs/1511.07289</a>
    <dt id="biblio-facenet">[FaceNet]
    <dd>Florian Schroff; Dmitry Kalenichenko; James Philbin. <a href="https://arxiv.org/abs/1503.03832">FaceNet: A Unified Embedding for Face Recognition and Clustering</a>. June 2015. URL: <a href="https://arxiv.org/abs/1503.03832">https://arxiv.org/abs/1503.03832</a>
+   <dt id="biblio-fan">[FAN]
+   <dd>Adrian Bulat; Georgios Tzimiropoulos. <a href="https://arxiv.org/abs/1703.07332">How far are we from solving the 2D &amp; 3D Face Alignment problem? (and a dataset of 230,000 3D facial landmarks)</a>. September 2017. URL: <a href="https://arxiv.org/abs/1703.07332">https://arxiv.org/abs/1703.07332</a>
    <dt id="biblio-gnmt">[GNMT]
    <dd>Minh-Thang Luong; Eugene Brevdo; Rui Zhao. <a href="https://github.com/tensorflow/nmt">Neural Machine Translation (seq2seq) Tutorial</a>. May 2017. URL: <a href="https://github.com/tensorflow/nmt">https://github.com/tensorflow/nmt</a>
    <dt id="biblio-im2txt">[IM2TXT]
@@ -1698,6 +1721,8 @@ <h3 class="no-num no-ref heading settled" id="informative"><span class="content"
    <dd>Kaiming He; et al. <a href="https://arxiv.org/abs/1703.06870">Mask R-CNN</a>. January 2018. URL: <a href="https://arxiv.org/abs/1703.06870">https://arxiv.org/abs/1703.06870</a>
    <dt id="biblio-opennmt">[OpenNMT]
    <dd>Guillaume Klein; et al. <a href="https://arxiv.org/abs/1701.02810">OpenNMT: Open-Source Toolkit for Neural Machine Translation</a>. March 2017. URL: <a href="https://arxiv.org/abs/1701.02810">https://arxiv.org/abs/1701.02810</a>
+   <dt id="biblio-pairedcyclegan">[PairedCycleGAN]
+   <dd>Huiwen Chang; et al. <a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Chang_PairedCycleGAN_Asymmetric_Style_CVPR_2018_paper.html">PairedCycleGAN: Asymmetric Style Transfer for Applying and Removing Makeup</a>. June 2018. URL: <a href="http://openaccess.thecvf.com/content_cvpr_2018/html/Chang_PairedCycleGAN_Asymmetric_Style_CVPR_2018_paper.html">http://openaccess.thecvf.com/content_cvpr_2018/html/Chang_PairedCycleGAN_Asymmetric_Style_CVPR_2018_paper.html</a>
    <dt id="biblio-posenet">[PoseNet]
    <dd>Dan Oved. <a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">Real-time Human Pose Estimation in the Browser with TensorFlow.js</a>. May 2018. URL: <a href="https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5">https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5</a>
    <dt id="biblio-srgan">[SRGAN]