diff --git a/README.md b/README.md
index f3618b42de..b1bcb3fd6f 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ code.
Make sure you have the latest LTS version of [Node.js](https://nodejs.org) and
[Yarn](https://yarnpkg.com) installed.
-Run `yarn command`.
+Run `yarn`.
## Commands
diff --git a/content/docs/command-reference/cache/index.md b/content/docs/command-reference/cache/index.md
index 04999a469a..d8e565a806 100644
--- a/content/docs/command-reference/cache/index.md
+++ b/content/docs/command-reference/cache/index.md
@@ -15,9 +15,9 @@ positional arguments:
## Description
-At DVC initialization, a new `.dvc/` directory will be created for internal
-configuration and cache
-[files and directories](/doc/user-guide/dvc-files-and-directories) that are
+At DVC initialization, a new `.dvc/` directory is created for internal
+configuration and cache
+[files and directories](/doc/user-guide/dvc-files-and-directories), that are
hidden from the user.
The cache is where your data files, models, etc. (anything you want to version
diff --git a/content/docs/command-reference/init.md b/content/docs/command-reference/init.md
index feddb5ec96..ae9952a2b3 100644
--- a/content/docs/command-reference/init.md
+++ b/content/docs/command-reference/init.md
@@ -22,9 +22,9 @@ advanced scenarios:
- [Initializing DVC without Git](#how-does-it-affect-dvc-commands) - support for
SCM other than Git, deployment automation cases, etc.
-At DVC initialization, a new `.dvc/` directory will be created for internal
-configuration and cache
-[files and directories](/doc/user-guide/dvc-files-and-directories) that are
+At DVC initialization, a new `.dvc/` directory is created for internal
+configuration and cache
+[files and directories](/doc/user-guide/dvc-files-and-directories), that are
hidden from the user.
### Initializing DVC in subdirectories
diff --git a/content/docs/command-reference/remote/index.md b/content/docs/command-reference/remote/index.md
index c0ca9c2f7a..a933a54732 100644
--- a/content/docs/command-reference/remote/index.md
+++ b/content/docs/command-reference/remote/index.md
@@ -74,9 +74,9 @@ For the typical process to share the project via remote, see
### What is a "local remote" ?
While the term may seem contradictory, it doesn't have to be. The "local" part
-refers to the machine where the project is stored, so it can be any directory
-accessible to the same system. The "remote" part refers specifically to the
-project/repository itself. Read "local, but external" storage.
+refers to the location of the storage relative to the project, so it can be any
+directory in the file system. "Remote" is the term that refers to the storage.
+Read "local cache backup".
diff --git a/content/docs/command-reference/remote/list.md b/content/docs/command-reference/remote/list.md
index 57b39278cb..30880fecaf 100644
--- a/content/docs/command-reference/remote/list.md
+++ b/content/docs/command-reference/remote/list.md
@@ -45,9 +45,9 @@ Let's for simplicity add a _default_ local remote:
### What is a "local remote" ?
While the term may seem contradictory, it doesn't have to be. The "local" part
-refers to the machine where the project is stored, so it can be any directory
-accessible to the same system. The "remote" part refers specifically to the
-project/repository itself. Read "local, but external" storage.
+refers to the location of the storage relative to the project, so it can be any
+directory in the file system. "Remote" is the term that refers to the storage.
+Read "local cache backup".
diff --git a/content/docs/index.md b/content/docs/index.md
index f23d119383..7de213aaaa 100644
--- a/content/docs/index.md
+++ b/content/docs/index.md
@@ -1,8 +1,27 @@
# DVC Documentation
-Welcome! In here you may find all the guiding material and technical documents
-needed to learn about DVC: how to use it, how it works, and where to go for
-additional resources.
+Welcome! In here you may find all the material and technical details needed to
+learn about DVC: how to use it, how it works, and where to go for additional
+resources.
+
+## What is DVC?
+
+Data Version Control, or DVC, is **a new type of experiment management
+software** built on top of the existing engineering toolset that you're already
+used to, particularly [Git](https://git-scm.com) source code management. DVC
+reduces the gap between existing tools and data science needs.
+
+If you store and process data files or datasets to produce other data or machine
+learning models, and you want to
+
+- capture and save data artifacts the same way you capture code;
+- track and switch between different versions of data or models easily;
+- understand how data or models were built in the first place;
+- be able to compare models and metrics to each other;
+- bring software engineering best practices to your data science team;
+- among other [use cases](/doc/use-cases)
+
+DVC is for you!
## Before you start
diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json
index 7e3de822bc..618988e1db 100644
--- a/content/docs/sidebar.json
+++ b/content/docs/sidebar.json
@@ -1,6 +1,7 @@
[
{
- "slug": "home",
+ "slug": "",
+ "label": "Home",
"source": "index.md"
},
{
@@ -30,32 +31,14 @@
"children": [
{
"slug": "get-started",
- "source": false,
+ "source": "get-started/index.md",
"tutorials": {
"katacoda": "https://katacoda.com/dvc/courses/get-started/initialize"
},
"children": [
- "agenda",
- "initialize",
- "configure",
- "add-files",
- "store-data",
- "retrieve-data",
- "import-data",
- {
- "label": "Connect with Code",
- "slug": "connect-code-and-data"
- },
- "pipeline",
- "visualize",
- "reproduce",
- "metrics",
- "experiments",
- "compare-experiments",
- {
- "label": "Get Older Files",
- "slug": "older-versions"
- }
+ "versioning-basics",
+ "data-pipelines",
+ "experiment-management"
]
},
{
diff --git a/content/docs/tutorials/deep/preparation.md b/content/docs/tutorials/deep/preparation.md
index 6db7335400..bd25bbaf97 100644
--- a/content/docs/tutorials/deep/preparation.md
+++ b/content/docs/tutorials/deep/preparation.md
@@ -61,9 +61,9 @@ $ pip install -r code/requirements.txt
DVC works on top of Git repositories. You run DVC initialization in a repository
directory to create DVC meta files and directories.
-At DVC initialization, a new `.dvc/` directory will be created for internal
-configuration and cache
-[files and directories](/doc/user-guide/dvc-files-and-directories) that are
+At DVC initialization, a new `.dvc/` directory is created for internal
+configuration and cache
+[files and directories](/doc/user-guide/dvc-files-and-directories), that are
hidden from the user. We describe some DVC internals below for a better
understanding of how it works.
diff --git a/content/docs/tutorials/get-started/add-files.md b/content/docs/tutorials/get-started/add-files.md
deleted file mode 100644
index 048aafa213..0000000000
--- a/content/docs/tutorials/get-started/add-files.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Add Files or Directories
-
-DVC allows storing and versioning data files, ML models, directories,
-intermediate results with Git, without tracking the file contents with Git.
-Let's get a dataset example to play with:
-
-```dvc
-$ mkdir data
-$ dvc get https://github.com/iterative/dataset-registry \
- get-started/data.xml -o data/data.xml
-```
-
-> `dvc get` can use any DVC repository to find the appropriate
-> [remote storage](/doc/command-reference/remote) and download data
-> artifacts from it (analogous to `wget`, but for repositories). In this
-> case we use [dataset-registry](https://github.com/iterative/dataset-registry))
-> as the source repo. (Refer to
-> [Data Registries](/doc/use-cases/data-registries) for more info about this
-> setup.)
-
-To track a file (or a directory) with DVC just run `dvc add` on it. For example:
-
-```dvc
-$ dvc add data/data.xml
-```
-
-DVC stores information about the added data in a special file called a
-**DVC-file**. DVC-files are small text files with a human-readable
-[format](/doc/user-guide/dvc-file-format) and they can be committed with Git:
-
-```dvc
-$ git add data/.gitignore data/data.xml.dvc
-$ git commit -m "Add raw data to project"
-```
-
-Committing DVC-files with Git allows us to track different versions of the
-project data as it evolves with the source code tracked by Git.
-
-
-
-### Expand to learn about DVC internals
-
-`dvc add` moves the actual data file to the cache directory (see
-[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), while
-the entries in the workspace may be file links to the actual files in the DVC
-cache.
-
-```dvc
-$ ls -R .dvc/cache
- .dvc/cache/a3:
- 04afb96060aad90176268345e10355
-```
-
-`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml`
-file we just added with DVC. If you check the `data/data.xml.dvc` DVC-file, you
-will see that it has this string inside.
-
-### Important note on cache performance
-
-DVC tries to use reflinks\* by default to link your data files from the DVC
-cache to the workspace, optimizing speed and storage space. However, reflinks
-are not widely supported yet and DVC falls back to actually copying data files
-to/from the cache. **Copying can be very slow with large files**, and duplicates
-storage requirements.
-
-Hardlinks and symlinks are also available for optimized cache linking but,
-(unlike reflinks) they carry the risk of accidentally corrupting the cache if
-tracked data files are modified in the workspace.
-
-See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and
-`dvc config cache` for more information.
-
-> \***copy-on-write links or "reflinks"** are a relatively new way to link files
-> in UNIX-style file systems. Unlike hardlinks or symlinks, they support
-> transparent [copy on write](https://en.wikipedia.org/wiki/Copy-on-write). This
-> means that editing a reflinked file is always safe as all the other links to
-> the file will reflect the changes.
-
-
-
-If your workspace uses Git, without DVC you would have to manually put each data
-file or directory into `.gitignore`. DVC commands that track data files
-automatically takes care of this for you! (You just have to add the changes with
-Git.)
-
-Refer to
-[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files),
-`dvc add`, and `dvc run` for more information on storing and versioning data
-files with DVC.
diff --git a/content/docs/tutorials/get-started/agenda.md b/content/docs/tutorials/get-started/agenda.md
deleted file mode 100644
index ca56dfbdec..0000000000
--- a/content/docs/tutorials/get-started/agenda.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Agenda
-
-You'll need [Git](https://git-scm.com) to run the commands in this guide. Also,
-if DVC is not installed, please follow these [instructions](/doc/install) to do
-so.
-
-In the next few sections we'll build a simple natural language processing (NLP)
-project from scratch. If you'd like to get the final result or have any issues
-along the way, you can download the fully reproducible
-[GitHub project](https://github.com/iterative/example-get-started) by running:
-
-```dvc
-$ git clone https://github.com/iterative/example-get-started
-```
-
-Otherwise, bear with us and we'll introduce some basic DVC concepts to get the
-same results together!
-
-The idea for this project is a simplified version of our
-[Deep Dive Tutorial](/doc/tutorials/deep). It explores the NLP problem of
-predicting tags for a given StackOverflow question. For example, we might want a
-classifier that can classify (or predict) posts about Python by tagging them
-with `python`.
-
-
-
-This is a natural language processing context, but NLP isn't the only area of
-data science where DVC can help. DVC is designed to be agnostic of frameworks,
-languages, etc. If you have data files or datasets and/or you produce data
-files, models, or datasets and you want to:
-
-- Capture and save those data artifacts the same way you capture
- code
-- Track and switch between different versions of data easily
-- Understand how data artifacts (e.g. ML models) were built in the first place
-- Be able to compare models to each other
-- Bring software best practices to your team and get everyone on the same page
-
-Then you're in the right place! Click the `Next` button below to start ↘
diff --git a/content/docs/tutorials/get-started/compare-experiments.md b/content/docs/tutorials/get-started/compare-experiments.md
deleted file mode 100644
index 21e09a8450..0000000000
--- a/content/docs/tutorials/get-started/compare-experiments.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Compare Experiments
-
-DVC makes it easy to iterate on your project using Git commits with tags or Git
-branches. It provides a way to try different ideas, keep track of them, switch
-back and forth. To find the best performing experiment or track the progress,
-[project metrics](/doc/command-reference/metrics) are supported in DVC (as
-described in one of the previous chapters).
-
-Let's run evaluate for the latest `bigrams` experiment we created in previous
-chapters. It mostly takes just running the `dvc repro`:
-
-```dvc
-$ git checkout master
-$ dvc checkout
-$ dvc repro evaluate.dvc
-```
-
-`git checkout master` and `dvc checkout` commands ensure that we have the latest
-experiment code and data respectively. And `dvc repro`, as we discussed in the
-[Reproduce](/doc/tutorials/get-started/reproduce) chapter, is a way to run all
-the necessary commands to build the model and measure its performance.
-
-```dvc
-$ git commit -am "Evaluate bigrams model"
-$ git tag -a "bigrams-experiment" -m "Bigrams experiment evaluation"
-```
-
-Now, we can use `-T` option of the `dvc metrics show` command to see the
-difference between the `baseline` and `bigrams` experiments:
-
-```dvc
-$ dvc metrics show -T
-
-baseline-experiment:
- auc.metric: 0.588426
-bigrams-experiment:
- auc.metric: 0.602818
-```
-
-DVC provides built-in support to track and navigate `JSON`, `TSV` or `CSV`
-metric files if you want to track additional information. See `dvc metrics` to
-learn more.
diff --git a/content/docs/tutorials/get-started/configure.md b/content/docs/tutorials/get-started/configure.md
deleted file mode 100644
index 99e8ca9279..0000000000
--- a/content/docs/tutorials/get-started/configure.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Configure
-
-Once you install DVC, you'll be able to start using it (in its local setup)
-immediately.
-
-However, remote storage should be set up (see `dvc remote`) if you need to share
-data or models outside of the context of a single project, for example with
-other collaborators or even with yourself, in a different computing environment.
-It's similar to the way you would use GitHub or any other Git server to store
-and share your code.
-
-For simplicity, let's setup a local remote:
-
-
-
-### What is a "local remote" ?
-
-While the term may seem contradictory, it doesn't have to be. The "local" part
-refers to the machine where the project is stored, so it can be any directory
-accessible to the same system. The "remote" part refers specifically to the
-project/repository itself. Read "local, but external" storage.
-
-
-
-```dvc
-$ dvc remote add -d myremote /tmp/dvc-storage
-$ git commit .dvc/config -m "Configure local remote"
-```
-
-> We only use a local remote in this section for simplicity's sake as you learn
-> to use DVC. For most [use cases](/doc/use-cases), other "more remote" types of
-> remotes will be required.
-
-[Adding a remote](/doc/command-reference/remote/add) should be specified by both
-its type (protocol) and its path. DVC currently supports these types of remotes:
-
-- `s3`: Amazon Simple Storage Service
-- `azure`: Microsoft Azure Blob Storage
-- `gdrive` : Google Drive
-- `gs`: Google Cloud Storage
-- `ssh`: Secure Shell (requires SFTP)
-- `hdfs`: Hadoop Distributed File System
-- `http`: HTTP and HTTPS protocols
-- `local`: Directory in the local file system
-
-> If you installed DVC via `pip` and plan to use cloud services as remote
-> storage, you might need to install these optional dependencies: `[s3]`,
-> `[azure]`, `[gdrive]`, `[gs]`, `[oss]`, `[ssh]`. Alternatively, use `[all]` to
-> include them all. The command should look like this: `pip install "dvc[s3]"`.
-> (This example installs `boto3` library along with DVC to support S3 storage.)
-
-For example, to setup an S3 remote we would use something like this (make sure
-that `mybucket` exists):
-
-```dvc
-$ dvc remote add -d s3remote s3://mybucket/myproject
-```
-
-> This command is only shown for informational purposes. No need to actually run
-> it in order to continue with the Get Started.
-
-You can see that DVC doesn't require installing any databases, servers, or
-warehouses. It can use bare S3 or SSH to store data, intermediate results, and
-models.
-
-See `dvc config` to get information about more configuration options and
-`dvc remote` to learn more about remotes and get more examples.
diff --git a/content/docs/tutorials/get-started/connect-code-and-data.md b/content/docs/tutorials/get-started/connect-code-and-data.md
deleted file mode 100644
index 1bec301c0c..0000000000
--- a/content/docs/tutorials/get-started/connect-code-and-data.md
+++ /dev/null
@@ -1,165 +0,0 @@
-# Connect Code and Data
-
-Even in its basic scenarios, commands like `dvc add`, `dvc push`, `dvc pull`
-described in the previous sections could be used independently and provide a
-basic useful framework to track, save and share models and large data files. To
-achieve full reproducibility though, we'll have to connect code and
-configuration with the data it processes to produce the result.
-
-
-
-### Expand to prepare example code
-
-If you've followed this _Get Started_ section from the beginning, run these
-commands to get the example code:
-
-```dvc
-$ wget https://code.dvc.org/get-started/code.zip
-$ unzip code.zip
-$ rm -f code.zip
-```
-
-Windows doesn't include the `wget` utility by default, but you can use the
-browser to download `code.zip`. (Right-click
-[this link](https://code.dvc.org/get-started/code.zip) and select
-`Save Link As...` (Chrome). Save it into the project directory.
-
-The workspace should now look like this:
-
-```dvc
-$ tree
-.
-├── data
-│ ├── data.xml
-│ └── data.xml.dvc
-└── src
- ├── evaluate.py
- ├── featurization.py
- ├── prepare.py
- ├── requirements.txt
- └── train.py
-```
-
-Now let's install the requirements. But before we do that, we **strongly**
-recommend creating a
-[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments):
-
-```dvc
-$ virtualenv -p python3 .env
-$ echo ".env/" >> .gitignore
-$ source .env/bin/activate
-$ pip install -r src/requirements.txt
-```
-
-Optionally, save the progress with Git:
-
-```dvc
-$ git add .
-$ git commit -m "Add source code files to repo"
-```
-
-
-
-Having installed the `src/prepare.py` script in your repo, the following command
-transforms it into a reproducible [stage](/doc/command-reference/run) for the ML
-pipeline we're building (described in the
-[next chapter](/doc/tutorials/pipelines)).
-
-```dvc
-$ dvc run -f prepare.dvc \
- -d src/prepare.py -d data/data.xml \
- -o data/prepared \
- python src/prepare.py data/data.xml
-```
-
-`dvc run` generates the `prepare.dvc` DVC-file. It has the same
-[format](/doc/user-guide/dvc-file-format) as the file we created in the
-[previous section](/doc/tutorials/get-started/add-files) to track `data.xml`,
-except in this case it has additional information about the `data/prepared`
-output (a directory where two files, `train.tsv` and `test.tsv`, will be written
-to), and about the Python command that is required to build it.
-
-
-
-### Expand to learn more about what has just happened
-
-This is how the result should look like now:
-
-```diff
- .
- ├── data
- │ ├── data.xml
- │ ├── data.xml.dvc
-+ │ └── prepared
-+ │ ├── test.tsv
-+ │ └── train.tsv
-+ ├── prepare.dvc
- └── src
- ├── evaluate.py
- ├── featurization.py
- ├── prepare.py
- ├── requirements.txt
- └── train.py
-```
-
-This is how `prepare.dvc` looks like:
-
-```yaml
-cmd: python src/prepare.py data/data.xml
-deps:
- - md5: b4801c88a83f3bf5024c19a942993a48
- path: src/prepare.py
- - md5: a304afb96060aad90176268345e10355
- path: data/data.xml
-md5: c3a73109be6c186b9d72e714bcedaddb
-outs:
- - cache: true
- md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir
- metric: false
- path: data/prepared
-wdir: .
-```
-
-> `dvc run` is just the first of a set of DVC command required to generate a
-> [pipeline](/doc/tutorials/get-started/pipeline), or in other words,
-> instructions on how to build a ML model (data file) from previous data files
-> (or directories).
-
-Let's briefly mention what the command options used above mean for this
-particular example:
-
-`-f prepare.dvc` specifies a name for the DVC-file (pipeline stage). It's
-optional but we recommend using it to make your project structure more readable.
-
-`-d src/prepare.py` and `-d data/data.xml` mean that the `prepare.dvc` stage
-file depends on them to produce the result. When you run `dvc repro` next time
-(see next chapter) DVC will automatically check these dependencies and decide
-whether this stage is up to date or whether it should be executed to regenerate
-its outputs.
-
-`-o data/prepared` specifies the output directory processed data will be put
-into. The script creates two files in it – that will be used later to generate
-features, train and evaluate the model.
-
-And, the last line, `python src/prepare.py data/data.xml`, specifies a command
-to run. This command is saved to the generated DVC-file, and used later by
-`dvc repro`.
-
-Hopefully, `dvc run` (and `dvc repro`) will become intuitive after a few more
-Get Started chapters. You can always refer to the the command references for
-more details on their behavior and options.
-
-
-
-You don't need to run `dvc add` to track output files (`prepared/train.tsv` and
-`prepared/test.tsv`) with DVC. `dvc run` takes care of this. You only need to
-run `dvc push` (usually along with `git commit`) to save them to the remote when
-you are done.
-
-Let's commit the changes to save the stage we built:
-
-```dvc
-$ git add data/.gitignore prepare.dvc
-$ git commit -m "Create data preparation stage"
-$ dvc push
-```
diff --git a/content/docs/tutorials/get-started/data-pipelines.md b/content/docs/tutorials/get-started/data-pipelines.md
new file mode 100644
index 0000000000..2855b7acfa
--- /dev/null
+++ b/content/docs/tutorials/get-started/data-pipelines.md
@@ -0,0 +1,335 @@
+# Connect Code and Data
+
+Even in its basic scenarios, commands like `dvc add`, `dvc push`, `dvc pull`
+described in the previous sections could be used independently and provide a
+basic useful framework to track, save and share models and large data files. To
+achieve full reproducibility though, we'll have to connect code and
+configuration with the data it processes to produce the result.
+
+
+
+### Expand to prepare example code
+
+If you've followed this _Get Started_ section from the beginning, run these
+commands to get the example code:
+
+```dvc
+$ wget https://code.dvc.org/get-started/code.zip
+$ unzip code.zip
+$ rm -f code.zip
+```
+
+Windows doesn't include the `wget` utility by default, but you can use the
+browser to download `code.zip`. (Right-click
+[this link](https://code.dvc.org/get-started/code.zip) and select
+`Save Link As...` (Chrome). Save it into the project directory.
+
+The workspace should now look like this:
+
+```dvc
+$ tree
+.
+├── data
+│ ├── data.xml
+│ └── data.xml.dvc
+└── src
+ ├── evaluate.py
+ ├── featurization.py
+ ├── prepare.py
+ ├── requirements.txt
+ └── train.py
+```
+
+Now let's install the requirements. But before we do that, we **strongly**
+recommend creating a
+[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments):
+
+```dvc
+$ virtualenv -p python3 .env
+$ echo ".env/" >> .gitignore
+$ source .env/bin/activate
+$ pip install -r src/requirements.txt
+```
+
+Optionally, save the progress with Git:
+
+```dvc
+$ git add .
+$ git commit -m "Add source code files to repo"
+```
+
+
+
+Having installed the `src/prepare.py` script in your repo, the following command
+transforms it into a reproducible [stage](/doc/command-reference/run) for the ML
+pipeline we're building (described in the
+[next chapter](/doc/tutorials/pipelines)).
+
+```dvc
+$ dvc run -f prepare.dvc \
+ -d src/prepare.py -d data/data.xml \
+ -o data/prepared \
+ python src/prepare.py data/data.xml
+```
+
+`dvc run` generates the `prepare.dvc` DVC-file. It has the same
+[format](/doc/user-guide/dvc-file-format) as the file we created in the
+[previous section](/doc/tutorials/get-started/add-files) to track `data.xml`,
+except in this case it has additional information about the `data/prepared`
+output (a directory where two files, `train.tsv` and `test.tsv`, will be written
+to), and about the Python command that is required to build it.
+
+
+
+### Expand to learn more about what has just happened
+
+This is how the result should look like now:
+
+```diff
+ .
+ ├── data
+ │ ├── data.xml
+ │ ├── data.xml.dvc
++ │ └── prepared
++ │ ├── test.tsv
++ │ └── train.tsv
++ ├── prepare.dvc
+ └── src
+ ├── evaluate.py
+ ├── featurization.py
+ ├── prepare.py
+ ├── requirements.txt
+ └── train.py
+```
+
+This is how `prepare.dvc` looks like:
+
+```yaml
+cmd: python src/prepare.py data/data.xml
+deps:
+ - md5: b4801c88a83f3bf5024c19a942993a48
+ path: src/prepare.py
+ - md5: a304afb96060aad90176268345e10355
+ path: data/data.xml
+md5: c3a73109be6c186b9d72e714bcedaddb
+outs:
+ - cache: true
+ md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir
+ metric: false
+ path: data/prepared
+wdir: .
+```
+
+> `dvc run` is just the first of a set of DVC command required to generate a
+> [pipeline](/doc/tutorials/get-started/pipeline), or in other words,
+> instructions on how to build a ML model (data file) from previous data files
+> (or directories).
+
+Let's briefly mention what the command options used above mean for this
+particular example:
+
+`-f prepare.dvc` specifies a name for the DVC-file (pipeline stage). It's
+optional but we recommend using it to make your project structure more readable.
+
+`-d src/prepare.py` and `-d data/data.xml` mean that the `prepare.dvc` stage
+file depends on them to produce the result. When you run `dvc repro` next time
+(see next chapter) DVC will automatically check these dependencies and decide
+whether this stage is up to date or whether it should be executed to regenerate
+its outputs.
+
+`-o data/prepared` specifies the output directory processed data will be put
+into. The script creates two files in it – that will be used later to generate
+features, train and evaluate the model.
+
+And, the last line, `python src/prepare.py data/data.xml`, specifies a command
+to run. This command is saved to the generated DVC-file, and used later by
+`dvc repro`.
+
+Hopefully, `dvc run` (and `dvc repro`) will become intuitive after a few more
+Get Started chapters. You can always refer to the the command references for
+more details on their behavior and options.
+
+
+
+You don't need to run `dvc add` to track output files (`prepared/train.tsv` and
+`prepared/test.tsv`) with DVC. `dvc run` takes care of this. You only need to
+run `dvc push` (usually along with `git commit`) to save them to the remote when
+you are done.
+
+Let's commit the changes to save the stage we built:
+
+```dvc
+$ git add data/.gitignore prepare.dvc
+$ git commit -m "Create data preparation stage"
+$ dvc push
+```
+
+# Pipeline
+
+Support for [pipelines](/doc/command-reference/pipeline) is the biggest
+difference between DVC and other version control tools that can handle large
+data files (e.g. `git lfs`). By using `dvc run` multiple times, and specifying
+outputs of a command (stage) as dependencies in another one, we can describe a
+sequence of commands that gets to a desired result. This is what we call a
+**data pipeline** or dependency graph.
+
+Let's create a second stage (after `prepare.dvc`, created in the previous
+chapter) to perform feature extraction:
+
+```dvc
+$ dvc run -f featurize.dvc \
+ -d src/featurization.py -d data/prepared \
+ -o data/features \
+ python src/featurization.py \
+ data/prepared data/features
+```
+
+And a third stage for training:
+
+```dvc
+$ dvc run -f train.dvc \
+ -d src/train.py -d data/features \
+ -o model.pkl \
+ python src/train.py data/features model.pkl
+```
+
+Let's commit DVC-files that describe our pipeline so far:
+
+```dvc
+$ git add data/.gitignore .gitignore featurize.dvc train.dvc
+$ git commit -m "Create featurization and training stages"
+$ dvc push
+```
+
+This example is simplified just to show you a basic pipeline, see a more
+advanced [example](/doc/tutorials/pipelines) or
+[complete tutorial](/doc/tutorials/pipelines) to create an
+[NLP](https://en.wikipedia.org/wiki/Natural_language_processing) pipeline
+end-to-end.
+
+> See also the `dvc pipeline` command.
+
+# Visualize
+
+Now that we have built our pipeline, we need a good way to visualize it to be
+able to wrap our heads around it. Luckily, DVC allows us to do that without
+leaving the terminal, making the experience distraction-less.
+
+We are using the `--ascii` option below to better illustrate this pipeline.
+Please, refer to `dvc pipeline show` to explore other options this command
+supports (e.g. `.dot` files that can be used then in other tools).
+
+## Stages
+
+```dvc
+$ dvc pipeline show --ascii train.dvc
+ +-------------------+
+ | data/data.xml.dvc |
+ +-------------------+
+ *
+ *
+ *
+ +-------------+
+ | prepare.dvc |
+ +-------------+
+ *
+ *
+ *
+ +---------------+
+ | featurize.dvc |
+ +---------------+
+ *
+ *
+ *
+ +-----------+
+ | train.dvc |
+ +-----------+
+```
+
+## Commands
+
+```dvc
+$ dvc pipeline show --ascii train.dvc --commands
+ +-------------------------------------+
+ | python src/prepare.py data/data.xml |
+ +-------------------------------------+
+ *
+ *
+ *
+ +---------------------------------------------------------+
+ | python src/featurization.py data/prepared data/features |
+ +---------------------------------------------------------+
+ *
+ *
+ *
+ +---------------------------------------------+
+ | python src/train.py data/features model.pkl |
+ +---------------------------------------------+
+```
+
+## Outputs
+
+```dvc
+$ dvc pipeline show --ascii train.dvc --outs
+ +---------------+
+ | data/data.xml |
+ +---------------+
+ *
+ *
+ *
+ +---------------+
+ | data/prepared |
+ +---------------+
+ *
+ *
+ *
+ +---------------+
+ | data/features |
+ +---------------+
+ *
+ *
+ *
+ +-----------+
+ | model.pkl |
+ +-----------+
+```
+
+# Reproduce
+
+In the previous chapters, we described our first
+[pipeline](/doc/command-reference/pipeline). Basically, we generated a number of
+[stage files](/doc/command-reference/run)
+([DVC-files](/doc/user-guide/dvc-file-format)). These stages define individual
+commands to execute towards a final result. Each depends on some data (either
+raw data files or intermediate results from previous stages) and code files.
+
+If you just cloned the
+[project](https://github.com/iterative/example-get-started), make sure you first
+fetch the input data from DVC by calling `dvc pull`.
+
+It's now extremely easy for you or your colleagues to reproduce the result
+end-to-end:
+
+```dvc
+$ dvc repro train.dvc
+```
+
+> If you've just followed the previous chapters, the command above will have
+> nothing to reproduce since you've recently executed all the pipeline stages.
+> To easily try this command, clone this example
+> [GitHub project](https://github.com/iterative/example-get-started) and run it
+> from there.
+
+`train.dvc` describes which source code and data files to use, and how to run
+the command in order to get the resulting model file. For each data file it
+depends on, we can in turn do the same analysis: find a corresponding DVC-file
+that includes the data file in its outputs, get dependencies and commands, and
+so on. It means that DVC can recursively build a complete sequence of commands
+it needs to execute to get the model file.
+
+`dvc repro` essentially builds a dependency graph, detects stages with modified
+dependencies or missing outputs and recursively executes commands (nodes in this
+graph or pipeline) starting from the first stage with changes.
+
+Thus, `dvc run` and `dvc repro` provide a powerful framework for _reproducible
+experiments_ and _reproducible projects_.
diff --git a/content/docs/tutorials/get-started/experiment-management.md b/content/docs/tutorials/get-started/experiment-management.md
new file mode 100644
index 0000000000..d6cd1c5d30
--- /dev/null
+++ b/content/docs/tutorials/get-started/experiment-management.md
@@ -0,0 +1,192 @@
+# Experiment Metrics
+
+Finally, we'd like to add an evaluation stage to our
+[pipeline](/doc/command-reference/pipeline). Data science is a metric-driven
+R&D-like process and `dvc metrics` commands along with DVC metric files provide
+a framework to capture and compare experiments performance. It doesn't require
+installing any databases or instrumenting your code to use some API, all is
+tracked by Git and is stored in Git or DVC remote storage:
+
+```dvc
+$ dvc run -f evaluate.dvc \
+ -d src/evaluate.py -d model.pkl -d data/features \
+ -M auc.metric \
+ python src/evaluate.py model.pkl \
+ data/features auc.metric
+```
+
+`evaluate.py` calculates AUC value using the test dataset. It reads features
+from the `features/test.pkl` file and produces a
+[metric](/doc/command-reference/metrics) file (`auc.metric`). Any
+output (in this case just a plain text file containing a single
+numeric value) can be marked as a metric, for example by using the `-M` option
+of `dvc run`.
+
+> Please, refer to the `dvc metrics` command documentation to see more details.
+
+Let's save the updated results:
+
+```dvc
+$ git add evaluate.dvc auc.metric
+$ git commit -m "Create evaluation stage"
+$ dvc push
+```
+
+Let's also assign a Git tag, it will serve as a checkpoint for us to compare
+experiments in the future, or if we need to go back and checkout it and the
+corresponding data:
+
+```dvc
+$ git tag -a "baseline-experiment" -m "Baseline experiment evaluation"
+```
+
+The `dvc metrics show` command provides a way to compare different experiments,
+by analyzing metric files across different branches, tags, etc. But first we
+need to create a new experiment to compare the baseline with.
+
+# Experiments
+
+Data science process is inherently iterative and R&D like. Data scientist may
+try many different approaches, different hyper-parameter values, and "fail" many
+times before the required level of a metric is achieved.
+
+DVC is built to provide a way to capture different experiments and navigate
+easily between them. Let's say we want to try a modified feature extraction:
+
+
+
+### Expand to see code modifications
+
+Edit `src/featurization.py` to enable bigrams and increase the number of
+features. Find and change the `CountVectorizer` arguments, specify `ngram_range`
+and increase number of features:
+
+```python
+bag_of_words = CountVectorizer(stop_words='english',
+ max_features=6000,
+ ngram_range=(1, 2))
+```
+
+
+
+```dvc
+$ vi src/featurization.py # edit to use bigrams (see above)
+$ dvc repro train.dvc # regenerate the new model.pkl
+$ git commit -am "Reproduce model using bigrams"
+```
+
+> Notice that `git commit -a` stages all the changes produced by `dvc repro`
+> before committing them with Git. Refer to the
+> [command reference](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--a)
+> for more details.
+
+Now, we have a new `model.pkl` captured and saved. To get back to the initial
+version, we run `git checkout` along with `dvc checkout` command:
+
+```dvc
+$ git checkout baseline-experiment
+$ dvc checkout
+```
+
+DVC is designed to checkout large data files (no matter how large they are) into
+your workspace almost instantly on almost all modern operating
+systems with file links. See
+[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for
+more information.
+
+# Compare Experiments
+
+DVC makes it easy to iterate on your project using Git commits with tags or Git
+branches. It provides a way to try different ideas, keep track of them, switch
+back and forth. To find the best performing experiment or track the progress,
+[project metrics](/doc/command-reference/metrics) are supported in DVC (as
+described in one of the previous chapters).
+
+Let's run evaluate for the latest `bigrams` experiment we created in previous
+chapters. It mostly takes just running the `dvc repro`:
+
+```dvc
+$ git checkout master
+$ dvc checkout
+$ dvc repro evaluate.dvc
+```
+
+`git checkout master` and `dvc checkout` commands ensure that we have the latest
+experiment code and data respectively. And `dvc repro`, as we discussed in the
+[Reproduce](/doc/tutorials/get-started/reproduce) chapter, is a way to run all
+the necessary commands to build the model and measure its performance.
+
+```dvc
+$ git commit -am "Evaluate bigrams model"
+$ git tag -a "bigrams-experiment" -m "Bigrams experiment evaluation"
+```
+
+Now, we can use `-T` option of the `dvc metrics show` command to see the
+difference between the `baseline` and `bigrams` experiments:
+
+```dvc
+$ dvc metrics show -T
+
+baseline-experiment:
+ auc.metric: 0.588426
+bigrams-experiment:
+ auc.metric: 0.602818
+```
+
+DVC provides built-in support to track and navigate `JSON`, `TSV` or `CSV`
+metric files if you want to track additional information. See `dvc metrics` to
+learn more.
+
+# Get Older Data Version
+
+Now that we have multiple experiments, models, processed datasets, the question
+is how do we revert back to an older version of a model file? Or how can we get
+the previous version of the dataset if it was changed at some point?
+
+The answer is the `dvc checkout` command, and we already touched briefly the
+process of switching between different data versions in the
+[Experiments](/doc/tutorials/get-started/experiments) chapter of this _Get
+Started_ section.
+
+Let's say we want to get the previous `model.pkl` file. The short answer is:
+
+```dvc
+$ git checkout baseline-experiment train.dvc
+$ dvc checkout train.dvc
+```
+
+These two commands will bring the previous model file to its place in the
+workspace.
+
+
+
+### Expand to learn about DVC internals
+
+DVC uses special [DVC-files](/doc/user-guide/dvc-file-format) to track data
+files, directories, end results. In this case, `train.dvc` among other things
+describes the `model.pkl` file this way:
+
+```yaml
+outs:
+md5: a66489653d1b6a8ba989799367b32c43
+path: model.pkl
+```
+
+`a664...2c43` is the "address" of the file in the local or remote DVC storage.
+
+It means that if we want to get to the previous version, we need to restore the
+DVC-file first with the `git checkout` command. Only after that can DVC restore
+the model file using the new "address" from the DVC-file.
+
+
+
+To fully restore the previous experiment we just run `git checkout` and
+`dvc checkout` without specifying a target:
+
+```dvc
+$ git checkout baseline-experiment
+$ dvc checkout
+```
+
+Read the `dvc checkout` command reference and a dedicated data versioning
+[example](/doc/tutorials/versioning) for more information.
diff --git a/content/docs/tutorials/get-started/experiments.md b/content/docs/tutorials/get-started/experiments.md
deleted file mode 100644
index b716872a2e..0000000000
--- a/content/docs/tutorials/get-started/experiments.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Experiments
-
-Data science process is inherently iterative and R&D like. Data scientist may
-try many different approaches, different hyper-parameter values, and "fail" many
-times before the required level of a metric is achieved.
-
-DVC is built to provide a way to capture different experiments and navigate
-easily between them. Let's say we want to try a modified feature extraction:
-
-
-
-### Expand to see code modifications
-
-Edit `src/featurization.py` to enable bigrams and increase the number of
-features. Find and change the `CountVectorizer` arguments, specify `ngram_range`
-and increase number of features:
-
-```python
-bag_of_words = CountVectorizer(stop_words='english',
- max_features=6000,
- ngram_range=(1, 2))
-```
-
-
-
-```dvc
-$ vi src/featurization.py # edit to use bigrams (see above)
-$ dvc repro train.dvc # regenerate the new model.pkl
-$ git commit -am "Reproduce model using bigrams"
-```
-
-> Notice that `git commit -a` stages all the changes produced by `dvc repro`
-> before committing them with Git. Refer to the
-> [command reference](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--a)
-> for more details.
-
-Now, we have a new `model.pkl` captured and saved. To get back to the initial
-version, we run `git checkout` along with `dvc checkout` command:
-
-```dvc
-$ git checkout baseline-experiment
-$ dvc checkout
-```
-
-DVC is designed to checkout large data files (no matter how large they are) into
-your workspace almost instantly on almost all modern operating
-systems with file links. See
-[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for
-more information.
diff --git a/content/docs/tutorials/get-started/import-data.md b/content/docs/tutorials/get-started/import-data.md
deleted file mode 100644
index 6900533d5c..0000000000
--- a/content/docs/tutorials/get-started/import-data.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Import Data
-
-We've seen how to [push](/doc/tutorials/get-started/store-data) and
-[pull](/doc/tutorials/get-started/retrieve-data) data from/to a DVC
-project's [remote](/doc/command-reference/remote). But what if we wanted
-to integrate a dataset or ML model produced in one project into another one?
-
-One way is to manually download the data (with `wget` or `dvc get`, for example)
-and use `dvc add` to track it, but the connection between the projects would be
-lost. We wouldn't be able to tell where the data came from or whether there are
-new versions available. A better alternative is the `dvc import` command:
-
-
-
-```dvc
-$ dvc import https://github.com/iterative/dataset-registry \
- get-started/data.xml
-```
-
-This downloads `data.xml` from our
-[dataset-registry](https://github.com/iterative/dataset-registry) project into
-the current working directory, adds it to `.gitignore`, and creates the
-`data.xml.dvc` [DVC-file](/doc/user-guide/dvc-file-format) to track changes in
-the source data. With _imports_, we can use `dvc update` to bring in changes in
-the external data source before
-[reproducing](/doc/tutorials/get-started/reproduce) any pipeline
-that depends on this data.
-
-
-
-### Expand to learn more about imports
-
-Note that the [dataset-registry](https://github.com/iterative/dataset-registry)
-repository doesn't actually contain a `get-started/data.xml` file. Instead, DVC
-inspects
-[get-started/data.xml.dvc](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc)
-and tries to retrieve the file using the project's default remote (configured
-[here](https://github.com/iterative/dataset-registry/blob/master/.dvc/config)).
-
-DVC-files created by `dvc import` are called _import stages_. They use the
-`repo` field in the dependencies section (`deps`) in order to track source data
-changes (as an [external dependency](/doc/user-guide/external-dependencies)),
-enabling the reusability of data artifacts. For example:
-
-```yaml
-md5: fd56a1794c147fea48d408f2bc95a33a
-locked: true
-deps:
- - path: get-started/data.xml
- repo:
- url: https://github.com/iterative/dataset-registry
- rev_lock: 7476a858f6200864b5755863c729bff41d0fb045
-outs:
- - md5: a304afb96060aad90176268345e10355
- path: data.xml
- cache: true
- metric: false
- persist: false
-```
-
-The `url` and `rev_lock` subfields under `repo` are used to save the origin and
-[version](https://git-scm.com/docs/revisions) of the dependency, respectively.
-
-> Note that `dvc update` updates the `rev_lock` field of the corresponding
-> DVC-file (when there are changes to bring in).
-
-
-
-Since this is not an official part of this _Get Started_, bring everything back
-to normal with:
-
-```dvc
-$ git reset --hard
-$ rm -f data.*
-```
-
-> See also `dvc import-url`.
diff --git a/content/docs/tutorials/get-started/index.md b/content/docs/tutorials/get-started/index.md
new file mode 100644
index 0000000000..77bfcd7fc2
--- /dev/null
+++ b/content/docs/tutorials/get-started/index.md
@@ -0,0 +1,104 @@
+# Get Started with DVC!
+
+You'll need [Git](https://git-scm.com) to run the commands in this tutorial.
+Also, if DVC is not installed, please follow these [instructions](/doc/install)
+first.
+
+In the next few pages we'll build a simple natural language processing (NLP)
+project from scratch. It explores the NLP problem of predicting tags for a given
+StackOverflow question. For example, we might want a classifier that can
+classify (or predict) posts about Python by tagging them with `python`.
+
+ _Data modeling overview_
+
+> This is a simplified version of our [Deep Dive Tutorial](/doc/tutorials/deep).
+
+Keep in mind that NLP is not the only area of data science where DVC can help.
+DVC is designed to be agnostic of frameworks, programming languages, etc.
+
+> In case you'd like to get the complete code base and results, or have any
+> issues along the way, please note we have a fully reproducible
+> [**example-get-started**](https://github.com/iterative/example-get-started)
+> repo on GitHub:
+>
+> ```dvc
+> $ git clone https://github.com/iterative/example-get-started
+> $ cd example-get-started
+> $ dvc pull
+> ```
+
+## Initialize
+
+Let's start by creating a workspace your home directory that we can
+version with Git. Then run `dvc init` inside to create a DVC
+repository:
+
+```dvc
+$ cd ~
+$ mkdir sotag-predictions
+$ cd sotag-predictions
+$ git init
+$ dvc init
+$ git commit -m "Initialize DVC repository"
+```
+
+At DVC initialization, a new `.dvc/` directory is created for internal
+configuration and cache
+[files and directories](/doc/user-guide/dvc-files-and-directories), that are
+hidden from the user.
+
+> See [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to
+> learn more about the DVC internal file and directory structure.
+
+The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore`
+DVC internal files with Git.
+
+## Configure
+
+Because we'll want to share data and models outside of the local context later
+(for example with other collaborators or for access from a different computing
+environment), we're going to set up a remote storage for the DVC
+project. For simplicity, let's set up a _local remote_.
+
+
+
+### What is a "local remote" ?
+
+While the term may seem contradictory, it doesn't have to be. The "local" part
+refers to the location of the storage relative to the project, so it can be any
+directory in the file system. "Remote" is the term that refers to the storage.
+Read "local cache backup".
+
+
+
+```dvc
+$ dvc remote add -d myremote /tmp/dvc-storage
+$ git commit .dvc/config -m "Configure local remote"
+```
+
+> We only use a local remote in this tutorial for simplicity's sake. For most
+> cases, other "more remote" types of storage will be required.
+
+That's it! DVC doesn't require installing any databases, servers, or warehouses.
+It can simply use cloud services, local or network file systems to store data,
+intermediate results, and ML models. The following remote types are currently
+supported:
+
+- Amazon **S3** (Simple Storage Service)
+- Microsoft **Azure** Blob Storage
+- **Google Drive**
+- **Google Cloud** Storage
+- Aliyun **OSS** (Object Storage Service)
+- **SSH** (Secure Shell) — requires SFTP
+- **HDFS** (Hadoop Distributed File System)
+- **HTTP** (and HTTPS) — read-only
+- Directory in the **local** file system
+
+> Refer to `dvc remote` for more details and examples.
+
+There are other features and options that can be configured in DVC. Please see
+`dvc config` for more information.
+
+---
+
+Go to the next page to continue ↘
diff --git a/content/docs/tutorials/get-started/initialize.md b/content/docs/tutorials/get-started/initialize.md
deleted file mode 100644
index 1e227d96c9..0000000000
--- a/content/docs/tutorials/get-started/initialize.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Initialize
-
-There are a few recommended ways to install DVC: OS-specific package/installer,
-`pip`, `conda`, and Homebrew. See [Installation](/doc/install) for all the
-alternatives and details.
-
-Let's start by creating a workspace we can version with Git. Then
-run `dvc init` inside to create the DVC project:
-
-```dvc
-$ mkdir example-get-started
-$ cd example-get-started
-$ git init
-$ dvc init
-$ git commit -m "Initialize DVC project"
-```
-
-At DVC initialization, a new `.dvc/` directory will be created for internal
-configuration and cache
-[files and directories](/doc/user-guide/dvc-files-and-directories) that are
-hidden from the user.
-
-> See `dvc init` if you want to get more details about the initialization
-> process, and
-> [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to
-> learn about the DVC internal file and directory structure.
-
-The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore`
-files (DVC internals) with Git.
diff --git a/content/docs/tutorials/get-started/metrics.md b/content/docs/tutorials/get-started/metrics.md
deleted file mode 100644
index e91ba6371f..0000000000
--- a/content/docs/tutorials/get-started/metrics.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Experiment Metrics
-
-Finally, we'd like to add an evaluation stage to our
-[pipeline](/doc/command-reference/pipeline). Data science is a metric-driven
-R&D-like process and `dvc metrics` commands along with DVC metric files provide
-a framework to capture and compare experiments performance. It doesn't require
-installing any databases or instrumenting your code to use some API, all is
-tracked by Git and is stored in Git or DVC remote storage:
-
-```dvc
-$ dvc run -f evaluate.dvc \
- -d src/evaluate.py -d model.pkl -d data/features \
- -M auc.metric \
- python src/evaluate.py model.pkl \
- data/features auc.metric
-```
-
-`evaluate.py` calculates AUC value using the test dataset. It reads features
-from the `features/test.pkl` file and produces a
-[metric](/doc/command-reference/metrics) file (`auc.metric`). Any
-output (in this case just a plain text file containing a single
-numeric value) can be marked as a metric, for example by using the `-M` option
-of `dvc run`.
-
-> Please, refer to the `dvc metrics` command documentation to see more details.
-
-Let's save the updated results:
-
-```dvc
-$ git add evaluate.dvc auc.metric
-$ git commit -m "Create evaluation stage"
-$ dvc push
-```
-
-Let's also assign a Git tag, it will serve as a checkpoint for us to compare
-experiments in the future, or if we need to go back and checkout it and the
-corresponding data:
-
-```dvc
-$ git tag -a "baseline-experiment" -m "Baseline experiment evaluation"
-```
-
-The `dvc metrics show` command provides a way to compare different experiments,
-by analyzing metric files across different branches, tags, etc. But first we
-need to create a new experiment to compare the baseline with.
diff --git a/content/docs/tutorials/get-started/older-versions.md b/content/docs/tutorials/get-started/older-versions.md
deleted file mode 100644
index bde6bce562..0000000000
--- a/content/docs/tutorials/get-started/older-versions.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Get Older Data Version
-
-Now that we have multiple experiments, models, processed datasets, the question
-is how do we revert back to an older version of a model file? Or how can we get
-the previous version of the dataset if it was changed at some point?
-
-The answer is the `dvc checkout` command, and we already touched briefly the
-process of switching between different data versions in the
-[Experiments](/doc/tutorials/get-started/experiments) chapter of this _Get
-Started_ section.
-
-Let's say we want to get the previous `model.pkl` file. The short answer is:
-
-```dvc
-$ git checkout baseline-experiment train.dvc
-$ dvc checkout train.dvc
-```
-
-These two commands will bring the previous model file to its place in the
-workspace.
-
-
-
-### Expand to learn about DVC internals
-
-DVC uses special [DVC-files](/doc/user-guide/dvc-file-format) to track data
-files, directories, end results. In this case, `train.dvc` among other things
-describes the `model.pkl` file this way:
-
-```yaml
-outs:
-md5: a66489653d1b6a8ba989799367b32c43
-path: model.pkl
-```
-
-`a664...2c43` is the "address" of the file in the local or remote DVC storage.
-
-It means that if we want to get to the previous version, we need to restore the
-DVC-file first with the `git checkout` command. Only after that can DVC restore
-the model file using the new "address" from the DVC-file.
-
-
-
-To fully restore the previous experiment we just run `git checkout` and
-`dvc checkout` without specifying a target:
-
-```dvc
-$ git checkout baseline-experiment
-$ dvc checkout
-```
-
-Read the `dvc checkout` command reference and a dedicated data versioning
-[example](/doc/tutorials/versioning) for more information.
diff --git a/content/docs/tutorials/get-started/pipeline.md b/content/docs/tutorials/get-started/pipeline.md
deleted file mode 100644
index d9f0f19390..0000000000
--- a/content/docs/tutorials/get-started/pipeline.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Pipeline
-
-Support for [pipelines](/doc/command-reference/pipeline) is the biggest
-difference between DVC and other version control tools that can handle large
-data files (e.g. `git lfs`). By using `dvc run` multiple times, and specifying
-outputs of a command (stage) as dependencies in another one, we can describe a
-sequence of commands that gets to a desired result. This is what we call a
-**data pipeline** or dependency graph.
-
-Let's create a second stage (after `prepare.dvc`, created in the previous
-chapter) to perform feature extraction:
-
-```dvc
-$ dvc run -f featurize.dvc \
- -d src/featurization.py -d data/prepared \
- -o data/features \
- python src/featurization.py \
- data/prepared data/features
-```
-
-And a third stage for training:
-
-```dvc
-$ dvc run -f train.dvc \
- -d src/train.py -d data/features \
- -o model.pkl \
- python src/train.py data/features model.pkl
-```
-
-Let's commit DVC-files that describe our pipeline so far:
-
-```dvc
-$ git add data/.gitignore .gitignore featurize.dvc train.dvc
-$ git commit -m "Create featurization and training stages"
-$ dvc push
-```
-
-This example is simplified just to show you a basic pipeline, see a more
-advanced [example](/doc/tutorials/pipelines) or
-[complete tutorial](/doc/tutorials/pipelines) to create an
-[NLP](https://en.wikipedia.org/wiki/Natural_language_processing) pipeline
-end-to-end.
-
-> See also the `dvc pipeline` command.
diff --git a/content/docs/tutorials/get-started/reproduce.md b/content/docs/tutorials/get-started/reproduce.md
deleted file mode 100644
index d6e6375878..0000000000
--- a/content/docs/tutorials/get-started/reproduce.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Reproduce
-
-In the previous chapters, we described our first
-[pipeline](/doc/command-reference/pipeline). Basically, we generated a number of
-[stage files](/doc/command-reference/run)
-([DVC-files](/doc/user-guide/dvc-file-format)). These stages define individual
-commands to execute towards a final result. Each depends on some data (either
-raw data files or intermediate results from previous stages) and code files.
-
-If you just cloned the
-[project](https://github.com/iterative/example-get-started), make sure you first
-fetch the input data from DVC by calling `dvc pull`.
-
-It's now extremely easy for you or your colleagues to reproduce the result
-end-to-end:
-
-```dvc
-$ dvc repro train.dvc
-```
-
-> If you've just followed the previous chapters, the command above will have
-> nothing to reproduce since you've recently executed all the pipeline stages.
-> To easily try this command, clone this example
-> [GitHub project](https://github.com/iterative/example-get-started) and run it
-> from there.
-
-`train.dvc` describes which source code and data files to use, and how to run
-the command in order to get the resulting model file. For each data file it
-depends on, we can in turn do the same analysis: find a corresponding DVC-file
-that includes the data file in its outputs, get dependencies and commands, and
-so on. It means that DVC can recursively build a complete sequence of commands
-it needs to execute to get the model file.
-
-`dvc repro` essentially builds a dependency graph, detects stages with modified
-dependencies or missing outputs and recursively executes commands (nodes in this
-graph or pipeline) starting from the first stage with changes.
-
-Thus, `dvc run` and `dvc repro` provide a powerful framework for _reproducible
-experiments_ and _reproducible projects_.
diff --git a/content/docs/tutorials/get-started/retrieve-data.md b/content/docs/tutorials/get-started/retrieve-data.md
deleted file mode 100644
index 2a11926903..0000000000
--- a/content/docs/tutorials/get-started/retrieve-data.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Retrieve Data
-
-> You'll need to complete the
-> [initialization](/doc/tutorials/get-started/initialize) and
-> [configuration](/doc/tutorials/get-started/configure) chapters before being
-> able to run the commands explained here.
-
-To retrieve data files into the workspace in your local machine,
-run:
-
-```dvc
-$ rm -f data/data.xml
-$ dvc pull
-```
-
-This command downloads data files that are referenced in all
-[DVC-files](/doc/user-guide/dvc-file-format) in the project. So,
-you usually run it after `git clone`, `git pull`, or `git checkout`.
-
-Alternatively, if you want to retrieve a single dataset or a file you can use:
-
-```dvc
-$ dvc pull data/data.xml.dvc
-```
-
-DVC remotes, `dvc push`, and `dvc pull` provide a basic collaboration workflow,
-the same way as Git remotes, `git push` and `git pull`. See
-[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for
-more information.
diff --git a/content/docs/tutorials/get-started/store-data.md b/content/docs/tutorials/get-started/store-data.md
deleted file mode 100644
index 1306681e27..0000000000
--- a/content/docs/tutorials/get-started/store-data.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Store and Share Data
-
-Now, that your data files are managed by DVC (see
-[Add Files](/doc/tutorials/get-started/add-files)), you can push them from your
-repository to the default [remote](/doc/command-reference/remote) storage\*:
-
-```dvc
-$ dvc push
-```
-
-The same way as with Git remote, it ensures that your data files and your models
-are safely stored remotely and are shareable. This means that the data can be
-pulled by yourself or your colleagues whenever you need it.
-
-Usually, you run it along with `git commit` and `git push` to save the changed
-[DVC-files](/doc/user-guide/dvc-file-format).
-
-The `dvc push` command allows one to upload data to remote storage. It doesn't
-save any changes in the code or DVC-files. Those should be saved by using
-`git commit` and `git push`.
-
-> \*As noted in the DVC [configuration](/doc/tutorials/get-started/configure)
-> chapter, we are using a **local remote** in this section for illustrative
-> purposes.
-
-
-
-### Expand to learn more about DVC internals
-
-You can check now that actual data file has been copied to the remote we created
-in the [configuration](/doc/tutorials/get-started/configure) chapter:
-
-```dvc
-$ ls -R /tmp/dvc-storage
-/tmp/dvc-storage/a3:
-04afb96060aad90176268345e10355
-```
-
-`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml`
-file. If you check the `data.xml.dvc`
-[DVC-file](/doc/user-guide/dvc-file-format), you will see that it has this
-string inside.
-
-
diff --git a/content/docs/tutorials/get-started/versioning-basics.md b/content/docs/tutorials/get-started/versioning-basics.md
new file mode 100644
index 0000000000..6a887a22de
--- /dev/null
+++ b/content/docs/tutorials/get-started/versioning-basics.md
@@ -0,0 +1,236 @@
+# Data Versioning Basics
+
+DVC allows storing and versioning data files or directories, ML models, and
+intermediate results with a regular Git workflow, without actually tracking the
+file contents with Git. Let's get a dataset example to play with:
+
+```dvc
+$ mkdir data
+$ dvc get https://github.com/iterative/dataset-registry \
+ get-started/data.xml -o data/data.xml
+```
+
+> `dvc get` can download any data artifact tracked in a DVC
+> repository, using the appropriate
+> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for
+> DVC/Git repos). In this case we use our
+> [dataset-registry](https://github.com/iterative/dataset-registry)) as the
+> source repository (refer to [Data Registries](/doc/use-cases/data-registries)
+> for more info.)
+
+## Start tracking data
+
+To track a file with DVC, just run `dvc add` on it:
+
+```dvc
+$ dvc add data/data.xml
+```
+
+DVC stores information about the added data in a special **DVC-file**
+(`data/data.xml.dvc`), a small text file with a human-readable
+[format](/doc/user-guide/dvc-file-format). The above command also tells Git to
+ignore the actual data contents, so that this version of the data can be safely
+committed to the repository, using Git:
+
+```dvc
+$ git add data/.gitignore data/data.xml.dvc
+$ git commit -m "Add raw data"
+```
+
+
+
+### Expand to learn about DVC internals
+
+`dvc add` moves the data file to the project's cache (see
+[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), and
+makes file links (or copies) with the original file names back in the
+workspace, which is what you see inside the project.
+
+```dvc
+$ ls -R .dvc/cache
+...
+ .dvc/cache/a3:
+ 04afb96060aad90176268345e10355
+```
+
+The hash value of the `data/data.xml` file we just added,
+`a304afb96060aad90176268345e10355` determines the path and file name shown
+above. And if you check the `data/data.xml.dvc` DVC-file created by DVC, you
+will see that it has this string inside.
+
+### Important note on cache performance
+
+DVC tries to use reflinks\* by default to link your data files from the DVC
+cache to the workspace, optimizing speed and storage space. However, reflinks
+are not widely supported yet and DVC falls back to actually copying data files
+to/from the cache. **Copying can be very slow with large files**, and duplicates
+storage requirements.
+
+Hardlinks and symlinks are also available for optimized cache linking but,
+(unlike reflinks) they carry the risk of accidentally corrupting the cache if
+tracked data files are modified in the workspace.
+
+See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and
+`dvc config cache` for more information.
+
+> \***copy-on-write links or "reflinks"** are a relatively new way to link files
+> in UNIX-style file systems. Unlike hardlinks or symlinks, they support
+> transparent [copy on write](https://en.wikipedia.org/wiki/Copy-on-write). This
+> means that editing a reflinked file is always safe as all the other links to
+> the file will reflect the changes.
+
+
+
+Refer to
+[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files),
+`dvc add`, and `dvc run` for more information on storing and versioning data
+files with DVC.
+
+## Store and share data
+
+Now that your raw data is tracked by DVC, you can push it from your repository
+to the default [remote storage](/doc/command-reference/remote).
+
+> As seen in the intro's [Configure](/doc/tutorials/get-started#configure)
+> section, we are using a **local remote** in this section for illustrative
+> purposes.
+
+```dvc
+$ dvc push
+```
+
+Similar to pushing source code to a _Git remote_, `dvc push` ensures that your
+data files and models are safely backed up remotely. This means that the data
+can be pulled by yourself or by colleagues when and where needed. Usually, we
+also want to `git commit` and `git push`, to save the new (or changed versions
+of) [DVC-files](/doc/user-guide/dvc-file-format).
+
+
+
+### Expand to learn more about DVC internals
+
+You can check that the data has been backed up to the remote (`/tmp/dvc-storage`
+local directory) with:
+
+```dvc
+$ ls -R /tmp/dvc-storage
+...
+/tmp/dvc-storage/a3:
+04afb96060aad90176268345e10355
+```
+
+
+
+## Retrieve data
+
+Imagine you're just cloning the Git repo that has been created so far in another
+computer. This can be simulated by cloning our **example-get-started** repo from
+GitHub, and checking out the
+[`3-add-file`](https://github.com/iterative/example-get-started/tree/3-add-file)
+tag:
+
+```dvc
+$ cd ~
+$ git clone https://github.com/iterative/example-get-started
+$ cd example-get-started
+$ git checkout 3-add-file
+```
+
+If you list the files in this fresh workspace, or even in the
+cache, you'll notice that the `data/data.xml` file is not there yet. This is
+because it's not stored by Git! To get it, simply run:
+
+```dvc
+$ dvc pull
+```
+
+`dvc pull` downloads data files that are referenced in all present
+[DVC-files](/doc/user-guide/dvc-file-format) from the project's
+remote storage, so usually we run it after `git clone`, `git pull`, or
+`git checkout`.
+
+Alternatively, if you want to retrieve a single file or directory, you can
+specify the target like this:
+
+```dvc
+$ dvc pull data/data.xml.dvc
+```
+
+> In this case, both commands have the same result, as there's currently just
+> one DVC-tracked file in the repo.
+
+[DVC remotes](/doc/command-reference/remote), `dvc push`, and `dvc pull` provide
+a basic collaboration workflow, the same way as Git remotes, `git push` and
+`git pull`. See
+[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for
+more information.
+
+## Import data
+
+We've seen how to [push](#store-and-share-date) and [pull](#retrieve-data) data
+from/to a remote storage. But what if we wanted to integrate a dataset or ML
+model produced in one project into another one?
+
+One way is to manually download the data and use `dvc add` to track it, like in
+the beginning of this page. But the connection between the projects is only
+known by the person doing this. Others wouldn't be able to tell where the data
+came from or whether there are new versions available.
+
+A better alternative is the `dvc import` command! Let's go back to the
+project we're building, and replace `data/data.xml` by importing it
+from the same source:
+
+```dvc
+$ cd ~/sotag-predictions
+$ dvc import https://github.com/iterative/dataset-registry \
+ get-started/data.xml -o data/data.xml
+```
+
+This downloads and overwrites the same `data/data.xml`, checks that it's in
+`data/.gitignore`, and creates the `data/data.xml.dvc`
+[DVC-file](/doc/user-guide/dvc-file-format). So far this seems identical to our
+previous strategy, except that this time `data.xml.dvc` has additional metadata
+that allows DVC to track changes in the source data. This allows `dvc update` to
+bring in changes from the data source.
+
+
+
+### Expand to learn more about DVC internals
+
+DVC-files created by `dvc import` are called _import stages_. If we check the
+difference against the regular DVC-file we previously had, we can see that the
+latter has more fields, such as the data source `repo`, and `path` within it:
+
+```dvc
+$ git diff
+...
+--- a/data/data.xml.dvc
++++ b/data/data.xml.dvc
+...
++deps:
++- path: get-started/data.xml
++ repo:
++ url: https://github.com/iterative/dataset-registry
++ rev_lock: f31f5c4cdae787b4bdeb97a717687d44667d9e62
+```
+
+The `url` and `rev_lock` subfields under `repo` are used to save the origin and
+[version](https://git-scm.com/docs/revisions) of the dependency, respectively.
+
+> `dvc update` updates the `rev_lock` field of the corresponding DVC-file (when
+> there are changes to bring in).
+
+Note that the [dataset-registry](https://github.com/iterative/dataset-registry)
+repository doesn't actually contain a `get-started/data.xml` file. Like,
+`dvc get`, importing also downloads the data from the appropriate
+[remote storage](/doc/command-reference/remote).
+
+
+
+Let's wrap up by committing the import stage with Git:
+
+```dvc
+$ git add data/data.xml.dvc
+$ git commit -m "Import raw data (overwrite)"
+$ dvc push # so others can pull the imported data in their repo copies
+```
diff --git a/content/docs/tutorials/get-started/visualize.md b/content/docs/tutorials/get-started/visualize.md
deleted file mode 100644
index 5b7e5c293f..0000000000
--- a/content/docs/tutorials/get-started/visualize.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# Visualize
-
-Now that we have built our pipeline, we need a good way to visualize it to be
-able to wrap our heads around it. Luckily, DVC allows us to do that without
-leaving the terminal, making the experience distraction-less.
-
-We are using the `--ascii` option below to better illustrate this pipeline.
-Please, refer to `dvc pipeline show` to explore other options this command
-supports (e.g. `.dot` files that can be used then in other tools).
-
-## Stages
-
-```dvc
-$ dvc pipeline show --ascii train.dvc
- +-------------------+
- | data/data.xml.dvc |
- +-------------------+
- *
- *
- *
- +-------------+
- | prepare.dvc |
- +-------------+
- *
- *
- *
- +---------------+
- | featurize.dvc |
- +---------------+
- *
- *
- *
- +-----------+
- | train.dvc |
- +-----------+
-```
-
-## Commands
-
-```dvc
-$ dvc pipeline show --ascii train.dvc --commands
- +-------------------------------------+
- | python src/prepare.py data/data.xml |
- +-------------------------------------+
- *
- *
- *
- +---------------------------------------------------------+
- | python src/featurization.py data/prepared data/features |
- +---------------------------------------------------------+
- *
- *
- *
- +---------------------------------------------+
- | python src/train.py data/features model.pkl |
- +---------------------------------------------+
-```
-
-## Outputs
-
-```dvc
-$ dvc pipeline show --ascii train.dvc --outs
- +---------------+
- | data/data.xml |
- +---------------+
- *
- *
- *
- +---------------+
- | data/prepared |
- +---------------+
- *
- *
- *
- +---------------+
- | data/features |
- +---------------+
- *
- *
- *
- +-----------+
- | model.pkl |
- +-----------+
-```
diff --git a/content/docs/tutorials/pipelines.md b/content/docs/tutorials/pipelines.md
index 4bcd33da7e..10f21a352b 100644
--- a/content/docs/tutorials/pipelines.md
+++ b/content/docs/tutorials/pipelines.md
@@ -50,13 +50,13 @@ $ git add code/
$ git commit -m "Download and add code to new Git repo"
```
-> `dvc get` can use any DVC repository to find the appropriate
-> [remote storage](/doc/command-reference/remote) and download data
-> artifacts from it (analogous to `wget`, but for repositories). In this
-> case we use [dataset-registry](https://github.com/iterative/dataset-registry))
-> as the source repo. (Refer to
-> [Data Registries](/doc/use-cases/data-registries) for more info about this
-> setup.)
+> `dvc get` can download any data artifact tracked in a DVC
+> repository, using the appropriate
+> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for
+> DVC/Git repos). In this case we use our
+> [dataset-registry](https://github.com/iterative/dataset-registry)) as the
+> source repository (refer to [Data Registries](/doc/use-cases/data-registries)
+> for more info.)
Now let's install the requirements. But before we do that, we **strongly**
recommend creating a
@@ -102,9 +102,9 @@ When we run `dvc add` `Posts.xml.zip`, DVC creates a
### Expand to learn about DVC internals
-At DVC initialization, a new `.dvc/` directory will be created for internal
-configuration and cache
-[files and directories](/doc/user-guide/dvc-files-and-directories) that are
+At DVC initialization, a new `.dvc/` directory is created for internal
+configuration and cache
+[files and directories](/doc/user-guide/dvc-files-and-directories), that are
hidden from the user.
Note that the DVC-file created by `dvc add` has no dependencies, a.k.a. an
diff --git a/content/docs/tutorials/versioning.md b/content/docs/tutorials/versioning.md
index 370fc95165..1ae0768a9a 100644
--- a/content/docs/tutorials/versioning.md
+++ b/content/docs/tutorials/versioning.md
@@ -83,13 +83,13 @@ $ unzip -q data.zip
$ rm -f data.zip
```
-> `dvc get` can use any DVC repository to find the appropriate
-> [remote storage](/doc/command-reference/remote) and download data
-> artifacts from it (analogous to `wget`, but for repositories). In this
-> case we use [dataset-registry](https://github.com/iterative/dataset-registry))
-> as the source repo. (Refer to
-> [Data Registries](/doc/use-cases/data-registries) for more info about this
-> setup.)
+> `dvc get` can download any data artifact tracked in a DVC
+> repository, using the appropriate
+> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for
+> DVC/Git repos). In this case we use our
+> [dataset-registry](https://github.com/iterative/dataset-registry)) as the
+> source repository (refer to [Data Registries](/doc/use-cases/data-registries)
+> for more info.)
This command downloads and extracts our raw dataset, consisting of 1000 labeled
images for training and 800 labeled images for validation. In total, it's a 43
diff --git a/content/docs/understanding-dvc/what-is-dvc.md b/content/docs/understanding-dvc/what-is-dvc.md
index 444d7a6774..7f21206a88 100644
--- a/content/docs/understanding-dvc/what-is-dvc.md
+++ b/content/docs/understanding-dvc/what-is-dvc.md
@@ -1,16 +1,15 @@
# What Is DVC?
Data Version Control, or DVC, is **a new type of experiment management
-software** that has been built **on top of the existing engineering toolset that
-you're already used to**, and particularly on a source code version control
-system (currently Git). DVC reduces the gap between existing tools and data
-science needs, allowing users to take advantage of experiment management
-software while reusing existing skills and intuition.
-
-The underlying source code control system eliminates the need to use external
-services. Data science experiment sharing and collaboration can be done through
-regular Git tools (commit messages, merges, pull requests, etc) the same way it
-works for software engineers.
+software** built on top of the existing engineering toolset that you're already
+used to, and particularly on a source code management (Git). DVC reduces the gap
+between existing tools and data science needs, allowing users to take advantage
+of experiment management while reusing existing skills and intuition.
+
+Leveraging an underlying source code management system eliminates the need to
+use external services. Data science experiment sharing and collaboration can be
+done through regular Git features (commit messages, merges, pull requests, etc)
+the same way it works for software engineers.
DVC implements a **Git experimentation methodology** where each experiment
exists with its code as well as data, and can be represented as a separate Git
diff --git a/content/docs/use-cases/versioning-data-and-model-files.md b/content/docs/use-cases/versioning-data-and-model-files.md
index dbf45e37ce..a434db583a 100644
--- a/content/docs/use-cases/versioning-data-and-model-files.md
+++ b/content/docs/use-cases/versioning-data-and-model-files.md
@@ -42,9 +42,9 @@ initialize the DVC project on top of the existing repository:
$ dvc init
```
-At DVC initialization, a new `.dvc/` directory will be created for internal
-configuration and cache
-[files and directories](/doc/user-guide/dvc-files-and-directories) that are
+At DVC initialization, a new `.dvc/` directory is created for internal
+configuration and cache
+[files and directories](/doc/user-guide/dvc-files-and-directories), that are
hidden from the user. These can safely be tracked with Git:
```dvc
diff --git a/package.json b/package.json
index 316479f009..2b33043311 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,6 @@
"main": "index.js",
"scripts": {
"develop": "gatsby develop",
- "debug": "node --inspect-brk server.js",
"build": "gatsby build",
"heroku-postbuild": "./scripts/deploy-with-s3.js",
"test": "jest",
@@ -15,8 +14,8 @@
"format-all": "prettier --write './**/*.{js,jsx,md,tsx,ts,json}'",
"lint-ts": "tsc --noEmit --skipLibCheck && eslint --ext .json,.js,.ts,.tsx src scripts",
"lint-css": "stylelint \"src/**/*.css\"",
- "link-check": "scripts/link-check-git-all.sh",
- "link-check-diff": "scripts/link-check-git-diff.sh"
+ "link-check": "./scripts/link-check-git-all.sh",
+ "link-check-diff": "./scripts/link-check-git-diff.sh"
},
"repository": {
"type": "git",
diff --git a/scripts/link-check-git-all.sh b/scripts/link-check-git-all.sh
index 84da16a85b..27c78fea74 100755
--- a/scripts/link-check-git-all.sh
+++ b/scripts/link-check-git-all.sh
@@ -2,5 +2,5 @@
repo="$(dirname "$(realpath "$(dirname "$0")")")"
-(find "$repo"/pages/ "$repo"/content/docs/ "$repo"/src/ "$repo"/.github/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \
+(find "$repo"/.github/ "$repo"/content/docs/ "$repo"/src/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \
| xargs -n1 -P8 $(dirname "$0")/link-check.sh
diff --git a/src/utils/sidebar.js b/src/utils/sidebar.js
index 46e12bab9c..97f2e640f2 100644
--- a/src/utils/sidebar.js
+++ b/src/utils/sidebar.js
@@ -21,14 +21,14 @@
const startCase = require('lodash.startcase')
const sidebar = require('../../content/docs/sidebar.json')
-const PATH_ROOT = '/doc/'
+const PATH_ROOT = '/doc'
const FILE_ROOT = '/docs/'
const FILE_EXTENSION = '.md'
function validateRawItem({ slug, source, children }) {
const isSourceDisabled = source === false
- if (!slug) {
+ if (typeof slug !== 'string') {
throw Error("'slug' field is required in objects in sidebar.json")
}
@@ -81,7 +81,7 @@ function normalizeItem({ rawItem, parentPath, resultRef, prevRef }) {
const sourcePath = FILE_ROOT + parentPath + sourceFileName
return {
- path: PATH_ROOT + parentPath + slug,
+ path: PATH_ROOT + (parentPath || slug ? '/' : '') + parentPath + slug,
source: source === false ? false : sourcePath,
label: label ? label : startCase(slug),
tutorials: tutorials || {},
@@ -152,7 +152,7 @@ function getFirstPage() {
function getItemByPath(path) {
const normalizedPath = path.replace(/\/$/, '')
- const isRoot = normalizedPath === PATH_ROOT.slice(0, -1)
+ const isRoot = normalizedPath === PATH_ROOT
const item = isRoot
? normalizedSidebar[0]
: findItemByField(normalizedSidebar, 'path', normalizedPath)
@@ -173,10 +173,14 @@ function getPathWithSoruce(path) {
}
function getParentsListFromPath(path) {
- let currentPath = PATH_ROOT.slice(0, -1)
+ let currentPath = PATH_ROOT
+
+ if (path === PATH_ROOT) {
+ return [PATH_ROOT]
+ }
return path
- .replace(PATH_ROOT, '')
+ .replace(`${PATH_ROOT}/`, '')
.split('/')
.map(part => {
const path = `${currentPath}/${part}`