From 9ddaf888e080cd964632875907c047c4db9c1dcf Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Thu, 19 Mar 2020 01:44:56 -0600 Subject: [PATCH 1/9] tutorials: consolidating agenda, initialize, and configure into get-started index (WIP) --- content/docs/sidebar.json | 5 +- content/docs/tutorials/get-started/agenda.md | 39 ----- .../docs/tutorials/get-started/configure.md | 67 --------- content/docs/tutorials/get-started/index.md | 135 ++++++++++++++++++ .../docs/tutorials/get-started/initialize.md | 29 ---- 5 files changed, 136 insertions(+), 139 deletions(-) delete mode 100644 content/docs/tutorials/get-started/agenda.md delete mode 100644 content/docs/tutorials/get-started/configure.md create mode 100644 content/docs/tutorials/get-started/index.md delete mode 100644 content/docs/tutorials/get-started/initialize.md diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 7d097daf07..91799b11c9 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -30,14 +30,11 @@ "children": [ { "slug": "get-started", - "source": false, + "source": "get-started/index.md", "tutorials": { "katacoda": "https://katacoda.com/dvc/courses/get-started/initialize" }, "children": [ - "agenda", - "initialize", - "configure", "add-files", "store-data", "retrieve-data", diff --git a/content/docs/tutorials/get-started/agenda.md b/content/docs/tutorials/get-started/agenda.md deleted file mode 100644 index ca56dfbdec..0000000000 --- a/content/docs/tutorials/get-started/agenda.md +++ /dev/null @@ -1,39 +0,0 @@ -# Agenda - -You'll need [Git](https://git-scm.com) to run the commands in this guide. Also, -if DVC is not installed, please follow these [instructions](/doc/install) to do -so. - -In the next few sections we'll build a simple natural language processing (NLP) -project from scratch. If you'd like to get the final result or have any issues -along the way, you can download the fully reproducible -[GitHub project](https://github.com/iterative/example-get-started) by running: - -```dvc -$ git clone https://github.com/iterative/example-get-started -``` - -Otherwise, bear with us and we'll introduce some basic DVC concepts to get the -same results together! - -The idea for this project is a simplified version of our -[Deep Dive Tutorial](/doc/tutorials/deep). It explores the NLP problem of -predicting tags for a given StackOverflow question. For example, we might want a -classifier that can classify (or predict) posts about Python by tagging them -with `python`. - -![](/img/example-flow-2x.png) - -This is a natural language processing context, but NLP isn't the only area of -data science where DVC can help. DVC is designed to be agnostic of frameworks, -languages, etc. If you have data files or datasets and/or you produce data -files, models, or datasets and you want to: - -- Capture and save those data artifacts the same way you capture - code -- Track and switch between different versions of data easily -- Understand how data artifacts (e.g. ML models) were built in the first place -- Be able to compare models to each other -- Bring software best practices to your team and get everyone on the same page - -Then you're in the right place! Click the `Next` button below to start ↘ diff --git a/content/docs/tutorials/get-started/configure.md b/content/docs/tutorials/get-started/configure.md deleted file mode 100644 index 99e8ca9279..0000000000 --- a/content/docs/tutorials/get-started/configure.md +++ /dev/null @@ -1,67 +0,0 @@ -# Configure - -Once you install DVC, you'll be able to start using it (in its local setup) -immediately. - -However, remote storage should be set up (see `dvc remote`) if you need to share -data or models outside of the context of a single project, for example with -other collaborators or even with yourself, in a different computing environment. -It's similar to the way you would use GitHub or any other Git server to store -and share your code. - -For simplicity, let's setup a local remote: - -
- -### What is a "local remote" ? - -While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. - -
- -```dvc -$ dvc remote add -d myremote /tmp/dvc-storage -$ git commit .dvc/config -m "Configure local remote" -``` - -> We only use a local remote in this section for simplicity's sake as you learn -> to use DVC. For most [use cases](/doc/use-cases), other "more remote" types of -> remotes will be required. - -[Adding a remote](/doc/command-reference/remote/add) should be specified by both -its type (protocol) and its path. DVC currently supports these types of remotes: - -- `s3`: Amazon Simple Storage Service -- `azure`: Microsoft Azure Blob Storage -- `gdrive` : Google Drive -- `gs`: Google Cloud Storage -- `ssh`: Secure Shell (requires SFTP) -- `hdfs`: Hadoop Distributed File System -- `http`: HTTP and HTTPS protocols -- `local`: Directory in the local file system - -> If you installed DVC via `pip` and plan to use cloud services as remote -> storage, you might need to install these optional dependencies: `[s3]`, -> `[azure]`, `[gdrive]`, `[gs]`, `[oss]`, `[ssh]`. Alternatively, use `[all]` to -> include them all. The command should look like this: `pip install "dvc[s3]"`. -> (This example installs `boto3` library along with DVC to support S3 storage.) - -For example, to setup an S3 remote we would use something like this (make sure -that `mybucket` exists): - -```dvc -$ dvc remote add -d s3remote s3://mybucket/myproject -``` - -> This command is only shown for informational purposes. No need to actually run -> it in order to continue with the Get Started. - -You can see that DVC doesn't require installing any databases, servers, or -warehouses. It can use bare S3 or SSH to store data, intermediate results, and -models. - -See `dvc config` to get information about more configuration options and -`dvc remote` to learn more about remotes and get more examples. diff --git a/content/docs/tutorials/get-started/index.md b/content/docs/tutorials/get-started/index.md new file mode 100644 index 0000000000..eeed4adbef --- /dev/null +++ b/content/docs/tutorials/get-started/index.md @@ -0,0 +1,135 @@ +# Get Started with DVC! + +You'll need [Git](https://git-scm.com) to run the commands in this guide. Also, +if DVC is not installed, please follow these [instructions](/doc/install) to do +so. + +In the next few sections we'll build a simple natural language processing (NLP) +project from scratch. If you'd like to get the final result or have any issues +along the way, you can download the fully reproducible +[GitHub project](https://github.com/iterative/example-get-started) by running: + +```dvc +$ git clone https://github.com/iterative/example-get-started +``` + +Otherwise, bear with us and we'll introduce some basic DVC concepts to get the +same results together! + +The idea for this project is a simplified version of our +[Deep Dive Tutorial](/doc/tutorials/deep). It explores the NLP problem of +predicting tags for a given StackOverflow question. For example, we might want a +classifier that can classify (or predict) posts about Python by tagging them +with `python`. + +![](/img/example-flow-2x.png) + +This is a natural language processing context, but NLP isn't the only area of +data science where DVC can help. DVC is designed to be agnostic of frameworks, +languages, etc. If you have data files or datasets and/or you produce data +files, models, or datasets and you want to: + +- Capture and save those data artifacts the same way you capture + code +- Track and switch between different versions of data easily +- Understand how data artifacts (e.g. ML models) were built in the first place +- Be able to compare models to each other +- Bring software best practices to your team and get everyone on the same page + +# Initialize + +There are a few recommended ways to install DVC: OS-specific package/installer, +`pip`, `conda`, and Homebrew. See [Installation](/doc/install) for all the +alternatives and details. + +Let's start by creating a workspace we can version with Git. Then +run `dvc init` inside to create the DVC project: + +```dvc +$ mkdir example-get-started +$ cd example-get-started +$ git init +$ dvc init +$ git commit -m "Initialize DVC project" +``` + +At DVC initialization, a new `.dvc/` directory will be created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories) that are +hidden from the user. + +> See `dvc init` if you want to get more details about the initialization +> process, and +> [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to +> learn about the DVC internal file and directory structure. + +The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore` +files (DVC internals) with Git. + +# Configure + +Once you install DVC, you'll be able to start using it (in its local setup) +immediately. + +However, remote storage should be set up (see `dvc remote`) if you need to share +data or models outside of the context of a single project, for example with +other collaborators or even with yourself, in a different computing environment. +It's similar to the way you would use GitHub or any other Git server to store +and share your code. + +For simplicity, let's setup a local remote: + +
+ +### What is a "local remote" ? + +While the term may seem contradictory, it doesn't have to be. The "local" part +refers to the machine where the project is stored, so it can be any directory +accessible to the same system. The "remote" part refers specifically to the +project/repository itself. Read "local, but external" storage. + +
+ +```dvc +$ dvc remote add -d myremote /tmp/dvc-storage +$ git commit .dvc/config -m "Configure local remote" +``` + +> We only use a local remote in this section for simplicity's sake as you learn +> to use DVC. For most [use cases](/doc/use-cases), other "more remote" types of +> remotes will be required. + +[Adding a remote](/doc/command-reference/remote/add) should be specified by both +its type (protocol) and its path. DVC currently supports these types of remotes: + +- `s3`: Amazon Simple Storage Service +- `azure`: Microsoft Azure Blob Storage +- `gdrive` : Google Drive +- `gs`: Google Cloud Storage +- `ssh`: Secure Shell (requires SFTP) +- `hdfs`: Hadoop Distributed File System +- `http`: HTTP and HTTPS protocols +- `local`: Directory in the local file system + +> If you installed DVC via `pip` and plan to use cloud services as remote +> storage, you might need to install these optional dependencies: `[s3]`, +> `[azure]`, `[gdrive]`, `[gs]`, `[oss]`, `[ssh]`. Alternatively, use `[all]` to +> include them all. The command should look like this: `pip install "dvc[s3]"`. +> (This example installs `boto3` library along with DVC to support S3 storage.) + +For example, to setup an S3 remote we would use something like this (make sure +that `mybucket` exists): + +```dvc +$ dvc remote add -d s3remote s3://mybucket/myproject +``` + +> This command is only shown for informational purposes. No need to actually run +> it in order to continue with the Get Started. + +You can see that DVC doesn't require installing any databases, servers, or +warehouses. It can use bare S3 or SSH to store data, intermediate results, and +models. + +See `dvc config` to get information about more configuration options and +`dvc remote` to learn more about remotes and get more examples. diff --git a/content/docs/tutorials/get-started/initialize.md b/content/docs/tutorials/get-started/initialize.md deleted file mode 100644 index 1e227d96c9..0000000000 --- a/content/docs/tutorials/get-started/initialize.md +++ /dev/null @@ -1,29 +0,0 @@ -# Initialize - -There are a few recommended ways to install DVC: OS-specific package/installer, -`pip`, `conda`, and Homebrew. See [Installation](/doc/install) for all the -alternatives and details. - -Let's start by creating a workspace we can version with Git. Then -run `dvc init` inside to create the DVC project: - -```dvc -$ mkdir example-get-started -$ cd example-get-started -$ git init -$ dvc init -$ git commit -m "Initialize DVC project" -``` - -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are -hidden from the user. - -> See `dvc init` if you want to get more details about the initialization -> process, and -> [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to -> learn about the DVC internal file and directory structure. - -The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore` -files (DVC internals) with Git. From fff66f3e97c71be6d9f875af423109f37aedde31 Mon Sep 17 00:00:00 2001 From: Pavel Grinchenko Date: Thu, 19 Mar 2020 14:23:50 +0300 Subject: [PATCH 2/9] Allow to set empty slug in sidebar.json --- src/utils/sidebar.js | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/utils/sidebar.js b/src/utils/sidebar.js index 46e12bab9c..97f2e640f2 100644 --- a/src/utils/sidebar.js +++ b/src/utils/sidebar.js @@ -21,14 +21,14 @@ const startCase = require('lodash.startcase') const sidebar = require('../../content/docs/sidebar.json') -const PATH_ROOT = '/doc/' +const PATH_ROOT = '/doc' const FILE_ROOT = '/docs/' const FILE_EXTENSION = '.md' function validateRawItem({ slug, source, children }) { const isSourceDisabled = source === false - if (!slug) { + if (typeof slug !== 'string') { throw Error("'slug' field is required in objects in sidebar.json") } @@ -81,7 +81,7 @@ function normalizeItem({ rawItem, parentPath, resultRef, prevRef }) { const sourcePath = FILE_ROOT + parentPath + sourceFileName return { - path: PATH_ROOT + parentPath + slug, + path: PATH_ROOT + (parentPath || slug ? '/' : '') + parentPath + slug, source: source === false ? false : sourcePath, label: label ? label : startCase(slug), tutorials: tutorials || {}, @@ -152,7 +152,7 @@ function getFirstPage() { function getItemByPath(path) { const normalizedPath = path.replace(/\/$/, '') - const isRoot = normalizedPath === PATH_ROOT.slice(0, -1) + const isRoot = normalizedPath === PATH_ROOT const item = isRoot ? normalizedSidebar[0] : findItemByField(normalizedSidebar, 'path', normalizedPath) @@ -173,10 +173,14 @@ function getPathWithSoruce(path) { } function getParentsListFromPath(path) { - let currentPath = PATH_ROOT.slice(0, -1) + let currentPath = PATH_ROOT + + if (path === PATH_ROOT) { + return [PATH_ROOT] + } return path - .replace(PATH_ROOT, '') + .replace(`${PATH_ROOT}/`, '') .split('/') .map(part => { const path = `${currentPath}/${part}` From ef4ece676c44e00f1de0d888ab4e94931d96d507 Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Thu, 19 Mar 2020 09:03:07 -0600 Subject: [PATCH 3/9] home: put it in /doc directly to close #1073 --- content/docs/sidebar.json | 3 ++- redirects-list.json | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 91799b11c9..20fe842688 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -1,6 +1,7 @@ [ { - "slug": "home", + "slug": "", + "label": "Home", "source": "index.md" }, { diff --git a/redirects-list.json b/redirects-list.json index 4ff9f6395a..f3cad33d69 100644 --- a/redirects-list.json +++ b/redirects-list.json @@ -6,7 +6,6 @@ "^/((?:deb|rpm)/.+) https://s3-us-east-2.amazonaws.com/dvc-s3-repo/$1 303", "^/(?:help|chat)/?$ https://discordapp.com/invite/dvwXA2N 303", "^/(?:docs|documentation)(/.*)?$ /doc$1", - "^/doc/?$ /doc/tutorials/get-started 307", "^/doc/get-started /doc/tutorials/get-started", "^/doc/tutorial/?$ /doc/tutorials", "^/doc/tutorial/(.*)? /doc/tutorials/deep/$1", From 61dd5376093839434885e661daa07aa6cba0b977 Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Fri, 20 Mar 2020 22:39:05 -0600 Subject: [PATCH 4/9] [WIP] get-started: combine chapters into index + 3 pages --- content/docs/sidebar.json | 21 +- .../docs/tutorials/get-started/add-files.md | 89 ----- .../get-started/compare-experiments.md | 42 --- .../get-started/connect-code-and-data.md | 165 --------- .../tutorials/get-started/data-pipelines.md | 335 ++++++++++++++++++ .../get-started/experiment-management.md | 192 ++++++++++ .../docs/tutorials/get-started/experiments.md | 49 --- .../docs/tutorials/get-started/import-data.md | 87 ----- content/docs/tutorials/get-started/metrics.md | 45 --- .../tutorials/get-started/older-versions.md | 53 --- .../docs/tutorials/get-started/pipeline.md | 44 --- .../docs/tutorials/get-started/reproduce.md | 39 -- .../tutorials/get-started/retrieve-data.md | 29 -- .../docs/tutorials/get-started/store-data.md | 44 --- .../get-started/versioning-basics.md | 252 +++++++++++++ .../docs/tutorials/get-started/visualize.md | 84 ----- 16 files changed, 782 insertions(+), 788 deletions(-) delete mode 100644 content/docs/tutorials/get-started/add-files.md delete mode 100644 content/docs/tutorials/get-started/compare-experiments.md delete mode 100644 content/docs/tutorials/get-started/connect-code-and-data.md create mode 100644 content/docs/tutorials/get-started/data-pipelines.md create mode 100644 content/docs/tutorials/get-started/experiment-management.md delete mode 100644 content/docs/tutorials/get-started/experiments.md delete mode 100644 content/docs/tutorials/get-started/import-data.md delete mode 100644 content/docs/tutorials/get-started/metrics.md delete mode 100644 content/docs/tutorials/get-started/older-versions.md delete mode 100644 content/docs/tutorials/get-started/pipeline.md delete mode 100644 content/docs/tutorials/get-started/reproduce.md delete mode 100644 content/docs/tutorials/get-started/retrieve-data.md delete mode 100644 content/docs/tutorials/get-started/store-data.md create mode 100644 content/docs/tutorials/get-started/versioning-basics.md delete mode 100644 content/docs/tutorials/get-started/visualize.md diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 20fe842688..7502e7cd0e 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -36,24 +36,9 @@ "katacoda": "https://katacoda.com/dvc/courses/get-started/initialize" }, "children": [ - "add-files", - "store-data", - "retrieve-data", - "import-data", - { - "label": "Connect with Code", - "slug": "connect-code-and-data" - }, - "pipeline", - "visualize", - "reproduce", - "metrics", - "experiments", - "compare-experiments", - { - "label": "Get Older Files", - "slug": "older-versions" - } + "versioning-basics", + "data-pipelines", + "experiment-management" ] }, { diff --git a/content/docs/tutorials/get-started/add-files.md b/content/docs/tutorials/get-started/add-files.md deleted file mode 100644 index 048aafa213..0000000000 --- a/content/docs/tutorials/get-started/add-files.md +++ /dev/null @@ -1,89 +0,0 @@ -# Add Files or Directories - -DVC allows storing and versioning data files, ML models, directories, -intermediate results with Git, without tracking the file contents with Git. -Let's get a dataset example to play with: - -```dvc -$ mkdir data -$ dvc get https://github.com/iterative/dataset-registry \ - get-started/data.xml -o data/data.xml -``` - -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) - -To track a file (or a directory) with DVC just run `dvc add` on it. For example: - -```dvc -$ dvc add data/data.xml -``` - -DVC stores information about the added data in a special file called a -**DVC-file**. DVC-files are small text files with a human-readable -[format](/doc/user-guide/dvc-file-format) and they can be committed with Git: - -```dvc -$ git add data/.gitignore data/data.xml.dvc -$ git commit -m "Add raw data to project" -``` - -Committing DVC-files with Git allows us to track different versions of the -project data as it evolves with the source code tracked by Git. - -
- -### Expand to learn about DVC internals - -`dvc add` moves the actual data file to the cache directory (see -[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), while -the entries in the workspace may be file links to the actual files in the DVC -cache. - -```dvc -$ ls -R .dvc/cache - .dvc/cache/a3: - 04afb96060aad90176268345e10355 -``` - -`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` -file we just added with DVC. If you check the `data/data.xml.dvc` DVC-file, you -will see that it has this string inside. - -### Important note on cache performance - -DVC tries to use reflinks\* by default to link your data files from the DVC -cache to the workspace, optimizing speed and storage space. However, reflinks -are not widely supported yet and DVC falls back to actually copying data files -to/from the cache. **Copying can be very slow with large files**, and duplicates -storage requirements. - -Hardlinks and symlinks are also available for optimized cache linking but, -(unlike reflinks) they carry the risk of accidentally corrupting the cache if -tracked data files are modified in the workspace. - -See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and -`dvc config cache` for more information. - -> \***copy-on-write links or "reflinks"** are a relatively new way to link files -> in UNIX-style file systems. Unlike hardlinks or symlinks, they support -> transparent [copy on write](https://en.wikipedia.org/wiki/Copy-on-write). This -> means that editing a reflinked file is always safe as all the other links to -> the file will reflect the changes. - -
- -If your workspace uses Git, without DVC you would have to manually put each data -file or directory into `.gitignore`. DVC commands that track data files -automatically takes care of this for you! (You just have to add the changes with -Git.) - -Refer to -[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files), -`dvc add`, and `dvc run` for more information on storing and versioning data -files with DVC. diff --git a/content/docs/tutorials/get-started/compare-experiments.md b/content/docs/tutorials/get-started/compare-experiments.md deleted file mode 100644 index 21e09a8450..0000000000 --- a/content/docs/tutorials/get-started/compare-experiments.md +++ /dev/null @@ -1,42 +0,0 @@ -# Compare Experiments - -DVC makes it easy to iterate on your project using Git commits with tags or Git -branches. It provides a way to try different ideas, keep track of them, switch -back and forth. To find the best performing experiment or track the progress, -[project metrics](/doc/command-reference/metrics) are supported in DVC (as -described in one of the previous chapters). - -Let's run evaluate for the latest `bigrams` experiment we created in previous -chapters. It mostly takes just running the `dvc repro`: - -```dvc -$ git checkout master -$ dvc checkout -$ dvc repro evaluate.dvc -``` - -`git checkout master` and `dvc checkout` commands ensure that we have the latest -experiment code and data respectively. And `dvc repro`, as we discussed in the -[Reproduce](/doc/tutorials/get-started/reproduce) chapter, is a way to run all -the necessary commands to build the model and measure its performance. - -```dvc -$ git commit -am "Evaluate bigrams model" -$ git tag -a "bigrams-experiment" -m "Bigrams experiment evaluation" -``` - -Now, we can use `-T` option of the `dvc metrics show` command to see the -difference between the `baseline` and `bigrams` experiments: - -```dvc -$ dvc metrics show -T - -baseline-experiment: - auc.metric: 0.588426 -bigrams-experiment: - auc.metric: 0.602818 -``` - -DVC provides built-in support to track and navigate `JSON`, `TSV` or `CSV` -metric files if you want to track additional information. See `dvc metrics` to -learn more. diff --git a/content/docs/tutorials/get-started/connect-code-and-data.md b/content/docs/tutorials/get-started/connect-code-and-data.md deleted file mode 100644 index 1bec301c0c..0000000000 --- a/content/docs/tutorials/get-started/connect-code-and-data.md +++ /dev/null @@ -1,165 +0,0 @@ -# Connect Code and Data - -Even in its basic scenarios, commands like `dvc add`, `dvc push`, `dvc pull` -described in the previous sections could be used independently and provide a -basic useful framework to track, save and share models and large data files. To -achieve full reproducibility though, we'll have to connect code and -configuration with the data it processes to produce the result. - -
- -### Expand to prepare example code - -If you've followed this _Get Started_ section from the beginning, run these -commands to get the example code: - -```dvc -$ wget https://code.dvc.org/get-started/code.zip -$ unzip code.zip -$ rm -f code.zip -``` - -Windows doesn't include the `wget` utility by default, but you can use the -browser to download `code.zip`. (Right-click -[this link](https://code.dvc.org/get-started/code.zip) and select -`Save Link As...` (Chrome). Save it into the project directory. - -The workspace should now look like this: - -```dvc -$ tree -. -├── data -│   ├── data.xml -│   └── data.xml.dvc -└── src -    ├── evaluate.py -    ├── featurization.py -    ├── prepare.py -    ├── requirements.txt -  └── train.py -``` - -Now let's install the requirements. But before we do that, we **strongly** -recommend creating a -[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments): - -```dvc -$ virtualenv -p python3 .env -$ echo ".env/" >> .gitignore -$ source .env/bin/activate -$ pip install -r src/requirements.txt -``` - -Optionally, save the progress with Git: - -```dvc -$ git add . -$ git commit -m "Add source code files to repo" -``` - -
- -Having installed the `src/prepare.py` script in your repo, the following command -transforms it into a reproducible [stage](/doc/command-reference/run) for the ML -pipeline we're building (described in the -[next chapter](/doc/tutorials/pipelines)). - -```dvc -$ dvc run -f prepare.dvc \ - -d src/prepare.py -d data/data.xml \ - -o data/prepared \ - python src/prepare.py data/data.xml -``` - -`dvc run` generates the `prepare.dvc` DVC-file. It has the same -[format](/doc/user-guide/dvc-file-format) as the file we created in the -[previous section](/doc/tutorials/get-started/add-files) to track `data.xml`, -except in this case it has additional information about the `data/prepared` -output (a directory where two files, `train.tsv` and `test.tsv`, will be written -to), and about the Python command that is required to build it. - -
- -### Expand to learn more about what has just happened - -This is how the result should look like now: - -```diff - . - ├── data - │ ├── data.xml - │ ├── data.xml.dvc -+ │ └── prepared -+ │ ├── test.tsv -+ │ └── train.tsv -+ ├── prepare.dvc - └── src - ├── evaluate.py - ├── featurization.py - ├── prepare.py - ├── requirements.txt - └── train.py -``` - -This is how `prepare.dvc` looks like: - -```yaml -cmd: python src/prepare.py data/data.xml -deps: - - md5: b4801c88a83f3bf5024c19a942993a48 - path: src/prepare.py - - md5: a304afb96060aad90176268345e10355 - path: data/data.xml -md5: c3a73109be6c186b9d72e714bcedaddb -outs: - - cache: true - md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir - metric: false - path: data/prepared -wdir: . -``` - -> `dvc run` is just the first of a set of DVC command required to generate a -> [pipeline](/doc/tutorials/get-started/pipeline), or in other words, -> instructions on how to build a ML model (data file) from previous data files -> (or directories). - -Let's briefly mention what the command options used above mean for this -particular example: - -`-f prepare.dvc` specifies a name for the DVC-file (pipeline stage). It's -optional but we recommend using it to make your project structure more readable. - -`-d src/prepare.py` and `-d data/data.xml` mean that the `prepare.dvc` stage -file depends on them to produce the result. When you run `dvc repro` next time -(see next chapter) DVC will automatically check these dependencies and decide -whether this stage is up to date or whether it should be executed to regenerate -its outputs. - -`-o data/prepared` specifies the output directory processed data will be put -into. The script creates two files in it – that will be used later to generate -features, train and evaluate the model. - -And, the last line, `python src/prepare.py data/data.xml`, specifies a command -to run. This command is saved to the generated DVC-file, and used later by -`dvc repro`. - -Hopefully, `dvc run` (and `dvc repro`) will become intuitive after a few more -Get Started chapters. You can always refer to the the command references for -more details on their behavior and options. - -
- -You don't need to run `dvc add` to track output files (`prepared/train.tsv` and -`prepared/test.tsv`) with DVC. `dvc run` takes care of this. You only need to -run `dvc push` (usually along with `git commit`) to save them to the remote when -you are done. - -Let's commit the changes to save the stage we built: - -```dvc -$ git add data/.gitignore prepare.dvc -$ git commit -m "Create data preparation stage" -$ dvc push -``` diff --git a/content/docs/tutorials/get-started/data-pipelines.md b/content/docs/tutorials/get-started/data-pipelines.md new file mode 100644 index 0000000000..2855b7acfa --- /dev/null +++ b/content/docs/tutorials/get-started/data-pipelines.md @@ -0,0 +1,335 @@ +# Connect Code and Data + +Even in its basic scenarios, commands like `dvc add`, `dvc push`, `dvc pull` +described in the previous sections could be used independently and provide a +basic useful framework to track, save and share models and large data files. To +achieve full reproducibility though, we'll have to connect code and +configuration with the data it processes to produce the result. + +
+ +### Expand to prepare example code + +If you've followed this _Get Started_ section from the beginning, run these +commands to get the example code: + +```dvc +$ wget https://code.dvc.org/get-started/code.zip +$ unzip code.zip +$ rm -f code.zip +``` + +Windows doesn't include the `wget` utility by default, but you can use the +browser to download `code.zip`. (Right-click +[this link](https://code.dvc.org/get-started/code.zip) and select +`Save Link As...` (Chrome). Save it into the project directory. + +The workspace should now look like this: + +```dvc +$ tree +. +├── data +│   ├── data.xml +│   └── data.xml.dvc +└── src +    ├── evaluate.py +    ├── featurization.py +    ├── prepare.py +    ├── requirements.txt +  └── train.py +``` + +Now let's install the requirements. But before we do that, we **strongly** +recommend creating a +[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments): + +```dvc +$ virtualenv -p python3 .env +$ echo ".env/" >> .gitignore +$ source .env/bin/activate +$ pip install -r src/requirements.txt +``` + +Optionally, save the progress with Git: + +```dvc +$ git add . +$ git commit -m "Add source code files to repo" +``` + +
+ +Having installed the `src/prepare.py` script in your repo, the following command +transforms it into a reproducible [stage](/doc/command-reference/run) for the ML +pipeline we're building (described in the +[next chapter](/doc/tutorials/pipelines)). + +```dvc +$ dvc run -f prepare.dvc \ + -d src/prepare.py -d data/data.xml \ + -o data/prepared \ + python src/prepare.py data/data.xml +``` + +`dvc run` generates the `prepare.dvc` DVC-file. It has the same +[format](/doc/user-guide/dvc-file-format) as the file we created in the +[previous section](/doc/tutorials/get-started/add-files) to track `data.xml`, +except in this case it has additional information about the `data/prepared` +output (a directory where two files, `train.tsv` and `test.tsv`, will be written +to), and about the Python command that is required to build it. + +
+ +### Expand to learn more about what has just happened + +This is how the result should look like now: + +```diff + . + ├── data + │ ├── data.xml + │ ├── data.xml.dvc ++ │ └── prepared ++ │ ├── test.tsv ++ │ └── train.tsv ++ ├── prepare.dvc + └── src + ├── evaluate.py + ├── featurization.py + ├── prepare.py + ├── requirements.txt + └── train.py +``` + +This is how `prepare.dvc` looks like: + +```yaml +cmd: python src/prepare.py data/data.xml +deps: + - md5: b4801c88a83f3bf5024c19a942993a48 + path: src/prepare.py + - md5: a304afb96060aad90176268345e10355 + path: data/data.xml +md5: c3a73109be6c186b9d72e714bcedaddb +outs: + - cache: true + md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir + metric: false + path: data/prepared +wdir: . +``` + +> `dvc run` is just the first of a set of DVC command required to generate a +> [pipeline](/doc/tutorials/get-started/pipeline), or in other words, +> instructions on how to build a ML model (data file) from previous data files +> (or directories). + +Let's briefly mention what the command options used above mean for this +particular example: + +`-f prepare.dvc` specifies a name for the DVC-file (pipeline stage). It's +optional but we recommend using it to make your project structure more readable. + +`-d src/prepare.py` and `-d data/data.xml` mean that the `prepare.dvc` stage +file depends on them to produce the result. When you run `dvc repro` next time +(see next chapter) DVC will automatically check these dependencies and decide +whether this stage is up to date or whether it should be executed to regenerate +its outputs. + +`-o data/prepared` specifies the output directory processed data will be put +into. The script creates two files in it – that will be used later to generate +features, train and evaluate the model. + +And, the last line, `python src/prepare.py data/data.xml`, specifies a command +to run. This command is saved to the generated DVC-file, and used later by +`dvc repro`. + +Hopefully, `dvc run` (and `dvc repro`) will become intuitive after a few more +Get Started chapters. You can always refer to the the command references for +more details on their behavior and options. + +
+ +You don't need to run `dvc add` to track output files (`prepared/train.tsv` and +`prepared/test.tsv`) with DVC. `dvc run` takes care of this. You only need to +run `dvc push` (usually along with `git commit`) to save them to the remote when +you are done. + +Let's commit the changes to save the stage we built: + +```dvc +$ git add data/.gitignore prepare.dvc +$ git commit -m "Create data preparation stage" +$ dvc push +``` + +# Pipeline + +Support for [pipelines](/doc/command-reference/pipeline) is the biggest +difference between DVC and other version control tools that can handle large +data files (e.g. `git lfs`). By using `dvc run` multiple times, and specifying +outputs of a command (stage) as dependencies in another one, we can describe a +sequence of commands that gets to a desired result. This is what we call a +**data pipeline** or dependency graph. + +Let's create a second stage (after `prepare.dvc`, created in the previous +chapter) to perform feature extraction: + +```dvc +$ dvc run -f featurize.dvc \ + -d src/featurization.py -d data/prepared \ + -o data/features \ + python src/featurization.py \ + data/prepared data/features +``` + +And a third stage for training: + +```dvc +$ dvc run -f train.dvc \ + -d src/train.py -d data/features \ + -o model.pkl \ + python src/train.py data/features model.pkl +``` + +Let's commit DVC-files that describe our pipeline so far: + +```dvc +$ git add data/.gitignore .gitignore featurize.dvc train.dvc +$ git commit -m "Create featurization and training stages" +$ dvc push +``` + +This example is simplified just to show you a basic pipeline, see a more +advanced [example](/doc/tutorials/pipelines) or +[complete tutorial](/doc/tutorials/pipelines) to create an +[NLP](https://en.wikipedia.org/wiki/Natural_language_processing) pipeline +end-to-end. + +> See also the `dvc pipeline` command. + +# Visualize + +Now that we have built our pipeline, we need a good way to visualize it to be +able to wrap our heads around it. Luckily, DVC allows us to do that without +leaving the terminal, making the experience distraction-less. + +We are using the `--ascii` option below to better illustrate this pipeline. +Please, refer to `dvc pipeline show` to explore other options this command +supports (e.g. `.dot` files that can be used then in other tools). + +## Stages + +```dvc +$ dvc pipeline show --ascii train.dvc + +-------------------+ + | data/data.xml.dvc | + +-------------------+ + * + * + * + +-------------+ + | prepare.dvc | + +-------------+ + * + * + * + +---------------+ + | featurize.dvc | + +---------------+ + * + * + * + +-----------+ + | train.dvc | + +-----------+ +``` + +## Commands + +```dvc +$ dvc pipeline show --ascii train.dvc --commands + +-------------------------------------+ + | python src/prepare.py data/data.xml | + +-------------------------------------+ + * + * + * + +---------------------------------------------------------+ + | python src/featurization.py data/prepared data/features | + +---------------------------------------------------------+ + * + * + * + +---------------------------------------------+ + | python src/train.py data/features model.pkl | + +---------------------------------------------+ +``` + +## Outputs + +```dvc +$ dvc pipeline show --ascii train.dvc --outs + +---------------+ + | data/data.xml | + +---------------+ + * + * + * + +---------------+ + | data/prepared | + +---------------+ + * + * + * + +---------------+ + | data/features | + +---------------+ + * + * + * + +-----------+ + | model.pkl | + +-----------+ +``` + +# Reproduce + +In the previous chapters, we described our first +[pipeline](/doc/command-reference/pipeline). Basically, we generated a number of +[stage files](/doc/command-reference/run) +([DVC-files](/doc/user-guide/dvc-file-format)). These stages define individual +commands to execute towards a final result. Each depends on some data (either +raw data files or intermediate results from previous stages) and code files. + +If you just cloned the +[project](https://github.com/iterative/example-get-started), make sure you first +fetch the input data from DVC by calling `dvc pull`. + +It's now extremely easy for you or your colleagues to reproduce the result +end-to-end: + +```dvc +$ dvc repro train.dvc +``` + +> If you've just followed the previous chapters, the command above will have +> nothing to reproduce since you've recently executed all the pipeline stages. +> To easily try this command, clone this example +> [GitHub project](https://github.com/iterative/example-get-started) and run it +> from there. + +`train.dvc` describes which source code and data files to use, and how to run +the command in order to get the resulting model file. For each data file it +depends on, we can in turn do the same analysis: find a corresponding DVC-file +that includes the data file in its outputs, get dependencies and commands, and +so on. It means that DVC can recursively build a complete sequence of commands +it needs to execute to get the model file. + +`dvc repro` essentially builds a dependency graph, detects stages with modified +dependencies or missing outputs and recursively executes commands (nodes in this +graph or pipeline) starting from the first stage with changes. + +Thus, `dvc run` and `dvc repro` provide a powerful framework for _reproducible +experiments_ and _reproducible projects_. diff --git a/content/docs/tutorials/get-started/experiment-management.md b/content/docs/tutorials/get-started/experiment-management.md new file mode 100644 index 0000000000..d6cd1c5d30 --- /dev/null +++ b/content/docs/tutorials/get-started/experiment-management.md @@ -0,0 +1,192 @@ +# Experiment Metrics + +Finally, we'd like to add an evaluation stage to our +[pipeline](/doc/command-reference/pipeline). Data science is a metric-driven +R&D-like process and `dvc metrics` commands along with DVC metric files provide +a framework to capture and compare experiments performance. It doesn't require +installing any databases or instrumenting your code to use some API, all is +tracked by Git and is stored in Git or DVC remote storage: + +```dvc +$ dvc run -f evaluate.dvc \ + -d src/evaluate.py -d model.pkl -d data/features \ + -M auc.metric \ + python src/evaluate.py model.pkl \ + data/features auc.metric +``` + +`evaluate.py` calculates AUC value using the test dataset. It reads features +from the `features/test.pkl` file and produces a +[metric](/doc/command-reference/metrics) file (`auc.metric`). Any +output (in this case just a plain text file containing a single +numeric value) can be marked as a metric, for example by using the `-M` option +of `dvc run`. + +> Please, refer to the `dvc metrics` command documentation to see more details. + +Let's save the updated results: + +```dvc +$ git add evaluate.dvc auc.metric +$ git commit -m "Create evaluation stage" +$ dvc push +``` + +Let's also assign a Git tag, it will serve as a checkpoint for us to compare +experiments in the future, or if we need to go back and checkout it and the +corresponding data: + +```dvc +$ git tag -a "baseline-experiment" -m "Baseline experiment evaluation" +``` + +The `dvc metrics show` command provides a way to compare different experiments, +by analyzing metric files across different branches, tags, etc. But first we +need to create a new experiment to compare the baseline with. + +# Experiments + +Data science process is inherently iterative and R&D like. Data scientist may +try many different approaches, different hyper-parameter values, and "fail" many +times before the required level of a metric is achieved. + +DVC is built to provide a way to capture different experiments and navigate +easily between them. Let's say we want to try a modified feature extraction: + +
+ +### Expand to see code modifications + +Edit `src/featurization.py` to enable bigrams and increase the number of +features. Find and change the `CountVectorizer` arguments, specify `ngram_range` +and increase number of features: + +```python +bag_of_words = CountVectorizer(stop_words='english', + max_features=6000, + ngram_range=(1, 2)) +``` + +
+ +```dvc +$ vi src/featurization.py # edit to use bigrams (see above) +$ dvc repro train.dvc # regenerate the new model.pkl +$ git commit -am "Reproduce model using bigrams" +``` + +> Notice that `git commit -a` stages all the changes produced by `dvc repro` +> before committing them with Git. Refer to the +> [command reference](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--a) +> for more details. + +Now, we have a new `model.pkl` captured and saved. To get back to the initial +version, we run `git checkout` along with `dvc checkout` command: + +```dvc +$ git checkout baseline-experiment +$ dvc checkout +``` + +DVC is designed to checkout large data files (no matter how large they are) into +your workspace almost instantly on almost all modern operating +systems with file links. See +[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for +more information. + +# Compare Experiments + +DVC makes it easy to iterate on your project using Git commits with tags or Git +branches. It provides a way to try different ideas, keep track of them, switch +back and forth. To find the best performing experiment or track the progress, +[project metrics](/doc/command-reference/metrics) are supported in DVC (as +described in one of the previous chapters). + +Let's run evaluate for the latest `bigrams` experiment we created in previous +chapters. It mostly takes just running the `dvc repro`: + +```dvc +$ git checkout master +$ dvc checkout +$ dvc repro evaluate.dvc +``` + +`git checkout master` and `dvc checkout` commands ensure that we have the latest +experiment code and data respectively. And `dvc repro`, as we discussed in the +[Reproduce](/doc/tutorials/get-started/reproduce) chapter, is a way to run all +the necessary commands to build the model and measure its performance. + +```dvc +$ git commit -am "Evaluate bigrams model" +$ git tag -a "bigrams-experiment" -m "Bigrams experiment evaluation" +``` + +Now, we can use `-T` option of the `dvc metrics show` command to see the +difference between the `baseline` and `bigrams` experiments: + +```dvc +$ dvc metrics show -T + +baseline-experiment: + auc.metric: 0.588426 +bigrams-experiment: + auc.metric: 0.602818 +``` + +DVC provides built-in support to track and navigate `JSON`, `TSV` or `CSV` +metric files if you want to track additional information. See `dvc metrics` to +learn more. + +# Get Older Data Version + +Now that we have multiple experiments, models, processed datasets, the question +is how do we revert back to an older version of a model file? Or how can we get +the previous version of the dataset if it was changed at some point? + +The answer is the `dvc checkout` command, and we already touched briefly the +process of switching between different data versions in the +[Experiments](/doc/tutorials/get-started/experiments) chapter of this _Get +Started_ section. + +Let's say we want to get the previous `model.pkl` file. The short answer is: + +```dvc +$ git checkout baseline-experiment train.dvc +$ dvc checkout train.dvc +``` + +These two commands will bring the previous model file to its place in the +workspace. + +
+ +### Expand to learn about DVC internals + +DVC uses special [DVC-files](/doc/user-guide/dvc-file-format) to track data +files, directories, end results. In this case, `train.dvc` among other things +describes the `model.pkl` file this way: + +```yaml +outs: +md5: a66489653d1b6a8ba989799367b32c43 +path: model.pkl +``` + +`a664...2c43` is the "address" of the file in the local or remote DVC storage. + +It means that if we want to get to the previous version, we need to restore the +DVC-file first with the `git checkout` command. Only after that can DVC restore +the model file using the new "address" from the DVC-file. + +
+ +To fully restore the previous experiment we just run `git checkout` and +`dvc checkout` without specifying a target: + +```dvc +$ git checkout baseline-experiment +$ dvc checkout +``` + +Read the `dvc checkout` command reference and a dedicated data versioning +[example](/doc/tutorials/versioning) for more information. diff --git a/content/docs/tutorials/get-started/experiments.md b/content/docs/tutorials/get-started/experiments.md deleted file mode 100644 index b716872a2e..0000000000 --- a/content/docs/tutorials/get-started/experiments.md +++ /dev/null @@ -1,49 +0,0 @@ -# Experiments - -Data science process is inherently iterative and R&D like. Data scientist may -try many different approaches, different hyper-parameter values, and "fail" many -times before the required level of a metric is achieved. - -DVC is built to provide a way to capture different experiments and navigate -easily between them. Let's say we want to try a modified feature extraction: - -
- -### Expand to see code modifications - -Edit `src/featurization.py` to enable bigrams and increase the number of -features. Find and change the `CountVectorizer` arguments, specify `ngram_range` -and increase number of features: - -```python -bag_of_words = CountVectorizer(stop_words='english', - max_features=6000, - ngram_range=(1, 2)) -``` - -
- -```dvc -$ vi src/featurization.py # edit to use bigrams (see above) -$ dvc repro train.dvc # regenerate the new model.pkl -$ git commit -am "Reproduce model using bigrams" -``` - -> Notice that `git commit -a` stages all the changes produced by `dvc repro` -> before committing them with Git. Refer to the -> [command reference](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--a) -> for more details. - -Now, we have a new `model.pkl` captured and saved. To get back to the initial -version, we run `git checkout` along with `dvc checkout` command: - -```dvc -$ git checkout baseline-experiment -$ dvc checkout -``` - -DVC is designed to checkout large data files (no matter how large they are) into -your workspace almost instantly on almost all modern operating -systems with file links. See -[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for -more information. diff --git a/content/docs/tutorials/get-started/import-data.md b/content/docs/tutorials/get-started/import-data.md deleted file mode 100644 index 6900533d5c..0000000000 --- a/content/docs/tutorials/get-started/import-data.md +++ /dev/null @@ -1,87 +0,0 @@ -# Import Data - -We've seen how to [push](/doc/tutorials/get-started/store-data) and -[pull](/doc/tutorials/get-started/retrieve-data) data from/to a DVC -project's [remote](/doc/command-reference/remote). But what if we wanted -to integrate a dataset or ML model produced in one project into another one? - -One way is to manually download the data (with `wget` or `dvc get`, for example) -and use `dvc add` to track it, but the connection between the projects would be -lost. We wouldn't be able to tell where the data came from or whether there are -new versions available. A better alternative is the `dvc import` command: - - - -```dvc -$ dvc import https://github.com/iterative/dataset-registry \ - get-started/data.xml -``` - -This downloads `data.xml` from our -[dataset-registry](https://github.com/iterative/dataset-registry) project into -the current working directory, adds it to `.gitignore`, and creates the -`data.xml.dvc` [DVC-file](/doc/user-guide/dvc-file-format) to track changes in -the source data. With _imports_, we can use `dvc update` to bring in changes in -the external data source before -[reproducing](/doc/tutorials/get-started/reproduce) any pipeline -that depends on this data. - -
- -### Expand to learn more about imports - -Note that the [dataset-registry](https://github.com/iterative/dataset-registry) -repository doesn't actually contain a `get-started/data.xml` file. Instead, DVC -inspects -[get-started/data.xml.dvc](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc) -and tries to retrieve the file using the project's default remote (configured -[here](https://github.com/iterative/dataset-registry/blob/master/.dvc/config)). - -DVC-files created by `dvc import` are called _import stages_. They use the -`repo` field in the dependencies section (`deps`) in order to track source data -changes (as an [external dependency](/doc/user-guide/external-dependencies)), -enabling the reusability of data artifacts. For example: - -```yaml -md5: fd56a1794c147fea48d408f2bc95a33a -locked: true -deps: - - path: get-started/data.xml - repo: - url: https://github.com/iterative/dataset-registry - rev_lock: 7476a858f6200864b5755863c729bff41d0fb045 -outs: - - md5: a304afb96060aad90176268345e10355 - path: data.xml - cache: true - metric: false - persist: false -``` - -The `url` and `rev_lock` subfields under `repo` are used to save the origin and -[version](https://git-scm.com/docs/revisions) of the dependency, respectively. - -> Note that `dvc update` updates the `rev_lock` field of the corresponding -> DVC-file (when there are changes to bring in). - -
- -Since this is not an official part of this _Get Started_, bring everything back -to normal with: - -```dvc -$ git reset --hard -$ rm -f data.* -``` - -> See also `dvc import-url`. diff --git a/content/docs/tutorials/get-started/metrics.md b/content/docs/tutorials/get-started/metrics.md deleted file mode 100644 index e91ba6371f..0000000000 --- a/content/docs/tutorials/get-started/metrics.md +++ /dev/null @@ -1,45 +0,0 @@ -# Experiment Metrics - -Finally, we'd like to add an evaluation stage to our -[pipeline](/doc/command-reference/pipeline). Data science is a metric-driven -R&D-like process and `dvc metrics` commands along with DVC metric files provide -a framework to capture and compare experiments performance. It doesn't require -installing any databases or instrumenting your code to use some API, all is -tracked by Git and is stored in Git or DVC remote storage: - -```dvc -$ dvc run -f evaluate.dvc \ - -d src/evaluate.py -d model.pkl -d data/features \ - -M auc.metric \ - python src/evaluate.py model.pkl \ - data/features auc.metric -``` - -`evaluate.py` calculates AUC value using the test dataset. It reads features -from the `features/test.pkl` file and produces a -[metric](/doc/command-reference/metrics) file (`auc.metric`). Any -output (in this case just a plain text file containing a single -numeric value) can be marked as a metric, for example by using the `-M` option -of `dvc run`. - -> Please, refer to the `dvc metrics` command documentation to see more details. - -Let's save the updated results: - -```dvc -$ git add evaluate.dvc auc.metric -$ git commit -m "Create evaluation stage" -$ dvc push -``` - -Let's also assign a Git tag, it will serve as a checkpoint for us to compare -experiments in the future, or if we need to go back and checkout it and the -corresponding data: - -```dvc -$ git tag -a "baseline-experiment" -m "Baseline experiment evaluation" -``` - -The `dvc metrics show` command provides a way to compare different experiments, -by analyzing metric files across different branches, tags, etc. But first we -need to create a new experiment to compare the baseline with. diff --git a/content/docs/tutorials/get-started/older-versions.md b/content/docs/tutorials/get-started/older-versions.md deleted file mode 100644 index bde6bce562..0000000000 --- a/content/docs/tutorials/get-started/older-versions.md +++ /dev/null @@ -1,53 +0,0 @@ -# Get Older Data Version - -Now that we have multiple experiments, models, processed datasets, the question -is how do we revert back to an older version of a model file? Or how can we get -the previous version of the dataset if it was changed at some point? - -The answer is the `dvc checkout` command, and we already touched briefly the -process of switching between different data versions in the -[Experiments](/doc/tutorials/get-started/experiments) chapter of this _Get -Started_ section. - -Let's say we want to get the previous `model.pkl` file. The short answer is: - -```dvc -$ git checkout baseline-experiment train.dvc -$ dvc checkout train.dvc -``` - -These two commands will bring the previous model file to its place in the -workspace. - -
- -### Expand to learn about DVC internals - -DVC uses special [DVC-files](/doc/user-guide/dvc-file-format) to track data -files, directories, end results. In this case, `train.dvc` among other things -describes the `model.pkl` file this way: - -```yaml -outs: -md5: a66489653d1b6a8ba989799367b32c43 -path: model.pkl -``` - -`a664...2c43` is the "address" of the file in the local or remote DVC storage. - -It means that if we want to get to the previous version, we need to restore the -DVC-file first with the `git checkout` command. Only after that can DVC restore -the model file using the new "address" from the DVC-file. - -
- -To fully restore the previous experiment we just run `git checkout` and -`dvc checkout` without specifying a target: - -```dvc -$ git checkout baseline-experiment -$ dvc checkout -``` - -Read the `dvc checkout` command reference and a dedicated data versioning -[example](/doc/tutorials/versioning) for more information. diff --git a/content/docs/tutorials/get-started/pipeline.md b/content/docs/tutorials/get-started/pipeline.md deleted file mode 100644 index d9f0f19390..0000000000 --- a/content/docs/tutorials/get-started/pipeline.md +++ /dev/null @@ -1,44 +0,0 @@ -# Pipeline - -Support for [pipelines](/doc/command-reference/pipeline) is the biggest -difference between DVC and other version control tools that can handle large -data files (e.g. `git lfs`). By using `dvc run` multiple times, and specifying -outputs of a command (stage) as dependencies in another one, we can describe a -sequence of commands that gets to a desired result. This is what we call a -**data pipeline** or dependency graph. - -Let's create a second stage (after `prepare.dvc`, created in the previous -chapter) to perform feature extraction: - -```dvc -$ dvc run -f featurize.dvc \ - -d src/featurization.py -d data/prepared \ - -o data/features \ - python src/featurization.py \ - data/prepared data/features -``` - -And a third stage for training: - -```dvc -$ dvc run -f train.dvc \ - -d src/train.py -d data/features \ - -o model.pkl \ - python src/train.py data/features model.pkl -``` - -Let's commit DVC-files that describe our pipeline so far: - -```dvc -$ git add data/.gitignore .gitignore featurize.dvc train.dvc -$ git commit -m "Create featurization and training stages" -$ dvc push -``` - -This example is simplified just to show you a basic pipeline, see a more -advanced [example](/doc/tutorials/pipelines) or -[complete tutorial](/doc/tutorials/pipelines) to create an -[NLP](https://en.wikipedia.org/wiki/Natural_language_processing) pipeline -end-to-end. - -> See also the `dvc pipeline` command. diff --git a/content/docs/tutorials/get-started/reproduce.md b/content/docs/tutorials/get-started/reproduce.md deleted file mode 100644 index d6e6375878..0000000000 --- a/content/docs/tutorials/get-started/reproduce.md +++ /dev/null @@ -1,39 +0,0 @@ -# Reproduce - -In the previous chapters, we described our first -[pipeline](/doc/command-reference/pipeline). Basically, we generated a number of -[stage files](/doc/command-reference/run) -([DVC-files](/doc/user-guide/dvc-file-format)). These stages define individual -commands to execute towards a final result. Each depends on some data (either -raw data files or intermediate results from previous stages) and code files. - -If you just cloned the -[project](https://github.com/iterative/example-get-started), make sure you first -fetch the input data from DVC by calling `dvc pull`. - -It's now extremely easy for you or your colleagues to reproduce the result -end-to-end: - -```dvc -$ dvc repro train.dvc -``` - -> If you've just followed the previous chapters, the command above will have -> nothing to reproduce since you've recently executed all the pipeline stages. -> To easily try this command, clone this example -> [GitHub project](https://github.com/iterative/example-get-started) and run it -> from there. - -`train.dvc` describes which source code and data files to use, and how to run -the command in order to get the resulting model file. For each data file it -depends on, we can in turn do the same analysis: find a corresponding DVC-file -that includes the data file in its outputs, get dependencies and commands, and -so on. It means that DVC can recursively build a complete sequence of commands -it needs to execute to get the model file. - -`dvc repro` essentially builds a dependency graph, detects stages with modified -dependencies or missing outputs and recursively executes commands (nodes in this -graph or pipeline) starting from the first stage with changes. - -Thus, `dvc run` and `dvc repro` provide a powerful framework for _reproducible -experiments_ and _reproducible projects_. diff --git a/content/docs/tutorials/get-started/retrieve-data.md b/content/docs/tutorials/get-started/retrieve-data.md deleted file mode 100644 index 2a11926903..0000000000 --- a/content/docs/tutorials/get-started/retrieve-data.md +++ /dev/null @@ -1,29 +0,0 @@ -# Retrieve Data - -> You'll need to complete the -> [initialization](/doc/tutorials/get-started/initialize) and -> [configuration](/doc/tutorials/get-started/configure) chapters before being -> able to run the commands explained here. - -To retrieve data files into the workspace in your local machine, -run: - -```dvc -$ rm -f data/data.xml -$ dvc pull -``` - -This command downloads data files that are referenced in all -[DVC-files](/doc/user-guide/dvc-file-format) in the project. So, -you usually run it after `git clone`, `git pull`, or `git checkout`. - -Alternatively, if you want to retrieve a single dataset or a file you can use: - -```dvc -$ dvc pull data/data.xml.dvc -``` - -DVC remotes, `dvc push`, and `dvc pull` provide a basic collaboration workflow, -the same way as Git remotes, `git push` and `git pull`. See -[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for -more information. diff --git a/content/docs/tutorials/get-started/store-data.md b/content/docs/tutorials/get-started/store-data.md deleted file mode 100644 index 1306681e27..0000000000 --- a/content/docs/tutorials/get-started/store-data.md +++ /dev/null @@ -1,44 +0,0 @@ -# Store and Share Data - -Now, that your data files are managed by DVC (see -[Add Files](/doc/tutorials/get-started/add-files)), you can push them from your -repository to the default [remote](/doc/command-reference/remote) storage\*: - -```dvc -$ dvc push -``` - -The same way as with Git remote, it ensures that your data files and your models -are safely stored remotely and are shareable. This means that the data can be -pulled by yourself or your colleagues whenever you need it. - -Usually, you run it along with `git commit` and `git push` to save the changed -[DVC-files](/doc/user-guide/dvc-file-format). - -The `dvc push` command allows one to upload data to remote storage. It doesn't -save any changes in the code or DVC-files. Those should be saved by using -`git commit` and `git push`. - -> \*As noted in the DVC [configuration](/doc/tutorials/get-started/configure) -> chapter, we are using a **local remote** in this section for illustrative -> purposes. - -
- -### Expand to learn more about DVC internals - -You can check now that actual data file has been copied to the remote we created -in the [configuration](/doc/tutorials/get-started/configure) chapter: - -```dvc -$ ls -R /tmp/dvc-storage -/tmp/dvc-storage/a3: -04afb96060aad90176268345e10355 -``` - -`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` -file. If you check the `data.xml.dvc` -[DVC-file](/doc/user-guide/dvc-file-format), you will see that it has this -string inside. - -
diff --git a/content/docs/tutorials/get-started/versioning-basics.md b/content/docs/tutorials/get-started/versioning-basics.md new file mode 100644 index 0000000000..257d43c417 --- /dev/null +++ b/content/docs/tutorials/get-started/versioning-basics.md @@ -0,0 +1,252 @@ +# Add Files or Directories + +DVC allows storing and versioning data files, ML models, directories, +intermediate results with Git, without tracking the file contents with Git. +Let's get a dataset example to play with: + +```dvc +$ mkdir data +$ dvc get https://github.com/iterative/dataset-registry \ + get-started/data.xml -o data/data.xml +``` + +> `dvc get` can use any DVC repository to find the appropriate +> [remote storage](/doc/command-reference/remote) and download data +> artifacts from it (analogous to `wget`, but for repositories). In this +> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) +> as the source repo. (Refer to +> [Data Registries](/doc/use-cases/data-registries) for more info about this +> setup.) + +To track a file (or a directory) with DVC just run `dvc add` on it. For example: + +```dvc +$ dvc add data/data.xml +``` + +DVC stores information about the added data in a special file called a +**DVC-file**. DVC-files are small text files with a human-readable +[format](/doc/user-guide/dvc-file-format) and they can be committed with Git: + +```dvc +$ git add data/.gitignore data/data.xml.dvc +$ git commit -m "Add raw data to project" +``` + +Committing DVC-files with Git allows us to track different versions of the +project data as it evolves with the source code tracked by Git. + +
+ +### Expand to learn about DVC internals + +`dvc add` moves the actual data file to the cache directory (see +[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), while +the entries in the workspace may be file links to the actual files in the DVC +cache. + +```dvc +$ ls -R .dvc/cache + .dvc/cache/a3: + 04afb96060aad90176268345e10355 +``` + +`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` +file we just added with DVC. If you check the `data/data.xml.dvc` DVC-file, you +will see that it has this string inside. + +### Important note on cache performance + +DVC tries to use reflinks\* by default to link your data files from the DVC +cache to the workspace, optimizing speed and storage space. However, reflinks +are not widely supported yet and DVC falls back to actually copying data files +to/from the cache. **Copying can be very slow with large files**, and duplicates +storage requirements. + +Hardlinks and symlinks are also available for optimized cache linking but, +(unlike reflinks) they carry the risk of accidentally corrupting the cache if +tracked data files are modified in the workspace. + +See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and +`dvc config cache` for more information. + +> \***copy-on-write links or "reflinks"** are a relatively new way to link files +> in UNIX-style file systems. Unlike hardlinks or symlinks, they support +> transparent [copy on write](https://en.wikipedia.org/wiki/Copy-on-write). This +> means that editing a reflinked file is always safe as all the other links to +> the file will reflect the changes. + +
+ +If your workspace uses Git, without DVC you would have to manually put each data +file or directory into `.gitignore`. DVC commands that track data files +automatically takes care of this for you! (You just have to add the changes with +Git.) + +Refer to +[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files), +`dvc add`, and `dvc run` for more information on storing and versioning data +files with DVC. + +# Store and Share Data + +Now, that your data files are managed by DVC (see +[Add Files](/doc/tutorials/get-started/add-files)), you can push them from your +repository to the default [remote](/doc/command-reference/remote) storage\*: + +```dvc +$ dvc push +``` + +The same way as with Git remote, it ensures that your data files and your models +are safely stored remotely and are shareable. This means that the data can be +pulled by yourself or your colleagues whenever you need it. + +Usually, you run it along with `git commit` and `git push` to save the changed +[DVC-files](/doc/user-guide/dvc-file-format). + +The `dvc push` command allows one to upload data to remote storage. It doesn't +save any changes in the code or DVC-files. Those should be saved by using +`git commit` and `git push`. + +> \*As noted in the DVC [configuration](/doc/tutorials/get-started/configure) +> chapter, we are using a **local remote** in this section for illustrative +> purposes. + +
+ +### Expand to learn more about DVC internals + +You can check now that actual data file has been copied to the remote we created +in the [configuration](/doc/tutorials/get-started/configure) chapter: + +```dvc +$ ls -R /tmp/dvc-storage +/tmp/dvc-storage/a3: +04afb96060aad90176268345e10355 +``` + +`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` +file. If you check the `data.xml.dvc` +[DVC-file](/doc/user-guide/dvc-file-format), you will see that it has this +string inside. + +
+ +# Import Data + +We've seen how to [push](/doc/tutorials/get-started/store-data) and +[pull](/doc/tutorials/get-started/retrieve-data) data from/to a DVC +project's [remote](/doc/command-reference/remote). But what if we wanted +to integrate a dataset or ML model produced in one project into another one? + +One way is to manually download the data (with `wget` or `dvc get`, for example) +and use `dvc add` to track it, but the connection between the projects would be +lost. We wouldn't be able to tell where the data came from or whether there are +new versions available. A better alternative is the `dvc import` command: + + + +```dvc +$ dvc import https://github.com/iterative/dataset-registry \ + get-started/data.xml +``` + +This downloads `data.xml` from our +[dataset-registry](https://github.com/iterative/dataset-registry) project into +the current working directory, adds it to `.gitignore`, and creates the +`data.xml.dvc` [DVC-file](/doc/user-guide/dvc-file-format) to track changes in +the source data. With _imports_, we can use `dvc update` to bring in changes in +the external data source before +[reproducing](/doc/tutorials/get-started/reproduce) any pipeline +that depends on this data. + +
+ +### Expand to learn more about imports + +Note that the [dataset-registry](https://github.com/iterative/dataset-registry) +repository doesn't actually contain a `get-started/data.xml` file. Instead, DVC +inspects +[get-started/data.xml.dvc](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc) +and tries to retrieve the file using the project's default remote (configured +[here](https://github.com/iterative/dataset-registry/blob/master/.dvc/config)). + +DVC-files created by `dvc import` are called _import stages_. They use the +`repo` field in the dependencies section (`deps`) in order to track source data +changes (as an [external dependency](/doc/user-guide/external-dependencies)), +enabling the reusability of data artifacts. For example: + +```yaml +md5: fd56a1794c147fea48d408f2bc95a33a +locked: true +deps: + - path: get-started/data.xml + repo: + url: https://github.com/iterative/dataset-registry + rev_lock: 7476a858f6200864b5755863c729bff41d0fb045 +outs: + - md5: a304afb96060aad90176268345e10355 + path: data.xml + cache: true + metric: false + persist: false +``` + +The `url` and `rev_lock` subfields under `repo` are used to save the origin and +[version](https://git-scm.com/docs/revisions) of the dependency, respectively. + +> Note that `dvc update` updates the `rev_lock` field of the corresponding +> DVC-file (when there are changes to bring in). + +
+ +Since this is not an official part of this _Get Started_, bring everything back +to normal with: + +```dvc +$ git reset --hard +$ rm -f data.* +``` + +> See also `dvc import-url`. + +# Retrieve Data + +> You'll need to complete the +> [initialization](/doc/tutorials/get-started/initialize) and +> [configuration](/doc/tutorials/get-started/configure) chapters before being +> able to run the commands explained here. + +To retrieve data files into the workspace in your local machine, +run: + +```dvc +$ rm -f data/data.xml +$ dvc pull +``` + +This command downloads data files that are referenced in all +[DVC-files](/doc/user-guide/dvc-file-format) in the project. So, +you usually run it after `git clone`, `git pull`, or `git checkout`. + +Alternatively, if you want to retrieve a single dataset or a file you can use: + +```dvc +$ dvc pull data/data.xml.dvc +``` + +DVC remotes, `dvc push`, and `dvc pull` provide a basic collaboration workflow, +the same way as Git remotes, `git push` and `git pull`. See +[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for +more information. diff --git a/content/docs/tutorials/get-started/visualize.md b/content/docs/tutorials/get-started/visualize.md deleted file mode 100644 index 5b7e5c293f..0000000000 --- a/content/docs/tutorials/get-started/visualize.md +++ /dev/null @@ -1,84 +0,0 @@ -# Visualize - -Now that we have built our pipeline, we need a good way to visualize it to be -able to wrap our heads around it. Luckily, DVC allows us to do that without -leaving the terminal, making the experience distraction-less. - -We are using the `--ascii` option below to better illustrate this pipeline. -Please, refer to `dvc pipeline show` to explore other options this command -supports (e.g. `.dot` files that can be used then in other tools). - -## Stages - -```dvc -$ dvc pipeline show --ascii train.dvc - +-------------------+ - | data/data.xml.dvc | - +-------------------+ - * - * - * - +-------------+ - | prepare.dvc | - +-------------+ - * - * - * - +---------------+ - | featurize.dvc | - +---------------+ - * - * - * - +-----------+ - | train.dvc | - +-----------+ -``` - -## Commands - -```dvc -$ dvc pipeline show --ascii train.dvc --commands - +-------------------------------------+ - | python src/prepare.py data/data.xml | - +-------------------------------------+ - * - * - * - +---------------------------------------------------------+ - | python src/featurization.py data/prepared data/features | - +---------------------------------------------------------+ - * - * - * - +---------------------------------------------+ - | python src/train.py data/features model.pkl | - +---------------------------------------------+ -``` - -## Outputs - -```dvc -$ dvc pipeline show --ascii train.dvc --outs - +---------------+ - | data/data.xml | - +---------------+ - * - * - * - +---------------+ - | data/prepared | - +---------------+ - * - * - * - +---------------+ - | data/features | - +---------------+ - * - * - * - +-----------+ - | model.pkl | - +-----------+ -``` From e0a424d8fdf29e980d7748e2ddb7abce2bab7daa Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Mon, 23 Mar 2020 03:29:56 -0600 Subject: [PATCH 5/9] tutorials: finish get-started index page (part 0) --- content/docs/command-reference/cache/index.md | 2 +- content/docs/command-reference/init.md | 2 +- .../docs/command-reference/remote/index.md | 6 +- content/docs/command-reference/remote/list.md | 6 +- content/docs/index.md | 25 +++- content/docs/tutorials/deep/preparation.md | 2 +- content/docs/tutorials/get-started/index.md | 116 +++++++----------- content/docs/tutorials/pipelines.md | 2 +- content/docs/understanding-dvc/what-is-dvc.md | 19 ++- .../versioning-data-and-model-files.md | 2 +- 10 files changed, 83 insertions(+), 99 deletions(-) diff --git a/content/docs/command-reference/cache/index.md b/content/docs/command-reference/cache/index.md index fb802105d4..61a3c7ee62 100644 --- a/content/docs/command-reference/cache/index.md +++ b/content/docs/command-reference/cache/index.md @@ -15,7 +15,7 @@ positional arguments: ## Description -At DVC initialization, a new `.dvc/` directory will be created for internal +At DVC initialization, a new `.dvc/` directory is created for internal configuration and cache [files and directories](/doc/user-guide/dvc-files-and-directories) that are hidden from the user. diff --git a/content/docs/command-reference/init.md b/content/docs/command-reference/init.md index ec1e153ddb..d667d55e9f 100644 --- a/content/docs/command-reference/init.md +++ b/content/docs/command-reference/init.md @@ -22,7 +22,7 @@ advanced scenarios: - [Initializing DVC without Git](#how-does-it-affect-dvc-commands) - support for SCM other than Git, deployment automation cases, etc. -At DVC initialization, a new `.dvc/` directory will be created for internal +At DVC initialization, a new `.dvc/` directory is created for internal configuration and cache [files and directories](/doc/user-guide/dvc-files-and-directories) that are hidden from the user. diff --git a/content/docs/command-reference/remote/index.md b/content/docs/command-reference/remote/index.md index cbe8a8947c..d6b739ec9a 100644 --- a/content/docs/command-reference/remote/index.md +++ b/content/docs/command-reference/remote/index.md @@ -74,9 +74,9 @@ For the typical process to share the project via remote, see ### What is a "local remote" ? While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. +refers to the location of the storage relative to the project, so it can be any +directory in the file system. "Remote" is the term that refers to the storage. +Read "local cache backup". diff --git a/content/docs/command-reference/remote/list.md b/content/docs/command-reference/remote/list.md index 57b39278cb..30880fecaf 100644 --- a/content/docs/command-reference/remote/list.md +++ b/content/docs/command-reference/remote/list.md @@ -45,9 +45,9 @@ Let's for simplicity add a _default_ local remote: ### What is a "local remote" ? While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. +refers to the location of the storage relative to the project, so it can be any +directory in the file system. "Remote" is the term that refers to the storage. +Read "local cache backup". diff --git a/content/docs/index.md b/content/docs/index.md index f23d119383..7de213aaaa 100644 --- a/content/docs/index.md +++ b/content/docs/index.md @@ -1,8 +1,27 @@ # DVC Documentation -Welcome! In here you may find all the guiding material and technical documents -needed to learn about DVC: how to use it, how it works, and where to go for -additional resources. +Welcome! In here you may find all the material and technical details needed to +learn about DVC: how to use it, how it works, and where to go for additional +resources. + +## What is DVC? + +Data Version Control, or DVC, is **a new type of experiment management +software** built on top of the existing engineering toolset that you're already +used to, particularly [Git](https://git-scm.com) source code management. DVC +reduces the gap between existing tools and data science needs. + +If you store and process data files or datasets to produce other data or machine +learning models, and you want to + +- capture and save data artifacts the same way you capture code; +- track and switch between different versions of data or models easily; +- understand how data or models were built in the first place; +- be able to compare models and metrics to each other; +- bring software engineering best practices to your data science team; +- among other [use cases](/doc/use-cases) + +DVC is for you! ## Before you start diff --git a/content/docs/tutorials/deep/preparation.md b/content/docs/tutorials/deep/preparation.md index 6db7335400..4d81d1501b 100644 --- a/content/docs/tutorials/deep/preparation.md +++ b/content/docs/tutorials/deep/preparation.md @@ -61,7 +61,7 @@ $ pip install -r code/requirements.txt DVC works on top of Git repositories. You run DVC initialization in a repository directory to create DVC meta files and directories. -At DVC initialization, a new `.dvc/` directory will be created for internal +At DVC initialization, a new `.dvc/` directory is created for internal configuration and cache [files and directories](/doc/user-guide/dvc-files-and-directories) that are hidden from the user. We describe some DVC internals below for a better diff --git a/content/docs/tutorials/get-started/index.md b/content/docs/tutorials/get-started/index.md index eeed4adbef..62e7e19591 100644 --- a/content/docs/tutorials/get-started/index.md +++ b/content/docs/tutorials/get-started/index.md @@ -1,92 +1,71 @@ # Get Started with DVC! -You'll need [Git](https://git-scm.com) to run the commands in this guide. Also, -if DVC is not installed, please follow these [instructions](/doc/install) to do -so. +You'll need [Git](https://git-scm.com) to run the commands in this tutorial. +Also, if DVC is not installed, please follow these [instructions](/doc/install) +first. -In the next few sections we'll build a simple natural language processing (NLP) -project from scratch. If you'd like to get the final result or have any issues -along the way, you can download the fully reproducible -[GitHub project](https://github.com/iterative/example-get-started) by running: +In the next few pages we'll build a simple natural language processing (NLP) +project from scratch. If you'd like to get the complete project or have any +issues along the way, you can clone the fully reproducible +[GitHub project](https://github.com/iterative/example-get-started): ```dvc $ git clone https://github.com/iterative/example-get-started ``` -Otherwise, bear with us and we'll introduce some basic DVC concepts to get the -same results together! +This project explores the NLP problem of predicting tags for a given +StackOverflow question. For example, we might want a classifier that can +classify (or predict) posts about Python by tagging them with `python`. -The idea for this project is a simplified version of our -[Deep Dive Tutorial](/doc/tutorials/deep). It explores the NLP problem of -predicting tags for a given StackOverflow question. For example, we might want a -classifier that can classify (or predict) posts about Python by tagging them -with `python`. +![](/img/example-flow-2x.png) _Data modeling overview_ -![](/img/example-flow-2x.png) +> This is a simplified version of our [Deep Dive Tutorial](/doc/tutorials/deep). -This is a natural language processing context, but NLP isn't the only area of -data science where DVC can help. DVC is designed to be agnostic of frameworks, -languages, etc. If you have data files or datasets and/or you produce data -files, models, or datasets and you want to: +Keep in mind that NLP is not the only area of data science where DVC can help. +DVC is designed to be agnostic of frameworks, programming languages, etc. -- Capture and save those data artifacts the same way you capture - code -- Track and switch between different versions of data easily -- Understand how data artifacts (e.g. ML models) were built in the first place -- Be able to compare models to each other -- Bring software best practices to your team and get everyone on the same page - -# Initialize - -There are a few recommended ways to install DVC: OS-specific package/installer, -`pip`, `conda`, and Homebrew. See [Installation](/doc/install) for all the -alternatives and details. +## Initialize Let's start by creating a workspace we can version with Git. Then -run `dvc init` inside to create the DVC project: +run `dvc init` inside to create a DVC repository: ```dvc $ mkdir example-get-started $ cd example-get-started $ git init $ dvc init +... $ git commit -m "Initialize DVC project" ``` -At DVC initialization, a new `.dvc/` directory will be created for internal +At DVC initialization, a new `.dvc/` directory is created for internal configuration and cache [files and directories](/doc/user-guide/dvc-files-and-directories) that are hidden from the user. -> See `dvc init` if you want to get more details about the initialization -> process, and -> [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to -> learn about the DVC internal file and directory structure. +> See [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to +> learn more about the DVC internal file and directory structure. The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore` -files (DVC internals) with Git. +DVC internal files with Git. -# Configure +## Configure -Once you install DVC, you'll be able to start using it (in its local setup) -immediately. +Remote storage for the project (see `dvc remote`) should be set up +if you need to share data or models outside of the local context, for example +with other collaborators or even for yourself to access from a different +computing environment. -However, remote storage should be set up (see `dvc remote`) if you need to share -data or models outside of the context of a single project, for example with -other collaborators or even with yourself, in a different computing environment. -It's similar to the way you would use GitHub or any other Git server to store -and share your code. - -For simplicity, let's setup a local remote: +For simplicity, let's set up a _local remote_.
### What is a "local remote" ? While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. +refers to the location of the storage relative to the project, so it can be any +directory in the file system. "Remote" is the term that refers to the storage. +Read "local cache backup".
@@ -95,41 +74,28 @@ $ dvc remote add -d myremote /tmp/dvc-storage $ git commit .dvc/config -m "Configure local remote" ``` -> We only use a local remote in this section for simplicity's sake as you learn -> to use DVC. For most [use cases](/doc/use-cases), other "more remote" types of -> remotes will be required. +> We only use a local remote in this tutorial for simplicity's sake.. For most +> cases, other "more remote" types of remotes will be required. -[Adding a remote](/doc/command-reference/remote/add) should be specified by both -its type (protocol) and its path. DVC currently supports these types of remotes: +[Adding a remote](/doc/command-reference/remote/add) requires specifying both +its type and its path. DVC currently supports these protocols: - `s3`: Amazon Simple Storage Service - `azure`: Microsoft Azure Blob Storage -- `gdrive` : Google Drive +- `gdrive`: Google Drive - `gs`: Google Cloud Storage - `ssh`: Secure Shell (requires SFTP) - `hdfs`: Hadoop Distributed File System - `http`: HTTP and HTTPS protocols - `local`: Directory in the local file system -> If you installed DVC via `pip` and plan to use cloud services as remote -> storage, you might need to install these optional dependencies: `[s3]`, -> `[azure]`, `[gdrive]`, `[gs]`, `[oss]`, `[ssh]`. Alternatively, use `[all]` to -> include them all. The command should look like this: `pip install "dvc[s3]"`. -> (This example installs `boto3` library along with DVC to support S3 storage.) +> Refer to `dvc remote` for more details and examples. -For example, to setup an S3 remote we would use something like this (make sure -that `mybucket` exists): +There are several more things that can be optionally configured in DVC +projects, please see `dvc config` for more information. -```dvc -$ dvc remote add -d s3remote s3://mybucket/myproject -``` - -> This command is only shown for informational purposes. No need to actually run -> it in order to continue with the Get Started. +## Ready to go! You can see that DVC doesn't require installing any databases, servers, or -warehouses. It can use bare S3 or SSH to store data, intermediate results, and -models. - -See `dvc config` to get information about more configuration options and -`dvc remote` to learn more about remotes and get more examples. +warehouses. It can use simple S3 or SSH to store data, intermediate results, and +ML models. Please go to the next page of this tutorial to continue ↘ diff --git a/content/docs/tutorials/pipelines.md b/content/docs/tutorials/pipelines.md index 4bcd33da7e..bdcc4bbce5 100644 --- a/content/docs/tutorials/pipelines.md +++ b/content/docs/tutorials/pipelines.md @@ -102,7 +102,7 @@ When we run `dvc add` `Posts.xml.zip`, DVC creates a ### Expand to learn about DVC internals -At DVC initialization, a new `.dvc/` directory will be created for internal +At DVC initialization, a new `.dvc/` directory is created for internal configuration and cache [files and directories](/doc/user-guide/dvc-files-and-directories) that are hidden from the user. diff --git a/content/docs/understanding-dvc/what-is-dvc.md b/content/docs/understanding-dvc/what-is-dvc.md index 444d7a6774..7f21206a88 100644 --- a/content/docs/understanding-dvc/what-is-dvc.md +++ b/content/docs/understanding-dvc/what-is-dvc.md @@ -1,16 +1,15 @@ # What Is DVC? Data Version Control, or DVC, is **a new type of experiment management -software** that has been built **on top of the existing engineering toolset that -you're already used to**, and particularly on a source code version control -system (currently Git). DVC reduces the gap between existing tools and data -science needs, allowing users to take advantage of experiment management -software while reusing existing skills and intuition. - -The underlying source code control system eliminates the need to use external -services. Data science experiment sharing and collaboration can be done through -regular Git tools (commit messages, merges, pull requests, etc) the same way it -works for software engineers. +software** built on top of the existing engineering toolset that you're already +used to, and particularly on a source code management (Git). DVC reduces the gap +between existing tools and data science needs, allowing users to take advantage +of experiment management while reusing existing skills and intuition. + +Leveraging an underlying source code management system eliminates the need to +use external services. Data science experiment sharing and collaboration can be +done through regular Git features (commit messages, merges, pull requests, etc) +the same way it works for software engineers. DVC implements a **Git experimentation methodology** where each experiment exists with its code as well as data, and can be represented as a separate Git diff --git a/content/docs/use-cases/versioning-data-and-model-files.md b/content/docs/use-cases/versioning-data-and-model-files.md index dbf45e37ce..887bf62e15 100644 --- a/content/docs/use-cases/versioning-data-and-model-files.md +++ b/content/docs/use-cases/versioning-data-and-model-files.md @@ -42,7 +42,7 @@ initialize the DVC project on top of the existing repository: $ dvc init ``` -At DVC initialization, a new `.dvc/` directory will be created for internal +At DVC initialization, a new `.dvc/` directory is created for internal configuration and cache [files and directories](/doc/user-guide/dvc-files-and-directories) that are hidden from the user. These can safely be tracked with Git: From 8c11f740df3a83ad8e1c3d256d757ee530c26252 Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Mon, 23 Mar 2020 04:18:33 -0600 Subject: [PATCH 6/9] tutorials: a few more impovements to get-started index --- content/docs/tutorials/get-started/index.md | 60 +++++++++++---------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/content/docs/tutorials/get-started/index.md b/content/docs/tutorials/get-started/index.md index 62e7e19591..5b9f2b9e49 100644 --- a/content/docs/tutorials/get-started/index.md +++ b/content/docs/tutorials/get-started/index.md @@ -5,12 +5,14 @@ Also, if DVC is not installed, please follow these [instructions](/doc/install) first. In the next few pages we'll build a simple natural language processing (NLP) -project from scratch. If you'd like to get the complete project or have any -issues along the way, you can clone the fully reproducible -[GitHub project](https://github.com/iterative/example-get-started): +project from scratch. In case you'd like to get the complete code base and +results, or have any issues along the way, here's a fully reproducible +[repo on GitHub](https://github.com/iterative/example-get-started): ```dvc $ git clone https://github.com/iterative/example-get-started +$ cd example-get-started +$ dvc pull ``` This project explores the NLP problem of predicting tags for a given @@ -35,7 +37,7 @@ $ cd example-get-started $ git init $ dvc init ... -$ git commit -m "Initialize DVC project" +$ git commit -m "Initialize DVC repo" ``` At DVC initialization, a new `.dvc/` directory is created for internal @@ -51,12 +53,10 @@ DVC internal files with Git. ## Configure -Remote storage for the project (see `dvc remote`) should be set up -if you need to share data or models outside of the local context, for example -with other collaborators or even for yourself to access from a different -computing environment. - -For simplicity, let's set up a _local remote_. +Because we'll want to share data and models outside of the local context later +(for example with other collaborators or for access from a different computing +environment), we're going to set up remote storage for the DVC +project. For simplicity, let's set up a _local remote_.
@@ -65,7 +65,7 @@ For simplicity, let's set up a _local remote_. While the term may seem contradictory, it doesn't have to be. The "local" part refers to the location of the storage relative to the project, so it can be any directory in the file system. "Remote" is the term that refers to the storage. -Read "local cache backup". +Read "local cache backup".
@@ -74,28 +74,32 @@ $ dvc remote add -d myremote /tmp/dvc-storage $ git commit .dvc/config -m "Configure local remote" ``` -> We only use a local remote in this tutorial for simplicity's sake.. For most +> We only use a local remote in this tutorial for simplicity's sake. For most > cases, other "more remote" types of remotes will be required. -[Adding a remote](/doc/command-reference/remote/add) requires specifying both -its type and its path. DVC currently supports these protocols: +[Adding a remote](/doc/command-reference/remote/add) requires providing a +location which includes both a type (protocol) and its path. DVC currently +supports these types: -- `s3`: Amazon Simple Storage Service -- `azure`: Microsoft Azure Blob Storage -- `gdrive`: Google Drive -- `gs`: Google Cloud Storage -- `ssh`: Secure Shell (requires SFTP) -- `hdfs`: Hadoop Distributed File System -- `http`: HTTP and HTTPS protocols -- `local`: Directory in the local file system +- Amazon **S3** (Simple Storage Service) +- Microsoft **Azure** Blob Storage +- **Google Drive** +- **Google Cloud** Storage +- Aliyun **OSS** (Object Storage Service) +- **SSH** (Secure Shell) — requires SFTP +- HDFS (Hadoop Distributed File System) +- **HTTP** (and HTTPS) — read-only +- Directory in the **local** file system > Refer to `dvc remote` for more details and examples. -There are several more things that can be optionally configured in DVC -projects, please see `dvc config` for more information. +DVC doesn't require installing any databases, servers, or warehouses. It can +simply use cloud services, local or network file systems to store data, +intermediate results, and ML models. + +There are other features and options that can optionally be configured in DVC. +Please see `dvc config` for more information. -## Ready to go! +--- -You can see that DVC doesn't require installing any databases, servers, or -warehouses. It can use simple S3 or SSH to store data, intermediate results, and -ML models. Please go to the next page of this tutorial to continue ↘ +Go to the next page to continue ↘ From 3f13e2b442c9b04ae0d57368185b62fdcbc7dc85 Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Tue, 24 Mar 2020 12:21:59 -0600 Subject: [PATCH 7/9] tutorials: finish get-started/versioning-basics page (page 1/3) --- content/docs/command-reference/cache/index.md | 4 +- content/docs/command-reference/init.md | 4 +- content/docs/tutorials/deep/preparation.md | 4 +- content/docs/tutorials/get-started/index.md | 61 ++-- .../get-started/versioning-basics.md | 268 ++++++++---------- content/docs/tutorials/pipelines.md | 18 +- content/docs/tutorials/versioning.md | 14 +- .../versioning-data-and-model-files.md | 4 +- 8 files changed, 180 insertions(+), 197 deletions(-) diff --git a/content/docs/command-reference/cache/index.md b/content/docs/command-reference/cache/index.md index 61a3c7ee62..748a47342f 100644 --- a/content/docs/command-reference/cache/index.md +++ b/content/docs/command-reference/cache/index.md @@ -16,8 +16,8 @@ positional arguments: ## Description At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. The cache is where your data files, models, etc (anything you want to version diff --git a/content/docs/command-reference/init.md b/content/docs/command-reference/init.md index d667d55e9f..cc811d3c61 100644 --- a/content/docs/command-reference/init.md +++ b/content/docs/command-reference/init.md @@ -23,8 +23,8 @@ advanced scenarios: SCM other than Git, deployment automation cases, etc. At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. ### Initializing DVC in subdirectories diff --git a/content/docs/tutorials/deep/preparation.md b/content/docs/tutorials/deep/preparation.md index 4d81d1501b..bd25bbaf97 100644 --- a/content/docs/tutorials/deep/preparation.md +++ b/content/docs/tutorials/deep/preparation.md @@ -62,8 +62,8 @@ DVC works on top of Git repositories. You run DVC initialization in a repository directory to create DVC meta files and directories. At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. We describe some DVC internals below for a better understanding of how it works. diff --git a/content/docs/tutorials/get-started/index.md b/content/docs/tutorials/get-started/index.md index 5b9f2b9e49..77bfcd7fc2 100644 --- a/content/docs/tutorials/get-started/index.md +++ b/content/docs/tutorials/get-started/index.md @@ -5,17 +5,7 @@ Also, if DVC is not installed, please follow these [instructions](/doc/install) first. In the next few pages we'll build a simple natural language processing (NLP) -project from scratch. In case you'd like to get the complete code base and -results, or have any issues along the way, here's a fully reproducible -[repo on GitHub](https://github.com/iterative/example-get-started): - -```dvc -$ git clone https://github.com/iterative/example-get-started -$ cd example-get-started -$ dvc pull -``` - -This project explores the NLP problem of predicting tags for a given +project from scratch. It explores the NLP problem of predicting tags for a given StackOverflow question. For example, we might want a classifier that can classify (or predict) posts about Python by tagging them with `python`. @@ -26,23 +16,35 @@ classify (or predict) posts about Python by tagging them with `python`. Keep in mind that NLP is not the only area of data science where DVC can help. DVC is designed to be agnostic of frameworks, programming languages, etc. +> In case you'd like to get the complete code base and results, or have any +> issues along the way, please note we have a fully reproducible +> [**example-get-started**](https://github.com/iterative/example-get-started) +> repo on GitHub: +> +> ```dvc +> $ git clone https://github.com/iterative/example-get-started +> $ cd example-get-started +> $ dvc pull +> ``` + ## Initialize -Let's start by creating a workspace we can version with Git. Then -run `dvc init` inside to create a DVC repository: +Let's start by creating a workspace your home directory that we can +version with Git. Then run `dvc init` inside to create a DVC +repository: ```dvc -$ mkdir example-get-started -$ cd example-get-started +$ cd ~ +$ mkdir sotag-predictions +$ cd sotag-predictions $ git init $ dvc init -... -$ git commit -m "Initialize DVC repo" +$ git commit -m "Initialize DVC repository" ``` At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. > See [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to @@ -55,7 +57,7 @@ DVC internal files with Git. Because we'll want to share data and models outside of the local context later (for example with other collaborators or for access from a different computing -environment), we're going to set up remote storage for the DVC +environment), we're going to set up a remote storage for the DVC project. For simplicity, let's set up a _local remote_.
@@ -75,11 +77,12 @@ $ git commit .dvc/config -m "Configure local remote" ``` > We only use a local remote in this tutorial for simplicity's sake. For most -> cases, other "more remote" types of remotes will be required. +> cases, other "more remote" types of storage will be required. -[Adding a remote](/doc/command-reference/remote/add) requires providing a -location which includes both a type (protocol) and its path. DVC currently -supports these types: +That's it! DVC doesn't require installing any databases, servers, or warehouses. +It can simply use cloud services, local or network file systems to store data, +intermediate results, and ML models. The following remote types are currently +supported: - Amazon **S3** (Simple Storage Service) - Microsoft **Azure** Blob Storage @@ -87,18 +90,14 @@ supports these types: - **Google Cloud** Storage - Aliyun **OSS** (Object Storage Service) - **SSH** (Secure Shell) — requires SFTP -- HDFS (Hadoop Distributed File System) +- **HDFS** (Hadoop Distributed File System) - **HTTP** (and HTTPS) — read-only - Directory in the **local** file system > Refer to `dvc remote` for more details and examples. -DVC doesn't require installing any databases, servers, or warehouses. It can -simply use cloud services, local or network file systems to store data, -intermediate results, and ML models. - -There are other features and options that can optionally be configured in DVC. -Please see `dvc config` for more information. +There are other features and options that can be configured in DVC. Please see +`dvc config` for more information. --- diff --git a/content/docs/tutorials/get-started/versioning-basics.md b/content/docs/tutorials/get-started/versioning-basics.md index 257d43c417..6a887a22de 100644 --- a/content/docs/tutorials/get-started/versioning-basics.md +++ b/content/docs/tutorials/get-started/versioning-basics.md @@ -1,8 +1,8 @@ -# Add Files or Directories +# Data Versioning Basics -DVC allows storing and versioning data files, ML models, directories, -intermediate results with Git, without tracking the file contents with Git. -Let's get a dataset example to play with: +DVC allows storing and versioning data files or directories, ML models, and +intermediate results with a regular Git workflow, without actually tracking the +file contents with Git. Let's get a dataset example to play with: ```dvc $ mkdir data @@ -10,49 +10,52 @@ $ dvc get https://github.com/iterative/dataset-registry \ get-started/data.xml -o data/data.xml ``` -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) +> `dvc get` can download any data artifact tracked in a DVC +> repository, using the appropriate +> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for +> DVC/Git repos). In this case we use our +> [dataset-registry](https://github.com/iterative/dataset-registry)) as the +> source repository (refer to [Data Registries](/doc/use-cases/data-registries) +> for more info.) -To track a file (or a directory) with DVC just run `dvc add` on it. For example: +## Start tracking data + +To track a file with DVC, just run `dvc add` on it: ```dvc $ dvc add data/data.xml ``` -DVC stores information about the added data in a special file called a -**DVC-file**. DVC-files are small text files with a human-readable -[format](/doc/user-guide/dvc-file-format) and they can be committed with Git: +DVC stores information about the added data in a special **DVC-file** +(`data/data.xml.dvc`), a small text file with a human-readable +[format](/doc/user-guide/dvc-file-format). The above command also tells Git to +ignore the actual data contents, so that this version of the data can be safely +committed to the repository, using Git: ```dvc $ git add data/.gitignore data/data.xml.dvc -$ git commit -m "Add raw data to project" +$ git commit -m "Add raw data" ``` -Committing DVC-files with Git allows us to track different versions of the -project data as it evolves with the source code tracked by Git. -
### Expand to learn about DVC internals -`dvc add` moves the actual data file to the cache directory (see -[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), while -the entries in the workspace may be file links to the actual files in the DVC -cache. +`dvc add` moves the data file to the project's cache (see +[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), and +makes file links (or copies) with the original file names back in the +workspace, which is what you see inside the project. ```dvc $ ls -R .dvc/cache +... .dvc/cache/a3: 04afb96060aad90176268345e10355 ``` -`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` -file we just added with DVC. If you check the `data/data.xml.dvc` DVC-file, you +The hash value of the `data/data.xml` file we just added, +`a304afb96060aad90176268345e10355` determines the path and file name shown +above. And if you check the `data/data.xml.dvc` DVC-file created by DVC, you will see that it has this string inside. ### Important note on cache performance @@ -78,175 +81,156 @@ See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and
-If your workspace uses Git, without DVC you would have to manually put each data -file or directory into `.gitignore`. DVC commands that track data files -automatically takes care of this for you! (You just have to add the changes with -Git.) - Refer to [Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files), `dvc add`, and `dvc run` for more information on storing and versioning data files with DVC. -# Store and Share Data +## Store and share data -Now, that your data files are managed by DVC (see -[Add Files](/doc/tutorials/get-started/add-files)), you can push them from your -repository to the default [remote](/doc/command-reference/remote) storage\*: +Now that your raw data is tracked by DVC, you can push it from your repository +to the default [remote storage](/doc/command-reference/remote). + +> As seen in the intro's [Configure](/doc/tutorials/get-started#configure) +> section, we are using a **local remote** in this section for illustrative +> purposes. ```dvc $ dvc push ``` -The same way as with Git remote, it ensures that your data files and your models -are safely stored remotely and are shareable. This means that the data can be -pulled by yourself or your colleagues whenever you need it. - -Usually, you run it along with `git commit` and `git push` to save the changed -[DVC-files](/doc/user-guide/dvc-file-format). - -The `dvc push` command allows one to upload data to remote storage. It doesn't -save any changes in the code or DVC-files. Those should be saved by using -`git commit` and `git push`. - -> \*As noted in the DVC [configuration](/doc/tutorials/get-started/configure) -> chapter, we are using a **local remote** in this section for illustrative -> purposes. +Similar to pushing source code to a _Git remote_, `dvc push` ensures that your +data files and models are safely backed up remotely. This means that the data +can be pulled by yourself or by colleagues when and where needed. Usually, we +also want to `git commit` and `git push`, to save the new (or changed versions +of) [DVC-files](/doc/user-guide/dvc-file-format).
### Expand to learn more about DVC internals -You can check now that actual data file has been copied to the remote we created -in the [configuration](/doc/tutorials/get-started/configure) chapter: +You can check that the data has been backed up to the remote (`/tmp/dvc-storage` +local directory) with: ```dvc $ ls -R /tmp/dvc-storage +... /tmp/dvc-storage/a3: 04afb96060aad90176268345e10355 ``` -`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` -file. If you check the `data.xml.dvc` -[DVC-file](/doc/user-guide/dvc-file-format), you will see that it has this -string inside. -
-# Import Data - -We've seen how to [push](/doc/tutorials/get-started/store-data) and -[pull](/doc/tutorials/get-started/retrieve-data) data from/to a DVC -project's [remote](/doc/command-reference/remote). But what if we wanted -to integrate a dataset or ML model produced in one project into another one? - -One way is to manually download the data (with `wget` or `dvc get`, for example) -and use `dvc add` to track it, but the connection between the projects would be -lost. We wouldn't be able to tell where the data came from or whether there are -new versions available. A better alternative is the `dvc import` command: +## Retrieve data - + +If you list the files in this fresh workspace, or even in the +cache, you'll notice that the `data/data.xml` file is not there yet. This is +because it's not stored by Git! To get it, simply run: ```dvc -$ dvc import https://github.com/iterative/dataset-registry \ - get-started/data.xml +$ dvc pull ``` -This downloads `data.xml` from our -[dataset-registry](https://github.com/iterative/dataset-registry) project into -the current working directory, adds it to `.gitignore`, and creates the -`data.xml.dvc` [DVC-file](/doc/user-guide/dvc-file-format) to track changes in -the source data. With _imports_, we can use `dvc update` to bring in changes in -the external data source before -[reproducing](/doc/tutorials/get-started/reproduce) any pipeline -that depends on this data. +`dvc pull` downloads data files that are referenced in all present +[DVC-files](/doc/user-guide/dvc-file-format) from the project's +remote storage, so usually we run it after `git clone`, `git pull`, or +`git checkout`. -
+Alternatively, if you want to retrieve a single file or directory, you can +specify the target like this: -### Expand to learn more about imports - -Note that the [dataset-registry](https://github.com/iterative/dataset-registry) -repository doesn't actually contain a `get-started/data.xml` file. Instead, DVC -inspects -[get-started/data.xml.dvc](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc) -and tries to retrieve the file using the project's default remote (configured -[here](https://github.com/iterative/dataset-registry/blob/master/.dvc/config)). - -DVC-files created by `dvc import` are called _import stages_. They use the -`repo` field in the dependencies section (`deps`) in order to track source data -changes (as an [external dependency](/doc/user-guide/external-dependencies)), -enabling the reusability of data artifacts. For example: - -```yaml -md5: fd56a1794c147fea48d408f2bc95a33a -locked: true -deps: - - path: get-started/data.xml - repo: - url: https://github.com/iterative/dataset-registry - rev_lock: 7476a858f6200864b5755863c729bff41d0fb045 -outs: - - md5: a304afb96060aad90176268345e10355 - path: data.xml - cache: true - metric: false - persist: false +```dvc +$ dvc pull data/data.xml.dvc ``` -The `url` and `rev_lock` subfields under `repo` are used to save the origin and -[version](https://git-scm.com/docs/revisions) of the dependency, respectively. +> In this case, both commands have the same result, as there's currently just +> one DVC-tracked file in the repo. + +[DVC remotes](/doc/command-reference/remote), `dvc push`, and `dvc pull` provide +a basic collaboration workflow, the same way as Git remotes, `git push` and +`git pull`. See +[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for +more information. -> Note that `dvc update` updates the `rev_lock` field of the corresponding -> DVC-file (when there are changes to bring in). +## Import data -
+We've seen how to [push](#store-and-share-date) and [pull](#retrieve-data) data +from/to a remote storage. But what if we wanted to integrate a dataset or ML +model produced in one project into another one? + +One way is to manually download the data and use `dvc add` to track it, like in +the beginning of this page. But the connection between the projects is only +known by the person doing this. Others wouldn't be able to tell where the data +came from or whether there are new versions available. -Since this is not an official part of this _Get Started_, bring everything back -to normal with: +A better alternative is the `dvc import` command! Let's go back to the +project we're building, and replace `data/data.xml` by importing it +from the same source: ```dvc -$ git reset --hard -$ rm -f data.* +$ cd ~/sotag-predictions +$ dvc import https://github.com/iterative/dataset-registry \ + get-started/data.xml -o data/data.xml ``` -> See also `dvc import-url`. +This downloads and overwrites the same `data/data.xml`, checks that it's in +`data/.gitignore`, and creates the `data/data.xml.dvc` +[DVC-file](/doc/user-guide/dvc-file-format). So far this seems identical to our +previous strategy, except that this time `data.xml.dvc` has additional metadata +that allows DVC to track changes in the source data. This allows `dvc update` to +bring in changes from the data source. -# Retrieve Data +
-> You'll need to complete the -> [initialization](/doc/tutorials/get-started/initialize) and -> [configuration](/doc/tutorials/get-started/configure) chapters before being -> able to run the commands explained here. +### Expand to learn more about DVC internals -To retrieve data files into the workspace in your local machine, -run: +DVC-files created by `dvc import` are called _import stages_. If we check the +difference against the regular DVC-file we previously had, we can see that the +latter has more fields, such as the data source `repo`, and `path` within it: ```dvc -$ rm -f data/data.xml -$ dvc pull +$ git diff +... +--- a/data/data.xml.dvc ++++ b/data/data.xml.dvc +... ++deps: ++- path: get-started/data.xml ++ repo: ++ url: https://github.com/iterative/dataset-registry ++ rev_lock: f31f5c4cdae787b4bdeb97a717687d44667d9e62 ``` -This command downloads data files that are referenced in all -[DVC-files](/doc/user-guide/dvc-file-format) in the project. So, -you usually run it after `git clone`, `git pull`, or `git checkout`. +The `url` and `rev_lock` subfields under `repo` are used to save the origin and +[version](https://git-scm.com/docs/revisions) of the dependency, respectively. + +> `dvc update` updates the `rev_lock` field of the corresponding DVC-file (when +> there are changes to bring in). -Alternatively, if you want to retrieve a single dataset or a file you can use: +Note that the [dataset-registry](https://github.com/iterative/dataset-registry) +repository doesn't actually contain a `get-started/data.xml` file. Like, +`dvc get`, importing also downloads the data from the appropriate +[remote storage](/doc/command-reference/remote). + +
+ +Let's wrap up by committing the import stage with Git: ```dvc -$ dvc pull data/data.xml.dvc +$ git add data/data.xml.dvc +$ git commit -m "Import raw data (overwrite)" +$ dvc push # so others can pull the imported data in their repo copies ``` - -DVC remotes, `dvc push`, and `dvc pull` provide a basic collaboration workflow, -the same way as Git remotes, `git push` and `git pull`. See -[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for -more information. diff --git a/content/docs/tutorials/pipelines.md b/content/docs/tutorials/pipelines.md index bdcc4bbce5..10f21a352b 100644 --- a/content/docs/tutorials/pipelines.md +++ b/content/docs/tutorials/pipelines.md @@ -50,13 +50,13 @@ $ git add code/ $ git commit -m "Download and add code to new Git repo" ``` -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) +> `dvc get` can download any data artifact tracked in a DVC +> repository, using the appropriate +> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for +> DVC/Git repos). In this case we use our +> [dataset-registry](https://github.com/iterative/dataset-registry)) as the +> source repository (refer to [Data Registries](/doc/use-cases/data-registries) +> for more info.) Now let's install the requirements. But before we do that, we **strongly** recommend creating a @@ -103,8 +103,8 @@ When we run `dvc add` `Posts.xml.zip`, DVC creates a ### Expand to learn about DVC internals At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. Note that the DVC-file created by `dvc add` has no dependencies, a.k.a. an diff --git a/content/docs/tutorials/versioning.md b/content/docs/tutorials/versioning.md index 370fc95165..1ae0768a9a 100644 --- a/content/docs/tutorials/versioning.md +++ b/content/docs/tutorials/versioning.md @@ -83,13 +83,13 @@ $ unzip -q data.zip $ rm -f data.zip ``` -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) +> `dvc get` can download any data artifact tracked in a DVC +> repository, using the appropriate +> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for +> DVC/Git repos). In this case we use our +> [dataset-registry](https://github.com/iterative/dataset-registry)) as the +> source repository (refer to [Data Registries](/doc/use-cases/data-registries) +> for more info.) This command downloads and extracts our raw dataset, consisting of 1000 labeled images for training and 800 labeled images for validation. In total, it's a 43 diff --git a/content/docs/use-cases/versioning-data-and-model-files.md b/content/docs/use-cases/versioning-data-and-model-files.md index 887bf62e15..a434db583a 100644 --- a/content/docs/use-cases/versioning-data-and-model-files.md +++ b/content/docs/use-cases/versioning-data-and-model-files.md @@ -43,8 +43,8 @@ $ dvc init ``` At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. These can safely be tracked with Git: ```dvc From d6a300fa4a097c2b4314b8aa5c324eb1a5ebefa3 Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Wed, 8 Apr 2020 00:22:30 -0500 Subject: [PATCH 8/9] scripts: small update for link-check --- README.md | 2 +- package.json | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f3618b42de..b1bcb3fd6f 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ code. Make sure you have the latest LTS version of [Node.js](https://nodejs.org) and [Yarn](https://yarnpkg.com) installed. -Run `yarn command`. +Run `yarn`. ## Commands diff --git a/package.json b/package.json index 316479f009..2b33043311 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,6 @@ "main": "index.js", "scripts": { "develop": "gatsby develop", - "debug": "node --inspect-brk server.js", "build": "gatsby build", "heroku-postbuild": "./scripts/deploy-with-s3.js", "test": "jest", @@ -15,8 +14,8 @@ "format-all": "prettier --write './**/*.{js,jsx,md,tsx,ts,json}'", "lint-ts": "tsc --noEmit --skipLibCheck && eslint --ext .json,.js,.ts,.tsx src scripts", "lint-css": "stylelint \"src/**/*.css\"", - "link-check": "scripts/link-check-git-all.sh", - "link-check-diff": "scripts/link-check-git-diff.sh" + "link-check": "./scripts/link-check-git-all.sh", + "link-check-diff": "./scripts/link-check-git-diff.sh" }, "repository": { "type": "git", From 9665edc369a725bd3fab0f4a5f4645ec574acdce Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Wed, 8 Apr 2020 01:10:34 -0500 Subject: [PATCH 9/9] scripts: fix link-check bad pages path per https://github.com/iterative/dvc.org/issues/1123#issuecomment-610766116 --- scripts/link-check-git-all.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-check-git-all.sh b/scripts/link-check-git-all.sh index 84da16a85b..27c78fea74 100755 --- a/scripts/link-check-git-all.sh +++ b/scripts/link-check-git-all.sh @@ -2,5 +2,5 @@ repo="$(dirname "$(realpath "$(dirname "$0")")")" -(find "$repo"/pages/ "$repo"/content/docs/ "$repo"/src/ "$repo"/.github/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \ +(find "$repo"/.github/ "$repo"/content/docs/ "$repo"/src/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \ | xargs -n1 -P8 $(dirname "$0")/link-check.sh