diff --git a/README.md b/README.md index f3618b42de..b1bcb3fd6f 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ code. Make sure you have the latest LTS version of [Node.js](https://nodejs.org) and [Yarn](https://yarnpkg.com) installed. -Run `yarn command`. +Run `yarn`. ## Commands diff --git a/content/docs/command-reference/cache/index.md b/content/docs/command-reference/cache/index.md index 04999a469a..d8e565a806 100644 --- a/content/docs/command-reference/cache/index.md +++ b/content/docs/command-reference/cache/index.md @@ -15,9 +15,9 @@ positional arguments: ## Description -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +At DVC initialization, a new `.dvc/` directory is created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. The cache is where your data files, models, etc. (anything you want to version diff --git a/content/docs/command-reference/init.md b/content/docs/command-reference/init.md index feddb5ec96..ae9952a2b3 100644 --- a/content/docs/command-reference/init.md +++ b/content/docs/command-reference/init.md @@ -22,9 +22,9 @@ advanced scenarios: - [Initializing DVC without Git](#how-does-it-affect-dvc-commands) - support for SCM other than Git, deployment automation cases, etc. -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +At DVC initialization, a new `.dvc/` directory is created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. ### Initializing DVC in subdirectories diff --git a/content/docs/command-reference/remote/index.md b/content/docs/command-reference/remote/index.md index c0ca9c2f7a..a933a54732 100644 --- a/content/docs/command-reference/remote/index.md +++ b/content/docs/command-reference/remote/index.md @@ -74,9 +74,9 @@ For the typical process to share the project via remote, see ### What is a "local remote" ? While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. +refers to the location of the storage relative to the project, so it can be any +directory in the file system. "Remote" is the term that refers to the storage. +Read "local cache backup". diff --git a/content/docs/command-reference/remote/list.md b/content/docs/command-reference/remote/list.md index 57b39278cb..30880fecaf 100644 --- a/content/docs/command-reference/remote/list.md +++ b/content/docs/command-reference/remote/list.md @@ -45,9 +45,9 @@ Let's for simplicity add a _default_ local remote: ### What is a "local remote" ? While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. +refers to the location of the storage relative to the project, so it can be any +directory in the file system. "Remote" is the term that refers to the storage. +Read "local cache backup". diff --git a/content/docs/index.md b/content/docs/index.md index f23d119383..7de213aaaa 100644 --- a/content/docs/index.md +++ b/content/docs/index.md @@ -1,8 +1,27 @@ # DVC Documentation -Welcome! In here you may find all the guiding material and technical documents -needed to learn about DVC: how to use it, how it works, and where to go for -additional resources. +Welcome! In here you may find all the material and technical details needed to +learn about DVC: how to use it, how it works, and where to go for additional +resources. + +## What is DVC? + +Data Version Control, or DVC, is **a new type of experiment management +software** built on top of the existing engineering toolset that you're already +used to, particularly [Git](https://git-scm.com) source code management. DVC +reduces the gap between existing tools and data science needs. + +If you store and process data files or datasets to produce other data or machine +learning models, and you want to + +- capture and save data artifacts the same way you capture code; +- track and switch between different versions of data or models easily; +- understand how data or models were built in the first place; +- be able to compare models and metrics to each other; +- bring software engineering best practices to your data science team; +- among other [use cases](/doc/use-cases) + +DVC is for you! ## Before you start diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 7e3de822bc..618988e1db 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -1,6 +1,7 @@ [ { - "slug": "home", + "slug": "", + "label": "Home", "source": "index.md" }, { @@ -30,32 +31,14 @@ "children": [ { "slug": "get-started", - "source": false, + "source": "get-started/index.md", "tutorials": { "katacoda": "https://katacoda.com/dvc/courses/get-started/initialize" }, "children": [ - "agenda", - "initialize", - "configure", - "add-files", - "store-data", - "retrieve-data", - "import-data", - { - "label": "Connect with Code", - "slug": "connect-code-and-data" - }, - "pipeline", - "visualize", - "reproduce", - "metrics", - "experiments", - "compare-experiments", - { - "label": "Get Older Files", - "slug": "older-versions" - } + "versioning-basics", + "data-pipelines", + "experiment-management" ] }, { diff --git a/content/docs/tutorials/deep/preparation.md b/content/docs/tutorials/deep/preparation.md index 6db7335400..bd25bbaf97 100644 --- a/content/docs/tutorials/deep/preparation.md +++ b/content/docs/tutorials/deep/preparation.md @@ -61,9 +61,9 @@ $ pip install -r code/requirements.txt DVC works on top of Git repositories. You run DVC initialization in a repository directory to create DVC meta files and directories. -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +At DVC initialization, a new `.dvc/` directory is created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. We describe some DVC internals below for a better understanding of how it works. diff --git a/content/docs/tutorials/get-started/add-files.md b/content/docs/tutorials/get-started/add-files.md deleted file mode 100644 index 048aafa213..0000000000 --- a/content/docs/tutorials/get-started/add-files.md +++ /dev/null @@ -1,89 +0,0 @@ -# Add Files or Directories - -DVC allows storing and versioning data files, ML models, directories, -intermediate results with Git, without tracking the file contents with Git. -Let's get a dataset example to play with: - -```dvc -$ mkdir data -$ dvc get https://github.com/iterative/dataset-registry \ - get-started/data.xml -o data/data.xml -``` - -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) - -To track a file (or a directory) with DVC just run `dvc add` on it. For example: - -```dvc -$ dvc add data/data.xml -``` - -DVC stores information about the added data in a special file called a -**DVC-file**. DVC-files are small text files with a human-readable -[format](/doc/user-guide/dvc-file-format) and they can be committed with Git: - -```dvc -$ git add data/.gitignore data/data.xml.dvc -$ git commit -m "Add raw data to project" -``` - -Committing DVC-files with Git allows us to track different versions of the -project data as it evolves with the source code tracked by Git. - -
- -### Expand to learn about DVC internals - -`dvc add` moves the actual data file to the cache directory (see -[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), while -the entries in the workspace may be file links to the actual files in the DVC -cache. - -```dvc -$ ls -R .dvc/cache - .dvc/cache/a3: - 04afb96060aad90176268345e10355 -``` - -`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` -file we just added with DVC. If you check the `data/data.xml.dvc` DVC-file, you -will see that it has this string inside. - -### Important note on cache performance - -DVC tries to use reflinks\* by default to link your data files from the DVC -cache to the workspace, optimizing speed and storage space. However, reflinks -are not widely supported yet and DVC falls back to actually copying data files -to/from the cache. **Copying can be very slow with large files**, and duplicates -storage requirements. - -Hardlinks and symlinks are also available for optimized cache linking but, -(unlike reflinks) they carry the risk of accidentally corrupting the cache if -tracked data files are modified in the workspace. - -See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and -`dvc config cache` for more information. - -> \***copy-on-write links or "reflinks"** are a relatively new way to link files -> in UNIX-style file systems. Unlike hardlinks or symlinks, they support -> transparent [copy on write](https://en.wikipedia.org/wiki/Copy-on-write). This -> means that editing a reflinked file is always safe as all the other links to -> the file will reflect the changes. - -
- -If your workspace uses Git, without DVC you would have to manually put each data -file or directory into `.gitignore`. DVC commands that track data files -automatically takes care of this for you! (You just have to add the changes with -Git.) - -Refer to -[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files), -`dvc add`, and `dvc run` for more information on storing and versioning data -files with DVC. diff --git a/content/docs/tutorials/get-started/agenda.md b/content/docs/tutorials/get-started/agenda.md deleted file mode 100644 index ca56dfbdec..0000000000 --- a/content/docs/tutorials/get-started/agenda.md +++ /dev/null @@ -1,39 +0,0 @@ -# Agenda - -You'll need [Git](https://git-scm.com) to run the commands in this guide. Also, -if DVC is not installed, please follow these [instructions](/doc/install) to do -so. - -In the next few sections we'll build a simple natural language processing (NLP) -project from scratch. If you'd like to get the final result or have any issues -along the way, you can download the fully reproducible -[GitHub project](https://github.com/iterative/example-get-started) by running: - -```dvc -$ git clone https://github.com/iterative/example-get-started -``` - -Otherwise, bear with us and we'll introduce some basic DVC concepts to get the -same results together! - -The idea for this project is a simplified version of our -[Deep Dive Tutorial](/doc/tutorials/deep). It explores the NLP problem of -predicting tags for a given StackOverflow question. For example, we might want a -classifier that can classify (or predict) posts about Python by tagging them -with `python`. - -![](/img/example-flow-2x.png) - -This is a natural language processing context, but NLP isn't the only area of -data science where DVC can help. DVC is designed to be agnostic of frameworks, -languages, etc. If you have data files or datasets and/or you produce data -files, models, or datasets and you want to: - -- Capture and save those data artifacts the same way you capture - code -- Track and switch between different versions of data easily -- Understand how data artifacts (e.g. ML models) were built in the first place -- Be able to compare models to each other -- Bring software best practices to your team and get everyone on the same page - -Then you're in the right place! Click the `Next` button below to start ↘ diff --git a/content/docs/tutorials/get-started/compare-experiments.md b/content/docs/tutorials/get-started/compare-experiments.md deleted file mode 100644 index 21e09a8450..0000000000 --- a/content/docs/tutorials/get-started/compare-experiments.md +++ /dev/null @@ -1,42 +0,0 @@ -# Compare Experiments - -DVC makes it easy to iterate on your project using Git commits with tags or Git -branches. It provides a way to try different ideas, keep track of them, switch -back and forth. To find the best performing experiment or track the progress, -[project metrics](/doc/command-reference/metrics) are supported in DVC (as -described in one of the previous chapters). - -Let's run evaluate for the latest `bigrams` experiment we created in previous -chapters. It mostly takes just running the `dvc repro`: - -```dvc -$ git checkout master -$ dvc checkout -$ dvc repro evaluate.dvc -``` - -`git checkout master` and `dvc checkout` commands ensure that we have the latest -experiment code and data respectively. And `dvc repro`, as we discussed in the -[Reproduce](/doc/tutorials/get-started/reproduce) chapter, is a way to run all -the necessary commands to build the model and measure its performance. - -```dvc -$ git commit -am "Evaluate bigrams model" -$ git tag -a "bigrams-experiment" -m "Bigrams experiment evaluation" -``` - -Now, we can use `-T` option of the `dvc metrics show` command to see the -difference between the `baseline` and `bigrams` experiments: - -```dvc -$ dvc metrics show -T - -baseline-experiment: - auc.metric: 0.588426 -bigrams-experiment: - auc.metric: 0.602818 -``` - -DVC provides built-in support to track and navigate `JSON`, `TSV` or `CSV` -metric files if you want to track additional information. See `dvc metrics` to -learn more. diff --git a/content/docs/tutorials/get-started/configure.md b/content/docs/tutorials/get-started/configure.md deleted file mode 100644 index 99e8ca9279..0000000000 --- a/content/docs/tutorials/get-started/configure.md +++ /dev/null @@ -1,67 +0,0 @@ -# Configure - -Once you install DVC, you'll be able to start using it (in its local setup) -immediately. - -However, remote storage should be set up (see `dvc remote`) if you need to share -data or models outside of the context of a single project, for example with -other collaborators or even with yourself, in a different computing environment. -It's similar to the way you would use GitHub or any other Git server to store -and share your code. - -For simplicity, let's setup a local remote: - -
- -### What is a "local remote" ? - -While the term may seem contradictory, it doesn't have to be. The "local" part -refers to the machine where the project is stored, so it can be any directory -accessible to the same system. The "remote" part refers specifically to the -project/repository itself. Read "local, but external" storage. - -
- -```dvc -$ dvc remote add -d myremote /tmp/dvc-storage -$ git commit .dvc/config -m "Configure local remote" -``` - -> We only use a local remote in this section for simplicity's sake as you learn -> to use DVC. For most [use cases](/doc/use-cases), other "more remote" types of -> remotes will be required. - -[Adding a remote](/doc/command-reference/remote/add) should be specified by both -its type (protocol) and its path. DVC currently supports these types of remotes: - -- `s3`: Amazon Simple Storage Service -- `azure`: Microsoft Azure Blob Storage -- `gdrive` : Google Drive -- `gs`: Google Cloud Storage -- `ssh`: Secure Shell (requires SFTP) -- `hdfs`: Hadoop Distributed File System -- `http`: HTTP and HTTPS protocols -- `local`: Directory in the local file system - -> If you installed DVC via `pip` and plan to use cloud services as remote -> storage, you might need to install these optional dependencies: `[s3]`, -> `[azure]`, `[gdrive]`, `[gs]`, `[oss]`, `[ssh]`. Alternatively, use `[all]` to -> include them all. The command should look like this: `pip install "dvc[s3]"`. -> (This example installs `boto3` library along with DVC to support S3 storage.) - -For example, to setup an S3 remote we would use something like this (make sure -that `mybucket` exists): - -```dvc -$ dvc remote add -d s3remote s3://mybucket/myproject -``` - -> This command is only shown for informational purposes. No need to actually run -> it in order to continue with the Get Started. - -You can see that DVC doesn't require installing any databases, servers, or -warehouses. It can use bare S3 or SSH to store data, intermediate results, and -models. - -See `dvc config` to get information about more configuration options and -`dvc remote` to learn more about remotes and get more examples. diff --git a/content/docs/tutorials/get-started/connect-code-and-data.md b/content/docs/tutorials/get-started/connect-code-and-data.md deleted file mode 100644 index 1bec301c0c..0000000000 --- a/content/docs/tutorials/get-started/connect-code-and-data.md +++ /dev/null @@ -1,165 +0,0 @@ -# Connect Code and Data - -Even in its basic scenarios, commands like `dvc add`, `dvc push`, `dvc pull` -described in the previous sections could be used independently and provide a -basic useful framework to track, save and share models and large data files. To -achieve full reproducibility though, we'll have to connect code and -configuration with the data it processes to produce the result. - -
- -### Expand to prepare example code - -If you've followed this _Get Started_ section from the beginning, run these -commands to get the example code: - -```dvc -$ wget https://code.dvc.org/get-started/code.zip -$ unzip code.zip -$ rm -f code.zip -``` - -Windows doesn't include the `wget` utility by default, but you can use the -browser to download `code.zip`. (Right-click -[this link](https://code.dvc.org/get-started/code.zip) and select -`Save Link As...` (Chrome). Save it into the project directory. - -The workspace should now look like this: - -```dvc -$ tree -. -├── data -│   ├── data.xml -│   └── data.xml.dvc -└── src -    ├── evaluate.py -    ├── featurization.py -    ├── prepare.py -    ├── requirements.txt -  └── train.py -``` - -Now let's install the requirements. But before we do that, we **strongly** -recommend creating a -[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments): - -```dvc -$ virtualenv -p python3 .env -$ echo ".env/" >> .gitignore -$ source .env/bin/activate -$ pip install -r src/requirements.txt -``` - -Optionally, save the progress with Git: - -```dvc -$ git add . -$ git commit -m "Add source code files to repo" -``` - -
- -Having installed the `src/prepare.py` script in your repo, the following command -transforms it into a reproducible [stage](/doc/command-reference/run) for the ML -pipeline we're building (described in the -[next chapter](/doc/tutorials/pipelines)). - -```dvc -$ dvc run -f prepare.dvc \ - -d src/prepare.py -d data/data.xml \ - -o data/prepared \ - python src/prepare.py data/data.xml -``` - -`dvc run` generates the `prepare.dvc` DVC-file. It has the same -[format](/doc/user-guide/dvc-file-format) as the file we created in the -[previous section](/doc/tutorials/get-started/add-files) to track `data.xml`, -except in this case it has additional information about the `data/prepared` -output (a directory where two files, `train.tsv` and `test.tsv`, will be written -to), and about the Python command that is required to build it. - -
- -### Expand to learn more about what has just happened - -This is how the result should look like now: - -```diff - . - ├── data - │ ├── data.xml - │ ├── data.xml.dvc -+ │ └── prepared -+ │ ├── test.tsv -+ │ └── train.tsv -+ ├── prepare.dvc - └── src - ├── evaluate.py - ├── featurization.py - ├── prepare.py - ├── requirements.txt - └── train.py -``` - -This is how `prepare.dvc` looks like: - -```yaml -cmd: python src/prepare.py data/data.xml -deps: - - md5: b4801c88a83f3bf5024c19a942993a48 - path: src/prepare.py - - md5: a304afb96060aad90176268345e10355 - path: data/data.xml -md5: c3a73109be6c186b9d72e714bcedaddb -outs: - - cache: true - md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir - metric: false - path: data/prepared -wdir: . -``` - -> `dvc run` is just the first of a set of DVC command required to generate a -> [pipeline](/doc/tutorials/get-started/pipeline), or in other words, -> instructions on how to build a ML model (data file) from previous data files -> (or directories). - -Let's briefly mention what the command options used above mean for this -particular example: - -`-f prepare.dvc` specifies a name for the DVC-file (pipeline stage). It's -optional but we recommend using it to make your project structure more readable. - -`-d src/prepare.py` and `-d data/data.xml` mean that the `prepare.dvc` stage -file depends on them to produce the result. When you run `dvc repro` next time -(see next chapter) DVC will automatically check these dependencies and decide -whether this stage is up to date or whether it should be executed to regenerate -its outputs. - -`-o data/prepared` specifies the output directory processed data will be put -into. The script creates two files in it – that will be used later to generate -features, train and evaluate the model. - -And, the last line, `python src/prepare.py data/data.xml`, specifies a command -to run. This command is saved to the generated DVC-file, and used later by -`dvc repro`. - -Hopefully, `dvc run` (and `dvc repro`) will become intuitive after a few more -Get Started chapters. You can always refer to the the command references for -more details on their behavior and options. - -
- -You don't need to run `dvc add` to track output files (`prepared/train.tsv` and -`prepared/test.tsv`) with DVC. `dvc run` takes care of this. You only need to -run `dvc push` (usually along with `git commit`) to save them to the remote when -you are done. - -Let's commit the changes to save the stage we built: - -```dvc -$ git add data/.gitignore prepare.dvc -$ git commit -m "Create data preparation stage" -$ dvc push -``` diff --git a/content/docs/tutorials/get-started/data-pipelines.md b/content/docs/tutorials/get-started/data-pipelines.md new file mode 100644 index 0000000000..2855b7acfa --- /dev/null +++ b/content/docs/tutorials/get-started/data-pipelines.md @@ -0,0 +1,335 @@ +# Connect Code and Data + +Even in its basic scenarios, commands like `dvc add`, `dvc push`, `dvc pull` +described in the previous sections could be used independently and provide a +basic useful framework to track, save and share models and large data files. To +achieve full reproducibility though, we'll have to connect code and +configuration with the data it processes to produce the result. + +
+ +### Expand to prepare example code + +If you've followed this _Get Started_ section from the beginning, run these +commands to get the example code: + +```dvc +$ wget https://code.dvc.org/get-started/code.zip +$ unzip code.zip +$ rm -f code.zip +``` + +Windows doesn't include the `wget` utility by default, but you can use the +browser to download `code.zip`. (Right-click +[this link](https://code.dvc.org/get-started/code.zip) and select +`Save Link As...` (Chrome). Save it into the project directory. + +The workspace should now look like this: + +```dvc +$ tree +. +├── data +│   ├── data.xml +│   └── data.xml.dvc +└── src +    ├── evaluate.py +    ├── featurization.py +    ├── prepare.py +    ├── requirements.txt +  └── train.py +``` + +Now let's install the requirements. But before we do that, we **strongly** +recommend creating a +[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments): + +```dvc +$ virtualenv -p python3 .env +$ echo ".env/" >> .gitignore +$ source .env/bin/activate +$ pip install -r src/requirements.txt +``` + +Optionally, save the progress with Git: + +```dvc +$ git add . +$ git commit -m "Add source code files to repo" +``` + +
+ +Having installed the `src/prepare.py` script in your repo, the following command +transforms it into a reproducible [stage](/doc/command-reference/run) for the ML +pipeline we're building (described in the +[next chapter](/doc/tutorials/pipelines)). + +```dvc +$ dvc run -f prepare.dvc \ + -d src/prepare.py -d data/data.xml \ + -o data/prepared \ + python src/prepare.py data/data.xml +``` + +`dvc run` generates the `prepare.dvc` DVC-file. It has the same +[format](/doc/user-guide/dvc-file-format) as the file we created in the +[previous section](/doc/tutorials/get-started/add-files) to track `data.xml`, +except in this case it has additional information about the `data/prepared` +output (a directory where two files, `train.tsv` and `test.tsv`, will be written +to), and about the Python command that is required to build it. + +
+ +### Expand to learn more about what has just happened + +This is how the result should look like now: + +```diff + . + ├── data + │ ├── data.xml + │ ├── data.xml.dvc ++ │ └── prepared ++ │ ├── test.tsv ++ │ └── train.tsv ++ ├── prepare.dvc + └── src + ├── evaluate.py + ├── featurization.py + ├── prepare.py + ├── requirements.txt + └── train.py +``` + +This is how `prepare.dvc` looks like: + +```yaml +cmd: python src/prepare.py data/data.xml +deps: + - md5: b4801c88a83f3bf5024c19a942993a48 + path: src/prepare.py + - md5: a304afb96060aad90176268345e10355 + path: data/data.xml +md5: c3a73109be6c186b9d72e714bcedaddb +outs: + - cache: true + md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir + metric: false + path: data/prepared +wdir: . +``` + +> `dvc run` is just the first of a set of DVC command required to generate a +> [pipeline](/doc/tutorials/get-started/pipeline), or in other words, +> instructions on how to build a ML model (data file) from previous data files +> (or directories). + +Let's briefly mention what the command options used above mean for this +particular example: + +`-f prepare.dvc` specifies a name for the DVC-file (pipeline stage). It's +optional but we recommend using it to make your project structure more readable. + +`-d src/prepare.py` and `-d data/data.xml` mean that the `prepare.dvc` stage +file depends on them to produce the result. When you run `dvc repro` next time +(see next chapter) DVC will automatically check these dependencies and decide +whether this stage is up to date or whether it should be executed to regenerate +its outputs. + +`-o data/prepared` specifies the output directory processed data will be put +into. The script creates two files in it – that will be used later to generate +features, train and evaluate the model. + +And, the last line, `python src/prepare.py data/data.xml`, specifies a command +to run. This command is saved to the generated DVC-file, and used later by +`dvc repro`. + +Hopefully, `dvc run` (and `dvc repro`) will become intuitive after a few more +Get Started chapters. You can always refer to the the command references for +more details on their behavior and options. + +
+ +You don't need to run `dvc add` to track output files (`prepared/train.tsv` and +`prepared/test.tsv`) with DVC. `dvc run` takes care of this. You only need to +run `dvc push` (usually along with `git commit`) to save them to the remote when +you are done. + +Let's commit the changes to save the stage we built: + +```dvc +$ git add data/.gitignore prepare.dvc +$ git commit -m "Create data preparation stage" +$ dvc push +``` + +# Pipeline + +Support for [pipelines](/doc/command-reference/pipeline) is the biggest +difference between DVC and other version control tools that can handle large +data files (e.g. `git lfs`). By using `dvc run` multiple times, and specifying +outputs of a command (stage) as dependencies in another one, we can describe a +sequence of commands that gets to a desired result. This is what we call a +**data pipeline** or dependency graph. + +Let's create a second stage (after `prepare.dvc`, created in the previous +chapter) to perform feature extraction: + +```dvc +$ dvc run -f featurize.dvc \ + -d src/featurization.py -d data/prepared \ + -o data/features \ + python src/featurization.py \ + data/prepared data/features +``` + +And a third stage for training: + +```dvc +$ dvc run -f train.dvc \ + -d src/train.py -d data/features \ + -o model.pkl \ + python src/train.py data/features model.pkl +``` + +Let's commit DVC-files that describe our pipeline so far: + +```dvc +$ git add data/.gitignore .gitignore featurize.dvc train.dvc +$ git commit -m "Create featurization and training stages" +$ dvc push +``` + +This example is simplified just to show you a basic pipeline, see a more +advanced [example](/doc/tutorials/pipelines) or +[complete tutorial](/doc/tutorials/pipelines) to create an +[NLP](https://en.wikipedia.org/wiki/Natural_language_processing) pipeline +end-to-end. + +> See also the `dvc pipeline` command. + +# Visualize + +Now that we have built our pipeline, we need a good way to visualize it to be +able to wrap our heads around it. Luckily, DVC allows us to do that without +leaving the terminal, making the experience distraction-less. + +We are using the `--ascii` option below to better illustrate this pipeline. +Please, refer to `dvc pipeline show` to explore other options this command +supports (e.g. `.dot` files that can be used then in other tools). + +## Stages + +```dvc +$ dvc pipeline show --ascii train.dvc + +-------------------+ + | data/data.xml.dvc | + +-------------------+ + * + * + * + +-------------+ + | prepare.dvc | + +-------------+ + * + * + * + +---------------+ + | featurize.dvc | + +---------------+ + * + * + * + +-----------+ + | train.dvc | + +-----------+ +``` + +## Commands + +```dvc +$ dvc pipeline show --ascii train.dvc --commands + +-------------------------------------+ + | python src/prepare.py data/data.xml | + +-------------------------------------+ + * + * + * + +---------------------------------------------------------+ + | python src/featurization.py data/prepared data/features | + +---------------------------------------------------------+ + * + * + * + +---------------------------------------------+ + | python src/train.py data/features model.pkl | + +---------------------------------------------+ +``` + +## Outputs + +```dvc +$ dvc pipeline show --ascii train.dvc --outs + +---------------+ + | data/data.xml | + +---------------+ + * + * + * + +---------------+ + | data/prepared | + +---------------+ + * + * + * + +---------------+ + | data/features | + +---------------+ + * + * + * + +-----------+ + | model.pkl | + +-----------+ +``` + +# Reproduce + +In the previous chapters, we described our first +[pipeline](/doc/command-reference/pipeline). Basically, we generated a number of +[stage files](/doc/command-reference/run) +([DVC-files](/doc/user-guide/dvc-file-format)). These stages define individual +commands to execute towards a final result. Each depends on some data (either +raw data files or intermediate results from previous stages) and code files. + +If you just cloned the +[project](https://github.com/iterative/example-get-started), make sure you first +fetch the input data from DVC by calling `dvc pull`. + +It's now extremely easy for you or your colleagues to reproduce the result +end-to-end: + +```dvc +$ dvc repro train.dvc +``` + +> If you've just followed the previous chapters, the command above will have +> nothing to reproduce since you've recently executed all the pipeline stages. +> To easily try this command, clone this example +> [GitHub project](https://github.com/iterative/example-get-started) and run it +> from there. + +`train.dvc` describes which source code and data files to use, and how to run +the command in order to get the resulting model file. For each data file it +depends on, we can in turn do the same analysis: find a corresponding DVC-file +that includes the data file in its outputs, get dependencies and commands, and +so on. It means that DVC can recursively build a complete sequence of commands +it needs to execute to get the model file. + +`dvc repro` essentially builds a dependency graph, detects stages with modified +dependencies or missing outputs and recursively executes commands (nodes in this +graph or pipeline) starting from the first stage with changes. + +Thus, `dvc run` and `dvc repro` provide a powerful framework for _reproducible +experiments_ and _reproducible projects_. diff --git a/content/docs/tutorials/get-started/experiment-management.md b/content/docs/tutorials/get-started/experiment-management.md new file mode 100644 index 0000000000..d6cd1c5d30 --- /dev/null +++ b/content/docs/tutorials/get-started/experiment-management.md @@ -0,0 +1,192 @@ +# Experiment Metrics + +Finally, we'd like to add an evaluation stage to our +[pipeline](/doc/command-reference/pipeline). Data science is a metric-driven +R&D-like process and `dvc metrics` commands along with DVC metric files provide +a framework to capture and compare experiments performance. It doesn't require +installing any databases or instrumenting your code to use some API, all is +tracked by Git and is stored in Git or DVC remote storage: + +```dvc +$ dvc run -f evaluate.dvc \ + -d src/evaluate.py -d model.pkl -d data/features \ + -M auc.metric \ + python src/evaluate.py model.pkl \ + data/features auc.metric +``` + +`evaluate.py` calculates AUC value using the test dataset. It reads features +from the `features/test.pkl` file and produces a +[metric](/doc/command-reference/metrics) file (`auc.metric`). Any +output (in this case just a plain text file containing a single +numeric value) can be marked as a metric, for example by using the `-M` option +of `dvc run`. + +> Please, refer to the `dvc metrics` command documentation to see more details. + +Let's save the updated results: + +```dvc +$ git add evaluate.dvc auc.metric +$ git commit -m "Create evaluation stage" +$ dvc push +``` + +Let's also assign a Git tag, it will serve as a checkpoint for us to compare +experiments in the future, or if we need to go back and checkout it and the +corresponding data: + +```dvc +$ git tag -a "baseline-experiment" -m "Baseline experiment evaluation" +``` + +The `dvc metrics show` command provides a way to compare different experiments, +by analyzing metric files across different branches, tags, etc. But first we +need to create a new experiment to compare the baseline with. + +# Experiments + +Data science process is inherently iterative and R&D like. Data scientist may +try many different approaches, different hyper-parameter values, and "fail" many +times before the required level of a metric is achieved. + +DVC is built to provide a way to capture different experiments and navigate +easily between them. Let's say we want to try a modified feature extraction: + +
+ +### Expand to see code modifications + +Edit `src/featurization.py` to enable bigrams and increase the number of +features. Find and change the `CountVectorizer` arguments, specify `ngram_range` +and increase number of features: + +```python +bag_of_words = CountVectorizer(stop_words='english', + max_features=6000, + ngram_range=(1, 2)) +``` + +
+ +```dvc +$ vi src/featurization.py # edit to use bigrams (see above) +$ dvc repro train.dvc # regenerate the new model.pkl +$ git commit -am "Reproduce model using bigrams" +``` + +> Notice that `git commit -a` stages all the changes produced by `dvc repro` +> before committing them with Git. Refer to the +> [command reference](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--a) +> for more details. + +Now, we have a new `model.pkl` captured and saved. To get back to the initial +version, we run `git checkout` along with `dvc checkout` command: + +```dvc +$ git checkout baseline-experiment +$ dvc checkout +``` + +DVC is designed to checkout large data files (no matter how large they are) into +your workspace almost instantly on almost all modern operating +systems with file links. See +[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for +more information. + +# Compare Experiments + +DVC makes it easy to iterate on your project using Git commits with tags or Git +branches. It provides a way to try different ideas, keep track of them, switch +back and forth. To find the best performing experiment or track the progress, +[project metrics](/doc/command-reference/metrics) are supported in DVC (as +described in one of the previous chapters). + +Let's run evaluate for the latest `bigrams` experiment we created in previous +chapters. It mostly takes just running the `dvc repro`: + +```dvc +$ git checkout master +$ dvc checkout +$ dvc repro evaluate.dvc +``` + +`git checkout master` and `dvc checkout` commands ensure that we have the latest +experiment code and data respectively. And `dvc repro`, as we discussed in the +[Reproduce](/doc/tutorials/get-started/reproduce) chapter, is a way to run all +the necessary commands to build the model and measure its performance. + +```dvc +$ git commit -am "Evaluate bigrams model" +$ git tag -a "bigrams-experiment" -m "Bigrams experiment evaluation" +``` + +Now, we can use `-T` option of the `dvc metrics show` command to see the +difference between the `baseline` and `bigrams` experiments: + +```dvc +$ dvc metrics show -T + +baseline-experiment: + auc.metric: 0.588426 +bigrams-experiment: + auc.metric: 0.602818 +``` + +DVC provides built-in support to track and navigate `JSON`, `TSV` or `CSV` +metric files if you want to track additional information. See `dvc metrics` to +learn more. + +# Get Older Data Version + +Now that we have multiple experiments, models, processed datasets, the question +is how do we revert back to an older version of a model file? Or how can we get +the previous version of the dataset if it was changed at some point? + +The answer is the `dvc checkout` command, and we already touched briefly the +process of switching between different data versions in the +[Experiments](/doc/tutorials/get-started/experiments) chapter of this _Get +Started_ section. + +Let's say we want to get the previous `model.pkl` file. The short answer is: + +```dvc +$ git checkout baseline-experiment train.dvc +$ dvc checkout train.dvc +``` + +These two commands will bring the previous model file to its place in the +workspace. + +
+ +### Expand to learn about DVC internals + +DVC uses special [DVC-files](/doc/user-guide/dvc-file-format) to track data +files, directories, end results. In this case, `train.dvc` among other things +describes the `model.pkl` file this way: + +```yaml +outs: +md5: a66489653d1b6a8ba989799367b32c43 +path: model.pkl +``` + +`a664...2c43` is the "address" of the file in the local or remote DVC storage. + +It means that if we want to get to the previous version, we need to restore the +DVC-file first with the `git checkout` command. Only after that can DVC restore +the model file using the new "address" from the DVC-file. + +
+ +To fully restore the previous experiment we just run `git checkout` and +`dvc checkout` without specifying a target: + +```dvc +$ git checkout baseline-experiment +$ dvc checkout +``` + +Read the `dvc checkout` command reference and a dedicated data versioning +[example](/doc/tutorials/versioning) for more information. diff --git a/content/docs/tutorials/get-started/experiments.md b/content/docs/tutorials/get-started/experiments.md deleted file mode 100644 index b716872a2e..0000000000 --- a/content/docs/tutorials/get-started/experiments.md +++ /dev/null @@ -1,49 +0,0 @@ -# Experiments - -Data science process is inherently iterative and R&D like. Data scientist may -try many different approaches, different hyper-parameter values, and "fail" many -times before the required level of a metric is achieved. - -DVC is built to provide a way to capture different experiments and navigate -easily between them. Let's say we want to try a modified feature extraction: - -
- -### Expand to see code modifications - -Edit `src/featurization.py` to enable bigrams and increase the number of -features. Find and change the `CountVectorizer` arguments, specify `ngram_range` -and increase number of features: - -```python -bag_of_words = CountVectorizer(stop_words='english', - max_features=6000, - ngram_range=(1, 2)) -``` - -
- -```dvc -$ vi src/featurization.py # edit to use bigrams (see above) -$ dvc repro train.dvc # regenerate the new model.pkl -$ git commit -am "Reproduce model using bigrams" -``` - -> Notice that `git commit -a` stages all the changes produced by `dvc repro` -> before committing them with Git. Refer to the -> [command reference](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--a) -> for more details. - -Now, we have a new `model.pkl` captured and saved. To get back to the initial -version, we run `git checkout` along with `dvc checkout` command: - -```dvc -$ git checkout baseline-experiment -$ dvc checkout -``` - -DVC is designed to checkout large data files (no matter how large they are) into -your workspace almost instantly on almost all modern operating -systems with file links. See -[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for -more information. diff --git a/content/docs/tutorials/get-started/import-data.md b/content/docs/tutorials/get-started/import-data.md deleted file mode 100644 index 6900533d5c..0000000000 --- a/content/docs/tutorials/get-started/import-data.md +++ /dev/null @@ -1,87 +0,0 @@ -# Import Data - -We've seen how to [push](/doc/tutorials/get-started/store-data) and -[pull](/doc/tutorials/get-started/retrieve-data) data from/to a DVC -project's [remote](/doc/command-reference/remote). But what if we wanted -to integrate a dataset or ML model produced in one project into another one? - -One way is to manually download the data (with `wget` or `dvc get`, for example) -and use `dvc add` to track it, but the connection between the projects would be -lost. We wouldn't be able to tell where the data came from or whether there are -new versions available. A better alternative is the `dvc import` command: - - - -```dvc -$ dvc import https://github.com/iterative/dataset-registry \ - get-started/data.xml -``` - -This downloads `data.xml` from our -[dataset-registry](https://github.com/iterative/dataset-registry) project into -the current working directory, adds it to `.gitignore`, and creates the -`data.xml.dvc` [DVC-file](/doc/user-guide/dvc-file-format) to track changes in -the source data. With _imports_, we can use `dvc update` to bring in changes in -the external data source before -[reproducing](/doc/tutorials/get-started/reproduce) any pipeline -that depends on this data. - -
- -### Expand to learn more about imports - -Note that the [dataset-registry](https://github.com/iterative/dataset-registry) -repository doesn't actually contain a `get-started/data.xml` file. Instead, DVC -inspects -[get-started/data.xml.dvc](https://github.com/iterative/dataset-registry/blob/master/get-started/data.xml.dvc) -and tries to retrieve the file using the project's default remote (configured -[here](https://github.com/iterative/dataset-registry/blob/master/.dvc/config)). - -DVC-files created by `dvc import` are called _import stages_. They use the -`repo` field in the dependencies section (`deps`) in order to track source data -changes (as an [external dependency](/doc/user-guide/external-dependencies)), -enabling the reusability of data artifacts. For example: - -```yaml -md5: fd56a1794c147fea48d408f2bc95a33a -locked: true -deps: - - path: get-started/data.xml - repo: - url: https://github.com/iterative/dataset-registry - rev_lock: 7476a858f6200864b5755863c729bff41d0fb045 -outs: - - md5: a304afb96060aad90176268345e10355 - path: data.xml - cache: true - metric: false - persist: false -``` - -The `url` and `rev_lock` subfields under `repo` are used to save the origin and -[version](https://git-scm.com/docs/revisions) of the dependency, respectively. - -> Note that `dvc update` updates the `rev_lock` field of the corresponding -> DVC-file (when there are changes to bring in). - -
- -Since this is not an official part of this _Get Started_, bring everything back -to normal with: - -```dvc -$ git reset --hard -$ rm -f data.* -``` - -> See also `dvc import-url`. diff --git a/content/docs/tutorials/get-started/index.md b/content/docs/tutorials/get-started/index.md new file mode 100644 index 0000000000..77bfcd7fc2 --- /dev/null +++ b/content/docs/tutorials/get-started/index.md @@ -0,0 +1,104 @@ +# Get Started with DVC! + +You'll need [Git](https://git-scm.com) to run the commands in this tutorial. +Also, if DVC is not installed, please follow these [instructions](/doc/install) +first. + +In the next few pages we'll build a simple natural language processing (NLP) +project from scratch. It explores the NLP problem of predicting tags for a given +StackOverflow question. For example, we might want a classifier that can +classify (or predict) posts about Python by tagging them with `python`. + +![](/img/example-flow-2x.png) _Data modeling overview_ + +> This is a simplified version of our [Deep Dive Tutorial](/doc/tutorials/deep). + +Keep in mind that NLP is not the only area of data science where DVC can help. +DVC is designed to be agnostic of frameworks, programming languages, etc. + +> In case you'd like to get the complete code base and results, or have any +> issues along the way, please note we have a fully reproducible +> [**example-get-started**](https://github.com/iterative/example-get-started) +> repo on GitHub: +> +> ```dvc +> $ git clone https://github.com/iterative/example-get-started +> $ cd example-get-started +> $ dvc pull +> ``` + +## Initialize + +Let's start by creating a workspace your home directory that we can +version with Git. Then run `dvc init` inside to create a DVC +repository: + +```dvc +$ cd ~ +$ mkdir sotag-predictions +$ cd sotag-predictions +$ git init +$ dvc init +$ git commit -m "Initialize DVC repository" +``` + +At DVC initialization, a new `.dvc/` directory is created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are +hidden from the user. + +> See [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to +> learn more about the DVC internal file and directory structure. + +The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore` +DVC internal files with Git. + +## Configure + +Because we'll want to share data and models outside of the local context later +(for example with other collaborators or for access from a different computing +environment), we're going to set up a remote storage for the DVC +project. For simplicity, let's set up a _local remote_. + +
+ +### What is a "local remote" ? + +While the term may seem contradictory, it doesn't have to be. The "local" part +refers to the location of the storage relative to the project, so it can be any +directory in the file system. "Remote" is the term that refers to the storage. +Read "local cache backup". + +
+ +```dvc +$ dvc remote add -d myremote /tmp/dvc-storage +$ git commit .dvc/config -m "Configure local remote" +``` + +> We only use a local remote in this tutorial for simplicity's sake. For most +> cases, other "more remote" types of storage will be required. + +That's it! DVC doesn't require installing any databases, servers, or warehouses. +It can simply use cloud services, local or network file systems to store data, +intermediate results, and ML models. The following remote types are currently +supported: + +- Amazon **S3** (Simple Storage Service) +- Microsoft **Azure** Blob Storage +- **Google Drive** +- **Google Cloud** Storage +- Aliyun **OSS** (Object Storage Service) +- **SSH** (Secure Shell) — requires SFTP +- **HDFS** (Hadoop Distributed File System) +- **HTTP** (and HTTPS) — read-only +- Directory in the **local** file system + +> Refer to `dvc remote` for more details and examples. + +There are other features and options that can be configured in DVC. Please see +`dvc config` for more information. + +--- + +Go to the next page to continue ↘ diff --git a/content/docs/tutorials/get-started/initialize.md b/content/docs/tutorials/get-started/initialize.md deleted file mode 100644 index 1e227d96c9..0000000000 --- a/content/docs/tutorials/get-started/initialize.md +++ /dev/null @@ -1,29 +0,0 @@ -# Initialize - -There are a few recommended ways to install DVC: OS-specific package/installer, -`pip`, `conda`, and Homebrew. See [Installation](/doc/install) for all the -alternatives and details. - -Let's start by creating a workspace we can version with Git. Then -run `dvc init` inside to create the DVC project: - -```dvc -$ mkdir example-get-started -$ cd example-get-started -$ git init -$ dvc init -$ git commit -m "Initialize DVC project" -``` - -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are -hidden from the user. - -> See `dvc init` if you want to get more details about the initialization -> process, and -> [DVC Files and Directories](/doc/user-guide/dvc-files-and-directories) to -> learn about the DVC internal file and directory structure. - -The last command, `git commit`, versions the `.dvc/config` and `.dvc/.gitignore` -files (DVC internals) with Git. diff --git a/content/docs/tutorials/get-started/metrics.md b/content/docs/tutorials/get-started/metrics.md deleted file mode 100644 index e91ba6371f..0000000000 --- a/content/docs/tutorials/get-started/metrics.md +++ /dev/null @@ -1,45 +0,0 @@ -# Experiment Metrics - -Finally, we'd like to add an evaluation stage to our -[pipeline](/doc/command-reference/pipeline). Data science is a metric-driven -R&D-like process and `dvc metrics` commands along with DVC metric files provide -a framework to capture and compare experiments performance. It doesn't require -installing any databases or instrumenting your code to use some API, all is -tracked by Git and is stored in Git or DVC remote storage: - -```dvc -$ dvc run -f evaluate.dvc \ - -d src/evaluate.py -d model.pkl -d data/features \ - -M auc.metric \ - python src/evaluate.py model.pkl \ - data/features auc.metric -``` - -`evaluate.py` calculates AUC value using the test dataset. It reads features -from the `features/test.pkl` file and produces a -[metric](/doc/command-reference/metrics) file (`auc.metric`). Any -output (in this case just a plain text file containing a single -numeric value) can be marked as a metric, for example by using the `-M` option -of `dvc run`. - -> Please, refer to the `dvc metrics` command documentation to see more details. - -Let's save the updated results: - -```dvc -$ git add evaluate.dvc auc.metric -$ git commit -m "Create evaluation stage" -$ dvc push -``` - -Let's also assign a Git tag, it will serve as a checkpoint for us to compare -experiments in the future, or if we need to go back and checkout it and the -corresponding data: - -```dvc -$ git tag -a "baseline-experiment" -m "Baseline experiment evaluation" -``` - -The `dvc metrics show` command provides a way to compare different experiments, -by analyzing metric files across different branches, tags, etc. But first we -need to create a new experiment to compare the baseline with. diff --git a/content/docs/tutorials/get-started/older-versions.md b/content/docs/tutorials/get-started/older-versions.md deleted file mode 100644 index bde6bce562..0000000000 --- a/content/docs/tutorials/get-started/older-versions.md +++ /dev/null @@ -1,53 +0,0 @@ -# Get Older Data Version - -Now that we have multiple experiments, models, processed datasets, the question -is how do we revert back to an older version of a model file? Or how can we get -the previous version of the dataset if it was changed at some point? - -The answer is the `dvc checkout` command, and we already touched briefly the -process of switching between different data versions in the -[Experiments](/doc/tutorials/get-started/experiments) chapter of this _Get -Started_ section. - -Let's say we want to get the previous `model.pkl` file. The short answer is: - -```dvc -$ git checkout baseline-experiment train.dvc -$ dvc checkout train.dvc -``` - -These two commands will bring the previous model file to its place in the -workspace. - -
- -### Expand to learn about DVC internals - -DVC uses special [DVC-files](/doc/user-guide/dvc-file-format) to track data -files, directories, end results. In this case, `train.dvc` among other things -describes the `model.pkl` file this way: - -```yaml -outs: -md5: a66489653d1b6a8ba989799367b32c43 -path: model.pkl -``` - -`a664...2c43` is the "address" of the file in the local or remote DVC storage. - -It means that if we want to get to the previous version, we need to restore the -DVC-file first with the `git checkout` command. Only after that can DVC restore -the model file using the new "address" from the DVC-file. - -
- -To fully restore the previous experiment we just run `git checkout` and -`dvc checkout` without specifying a target: - -```dvc -$ git checkout baseline-experiment -$ dvc checkout -``` - -Read the `dvc checkout` command reference and a dedicated data versioning -[example](/doc/tutorials/versioning) for more information. diff --git a/content/docs/tutorials/get-started/pipeline.md b/content/docs/tutorials/get-started/pipeline.md deleted file mode 100644 index d9f0f19390..0000000000 --- a/content/docs/tutorials/get-started/pipeline.md +++ /dev/null @@ -1,44 +0,0 @@ -# Pipeline - -Support for [pipelines](/doc/command-reference/pipeline) is the biggest -difference between DVC and other version control tools that can handle large -data files (e.g. `git lfs`). By using `dvc run` multiple times, and specifying -outputs of a command (stage) as dependencies in another one, we can describe a -sequence of commands that gets to a desired result. This is what we call a -**data pipeline** or dependency graph. - -Let's create a second stage (after `prepare.dvc`, created in the previous -chapter) to perform feature extraction: - -```dvc -$ dvc run -f featurize.dvc \ - -d src/featurization.py -d data/prepared \ - -o data/features \ - python src/featurization.py \ - data/prepared data/features -``` - -And a third stage for training: - -```dvc -$ dvc run -f train.dvc \ - -d src/train.py -d data/features \ - -o model.pkl \ - python src/train.py data/features model.pkl -``` - -Let's commit DVC-files that describe our pipeline so far: - -```dvc -$ git add data/.gitignore .gitignore featurize.dvc train.dvc -$ git commit -m "Create featurization and training stages" -$ dvc push -``` - -This example is simplified just to show you a basic pipeline, see a more -advanced [example](/doc/tutorials/pipelines) or -[complete tutorial](/doc/tutorials/pipelines) to create an -[NLP](https://en.wikipedia.org/wiki/Natural_language_processing) pipeline -end-to-end. - -> See also the `dvc pipeline` command. diff --git a/content/docs/tutorials/get-started/reproduce.md b/content/docs/tutorials/get-started/reproduce.md deleted file mode 100644 index d6e6375878..0000000000 --- a/content/docs/tutorials/get-started/reproduce.md +++ /dev/null @@ -1,39 +0,0 @@ -# Reproduce - -In the previous chapters, we described our first -[pipeline](/doc/command-reference/pipeline). Basically, we generated a number of -[stage files](/doc/command-reference/run) -([DVC-files](/doc/user-guide/dvc-file-format)). These stages define individual -commands to execute towards a final result. Each depends on some data (either -raw data files or intermediate results from previous stages) and code files. - -If you just cloned the -[project](https://github.com/iterative/example-get-started), make sure you first -fetch the input data from DVC by calling `dvc pull`. - -It's now extremely easy for you or your colleagues to reproduce the result -end-to-end: - -```dvc -$ dvc repro train.dvc -``` - -> If you've just followed the previous chapters, the command above will have -> nothing to reproduce since you've recently executed all the pipeline stages. -> To easily try this command, clone this example -> [GitHub project](https://github.com/iterative/example-get-started) and run it -> from there. - -`train.dvc` describes which source code and data files to use, and how to run -the command in order to get the resulting model file. For each data file it -depends on, we can in turn do the same analysis: find a corresponding DVC-file -that includes the data file in its outputs, get dependencies and commands, and -so on. It means that DVC can recursively build a complete sequence of commands -it needs to execute to get the model file. - -`dvc repro` essentially builds a dependency graph, detects stages with modified -dependencies or missing outputs and recursively executes commands (nodes in this -graph or pipeline) starting from the first stage with changes. - -Thus, `dvc run` and `dvc repro` provide a powerful framework for _reproducible -experiments_ and _reproducible projects_. diff --git a/content/docs/tutorials/get-started/retrieve-data.md b/content/docs/tutorials/get-started/retrieve-data.md deleted file mode 100644 index 2a11926903..0000000000 --- a/content/docs/tutorials/get-started/retrieve-data.md +++ /dev/null @@ -1,29 +0,0 @@ -# Retrieve Data - -> You'll need to complete the -> [initialization](/doc/tutorials/get-started/initialize) and -> [configuration](/doc/tutorials/get-started/configure) chapters before being -> able to run the commands explained here. - -To retrieve data files into the workspace in your local machine, -run: - -```dvc -$ rm -f data/data.xml -$ dvc pull -``` - -This command downloads data files that are referenced in all -[DVC-files](/doc/user-guide/dvc-file-format) in the project. So, -you usually run it after `git clone`, `git pull`, or `git checkout`. - -Alternatively, if you want to retrieve a single dataset or a file you can use: - -```dvc -$ dvc pull data/data.xml.dvc -``` - -DVC remotes, `dvc push`, and `dvc pull` provide a basic collaboration workflow, -the same way as Git remotes, `git push` and `git pull`. See -[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for -more information. diff --git a/content/docs/tutorials/get-started/store-data.md b/content/docs/tutorials/get-started/store-data.md deleted file mode 100644 index 1306681e27..0000000000 --- a/content/docs/tutorials/get-started/store-data.md +++ /dev/null @@ -1,44 +0,0 @@ -# Store and Share Data - -Now, that your data files are managed by DVC (see -[Add Files](/doc/tutorials/get-started/add-files)), you can push them from your -repository to the default [remote](/doc/command-reference/remote) storage\*: - -```dvc -$ dvc push -``` - -The same way as with Git remote, it ensures that your data files and your models -are safely stored remotely and are shareable. This means that the data can be -pulled by yourself or your colleagues whenever you need it. - -Usually, you run it along with `git commit` and `git push` to save the changed -[DVC-files](/doc/user-guide/dvc-file-format). - -The `dvc push` command allows one to upload data to remote storage. It doesn't -save any changes in the code or DVC-files. Those should be saved by using -`git commit` and `git push`. - -> \*As noted in the DVC [configuration](/doc/tutorials/get-started/configure) -> chapter, we are using a **local remote** in this section for illustrative -> purposes. - -
- -### Expand to learn more about DVC internals - -You can check now that actual data file has been copied to the remote we created -in the [configuration](/doc/tutorials/get-started/configure) chapter: - -```dvc -$ ls -R /tmp/dvc-storage -/tmp/dvc-storage/a3: -04afb96060aad90176268345e10355 -``` - -`a304afb96060aad90176268345e10355` above is the hash value of the `data.xml` -file. If you check the `data.xml.dvc` -[DVC-file](/doc/user-guide/dvc-file-format), you will see that it has this -string inside. - -
diff --git a/content/docs/tutorials/get-started/versioning-basics.md b/content/docs/tutorials/get-started/versioning-basics.md new file mode 100644 index 0000000000..6a887a22de --- /dev/null +++ b/content/docs/tutorials/get-started/versioning-basics.md @@ -0,0 +1,236 @@ +# Data Versioning Basics + +DVC allows storing and versioning data files or directories, ML models, and +intermediate results with a regular Git workflow, without actually tracking the +file contents with Git. Let's get a dataset example to play with: + +```dvc +$ mkdir data +$ dvc get https://github.com/iterative/dataset-registry \ + get-started/data.xml -o data/data.xml +``` + +> `dvc get` can download any data artifact tracked in a DVC +> repository, using the appropriate +> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for +> DVC/Git repos). In this case we use our +> [dataset-registry](https://github.com/iterative/dataset-registry)) as the +> source repository (refer to [Data Registries](/doc/use-cases/data-registries) +> for more info.) + +## Start tracking data + +To track a file with DVC, just run `dvc add` on it: + +```dvc +$ dvc add data/data.xml +``` + +DVC stores information about the added data in a special **DVC-file** +(`data/data.xml.dvc`), a small text file with a human-readable +[format](/doc/user-guide/dvc-file-format). The above command also tells Git to +ignore the actual data contents, so that this version of the data can be safely +committed to the repository, using Git: + +```dvc +$ git add data/.gitignore data/data.xml.dvc +$ git commit -m "Add raw data" +``` + +
+ +### Expand to learn about DVC internals + +`dvc add` moves the data file to the project's cache (see +[DVC Files and Directories](/doc/user-guide/dvc-files-and-directories)), and +makes file links (or copies) with the original file names back in the +workspace, which is what you see inside the project. + +```dvc +$ ls -R .dvc/cache +... + .dvc/cache/a3: + 04afb96060aad90176268345e10355 +``` + +The hash value of the `data/data.xml` file we just added, +`a304afb96060aad90176268345e10355` determines the path and file name shown +above. And if you check the `data/data.xml.dvc` DVC-file created by DVC, you +will see that it has this string inside. + +### Important note on cache performance + +DVC tries to use reflinks\* by default to link your data files from the DVC +cache to the workspace, optimizing speed and storage space. However, reflinks +are not widely supported yet and DVC falls back to actually copying data files +to/from the cache. **Copying can be very slow with large files**, and duplicates +storage requirements. + +Hardlinks and symlinks are also available for optimized cache linking but, +(unlike reflinks) they carry the risk of accidentally corrupting the cache if +tracked data files are modified in the workspace. + +See [Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) and +`dvc config cache` for more information. + +> \***copy-on-write links or "reflinks"** are a relatively new way to link files +> in UNIX-style file systems. Unlike hardlinks or symlinks, they support +> transparent [copy on write](https://en.wikipedia.org/wiki/Copy-on-write). This +> means that editing a reflinked file is always safe as all the other links to +> the file will reflect the changes. + +
+ +Refer to +[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files), +`dvc add`, and `dvc run` for more information on storing and versioning data +files with DVC. + +## Store and share data + +Now that your raw data is tracked by DVC, you can push it from your repository +to the default [remote storage](/doc/command-reference/remote). + +> As seen in the intro's [Configure](/doc/tutorials/get-started#configure) +> section, we are using a **local remote** in this section for illustrative +> purposes. + +```dvc +$ dvc push +``` + +Similar to pushing source code to a _Git remote_, `dvc push` ensures that your +data files and models are safely backed up remotely. This means that the data +can be pulled by yourself or by colleagues when and where needed. Usually, we +also want to `git commit` and `git push`, to save the new (or changed versions +of) [DVC-files](/doc/user-guide/dvc-file-format). + +
+ +### Expand to learn more about DVC internals + +You can check that the data has been backed up to the remote (`/tmp/dvc-storage` +local directory) with: + +```dvc +$ ls -R /tmp/dvc-storage +... +/tmp/dvc-storage/a3: +04afb96060aad90176268345e10355 +``` + +
+ +## Retrieve data + +Imagine you're just cloning the Git repo that has been created so far in another +computer. This can be simulated by cloning our **example-get-started** repo from +GitHub, and checking out the +[`3-add-file`](https://github.com/iterative/example-get-started/tree/3-add-file) +tag: + +```dvc +$ cd ~ +$ git clone https://github.com/iterative/example-get-started +$ cd example-get-started +$ git checkout 3-add-file +``` + +If you list the files in this fresh workspace, or even in the +cache, you'll notice that the `data/data.xml` file is not there yet. This is +because it's not stored by Git! To get it, simply run: + +```dvc +$ dvc pull +``` + +`dvc pull` downloads data files that are referenced in all present +[DVC-files](/doc/user-guide/dvc-file-format) from the project's +remote storage, so usually we run it after `git clone`, `git pull`, or +`git checkout`. + +Alternatively, if you want to retrieve a single file or directory, you can +specify the target like this: + +```dvc +$ dvc pull data/data.xml.dvc +``` + +> In this case, both commands have the same result, as there's currently just +> one DVC-tracked file in the repo. + +[DVC remotes](/doc/command-reference/remote), `dvc push`, and `dvc pull` provide +a basic collaboration workflow, the same way as Git remotes, `git push` and +`git pull`. See +[Sharing Data and Model Files](/doc/use-cases/sharing-data-and-model-files) for +more information. + +## Import data + +We've seen how to [push](#store-and-share-date) and [pull](#retrieve-data) data +from/to a remote storage. But what if we wanted to integrate a dataset or ML +model produced in one project into another one? + +One way is to manually download the data and use `dvc add` to track it, like in +the beginning of this page. But the connection between the projects is only +known by the person doing this. Others wouldn't be able to tell where the data +came from or whether there are new versions available. + +A better alternative is the `dvc import` command! Let's go back to the +project we're building, and replace `data/data.xml` by importing it +from the same source: + +```dvc +$ cd ~/sotag-predictions +$ dvc import https://github.com/iterative/dataset-registry \ + get-started/data.xml -o data/data.xml +``` + +This downloads and overwrites the same `data/data.xml`, checks that it's in +`data/.gitignore`, and creates the `data/data.xml.dvc` +[DVC-file](/doc/user-guide/dvc-file-format). So far this seems identical to our +previous strategy, except that this time `data.xml.dvc` has additional metadata +that allows DVC to track changes in the source data. This allows `dvc update` to +bring in changes from the data source. + +
+ +### Expand to learn more about DVC internals + +DVC-files created by `dvc import` are called _import stages_. If we check the +difference against the regular DVC-file we previously had, we can see that the +latter has more fields, such as the data source `repo`, and `path` within it: + +```dvc +$ git diff +... +--- a/data/data.xml.dvc ++++ b/data/data.xml.dvc +... ++deps: ++- path: get-started/data.xml ++ repo: ++ url: https://github.com/iterative/dataset-registry ++ rev_lock: f31f5c4cdae787b4bdeb97a717687d44667d9e62 +``` + +The `url` and `rev_lock` subfields under `repo` are used to save the origin and +[version](https://git-scm.com/docs/revisions) of the dependency, respectively. + +> `dvc update` updates the `rev_lock` field of the corresponding DVC-file (when +> there are changes to bring in). + +Note that the [dataset-registry](https://github.com/iterative/dataset-registry) +repository doesn't actually contain a `get-started/data.xml` file. Like, +`dvc get`, importing also downloads the data from the appropriate +[remote storage](/doc/command-reference/remote). + +
+ +Let's wrap up by committing the import stage with Git: + +```dvc +$ git add data/data.xml.dvc +$ git commit -m "Import raw data (overwrite)" +$ dvc push # so others can pull the imported data in their repo copies +``` diff --git a/content/docs/tutorials/get-started/visualize.md b/content/docs/tutorials/get-started/visualize.md deleted file mode 100644 index 5b7e5c293f..0000000000 --- a/content/docs/tutorials/get-started/visualize.md +++ /dev/null @@ -1,84 +0,0 @@ -# Visualize - -Now that we have built our pipeline, we need a good way to visualize it to be -able to wrap our heads around it. Luckily, DVC allows us to do that without -leaving the terminal, making the experience distraction-less. - -We are using the `--ascii` option below to better illustrate this pipeline. -Please, refer to `dvc pipeline show` to explore other options this command -supports (e.g. `.dot` files that can be used then in other tools). - -## Stages - -```dvc -$ dvc pipeline show --ascii train.dvc - +-------------------+ - | data/data.xml.dvc | - +-------------------+ - * - * - * - +-------------+ - | prepare.dvc | - +-------------+ - * - * - * - +---------------+ - | featurize.dvc | - +---------------+ - * - * - * - +-----------+ - | train.dvc | - +-----------+ -``` - -## Commands - -```dvc -$ dvc pipeline show --ascii train.dvc --commands - +-------------------------------------+ - | python src/prepare.py data/data.xml | - +-------------------------------------+ - * - * - * - +---------------------------------------------------------+ - | python src/featurization.py data/prepared data/features | - +---------------------------------------------------------+ - * - * - * - +---------------------------------------------+ - | python src/train.py data/features model.pkl | - +---------------------------------------------+ -``` - -## Outputs - -```dvc -$ dvc pipeline show --ascii train.dvc --outs - +---------------+ - | data/data.xml | - +---------------+ - * - * - * - +---------------+ - | data/prepared | - +---------------+ - * - * - * - +---------------+ - | data/features | - +---------------+ - * - * - * - +-----------+ - | model.pkl | - +-----------+ -``` diff --git a/content/docs/tutorials/pipelines.md b/content/docs/tutorials/pipelines.md index 4bcd33da7e..10f21a352b 100644 --- a/content/docs/tutorials/pipelines.md +++ b/content/docs/tutorials/pipelines.md @@ -50,13 +50,13 @@ $ git add code/ $ git commit -m "Download and add code to new Git repo" ``` -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) +> `dvc get` can download any data artifact tracked in a DVC +> repository, using the appropriate +> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for +> DVC/Git repos). In this case we use our +> [dataset-registry](https://github.com/iterative/dataset-registry)) as the +> source repository (refer to [Data Registries](/doc/use-cases/data-registries) +> for more info.) Now let's install the requirements. But before we do that, we **strongly** recommend creating a @@ -102,9 +102,9 @@ When we run `dvc add` `Posts.xml.zip`, DVC creates a ### Expand to learn about DVC internals -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +At DVC initialization, a new `.dvc/` directory is created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. Note that the DVC-file created by `dvc add` has no dependencies, a.k.a. an diff --git a/content/docs/tutorials/versioning.md b/content/docs/tutorials/versioning.md index 370fc95165..1ae0768a9a 100644 --- a/content/docs/tutorials/versioning.md +++ b/content/docs/tutorials/versioning.md @@ -83,13 +83,13 @@ $ unzip -q data.zip $ rm -f data.zip ``` -> `dvc get` can use any DVC repository to find the appropriate -> [remote storage](/doc/command-reference/remote) and download data -> artifacts from it (analogous to `wget`, but for repositories). In this -> case we use [dataset-registry](https://github.com/iterative/dataset-registry)) -> as the source repo. (Refer to -> [Data Registries](/doc/use-cases/data-registries) for more info about this -> setup.) +> `dvc get` can download any data artifact tracked in a DVC +> repository, using the appropriate +> [remote storage](/doc/command-reference/remote) (analogous to `wget`, but for +> DVC/Git repos). In this case we use our +> [dataset-registry](https://github.com/iterative/dataset-registry)) as the +> source repository (refer to [Data Registries](/doc/use-cases/data-registries) +> for more info.) This command downloads and extracts our raw dataset, consisting of 1000 labeled images for training and 800 labeled images for validation. In total, it's a 43 diff --git a/content/docs/understanding-dvc/what-is-dvc.md b/content/docs/understanding-dvc/what-is-dvc.md index 444d7a6774..7f21206a88 100644 --- a/content/docs/understanding-dvc/what-is-dvc.md +++ b/content/docs/understanding-dvc/what-is-dvc.md @@ -1,16 +1,15 @@ # What Is DVC? Data Version Control, or DVC, is **a new type of experiment management -software** that has been built **on top of the existing engineering toolset that -you're already used to**, and particularly on a source code version control -system (currently Git). DVC reduces the gap between existing tools and data -science needs, allowing users to take advantage of experiment management -software while reusing existing skills and intuition. - -The underlying source code control system eliminates the need to use external -services. Data science experiment sharing and collaboration can be done through -regular Git tools (commit messages, merges, pull requests, etc) the same way it -works for software engineers. +software** built on top of the existing engineering toolset that you're already +used to, and particularly on a source code management (Git). DVC reduces the gap +between existing tools and data science needs, allowing users to take advantage +of experiment management while reusing existing skills and intuition. + +Leveraging an underlying source code management system eliminates the need to +use external services. Data science experiment sharing and collaboration can be +done through regular Git features (commit messages, merges, pull requests, etc) +the same way it works for software engineers. DVC implements a **Git experimentation methodology** where each experiment exists with its code as well as data, and can be represented as a separate Git diff --git a/content/docs/use-cases/versioning-data-and-model-files.md b/content/docs/use-cases/versioning-data-and-model-files.md index dbf45e37ce..a434db583a 100644 --- a/content/docs/use-cases/versioning-data-and-model-files.md +++ b/content/docs/use-cases/versioning-data-and-model-files.md @@ -42,9 +42,9 @@ initialize the DVC project on top of the existing repository: $ dvc init ``` -At DVC initialization, a new `.dvc/` directory will be created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories) that are +At DVC initialization, a new `.dvc/` directory is created for internal +configuration and cache +[files and directories](/doc/user-guide/dvc-files-and-directories), that are hidden from the user. These can safely be tracked with Git: ```dvc diff --git a/package.json b/package.json index 316479f009..2b33043311 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,6 @@ "main": "index.js", "scripts": { "develop": "gatsby develop", - "debug": "node --inspect-brk server.js", "build": "gatsby build", "heroku-postbuild": "./scripts/deploy-with-s3.js", "test": "jest", @@ -15,8 +14,8 @@ "format-all": "prettier --write './**/*.{js,jsx,md,tsx,ts,json}'", "lint-ts": "tsc --noEmit --skipLibCheck && eslint --ext .json,.js,.ts,.tsx src scripts", "lint-css": "stylelint \"src/**/*.css\"", - "link-check": "scripts/link-check-git-all.sh", - "link-check-diff": "scripts/link-check-git-diff.sh" + "link-check": "./scripts/link-check-git-all.sh", + "link-check-diff": "./scripts/link-check-git-diff.sh" }, "repository": { "type": "git", diff --git a/scripts/link-check-git-all.sh b/scripts/link-check-git-all.sh index 84da16a85b..27c78fea74 100755 --- a/scripts/link-check-git-all.sh +++ b/scripts/link-check-git-all.sh @@ -2,5 +2,5 @@ repo="$(dirname "$(realpath "$(dirname "$0")")")" -(find "$repo"/pages/ "$repo"/content/docs/ "$repo"/src/ "$repo"/.github/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \ +(find "$repo"/.github/ "$repo"/content/docs/ "$repo"/src/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \ | xargs -n1 -P8 $(dirname "$0")/link-check.sh diff --git a/src/utils/sidebar.js b/src/utils/sidebar.js index 46e12bab9c..97f2e640f2 100644 --- a/src/utils/sidebar.js +++ b/src/utils/sidebar.js @@ -21,14 +21,14 @@ const startCase = require('lodash.startcase') const sidebar = require('../../content/docs/sidebar.json') -const PATH_ROOT = '/doc/' +const PATH_ROOT = '/doc' const FILE_ROOT = '/docs/' const FILE_EXTENSION = '.md' function validateRawItem({ slug, source, children }) { const isSourceDisabled = source === false - if (!slug) { + if (typeof slug !== 'string') { throw Error("'slug' field is required in objects in sidebar.json") } @@ -81,7 +81,7 @@ function normalizeItem({ rawItem, parentPath, resultRef, prevRef }) { const sourcePath = FILE_ROOT + parentPath + sourceFileName return { - path: PATH_ROOT + parentPath + slug, + path: PATH_ROOT + (parentPath || slug ? '/' : '') + parentPath + slug, source: source === false ? false : sourcePath, label: label ? label : startCase(slug), tutorials: tutorials || {}, @@ -152,7 +152,7 @@ function getFirstPage() { function getItemByPath(path) { const normalizedPath = path.replace(/\/$/, '') - const isRoot = normalizedPath === PATH_ROOT.slice(0, -1) + const isRoot = normalizedPath === PATH_ROOT const item = isRoot ? normalizedSidebar[0] : findItemByField(normalizedSidebar, 'path', normalizedPath) @@ -173,10 +173,14 @@ function getPathWithSoruce(path) { } function getParentsListFromPath(path) { - let currentPath = PATH_ROOT.slice(0, -1) + let currentPath = PATH_ROOT + + if (path === PATH_ROOT) { + return [PATH_ROOT] + } return path - .replace(PATH_ROOT, '') + .replace(`${PATH_ROOT}/`, '') .split('/') .map(part => { const path = `${currentPath}/${part}`