From 54706b6b7abc650cd09e9dbb03df78d05b3c3f45 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Tue, 2 Jun 2020 05:38:19 +0300 Subject: [PATCH 1/4] docs: use `--external` for `add/run` Per https://github.com/iterative/dvc/pull/3929 --- content/docs/command-reference/add.md | 6 +++++- content/docs/command-reference/run.md | 5 ++++- content/docs/user-guide/managing-external-data.md | 15 ++++++++++----- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/content/docs/command-reference/add.md b/content/docs/command-reference/add.md index 49fc6fc36b..c6ed1e66d2 100644 --- a/content/docs/command-reference/add.md +++ b/content/docs/command-reference/add.md @@ -6,7 +6,8 @@ Track data files or directories with DVC, by creating a corresponding ## Synopsis ```usage -usage: dvc add [-h] [-q | -v] [-R] [--no-commit] [-f ] +usage: dvc add [-h] [-q | -v] [-R] [--no-commit] [--external] + [-f ] targets [targets ...] positional arguments: @@ -98,6 +99,9 @@ This way you bring data provenance and make your project when ready to commit outputs with DVC. This is analogous to using `git add` before `git commit`. +- `--external` - allow targets that are outside of the DVC repository. See + [Managing External Data](/doc/user-guide/managing-external-data). + - `-f `, `--file ` - specify name of the DVC-file it generates. This option works only if there is a single target. By default the name of the generated DVC-file is `.dvc`, where `` is the file diff --git a/content/docs/command-reference/run.md b/content/docs/command-reference/run.md index 24b93d71d7..2b0a0dba6d 100644 --- a/content/docs/command-reference/run.md +++ b/content/docs/command-reference/run.md @@ -11,7 +11,7 @@ usage: dvc run [-h] [-q | -v] [-d ] [-o ] [-O ] [-w ] [--no-exec] [--overwrite-dvcfile] [--no-run-cache] [--no-commit] [--outs-persist ] [--outs-persist-no-cache ] - [--always-changed] + [--always-changed] [--external] command positional arguments: @@ -183,6 +183,9 @@ data pipeline (e.g. random numbers, time functions, hardware dependency, etc.) > Note that DVC-files without dependencies are automatically considered > "always changed", so this option has no effect in those cases. +- `--external` - allow outputs that are outside of the DVC repository. See + [Managing External Data](/doc/user-guide/managing-external-data). + - `-h`, `--help` - prints the usage/help message, and exit. - `-q`, `--quiet` - do not write anything to standard output. Exit with 0 if no diff --git a/content/docs/user-guide/managing-external-data.md b/content/docs/user-guide/managing-external-data.md index 994640eadc..6bc4511fee 100644 --- a/content/docs/user-guide/managing-external-data.md +++ b/content/docs/user-guide/managing-external-data.md @@ -52,8 +52,9 @@ The default local cache location is `.dvc/cache`, so there is no need to specify it explicitly. ```dvc -$ dvc add /home/shared/mydata +$ dvc add /home/shared/mydata --external $ dvc run -d data.txt \ + --external \ -o /home/shared/data.txt \ cp data.txt /home/shared/data.txt ``` @@ -68,10 +69,11 @@ $ dvc remote add sshcache ssh://user@example.com:/cache $ dvc config cache.ssh sshcache # Add data on SSH directly -$ dvc add ssh://user@example.com:/mydata +$ dvc add ssh://user@example.com:/mydata --external # Create the stage with external SSH output $ dvc run -d data.txt \ + --external \ -o ssh://user@example.com:/home/shared/data.txt \ scp data.txt user@example.com:/home/shared/data.txt ``` @@ -86,10 +88,11 @@ $ dvc remote add s3cache s3://mybucket/cache $ dvc config cache.s3 s3cache # Add data on S3 directly -$ dvc add s3://mybucket/mydata +$ dvc add s3://mybucket/mydata --external # Create the stage with external S3 output $ dvc run -d data.txt \ + --external \ -o s3://mybucket/data.txt \ aws s3 cp data.txt s3://mybucket/data.txt ``` @@ -104,10 +107,11 @@ $ dvc remote add gscache gs://mybucket/cache $ dvc config cache.gs gscache # Add data on GS directly -$ dvc add gs://mybucket/mydata +$ dvc add gs://mybucket/mydata --external # Create the stage with external GS output $ dvc run -d data.txt \ + --external \ -o gs://mybucket/data.txt \ gsutil cp data.txt gs://mybucket/data.txt ``` @@ -122,10 +126,11 @@ $ dvc remote add hdfscache hdfs://user@example.com/cache $ dvc config cache.hdfs hdfscache # Add data on HDFS directly -$ dvc add hdfs://user@example.com/mydata +$ dvc add hdfs://user@example.com/mydata --external # Create the stage with external HDFS output $ dvc run -d data.txt \ + --external \ -o hdfs://user@example.com/home/shared/data.txt \ hdfs fs -copyFromLocal \ data.txt \ From d57a6d82245e174cce56d10ef8e27cd4b4a7392b Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Wed, 3 Jun 2020 15:05:31 -0500 Subject: [PATCH 2/4] Update content/docs/command-reference/add.md --- content/docs/command-reference/add.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/content/docs/command-reference/add.md b/content/docs/command-reference/add.md index c6ed1e66d2..deb8d41af4 100644 --- a/content/docs/command-reference/add.md +++ b/content/docs/command-reference/add.md @@ -7,8 +7,7 @@ Track data files or directories with DVC, by creating a corresponding ```usage usage: dvc add [-h] [-q | -v] [-R] [--no-commit] [--external] - [-f ] - targets [targets ...] + [-f ] targets [targets ...] positional arguments: targets Input files/directories to add. From 6a8332a5cad5fe9c79fe793670770d22d75347ad Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Wed, 3 Jun 2020 15:53:17 -0500 Subject: [PATCH 3/4] Update content/docs/command-reference/add.md --- content/docs/command-reference/add.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/docs/command-reference/add.md b/content/docs/command-reference/add.md index deb8d41af4..e2d1cd44b5 100644 --- a/content/docs/command-reference/add.md +++ b/content/docs/command-reference/add.md @@ -98,7 +98,7 @@ This way you bring data provenance and make your project when ready to commit outputs with DVC. This is analogous to using `git add` before `git commit`. -- `--external` - allow targets that are outside of the DVC repository. See +- `--external` - allow `targets` that are outside of the DVC repository. See [Managing External Data](/doc/user-guide/managing-external-data). - `-f `, `--file ` - specify name of the DVC-file it From 12a9ff6f3003857a2c3f7eee561fa1e3c842fd31 Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Sat, 6 Jun 2020 02:46:04 -0500 Subject: [PATCH 4/4] term: review usage of "external" --- content/docs/command-reference/get.md | 5 ++--- content/docs/understanding-dvc/existing-tools.md | 4 ++-- content/docs/understanding-dvc/what-is-dvc.md | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/content/docs/command-reference/get.md b/content/docs/command-reference/get.md index ce3cd491c2..fac034b3fa 100644 --- a/content/docs/command-reference/get.md +++ b/content/docs/command-reference/get.md @@ -94,12 +94,11 @@ model.pkl Note that the `model.pkl` file doesn't actually exist in the [root directory](https://github.com/iterative/example-get-started/tree/master/) -of the external Git repo. Instead, the corresponding DVC-file +of the source Git repo. Instead, the corresponding DVC-file [train.dvc](https://github.com/iterative/example-get-started/blob/master/train.dvc) is found, that contains `model.pkl` (in the `outs` field). DVC then [pulls](/doc/command-reference/pull) the file from the default -[remote](/doc/command-reference/remote) of the external DVC project (found in -its +[remote](/doc/command-reference/remote) of the source DVC project (found in its [config file](https://github.com/iterative/example-get-started/blob/master/.dvc/config)). > A recommended use for downloading binary files from DVC repositories, as done diff --git a/content/docs/understanding-dvc/existing-tools.md b/content/docs/understanding-dvc/existing-tools.md index 279434f94a..0d007a1532 100644 --- a/content/docs/understanding-dvc/existing-tools.md +++ b/content/docs/understanding-dvc/existing-tools.md @@ -29,6 +29,6 @@ integrated with local environments either. The separation of the local data scientist environment and the experimentation cloud environment creates another discrepancy issue, and environment synchronization requires addition work. Also, this style of software usually -requires external services that aren't free. This might be a good solution for a -particular companies or groups of data scientists. but a more accessible, free +requires 3rd-party services that aren't free. This might be a good solution for +a particular companies or groups of data scientists. but a more accessible, free tool is needed for a wider audience. diff --git a/content/docs/understanding-dvc/what-is-dvc.md b/content/docs/understanding-dvc/what-is-dvc.md index 39aab4b8e9..f6b464e409 100644 --- a/content/docs/understanding-dvc/what-is-dvc.md +++ b/content/docs/understanding-dvc/what-is-dvc.md @@ -7,7 +7,7 @@ system (currently Git). DVC reduces the gap between existing tools and data science needs, allowing users to take advantage of experiment management software while reusing existing skills and intuition. -The underlying source code control system eliminates the need to use external +The underlying source code control system eliminates the need to use 3rd-party services. Data science experiment sharing and collaboration can be done through regular Git tools (commit messages, merges, pull requests, etc) the same way it works for software engineers.