From 6d17d7a3d4111ad1685f2f1ff168a3c3101bdfce Mon Sep 17 00:00:00 2001 From: Suneet Saldanha Date: Wed, 8 Jan 2020 13:54:51 -0800 Subject: [PATCH 1/2] Tutorials use new ingestion spec where possible There are 2 main changes * Use task type index_parallel instead of index * Remove the use of parser + firehose in favor of inputFormat + inputSource index_parallel is the preferred method starting in 0.17. Setting the job to index_parallel with the default maxNumConcurrentSubTasks(1) is the equivalent of an index task Instead of using a parserSpec, dimensionSpec and timestampSpec have been promoted to the dataSchema. The format is described in the ioConfig as the inputFormat. There are a few cases where the new format is not supported * Hadoop must use firehoses instead of the inputSource and inputFormat * There is no equivalent of a combining firehose as an inputSource * A Combining firehose does not support index_parallel --- docs/development/extensions-core/mysql.md | 6 +- .../development/extensions-core/postgresql.md | 6 +- docs/tutorials/tutorial-batch.md | 69 ++-- docs/tutorials/tutorial-compaction.md | 4 +- docs/tutorials/tutorial-ingestion-spec.md | 342 ++++++++---------- docs/tutorials/tutorial-rollup.md | 35 +- docs/tutorials/tutorial-transform-spec.md | 35 +- .../tutorial/compaction-day-granularity.json | 2 +- .../tutorial/compaction-init-index.json | 69 ++-- .../tutorial/compaction-keep-granularity.json | 2 +- .../quickstart/tutorial/deletion-index.json | 69 ++-- .../quickstart/tutorial/retention-index.json | 69 ++-- .../quickstart/tutorial/rollup-index.json | 35 +- .../quickstart/tutorial/transform-index.json | 35 +- .../tutorial/updates-append-index2.json | 33 +- .../tutorial/updates-init-index.json | 33 +- .../tutorial/updates-overwrite-index.json | 33 +- .../quickstart/tutorial/wikipedia-index.json | 69 ++-- 18 files changed, 427 insertions(+), 519 deletions(-) diff --git a/docs/development/extensions-core/mysql.md b/docs/development/extensions-core/mysql.md index 5445e1b47587..2ce85ddc5fc7 100644 --- a/docs/development/extensions-core/mysql.md +++ b/docs/development/extensions-core/mysql.md @@ -112,7 +112,7 @@ The MySQL extension provides an implementation of an [SqlFirehose](../../ingesti ```json { - "type": "index", + "type": "index_parallel", "spec": { "dataSchema": { "dataSource": "some_datasource", @@ -149,7 +149,7 @@ The MySQL extension provides an implementation of an [SqlFirehose](../../ingesti } }, "ioConfig": { - "type": "index", + "type": "index_parallel", "firehose": { "type": "sql", "database": { @@ -166,7 +166,7 @@ The MySQL extension provides an implementation of an [SqlFirehose](../../ingesti } }, "tuningconfig": { - "type": "index" + "type": "index_parallel" } } } diff --git a/docs/development/extensions-core/postgresql.md b/docs/development/extensions-core/postgresql.md index a51112b0cbcc..4be3a7678dfe 100644 --- a/docs/development/extensions-core/postgresql.md +++ b/docs/development/extensions-core/postgresql.md @@ -91,7 +91,7 @@ The PostgreSQL extension provides an implementation of an [SqlFirehose](../../in ```json { - "type": "index", + "type": "index_parallel", "spec": { "dataSchema": { "dataSource": "some_datasource", @@ -128,7 +128,7 @@ The PostgreSQL extension provides an implementation of an [SqlFirehose](../../in } }, "ioConfig": { - "type": "index", + "type": "index_parallel", "firehose": { "type": "sql", "database": { @@ -145,7 +145,7 @@ The PostgreSQL extension provides an implementation of an [SqlFirehose](../../in } }, "tuningconfig": { - "type": "index" + "type": "index_parallel" } } } diff --git a/docs/tutorials/tutorial-batch.md b/docs/tutorials/tutorial-batch.md index e175d4a272c2..75fab9f7c2ec 100644 --- a/docs/tutorials/tutorial-batch.md +++ b/docs/tutorials/tutorial-batch.md @@ -134,42 +134,36 @@ which has been configured to read the `quickstart/tutorial/wikiticker-2015-09-12 ```json { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "wikipedia", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "channel", - "cityName", - "comment", - "countryIsoCode", - "countryName", - "isAnonymous", - "isMinor", - "isNew", - "isRobot", - "isUnpatrolled", - "metroCode", - "namespace", - "page", - "regionIsoCode", - "regionName", - "user", - { "name": "added", "type": "long" }, - { "name": "deleted", "type": "long" }, - { "name": "delta", "type": "long" } - ] - }, - "timestampSpec": { - "column": "time", - "format": "iso" - } - } + "dimensionsSpec" : { + "dimensions" : [ + "channel", + "cityName", + "comment", + "countryIsoCode", + "countryName", + "isAnonymous", + "isMinor", + "isNew", + "isRobot", + "isUnpatrolled", + "metroCode", + "namespace", + "page", + "regionIsoCode", + "regionName", + "user", + { "name": "added", "type": "long" }, + { "name": "deleted", "type": "long" }, + { "name": "delta", "type": "long" } + ] + }, + "timestampSpec": { + "column": "time", + "format": "iso" }, "metricsSpec" : [], "granularitySpec" : { @@ -181,16 +175,19 @@ which has been configured to read the `quickstart/tutorial/wikiticker-2015-09-12 } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial/", "filter" : "wikiticker-2015-09-12-sampled.json.gz" }, + "inputFormat" : { + "type": "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/docs/tutorials/tutorial-compaction.md b/docs/tutorials/tutorial-compaction.md index 8da333b0d735..98052170cf71 100644 --- a/docs/tutorials/tutorial-compaction.md +++ b/docs/tutorials/tutorial-compaction.md @@ -81,7 +81,7 @@ We have included a compaction task spec for this tutorial datasource at `quickst "dataSource": "compaction-tutorial", "interval": "2015-09-12/2015-09-13", "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } @@ -143,7 +143,7 @@ We have included a compaction task spec that will create DAY granularity segment "interval": "2015-09-12/2015-09-13", "segmentGranularity": "DAY", "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000, "forceExtendableShardSpecs" : true diff --git a/docs/tutorials/tutorial-ingestion-spec.md b/docs/tutorials/tutorial-ingestion-spec.md index b722ed736750..9bc1a65d624a 100644 --- a/docs/tutorials/tutorial-ingestion-spec.md +++ b/docs/tutorials/tutorial-ingestion-spec.md @@ -88,42 +88,18 @@ The datasource name is specified by the `dataSource` parameter in the `dataSchem Let's call the tutorial datasource `ingestion-tutorial`. -### Choose a parser - -A `dataSchema` has a `parser` field, which defines the parser that Druid will use to interpret the input data. - -Since our input data is represented as JSON strings, we'll use a `string` parser with `json` format: - -```json -"dataSchema" : { - "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json" - } - } -} -``` - ### Time column -The `parser` needs to know how to extract the main timestamp field from the input data. When using a `json` type `parseSpec`, the timestamp is defined in a `timestampSpec`. +The `dataSchema` needs to know how to extract the main timestamp field from the input data. -The timestamp column in our input data is named "ts", containing ISO 8601 timestamps, so let's add a `timestampSpec` with that information to the `parseSpec`: +The timestamp column in our input data is named "ts", containing ISO 8601 timestamps, so let's add a `timestampSpec` with that information to the `dataSchema`: ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" } } ``` @@ -146,26 +122,17 @@ When ingesting data, we must consider whether we wish to use rollup or not. For this tutorial, let's enable rollup. This is specified with a `granularitySpec` on the `dataSchema`. -Note that the `granularitySpec` lies outside of the `parser`. We will revisit the `parser` soon when we define our dimensions and metrics. - ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" }, "granularitySpec" : { "rollup" : true } } - ``` #### Choosing dimensions and metrics @@ -181,29 +148,23 @@ Let's look at how to define these dimensions and metrics within the ingestion sp #### Dimensions -Dimensions are specified with a `dimensionsSpec` inside the `parseSpec`. +Dimensions are specified with a `dimensionsSpec` inside the `dataSchema`. ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "granularitySpec" : { "rollup" : true @@ -232,24 +193,18 @@ Metrics are specified with a `metricsSpec` inside the `dataSchema`: ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -307,24 +262,18 @@ Segment granularity is configured by the `segmentGranularity` property in the `g ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -349,24 +298,18 @@ The query granularity is configured by the `queryGranularity` property in the `g ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -377,7 +320,7 @@ The query granularity is configured by the `queryGranularity` property in the `g "granularitySpec" : { "type" : "uniform", "segmentGranularity" : "HOUR", - "queryGranularity" : "MINUTE" + "queryGranularity" : "MINUTE", "rollup" : true } } @@ -404,24 +347,18 @@ The interval is also specified in the `granularitySpec`: ```json "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -447,28 +384,22 @@ The `dataSchema` is shared across all task types, but each task type has its own ```json { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -490,13 +421,13 @@ The `dataSchema` is shared across all task types, but each task type has its own ## Define the input source -Now let's define our input source, which is specified in an `ioConfig` object. Each task type has its own type of `ioConfig`. The native batch task uses "firehoses" to read input data, so let's configure a "local" firehose to read the example netflow data we saved earlier: +Now let's define our input source, which is specified in an `ioConfig` object. Each task type has its own type of `ioConfig`. To read input data, we need to specify an `inputSource`. The the example netflow data we saved earlier needs to be read from a local file, which is configured below: ```json "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/", "filter" : "ingestion-tutorial-data.json" @@ -504,30 +435,43 @@ Now let's define our input source, which is specified in an `ioConfig` object. E } ``` + +### Define the format of the data + +Since our input data is represented as JSON strings, we'll use a `inputFormat` to `json` format: + +```json + "ioConfig" : { + "type" : "index_parallel", + "inputSource" : { + "type" : "local", + "baseDir" : "quickstart/", + "filter" : "ingestion-tutorial-data.json" + }, + "inputFormat" : { + "type" : "json" + } + } +``` + ```json { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -544,11 +488,14 @@ Now let's define our input source, which is specified in an `ioConfig` object. E } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/", "filter" : "ingestion-tutorial-data.json" + }, + "inputFormat" : { + "type" : "json" } } } @@ -563,7 +510,7 @@ As an example, let's add a `tuningConfig` that sets a target segment size for th ```json "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000 } ``` @@ -576,28 +523,22 @@ We've finished defining the ingestion spec, it should now look like the followin ```json { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "ingestion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "format" : "iso", - "column" : "ts" - }, - "dimensionsSpec" : { - "dimensions": [ - "srcIP", - { "name" : "srcPort", "type" : "long" }, - { "name" : "dstIP", "type" : "string" }, - { "name" : "dstPort", "type" : "long" }, - { "name" : "protocol", "type" : "string" } - ] - } - } + "timestampSpec" : { + "format" : "iso", + "column" : "ts" + }, + "dimensionsSpec" : { + "dimensions": [ + "srcIP", + { "name" : "srcPort", "type" : "long" }, + { "name" : "dstIP", "type" : "string" }, + { "name" : "dstPort", "type" : "long" }, + { "name" : "protocol", "type" : "string" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -614,15 +555,18 @@ We've finished defining the ingestion spec, it should now look like the followin } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/", "filter" : "ingestion-tutorial-data.json" + }, + "inputFormat" : { + "type" : "json" } }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000 } } diff --git a/docs/tutorials/tutorial-rollup.md b/docs/tutorials/tutorial-rollup.md index 8b4f1ad1d6f5..79276176925c 100644 --- a/docs/tutorials/tutorial-rollup.md +++ b/docs/tutorials/tutorial-rollup.md @@ -55,25 +55,19 @@ We'll ingest this data using the following ingestion task spec, located at `quic ```json { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "rollup-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "srcIP", - "dstIP" - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "dimensionsSpec" : { + "dimensions" : [ + "srcIP", + "dstIP" + ] + }, + "timestampSpec": { + "column": "timestamp", + "format": "iso" }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -89,16 +83,19 @@ We'll ingest this data using the following ingestion task spec, located at `quic } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "rollup-data.json" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/docs/tutorials/tutorial-transform-spec.md b/docs/tutorials/tutorial-transform-spec.md index c90ca6077e33..35695de2497b 100644 --- a/docs/tutorials/tutorial-transform-spec.md +++ b/docs/tutorials/tutorial-transform-spec.md @@ -48,25 +48,19 @@ We will ingest the sample data using the following spec, which demonstrates the ```json { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "transform-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "animal", - { "name": "location", "type": "long" } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "animal", + { "name": "location", "type": "long" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -104,16 +98,19 @@ We will ingest the sample data using the following spec, which demonstrates the } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "transform-data.json" }, + "inputFormat" : { + "type" :"json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/compaction-day-granularity.json b/examples/quickstart/tutorial/compaction-day-granularity.json index 4855821c0646..eb39276d254a 100644 --- a/examples/quickstart/tutorial/compaction-day-granularity.json +++ b/examples/quickstart/tutorial/compaction-day-granularity.json @@ -4,7 +4,7 @@ "interval": "2015-09-12/2015-09-13", "segmentGranularity": "DAY", "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000, "forceExtendableShardSpecs" : true diff --git a/examples/quickstart/tutorial/compaction-init-index.json b/examples/quickstart/tutorial/compaction-init-index.json index b6b59b60550f..f2c00481c36b 100644 --- a/examples/quickstart/tutorial/compaction-init-index.json +++ b/examples/quickstart/tutorial/compaction-init-index.json @@ -1,40 +1,34 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "compaction-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "channel", - "cityName", - "comment", - "countryIsoCode", - "countryName", - "isAnonymous", - "isMinor", - "isNew", - "isRobot", - "isUnpatrolled", - "metroCode", - "namespace", - "page", - "regionIsoCode", - "regionName", - "user", - { "name": "added", "type": "long" }, - { "name": "deleted", "type": "long" }, - { "name": "delta", "type": "long" } - ] - }, - "timestampSpec": { - "column": "time", - "format": "iso" - } - } + "timestampSpec": { + "column": "time", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "channel", + "cityName", + "comment", + "countryIsoCode", + "countryName", + "isAnonymous", + "isMinor", + "isNew", + "isRobot", + "isUnpatrolled", + "metroCode", + "namespace", + "page", + "regionIsoCode", + "regionName", + "user", + { "name": "added", "type": "long" }, + { "name": "deleted", "type": "long" }, + { "name": "delta", "type": "long" } + ] }, "metricsSpec" : [], "granularitySpec" : { @@ -46,16 +40,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial/", "filter" : "wikiticker-2015-09-12-sampled.json.gz" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 1000 } } diff --git a/examples/quickstart/tutorial/compaction-keep-granularity.json b/examples/quickstart/tutorial/compaction-keep-granularity.json index 6721e7bfcc65..ba76d612bddc 100644 --- a/examples/quickstart/tutorial/compaction-keep-granularity.json +++ b/examples/quickstart/tutorial/compaction-keep-granularity.json @@ -3,7 +3,7 @@ "dataSource": "compaction-tutorial", "interval": "2015-09-12/2015-09-13", "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/deletion-index.json b/examples/quickstart/tutorial/deletion-index.json index 0faf4689803e..d32ddd9ee28f 100644 --- a/examples/quickstart/tutorial/deletion-index.json +++ b/examples/quickstart/tutorial/deletion-index.json @@ -1,40 +1,34 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "deletion-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "channel", - "cityName", - "comment", - "countryIsoCode", - "countryName", - "isAnonymous", - "isMinor", - "isNew", - "isRobot", - "isUnpatrolled", - "metroCode", - "namespace", - "page", - "regionIsoCode", - "regionName", - "user", - { "name": "added", "type": "long" }, - { "name": "deleted", "type": "long" }, - { "name": "delta", "type": "long" } - ] - }, - "timestampSpec": { - "column": "time", - "format": "iso" - } - } + "timestampSpec": { + "column": "time", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "channel", + "cityName", + "comment", + "countryIsoCode", + "countryName", + "isAnonymous", + "isMinor", + "isNew", + "isRobot", + "isUnpatrolled", + "metroCode", + "namespace", + "page", + "regionIsoCode", + "regionName", + "user", + { "name": "added", "type": "long" }, + { "name": "deleted", "type": "long" }, + { "name": "delta", "type": "long" } + ] }, "metricsSpec" : [], "granularitySpec" : { @@ -46,16 +40,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial/", "filter" : "wikiticker-2015-09-12-sampled.json.gz" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/retention-index.json b/examples/quickstart/tutorial/retention-index.json index 95416e200791..4c6b33f05c6f 100644 --- a/examples/quickstart/tutorial/retention-index.json +++ b/examples/quickstart/tutorial/retention-index.json @@ -1,40 +1,34 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "retention-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "channel", - "cityName", - "comment", - "countryIsoCode", - "countryName", - "isAnonymous", - "isMinor", - "isNew", - "isRobot", - "isUnpatrolled", - "metroCode", - "namespace", - "page", - "regionIsoCode", - "regionName", - "user", - { "name": "added", "type": "long" }, - { "name": "deleted", "type": "long" }, - { "name": "delta", "type": "long" } - ] - }, - "timestampSpec": { - "column": "time", - "format": "iso" - } - } + "timestampSpec": { + "column": "time", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "channel", + "cityName", + "comment", + "countryIsoCode", + "countryName", + "isAnonymous", + "isMinor", + "isNew", + "isRobot", + "isUnpatrolled", + "metroCode", + "namespace", + "page", + "regionIsoCode", + "regionName", + "user", + { "name": "added", "type": "long" }, + { "name": "deleted", "type": "long" }, + { "name": "delta", "type": "long" } + ] }, "metricsSpec" : [], "granularitySpec" : { @@ -46,16 +40,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial/", "filter" : "wikiticker-2015-09-12-sampled.json.gz" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/rollup-index.json b/examples/quickstart/tutorial/rollup-index.json index 2c1426e58fd4..7c0b5815d2ce 100644 --- a/examples/quickstart/tutorial/rollup-index.json +++ b/examples/quickstart/tutorial/rollup-index.json @@ -1,23 +1,17 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "rollup-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "srcIP", - "dstIP" - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "srcIP", + "dstIP" + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -33,16 +27,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "rollup-data.json" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/transform-index.json b/examples/quickstart/tutorial/transform-index.json index 8d40b1917772..bf605fcfdbf0 100644 --- a/examples/quickstart/tutorial/transform-index.json +++ b/examples/quickstart/tutorial/transform-index.json @@ -1,23 +1,17 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "transform-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "animal", - { "name": "location", "type": "long" } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "animal", + { "name": "location", "type": "long" } + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -55,16 +49,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "transform-data.json" }, + "inpuFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/updates-append-index2.json b/examples/quickstart/tutorial/updates-append-index2.json index 247192a3f5d0..921b8cf0e2d6 100644 --- a/examples/quickstart/tutorial/updates-append-index2.json +++ b/examples/quickstart/tutorial/updates-append-index2.json @@ -1,22 +1,16 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "updates-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "animal" - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "animal" + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -31,16 +25,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "updates-data4.json" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : true }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/updates-init-index.json b/examples/quickstart/tutorial/updates-init-index.json index 71c449bf59a3..ed4b349c6e02 100644 --- a/examples/quickstart/tutorial/updates-init-index.json +++ b/examples/quickstart/tutorial/updates-init-index.json @@ -1,22 +1,16 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "updates-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "animal" - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "animal" + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -31,16 +25,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "updates-data.json" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/updates-overwrite-index.json b/examples/quickstart/tutorial/updates-overwrite-index.json index 451750e726ce..b2545f04dd18 100644 --- a/examples/quickstart/tutorial/updates-overwrite-index.json +++ b/examples/quickstart/tutorial/updates-overwrite-index.json @@ -1,22 +1,16 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "updates-tutorial", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "animal" - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "iso" - } - } + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "animal" + ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, @@ -31,16 +25,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial", "filter" : "updates-data2.json" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } diff --git a/examples/quickstart/tutorial/wikipedia-index.json b/examples/quickstart/tutorial/wikipedia-index.json index 785fbda91679..60d7670a577c 100644 --- a/examples/quickstart/tutorial/wikipedia-index.json +++ b/examples/quickstart/tutorial/wikipedia-index.json @@ -1,40 +1,34 @@ { - "type" : "index", + "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "wikipedia", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "dimensionsSpec" : { - "dimensions" : [ - "channel", - "cityName", - "comment", - "countryIsoCode", - "countryName", - "isAnonymous", - "isMinor", - "isNew", - "isRobot", - "isUnpatrolled", - "metroCode", - "namespace", - "page", - "regionIsoCode", - "regionName", - "user", - { "name": "added", "type": "long" }, - { "name": "deleted", "type": "long" }, - { "name": "delta", "type": "long" } - ] - }, - "timestampSpec": { - "column": "time", - "format": "iso" - } - } + "timestampSpec": { + "column": "time", + "format": "iso" + }, + "dimensionsSpec" : { + "dimensions" : [ + "channel", + "cityName", + "comment", + "countryIsoCode", + "countryName", + "isAnonymous", + "isMinor", + "isNew", + "isRobot", + "isUnpatrolled", + "metroCode", + "namespace", + "page", + "regionIsoCode", + "regionName", + "user", + { "name": "added", "type": "long" }, + { "name": "deleted", "type": "long" }, + { "name": "delta", "type": "long" } + ] }, "metricsSpec" : [], "granularitySpec" : { @@ -46,16 +40,19 @@ } }, "ioConfig" : { - "type" : "index", - "firehose" : { + "type" : "index_parallel", + "inputSource" : { "type" : "local", "baseDir" : "quickstart/tutorial/", "filter" : "wikiticker-2015-09-12-sampled.json.gz" }, + "inputFormat" : { + "type" : "json" + }, "appendToExisting" : false }, "tuningConfig" : { - "type" : "index", + "type" : "index_parallel", "maxRowsPerSegment" : 5000000, "maxRowsInMemory" : 25000 } From 0988dec2d6acec75ec6cea891ac67c06d978c909 Mon Sep 17 00:00:00 2001 From: Suneet Saldanha Date: Wed, 15 Jan 2020 12:00:00 -0800 Subject: [PATCH 2/2] fix typo --- docs/tutorials/tutorial-ingestion-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/tutorial-ingestion-spec.md b/docs/tutorials/tutorial-ingestion-spec.md index 9bc1a65d624a..773b920b34b2 100644 --- a/docs/tutorials/tutorial-ingestion-spec.md +++ b/docs/tutorials/tutorial-ingestion-spec.md @@ -421,7 +421,7 @@ The `dataSchema` is shared across all task types, but each task type has its own ## Define the input source -Now let's define our input source, which is specified in an `ioConfig` object. Each task type has its own type of `ioConfig`. To read input data, we need to specify an `inputSource`. The the example netflow data we saved earlier needs to be read from a local file, which is configured below: +Now let's define our input source, which is specified in an `ioConfig` object. Each task type has its own type of `ioConfig`. To read input data, we need to specify an `inputSource`. The example netflow data we saved earlier needs to be read from a local file, which is configured below: ```json