Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion docs/content/querying/dimensionspecs.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,23 @@ Returns the first matching group for the given regular expression.
If there is no match, it returns the dimension value as is.

```json
{ "type" : "regex", "expr" : <regular_expression> }
{
"type" : "regex", "expr" : <regular_expression>,
"replaceMissingValues" : true,
"replaceMissingValuesWith" : "foobar"
}
```

For example, using `"expr" : "(\\w\\w\\w).*"` will transform
`'Monday'`, `'Tuesday'`, `'Wednesday'` into `'Mon'`, `'Tue'`, `'Wed'`.

If the `replaceMissingValues` property is true, the extraction function will transform dimension values that do not match the regex pattern to a user-specified String. Default value is `false`.

The `replaceMissingValuesWith` property sets the String that unmatched dimension values will be replaced with, if `replaceMissingValues` is true. If `replaceMissingValuesWith` is not specified, unmatched dimension values will be replaced with nulls.

For example, if `expr` is `"(a\w+)"` in the example JSON above, a regex that matches words starting with the letter `a`, the extraction function will convert a dimension value like `banana` to `foobar`.


### Partial Extraction Function

Returns the dimension value unchanged if the regular expression matches, otherwise returns null.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,53 @@
public class RegexDimExtractionFn extends DimExtractionFn
{
private static final byte CACHE_TYPE_ID = 0x1;
private static final byte CACHE_KEY_SEPARATOR = (byte) 0xFF;

private final String expr;
private final Pattern pattern;
private final boolean replaceMissingValues;
private final String replaceMissingValuesWith;

@JsonCreator
public RegexDimExtractionFn(
@JsonProperty("expr") String expr
@JsonProperty("expr") String expr,
@JsonProperty("replaceMissingValues") Boolean replaceMissingValues,
@JsonProperty("replaceMissingValuesWith") String replaceMissingValuesWith
)
{
Preconditions.checkNotNull(expr, "expr must not be null");

this.expr = expr;
this.pattern = Pattern.compile(expr);
this.replaceMissingValues = replaceMissingValues == null ? false : replaceMissingValues;
this.replaceMissingValuesWith = replaceMissingValuesWith;
}

@Override
public byte[] getCacheKey()
{
byte[] exprBytes = StringUtils.toUtf8(expr);
return ByteBuffer.allocate(1 + exprBytes.length)
byte[] replaceBytes = replaceMissingValues ? new byte[]{1} : new byte[]{0};
byte[] replaceStrBytes;
if (replaceMissingValuesWith == null) {
replaceStrBytes = new byte[]{};
} else {
replaceStrBytes = StringUtils.toUtf8(replaceMissingValuesWith);
}

int totalLen = 1
+ exprBytes.length
+ replaceBytes.length
+ replaceStrBytes.length; // fields
totalLen += 2; // separators

return ByteBuffer.allocate(totalLen)
.put(CACHE_TYPE_ID)
.put(exprBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceStrBytes)
.put(CACHE_KEY_SEPARATOR)
.put(replaceBytes)
.array();
}

Expand All @@ -65,8 +90,14 @@ public String apply(String dimValue)
if (dimValue == null) {
return null;
}
String retVal;
Matcher matcher = pattern.matcher(dimValue);
return Strings.emptyToNull(matcher.find() ? matcher.group(1) : dimValue);
if (matcher.find()) {
retVal = matcher.group(1);
} else {
retVal = replaceMissingValues ? replaceMissingValuesWith : dimValue;
}
return Strings.emptyToNull(retVal);
}

@JsonProperty("expr")
Expand All @@ -75,6 +106,18 @@ public String getExpr()
return expr;
}

@JsonProperty("replaceMissingValues")
public boolean isReplaceMissingValues()
{
return replaceMissingValues;
}

@JsonProperty("replaceMissingValuesWith")
public String getReplaceMissingValuesWith()
{
return replaceMissingValuesWith;
}

@Override
public boolean preservesOrdering()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package io.druid.query.extraction;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import io.druid.jackson.DefaultObjectMapper;
import org.junit.Assert;
Expand Down Expand Up @@ -55,59 +56,58 @@ public class RegexDimExtractionFnTest
public void testPathExtraction()
{
String regex = "/([^/]+)/";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();

for (String path : paths) {
extracted.add(extractionFn.apply(path));
}

Assert.assertEquals(2, extracted.size());
Assert.assertTrue(extracted.contains("druid"));
Assert.assertTrue(extracted.contains("dash"));
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("druid", "dash"));
Assert.assertEquals(expected, extracted);
}

@Test
public void testDeeperPathExtraction()
{
String regex = "^/([^/]+/[^/]+)(/|$)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();

for (String path : paths) {
extracted.add(extractionFn.apply(path));
}

Assert.assertEquals(4, extracted.size());
Assert.assertTrue(extracted.contains("druid/prod"));
Assert.assertTrue(extracted.contains("druid/demo"));
Assert.assertTrue(extracted.contains("dash/aloe"));
Assert.assertTrue(extracted.contains("dash/baloo"));
Set<String> expected = Sets.newLinkedHashSet(
ImmutableList.of(
"druid/prod", "druid/demo",
"dash/aloe", "dash/baloo"
)
);
Assert.assertEquals(expected, extracted);
}

@Test
public void testStringExtraction()
{
String regex = "(.)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
Set<String> extracted = Sets.newHashSet();
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
Set<String> extracted = Sets.newLinkedHashSet();

for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}

Assert.assertEquals(3, extracted.size());
Assert.assertTrue(extracted.contains("a"));
Assert.assertTrue(extracted.contains("b"));
Assert.assertTrue(extracted.contains("c"));
Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("a", "b", "c"));
Assert.assertEquals(expected, extracted);
}


@Test
public void testNullAndEmpty()
{
String regex = "(.*)/.*/.*";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex);
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, false, null);
// no match, map empty input value to null
Assert.assertEquals(null, extractionFn.apply(""));
// null value, returns null
Expand All @@ -116,14 +116,54 @@ public void testNullAndEmpty()
Assert.assertEquals(null, extractionFn.apply("/a/b"));
}

@Test
public void testMissingValueReplacement()
{
String regex = "(a\\w*)";
ExtractionFn extractionFn = new RegexDimExtractionFn(regex, true, "foobar");
Set<String> extracted = Sets.newLinkedHashSet();

for (String testString : testStrings) {
extracted.add(extractionFn.apply(testString));
}

Set<String> expected = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum", "foobar"));
Assert.assertEquals(expected, extracted);

byte[] cacheKey = extractionFn.getCacheKey();
byte[] expectedCacheKey = new byte[]{
0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF,
0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72, (byte) 0xFF, 0x01
};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);

ExtractionFn nullExtractionFn = new RegexDimExtractionFn(regex, true, null);
Set<String> extracted2 = Sets.newLinkedHashSet();

for (String testString : testStrings) {
extracted2.add(nullExtractionFn.apply(testString));
}

Set<String> expected2 = Sets.newLinkedHashSet(ImmutableList.of("apple", "awesome", "asylum"));
expected2.add(null);
Assert.assertEquals(expected2, extracted2);

cacheKey = nullExtractionFn.getCacheKey();
expectedCacheKey = new byte[]{0x01, 0x28, 0x61, 0x5C, 0x77, 0x2A, 0x29, (byte) 0xFF, (byte) 0xFF, 0x01};
Assert.assertArrayEquals(expectedCacheKey, cacheKey);
}

@Test
public void testSerde() throws Exception
{
final ObjectMapper objectMapper = new DefaultObjectMapper();
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" }";
final String json = "{ \"type\" : \"regex\", \"expr\" : \".(...)?\" , " +
"\"replaceMissingValues\": true, \"replaceMissingValuesWith\":\"foobar\"}";
RegexDimExtractionFn extractionFn = (RegexDimExtractionFn) objectMapper.readValue(json, ExtractionFn.class);

Assert.assertEquals(".(...)?", extractionFn.getExpr());
Assert.assertTrue(extractionFn.isReplaceMissingValues());
Assert.assertEquals("foobar", extractionFn.getReplaceMissingValuesWith());

// round trip
Assert.assertEquals(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ public void testGroupByWithCardinality()
@Test
public void testGroupByWithNullProducingDimExtractionFn()
{
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})")
final ExtractionFn nullExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{
@Override
public byte[] getCacheKey()
Expand Down Expand Up @@ -797,7 +797,7 @@ public String apply(String dimValue)
*/
public void testGroupByWithEmptyStringProducingDimExtractionFn()
{
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})")
final ExtractionFn emptyStringExtractionFn = new RegexDimExtractionFn("(\\w{1})", false, null)
{
@Override
public byte[] getCacheKey()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1519,7 +1519,7 @@ public void testTopNCollapsingDimExtraction()
.dimension(
new ExtractionDimensionSpec(
QueryRunnerTestHelper.qualityDimension, QueryRunnerTestHelper.qualityDimension,
new RegexDimExtractionFn(".(.)"), null
new RegexDimExtractionFn(".(.)", false, null), null
)
)
.metric("index")
Expand Down Expand Up @@ -1568,7 +1568,7 @@ public void testTopNDimExtraction()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2074,7 +2074,7 @@ public void testTopNLexicographicDimExtraction()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2128,7 +2128,7 @@ public void testInvertedTopNLexicographicDimExtraction2()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"),
new RegexDimExtractionFn("..(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2182,7 +2182,7 @@ public void testTopNLexicographicDimExtractionWithPreviousStop()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2300,7 +2300,7 @@ public void testInvertedTopNLexicographicDimExtractionWithPreviousStop()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("(.)"),
new RegexDimExtractionFn("(.)", false, null),
null
)
)
Expand Down Expand Up @@ -2347,7 +2347,7 @@ public void testInvertedTopNLexicographicDimExtractionWithPreviousStop2()
new ExtractionDimensionSpec(
QueryRunnerTestHelper.marketDimension,
QueryRunnerTestHelper.marketDimension,
new RegexDimExtractionFn("..(.)"),
new RegexDimExtractionFn("..(.)", false, null),
null
)
)
Expand Down