Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/content/misc/math-expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,3 +195,17 @@ See javadoc of java.lang.Math for detailed explanation for each function.
| cartesian_fold(lambda,arr1,arr2,...) | folds a multi argument lambda across the cartesian product of all input arrays. The first arguments of the lambda is the array element and the last is the accumulator, returning a single accumulated value. |
| any(lambda,arr) | returns 1 if any element in the array matches the lambda expression, else 0 |
| all(lambda,arr) | returns 1 if all elements in the array matches the lambda expression, else 0 |


## IP Address Functions

For the IPv4 address functions, the `address` argument can either be an IPv4 dotted-decimal string
(e.g., "192.168.0.1") or an IP address represented as a long (e.g., 3232235521). The `subnet`
argument should be a string formatted as an IPv4 address subnet in CIDR notation (e.g.,
"192.168.0.0/16").

| function | description |
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason not to use shorter names like ipv4_match, etc?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it's more convenient to type and it's reasonable that users will understand "ipv4" to mean an "IP address", I'm ok with using the shorter names you suggested.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was thinking about balancing between clarity and keystrokes. I think the current names are great, thanks.

| --- | --- |
| ipv4_match(address, subnet) | Returns 1 if the `address` belongs to the `subnet` literal, else 0. If `address` is not a valid IPv4 address, then 0 is returned. This function is more efficient if `address` is a long instead of a string.|
| ipv4_parse(address) | Parses `address` into an IPv4 address stored as a long. If `address` is a long that is a valid IPv4 address, then it is passed through. Returns null if `address` cannot be represented as an IPv4 address. |
| ipv4_stringify(address) | Converts `address` into an IPv4 address dotted-decimal string. If `address` is a string that is a valid IPv4 address, then it is passed through. Returns null if `address` cannot be represented as an IPv4 address.|
10 changes: 10 additions & 0 deletions licenses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,16 @@ libraries:

---

name: Apache Commons Net
license_category: binary
module: java-core
license_name: Apache License version 2.0
version: 3.6
libraries:
- commons-net: commons-net

---

name: Apache Commons Pool
license_category: binary
module: java-core
Expand Down
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.6</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-ec2</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions processing/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
</dependency>
<dependency>
<groupId>com.google.errorprone</groupId>
<artifactId>error_prone_annotations</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.druid.query.expression;

import com.google.common.base.Preconditions;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.IAE;
Expand All @@ -41,7 +42,7 @@ public static Expr.ObjectBinding nilBindings()
return NIL_BINDINGS;
}

public static DateTimeZone toTimeZone(final Expr timeZoneArg)
static DateTimeZone toTimeZone(final Expr timeZoneArg)
{
if (!timeZoneArg.isLiteral()) {
throw new IAE("Time zone must be a literal");
Expand All @@ -51,7 +52,7 @@ public static DateTimeZone toTimeZone(final Expr timeZoneArg)
return literalValue == null ? DateTimeZone.UTC : DateTimes.inferTzFromString((String) literalValue);
}

public static PeriodGranularity toPeriodGranularity(
static PeriodGranularity toPeriodGranularity(
final Expr periodArg,
@Nullable final Expr originArg,
@Nullable final Expr timeZoneArg,
Expand Down Expand Up @@ -87,4 +88,14 @@ public static PeriodGranularity toPeriodGranularity(
return new PeriodGranularity(period, origin, timeZone);
}

static String createErrMsg(String functionName, String msg)
{
String prefix = "Function[" + functionName + "] ";
return prefix + msg;
}

static void checkLiteralArgument(String functionName, Expr arg, String argName)
{
Preconditions.checkArgument(arg.isLiteral(), createErrMsg(functionName, argName + " arg must be a literal"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.query.expression;

import com.google.common.net.InetAddresses;

import javax.annotation.Nullable;
import java.net.Inet4Address;
import java.net.InetAddress;
import java.util.regex.Pattern;

class IPv4AddressExprUtils
{
private static final Pattern IPV4_PATTERN = Pattern.compile(
"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
);

/**
* @return True if argument cannot be represented by an unsigned integer (4 bytes), else false
*/
static boolean overflowsUnsignedInt(long value)
{
return value < 0L || 0xff_ff_ff_ffL < value;
}

/**
* @return True if argument is a valid IPv4 address dotted-decimal string
*/
static boolean isValidAddress(@Nullable String string)
{
return string != null && IPV4_PATTERN.matcher(string).matches();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This'll accept some invalid IP-like strings, such as 300.300.300.300. It seems like it's mostly being used to avoid exception-throwing, so how about changing the name to isPossiblyValidAddress and emphasizing the might-be-false-positive nature of the check?

Or alternatively, making the check exact and keeping the name the same.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! I'll change the regex to be tighter and add more test cases.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

}

@Nullable
static Inet4Address parse(@Nullable String string)
{
// Explicitly check for valid address to avoid overhead of InetAddresses#forString() potentially
// throwing IllegalArgumentException
if (isValidAddress(string)) {
// Do not use java.lang.InetAddress#getByName() as it may do DNS lookups
InetAddress address = InetAddresses.forString(string);
if (address instanceof Inet4Address) {
return (Inet4Address) address;
}
}
return null;
}

static Inet4Address parse(int value)
{
return InetAddresses.fromInteger(value);
}

/**
* @return IPv4 address dotted-decimal notated string
*/
static String toString(Inet4Address address)
{
return address.getHostAddress();
}

static long toLong(Inet4Address address)
{
int value = InetAddresses.coerceToInteger(address);
return Integer.toUnsignedLong(value);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.query.expression;

import org.apache.commons.net.util.SubnetUtils;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.math.expr.Expr;
import org.apache.druid.math.expr.ExprEval;
import org.apache.druid.math.expr.ExprMacroTable;
import org.apache.druid.math.expr.ExprType;

import javax.annotation.Nonnull;
import java.util.List;

/**
* <pre>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No reason to include these tags, since we generally write javadocs with reading in source form in mind. (Nobody is looking at the generated docs AFAIK)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tags also preserve the formatting when invoking "Quick Documentation" in IntelliJ.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, well, good point, in that case feel free to leave them.

* Implements an expression that checks if an IPv4 address belongs to a particular subnet.
*
* Expression signatures:
* - long ipv4_match(string address, string subnet)
* - long ipv4_match(long address, string subnet)
*
* Valid "address" argument formats are:
* - unsigned int long (e.g., 3232235521)
* - IPv4 address dotted-decimal string (e.g., "198.168.0.1")
*
* The argument format for the "subnet" argument should be a literal in CIDR notation
* (e.g., "198.168.0.0/16").
*
* If the "address" argument does not represent an IPv4 address then false is returned.
* </pre>
*
* @see IPv4AddressParseExprMacro
* @see IPv4AddressStringifyExprMacro
*/
public class IPv4AddressMatchExprMacro implements ExprMacroTable.ExprMacro
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason these functions defined via ExprMacro in druid-processing instead of in Function.java in druid-core? As far as I can tell this doesn't really rely on anything in the processing module, and is not an extension, so it seems like adding it there instead it would be a much lighter touch approach with a lot less boilerplate code and test surface area.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made match an ExprMacro so that it could pre-compute the subnet matcher at parse time. parse and stringify don't do any precomputation, but since I wanted to colocate related expressions, I made both of those ExprMacros too.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IsSubnetInfo expensive to compute? Also, is it best that it only takes literal arguments and not be able to write an expression to compute the subnet argument?

There is probably a sort of janky way that stuff could be pre-computed in Function involving using validateArguments since that is called at parse time, but maybe it would be nice to plumb a more official way to do such things in the future.

Side note, please don't mark comments as resolved, leave that for the reviewer who made the comment to do.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Building SubnetInfo requires validating if the subnet argument has a valid (CIDR-notation) format, which only needs to be done once. What kinds of expressions are you thinking of for the subnet argument?

I've undone all the comment threads that I marked as resolved (was using that to keep track of the ones I've addressed in my branch).

Copy link
Copy Markdown
Contributor

@gianm gianm Jul 31, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is probably a sort of janky way that stuff could be pre-computed in Function involving using validateArguments since that is called at parse time, but maybe it would be nice to plumb a more official way to do such things in the future.

The purpose of ExprMacro is to provide a way to precompute certain things before doing the row-by-row evaluation, so IMO it does make more sense to use ExprMacro (which is designed for this) rather than hook into parse-time Function methods.

Any reason these functions defined via ExprMacro in druid-processing instead of in Function.java in druid-core?

It's probably an accident that Function and ExprMacro are in different packages. I added ExprMacro in #4365 and I don't recall having a good reason for having them be in different places. Fwiw, I don't have a strong separation in my mind between the responsibility of processing and core, anyway, largely agreeing with this comment: #4312 (comment)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The purpose of ExprMacro is to provide a way to precompute certain things before doing the row-by-row evaluation, so IMO it does make more sense to use ExprMacro (which is designed for this) rather than hook into parse-time Function methods.

Thanks for the explanation, 👍 I had stored away in my head for whatever reason that the main purpose of ExprMacro was so extensions could define expressions. I wonder if there is a way we could cut down the amount of code/test that needs to be added for ExprMacro that live in main Druid, because it does seem to take a lot more to add an ExprMacro compared to a Function, but I don't have any good ideas at the moment, nor am I sure that allowing Parser.getFunction to also take the list of args and allow initializing a Function when creating the FunctionExpr would be the appropriate.

{
public static final String NAME = "ipv4_match";
private static final int ARG_SUBNET = 1;

@Override
public String name()
{
return NAME;
}

@Override
public Expr apply(final List<Expr> args)
{
if (args.size() != 2) {
throw new IAE(ExprUtils.createErrMsg(name(), "must have 2 arguments"));
}

SubnetUtils.SubnetInfo subnetInfo = getSubnetInfo(args);
Expr arg = args.get(0);

class IPv4AddressMatchExpr extends ExprMacroTable.BaseScalarUnivariateMacroFunctionExpr
{
private final SubnetUtils.SubnetInfo subnetInfo;

private IPv4AddressMatchExpr(Expr arg, SubnetUtils.SubnetInfo subnetInfo)
{
super(arg);
this.subnetInfo = subnetInfo;
}

@Nonnull
@Override
public ExprEval eval(final ObjectBinding bindings)
{
ExprEval eval = arg.eval(bindings);
boolean match;
switch (eval.type()) {
case STRING:
match = isStringMatch(eval.asString());
break;
case LONG:
match = !eval.isNumericNull() && isLongMatch(eval.asLong());
break;
default:
match = false;
}
return ExprEval.of(match, ExprType.LONG);
}

private boolean isStringMatch(String stringValue)
{
return IPv4AddressExprUtils.isValidAddress(stringValue) && subnetInfo.isInRange(stringValue);
}

private boolean isLongMatch(long longValue)
{
return !IPv4AddressExprUtils.overflowsUnsignedInt(longValue) && subnetInfo.isInRange((int) longValue);
}

@Override
public Expr visit(Shuttle shuttle)
{
Expr newArg = arg.visit(shuttle);
return shuttle.visit(new IPv4AddressMatchExpr(newArg, subnetInfo));
}
}

return new IPv4AddressMatchExpr(arg, subnetInfo);
}

private SubnetUtils.SubnetInfo getSubnetInfo(List<Expr> args)
{
String subnetArgName = "subnet";
Expr arg = args.get(ARG_SUBNET);
ExprUtils.checkLiteralArgument(name(), arg, subnetArgName);
String subnet = (String) arg.getLiteralValue();

SubnetUtils subnetUtils;
try {
subnetUtils = new SubnetUtils(subnet);
}
catch (IllegalArgumentException e) {
throw new IAE(e, ExprUtils.createErrMsg(name(), subnetArgName + " arg has an invalid format: " + subnet));
}
subnetUtils.setInclusiveHostCount(true); // make network and broadcast addresses match

return subnetUtils.getInfo();
}
}
Loading