diff --git a/pom.xml b/pom.xml index 91e56f15..bd09c711 100644 --- a/pom.xml +++ b/pom.xml @@ -15,7 +15,8 @@ ~ limitations under the License. --> - + 4.0.0 @@ -107,6 +108,11 @@ joda-time 1.6 + + org.mozilla + rhino + 1.7R5 + diff --git a/src/main/java/com/metamx/common/parsers/JavaScriptParser.java b/src/main/java/com/metamx/common/parsers/JavaScriptParser.java new file mode 100644 index 00000000..f8d0a441 --- /dev/null +++ b/src/main/java/com/metamx/common/parsers/JavaScriptParser.java @@ -0,0 +1,102 @@ +/* +* Licensed to Metamarkets Group Inc. (Metamarkets) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. Metamarkets licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ + +package com.metamx.common.parsers; + +import com.google.common.base.Function; +import org.mozilla.javascript.Context; +import org.mozilla.javascript.ContextFactory; +import org.mozilla.javascript.ScriptableObject; + +import java.util.List; +import java.util.Map; + +/** + */ +public class JavaScriptParser implements Parser +{ + private static Function compile(String function) + { + final ContextFactory contextFactory = ContextFactory.getGlobal(); + final Context context = contextFactory.enterContext(); + context.setOptimizationLevel(9); + + final ScriptableObject scope = context.initStandardObjects(); + + final org.mozilla.javascript.Function fn = context.compileFunction(scope, function, "fn", 1, null); + Context.exit(); + + return new Function() + { + public Object apply(Object input) + { + // ideally we need a close() function to discard the context once it is not used anymore + Context cx = Context.getCurrentContext(); + if (cx == null) { + cx = contextFactory.enterContext(); + } + + final Object res = fn.call(cx, scope, scope, new Object[]{input}); + return res != null ? Context.toObject(res, scope) : null; + } + }; + } + + private final Function fn; + + public JavaScriptParser( + final String function + ) + { + this.fn = compile(function); + } + + public Function getFn() + { + return fn; + } + + @Override + public Map parse(String input) + { + try { + final Object compiled = fn.apply(input); + if (!(compiled instanceof Map)) { + throw new ParseException("JavaScript parsed value must be in {key: value} format!"); + } + + return (Map) compiled; + } + catch (Exception e) { + throw new ParseException(e, "Unable to parse row [%s]", input); + } + } + + @Override + public void setFieldNames(Iterable fieldNames) + { + throw new UnsupportedOperationException(); + } + + @Override + public List getFieldNames() + { + throw new UnsupportedOperationException(); + } +} diff --git a/src/main/java/com/metamx/common/parsers/RegexParser.java b/src/main/java/com/metamx/common/parsers/RegexParser.java new file mode 100644 index 00000000..811a2dc4 --- /dev/null +++ b/src/main/java/com/metamx/common/parsers/RegexParser.java @@ -0,0 +1,125 @@ +/* +* Licensed to Metamarkets Group Inc. (Metamarkets) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. Metamarkets licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ + +package com.metamx.common.parsers; + +import com.google.common.base.Function; +import com.google.common.base.Optional; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.metamx.common.collect.Utils; + +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + */ +public class RegexParser implements Parser +{ + private final String pattern; + private final Splitter listSplitter; + private final Function valueFunction; + private final Pattern compiled; + + private List fieldNames = null; + + public RegexParser( + final String pattern, + final Optional listDelimiter + ) + { + this.pattern = pattern; + this.listSplitter = Splitter.onPattern(listDelimiter.isPresent() + ? listDelimiter.get() + : Parsers.DEFAULT_LIST_DELIMITER); + this.valueFunction = new Function() + { + @Override + public Object apply(String input) + { + final List retVal = Lists.newArrayList( + Iterables.transform( + listSplitter.split(input), + ParserUtils.nullEmptyStringFunction + ) + ); + if (retVal.size() == 1) { + return retVal.get(0); + } else { + return retVal; + } + } + }; + this.compiled = Pattern.compile(pattern); + } + + public RegexParser( + final String pattern, + final Optional listDelimiter, + final Iterable fieldNames + ) + { + this(pattern, listDelimiter); + + setFieldNames(fieldNames); + } + + @Override + public Map parse(String input) + { + try { + final Matcher matcher = compiled.matcher(input); + + if (!matcher.matches()) { + throw new ParseException("Incorrect Regex: %s . No match found.", pattern); + } + + List values = Lists.newArrayList(); + for (int i = 1; i <= matcher.groupCount(); i++) { + values.add(matcher.group(i)); + } + + if (fieldNames == null) { + setFieldNames(ParserUtils.generateFieldNames(values.size())); + } + + return Utils.zipMapPartial(fieldNames, Iterables.transform(values, valueFunction)); + } + catch (Exception e) { + throw new ParseException(e, "Unable to parse row [%s]", input); + } + } + + @Override + public void setFieldNames(Iterable fieldNames) + { + ParserUtils.validateFields(fieldNames); + this.fieldNames = Lists.newArrayList(fieldNames); + } + + @Override + + public List getFieldNames() + { + return fieldNames; + } +} diff --git a/src/test/java/com/metamx/common/parsers/JavaScriptParserTest.java b/src/test/java/com/metamx/common/parsers/JavaScriptParserTest.java new file mode 100644 index 00000000..9fa7573a --- /dev/null +++ b/src/test/java/com/metamx/common/parsers/JavaScriptParserTest.java @@ -0,0 +1,81 @@ +/* +* Licensed to Metamarkets Group Inc. (Metamarkets) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. Metamarkets licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ + +package com.metamx.common.parsers; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import junit.framework.Assert; +import org.junit.Test; + +import java.util.Map; + +/** + */ +public class JavaScriptParserTest +{ + @Test + public void testParse() + { + final String function = "function(str) { var parts = str.split(\"-\"); return { one: parts[0], two: parts[1] } }"; + + final Parser parser = new JavaScriptParser( + function + ); + String data = "foo-val1"; + + final Map parsed = parser.parse(data); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("one", "foo"); + builder.put("two", "val1"); + Assert.assertEquals( + "result", + builder.build(), + parsed + ); + } + + @Test + public void testParseWithMultiVal() + { + final String function = "function(str) { var parts = str.split(\"-\"); return { one: [parts[0], parts[1]] } }"; + + final Parser parser = new JavaScriptParser( + function + ); + String data = "val1-val2"; + + final Map parsed = parser.parse(data); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("one", Lists.newArrayList("val1", "val2")); + Assert.assertEquals( + "result", + builder.build(), + parsed + ); + } + + @Test(expected = org.mozilla.javascript.EvaluatorException.class) + public void testFailure() + { + final String function = "i am bad javascript"; + + new JavaScriptParser(function); + } +} diff --git a/src/test/java/com/metamx/common/parsers/RegexParserTest.java b/src/test/java/com/metamx/common/parsers/RegexParserTest.java new file mode 100644 index 00000000..07addda8 --- /dev/null +++ b/src/test/java/com/metamx/common/parsers/RegexParserTest.java @@ -0,0 +1,231 @@ +/* +* Licensed to Metamarkets Group Inc. (Metamarkets) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. Metamarkets licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ + +package com.metamx.common.parsers; + +import com.google.common.base.Optional; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import junit.framework.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +/** + */ +public class RegexParserTest +{ + @Test + public void testAWSLog() + { + final String pattern = "^([0-9a-f]+) ([\\w.-]+) \\[([\\w\\/: +-]+)\\] ([\\d.]+) ([^\\s]+) ([\\w]+) ([\\w.-]+) ([^\\s\"]+) \"([^\"]*)\" ([\\d-]+) ([\\w-]+) ([\\d-]+) ([\\d-]+) ([\\d-]+) ([\\d-]+) \"(.+)\" \"(.+)\" ([\\w-]+)$"; + + final List fieldNames = Arrays.asList( + "Bucket Owner", + "Bucket", + "Time", + "Remote IP", + "Requester", + "Request ID", + "Operation", + "Key", + "Request-URI", + "HTTP status", + "Error Code", + "Bytes Sent", + "Object Size", + "Total Time", + "Turn-Around Time", + "Referrer", + "User-Agent", + "Version ID" + ); + + final Parser parser = new RegexParser( + pattern, + Optional.absent(), + fieldNames + ); + String data = "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be mybucket [06/Feb/2014:00:00:38 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be 3E57427F3EXAMPLE REST.GET.VERSIONING - \"GET /mybucket?versioning HTTP/1.1\" 200 - 113 - 7 - \"-\" \"S3Console/0.4\" -"; + + final Map parsed = parser.parse(data); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("Bucket Owner", "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be"); + builder.put("Bucket", "mybucket"); + builder.put("Time", "06/Feb/2014:00:00:38 +0000"); + builder.put("Remote IP", "192.0.2.3"); + builder.put("Requester", "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be"); + builder.put("Request ID", "3E57427F3EXAMPLE"); + builder.put("Operation", "REST.GET.VERSIONING"); + builder.put("Key", "-"); + builder.put("Request-URI", "GET /mybucket?versioning HTTP/1.1"); + builder.put("HTTP status", "200"); + builder.put("Error Code", "-"); + builder.put("Bytes Sent", "113"); + builder.put("Object Size", "-"); + builder.put("Total Time", "7"); + builder.put("Turn-Around Time", "-"); + builder.put("Referrer", "-"); + builder.put("User-Agent", "S3Console/0.4"); + builder.put("Version ID", "-"); + + Assert.assertEquals( + "result", + builder.build(), + parsed + ); + } + + @Test + public void testAWSLogWithCrazyUserAgent() + { + final String pattern = "^([0-9a-f]+) ([\\w.-]+) \\[([\\w\\/: +-]+)\\] ([\\d.]+) ([^\\s]+) ([\\w]+) ([\\w.-]+) ([^\\s\"]+) \"([^\"]*)\" ([\\d-]+) ([\\w-]+) ([\\d-]+) ([\\d-]+) ([\\d-]+) ([\\d-]+) \"(.+)\" \"(.+)\" ([\\w-]+)$"; + + final List fieldNames = Arrays.asList( + "Bucket Owner", + "Bucket", + "Time", + "Remote IP", + "Requester", + "Request ID", + "Operation", + "Key", + "Request-URI", + "HTTP status", + "Error Code", + "Bytes Sent", + "Object Size", + "Total Time", + "Turn-Around Time", + "Referrer", + "User-Agent", + "Version ID" + ); + + final Parser parser = new RegexParser( + pattern, + Optional.absent(), + fieldNames + ); + String data = "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be mybucket [06/Feb/2014:00:01:00 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be 7B4A0FABBEXAMPLE REST.GET.VERSIONING - \"GET /mybucket?versioning HTTP/1.1\" 200 - 139 139 27 26 \"-\" \"() { foo;};echo; /bin/bash -c \"expr 299663299665 / 3; echo 333:; uname -a; echo 333:; id;\"\" -"; + + final Map parsed = parser.parse(data); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("Bucket Owner", "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be"); + builder.put("Bucket", "mybucket"); + builder.put("Time", "06/Feb/2014:00:01:00 +0000"); + builder.put("Remote IP", "192.0.2.3"); + builder.put("Requester", "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be"); + builder.put("Request ID", "7B4A0FABBEXAMPLE"); + builder.put("Operation", "REST.GET.VERSIONING"); + builder.put("Key", "-"); + builder.put("Request-URI", "GET /mybucket?versioning HTTP/1.1"); + builder.put("HTTP status", "200"); + builder.put("Error Code", "-"); + builder.put("Bytes Sent", "139"); + builder.put("Object Size", "139"); + builder.put("Total Time", "27"); + builder.put("Turn-Around Time", "26"); + builder.put("Referrer", "-"); + builder.put( + "User-Agent", + "() { foo;};echo; /bin/bash -c \"expr 299663299665 / 3; echo 333:; uname -a; echo 333:; id;\"" + ); + builder.put("Version ID", "-"); + + Assert.assertEquals( + "result", + builder.build(), + parsed + ); + } + + @Test + public void testMultiVal() + { + final String pattern = "^([0-9a-f]+) (.*)"; + + final List fieldNames = Arrays.asList( + "Bucket Owner", + "Bucket" + ); + + final Parser parser = new RegexParser( + pattern, + Optional.of("@"), + fieldNames + ); + String data = "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be mybucket@mybucket2"; + + final Map parsed = parser.parse(data); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("Bucket Owner", "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be"); + builder.put("Bucket", Lists.newArrayList("mybucket", "mybucket2")); + + Assert.assertEquals( + "result", + builder.build(), + parsed + ); + } + + @Test + public void testMultiValWithRegexSplit() + { + final String pattern = "(.*)"; + final String listPattern = "[a-f]"; + + final Parser parser = new RegexParser( + pattern, + Optional.of(listPattern) + ); + String data = "1a2"; + + final Map parsed = parser.parse(data); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("column_1", Lists.newArrayList("1", "2")); + + Assert.assertEquals( + "result", + builder.build(), + parsed + ); + } + + @Test(expected = ParseException.class) + public void testFailure() + { + final String pattern = "AAAAA"; + + final List fieldNames = Arrays.asList( + "dummy" + ); + + final Parser parser = new RegexParser( + pattern, + Optional.of("@"), + fieldNames + ); + String data = "BBBB"; + + parser.parse(data); + } +}