From 6f7da13237dc10d058d9df7c9713927ee14f7f45 Mon Sep 17 00:00:00 2001 From: jon-wei Date: Tue, 10 Nov 2015 13:02:13 -0800 Subject: [PATCH] Add JsonPath parser --- pom.xml | 5 + .../metamx/common/parsers/JSONPathParser.java | 272 ++++++++++++++++++ .../common/parsers/JSONPathParserTest.java | 212 ++++++++++++++ 3 files changed, 489 insertions(+) create mode 100644 src/main/java/com/metamx/common/parsers/JSONPathParser.java create mode 100644 src/test/java/com/metamx/common/parsers/JSONPathParserTest.java diff --git a/pom.xml b/pom.xml index bd09c711..1c2a0d0d 100644 --- a/pom.xml +++ b/pom.xml @@ -113,6 +113,11 @@ rhino 1.7R5 + + com.jayway.jsonpath + json-path + 2.0.0 + diff --git a/src/main/java/com/metamx/common/parsers/JSONPathParser.java b/src/main/java/com/metamx/common/parsers/JSONPathParser.java new file mode 100644 index 00000000..b6cbf499 --- /dev/null +++ b/src/main/java/com/metamx/common/parsers/JSONPathParser.java @@ -0,0 +1,272 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.metamx.common.parsers; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Charsets; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; +import com.metamx.common.Pair; +import com.metamx.common.StringUtils; + +import java.math.BigInteger; +import java.nio.charset.CharsetEncoder; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * JSON parser class that uses the JsonPath library to access fields via path expressions. + */ +public class JSONPathParser implements Parser +{ + private final Map> fieldPathMap; + private final List fieldSpecs; + private final boolean useFieldDiscovery; + private final ObjectMapper mapper; + private final CharsetEncoder enc = Charsets.UTF_8.newEncoder(); + private final Configuration jsonPathConfig; + + /** + * Constructor + * + * @param fieldSpecs List of field specifications. + * @param useFieldDiscovery If true, automatically add root fields seen in the JSON document to the parsed object Map. + * Only fields that contain a singular value or flat list (list containing no subobjects or lists) are automatically added. + * @param mapper Optionally provide an ObjectMapper, used by the parser for reading the input JSON. + */ + public JSONPathParser(List fieldSpecs, boolean useFieldDiscovery, ObjectMapper mapper) + { + this.fieldSpecs = fieldSpecs; + this.fieldPathMap = generateFieldPaths(fieldSpecs); + this.useFieldDiscovery = useFieldDiscovery; + this.mapper = mapper == null ? new ObjectMapper() : mapper; + this.jsonPathConfig = Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS); + } + + @Override + public List getFieldNames() + { + return null; + } + + @Override + public void setFieldNames(Iterable fieldNames) + { + } + + /** + * + * @param input JSON string. The root must be a JSON object, not an array. + * e.g., {"valid": "true"} and {"valid":[1,2,3]} are supported + * but [{"invalid": "true"}] and [1,2,3] are not. + * @return A map of field names and values + */ + @Override + public Map parse(String input) + { + try { + Map map = new LinkedHashMap<>(); + Map document = mapper.readValue(input, new TypeReference>() {}); + for (Map.Entry> entry : fieldPathMap.entrySet()) { + String fieldName = entry.getKey(); + Pair pair = entry.getValue(); + JsonPath path = pair.rhs; + Object parsedVal; + if (pair.lhs == FieldType.ROOT) { + parsedVal = document.get(fieldName); + } else { + parsedVal = path.read(document, jsonPathConfig); + } + if (parsedVal == null) { + continue; + } + parsedVal = valueConversionFunction(parsedVal); + map.put(fieldName, parsedVal); + } + if (useFieldDiscovery) { + discoverFields(map, document); + } + return map; + } + catch (Exception e) { + throw new ParseException(e, "Unable to parse row [%s]", input); + } + } + + private Map> generateFieldPaths(List fieldSpecs) + { + Map> map = new LinkedHashMap<>(); + for (FieldSpec fieldSpec : fieldSpecs) { + String fieldName = fieldSpec.getName(); + if(map.get(fieldName) != null) { + throw new IllegalArgumentException("Cannot have duplicate field definition: " + fieldName); + } + JsonPath path = JsonPath.compile(fieldSpec.getExpr()); + Pair pair = new Pair<>(fieldSpec.getType(), path); + map.put(fieldName, pair); + } + return map; + } + + private void discoverFields(Map map, Map document) + { + for (String field : document.keySet()) { + if (!map.containsKey(field)) { + Object val = document.get(field); + if (val == null) { + continue; + } + if (val instanceof Map) { + continue; + } + if (val instanceof List) { + if (!isFlatList((List) val)) { + continue; + } + } + val = valueConversionFunction(val); + map.put(field, val); + } + } + } + + private Object valueConversionFunction(Object val) + { + if (val instanceof Integer) { + return Long.valueOf((Integer) val); + } + + if (val instanceof BigInteger) { + return Double.valueOf(((BigInteger) val).doubleValue()); + } + + if (val instanceof String) { + return charsetFix((String) val); + } + + if (val instanceof List) { + List newList = new ArrayList<>(); + for(Object entry : ((List) val)) { + newList.add(valueConversionFunction(entry)); + } + return newList; + } + + if (val instanceof Map) { + Map newMap = new LinkedHashMap<>(); + Map valMap = (Map) val; + for(Map.Entry entry : valMap.entrySet()) { + newMap.put(entry.getKey(), valueConversionFunction(entry.getValue())); + } + return newMap; + } + + return val; + } + + private String charsetFix(String s) + { + if (s != null && !enc.canEncode(s)) { + // Some whacky characters are in this string (e.g. \uD900). These are problematic because they are decodeable + // by new String(...) but will not encode into the same character. This dance here will replace these + // characters with something more sane. + return StringUtils.fromUtf8(StringUtils.toUtf8(s)); + } else { + return s; + } + } + + private boolean isFlatList(List list) + { + for (Object obj : list) { + if ((obj instanceof Map) || (obj instanceof List)) { + return false; + } + } + return true; + } + + /** + * Specifies access behavior for a field. + */ + public enum FieldType + { + /** + * A ROOT field is read directly from the JSON document root without using the JsonPath library. + */ + ROOT, + + /** + * A PATH field uses a JsonPath expression to retrieve the field value + */ + PATH; + } + + /** + * Specifies a field to be added to the parsed object Map, using JsonPath notation. + * + * See https://github.com/jayway/JsonPath for more information. + */ + public static class FieldSpec + { + private final FieldType type; + private final String name; + private final String expr; + + /** + * Constructor + * + * @param type Specifies how this field should be retrieved. + * @param name Name of the field, used as the key in the Object map returned by the parser. + * For ROOT fields, this must match the field name as it appears in the JSON document. + * @param expr Only used by PATH type fields, specifies the JsonPath expression used to access the field. + */ + public FieldSpec( + FieldType type, + String name, + String expr + ) + { + this.type = type; + this.name = name; + this.expr = expr; + } + + public FieldType getType() + { + return type; + } + + public String getName() + { + return name; + } + + public String getExpr() + { + return expr; + } + } + +} diff --git a/src/test/java/com/metamx/common/parsers/JSONPathParserTest.java b/src/test/java/com/metamx/common/parsers/JSONPathParserTest.java new file mode 100644 index 00000000..b7afb6a5 --- /dev/null +++ b/src/test/java/com/metamx/common/parsers/JSONPathParserTest.java @@ -0,0 +1,212 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.metamx.common.parsers; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class JSONPathParserTest +{ + private static final String json = + "{\"one\": \"foo\", \"two\" : [\"bar\", \"baz\"], \"three\" : \"qux\", \"four\" : null}"; + private static final String numbersJson = + "{\"five\" : 5.0, \"six\" : 6, \"many\" : 1234567878900, \"toomany\" : 1234567890000000000000}"; + private static final String whackyCharacterJson = + "{\"one\": \"foo\\uD900\"}"; + private static final String nestedJson = + "{\"simpleVal\":\"text\", \"ignore_me\":[1, {\"x\":2}], \"blah\":[4,5,6], \"newmet\":5, " + + "\"foo\":{\"bar1\":\"aaa\", \"bar2\":\"bbb\"}, " + + "\"baz\":[1,2,3], \"timestamp\":\"2999\", \"foo.bar1\":\"Hello world!\", " + + "\"testListConvert\":[1234567890000000000000, \"foo\\uD900\"], " + + "\"testListConvert2\":[1234567890000000000000, \"foo\\uD900\", [1234567890000000000000]], " + + "\"testMapConvert\":{\"big\": 1234567890000000000000, \"big2\":{\"big2\":1234567890000000000000}}, " + + "\"testEmptyList\": [], " + + "\"hey\":[{\"barx\":\"asdf\"}], \"met\":{\"a\":[7,8,9]}}"; + private static final String notJson = "***@#%R#*(TG@(*H(#@(#@((H#(@TH@(#TH(@SDHGKJDSKJFBSBJK"; + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void testSimple() + { + List fields = new ArrayList<>(); + final Parser jsonParser = new JSONPathParser(fields, true, null); + final Map jsonMap = jsonParser.parse(json); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("one", "foo", "two", ImmutableList.of("bar", "baz"), "three", "qux"), + jsonMap + ); + } + + @Test + public void testWithNumbers() + { + List fields = new ArrayList<>(); + final Parser jsonParser = new JSONPathParser(fields, true, null); + final Map jsonMap = jsonParser.parse(numbersJson); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("five", 5.0, "six", 6L, "many", 1234567878900L, "toomany", 1.23456789E21), + jsonMap + ); + } + + @Test + public void testWithWhackyCharacters() + { + List fields = new ArrayList<>(); + final Parser jsonParser = new JSONPathParser(fields, true, null); + final Map jsonMap = jsonParser.parse(whackyCharacterJson); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("one", "foo?"), + jsonMap + ); + } + + @Test + public void testNestingWithFieldDiscovery() + { + List fields = new ArrayList<>(); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.ROOT, "baz", "baz")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "nested-foo.bar1", "$.foo.bar1")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "nested-foo.bar2", "$.foo.bar2")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "heybarx0", "$.hey[0].barx")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "met-array", "$.met.a")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.ROOT, "testListConvert2", "testListConvert2")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.ROOT, "testMapConvert", "testMapConvert")); + + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.ROOT, "INVALID_ROOT", "INVALID_ROOT_EXPR")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "INVALID_PATH", "INVALID_PATH_EXPR")); + + + final Parser jsonParser = new JSONPathParser(fields, true, null); + final Map jsonMap = jsonParser.parse(nestedJson); + + // Root fields + Assert.assertEquals(ImmutableList.of(1L, 2L, 3L), jsonMap.get("baz")); + Assert.assertEquals(ImmutableList.of(4L, 5L, 6L), jsonMap.get("blah")); + Assert.assertEquals("text", jsonMap.get("simpleVal")); + Assert.assertEquals(5L, jsonMap.get("newmet")); + Assert.assertEquals("2999", jsonMap.get("timestamp")); + Assert.assertEquals("Hello world!", jsonMap.get("foo.bar1")); + + List testListConvert = (List)jsonMap.get("testListConvert"); + Assert.assertEquals(1.23456789E21, testListConvert.get(0)); + Assert.assertEquals("foo?", testListConvert.get(1)); + + List testListConvert2 = (List)jsonMap.get("testListConvert2"); + Assert.assertEquals(1.23456789E21, testListConvert2.get(0)); + Assert.assertEquals("foo?", testListConvert2.get(1)); + Assert.assertEquals(1.23456789E21, ((List) testListConvert2.get(2)).get(0)); + + Map testMapConvert = (Map) jsonMap.get("testMapConvert"); + Assert.assertEquals(1.23456789E21, testMapConvert.get("big")); + Assert.assertEquals(1.23456789E21, ((Map) testMapConvert.get("big2")).get("big2")); + + Assert.assertEquals(ImmutableList.of(), jsonMap.get("testEmptyList")); + + // Nested fields + Assert.assertEquals("aaa", jsonMap.get("nested-foo.bar1")); + Assert.assertEquals("bbb", jsonMap.get("nested-foo.bar2")); + Assert.assertEquals("asdf", jsonMap.get("heybarx0")); + Assert.assertEquals(ImmutableList.of(7L, 8L, 9L), jsonMap.get("met-array")); + + // Fields that should not be discovered + Assert.assertNull(jsonMap.get("hey")); + Assert.assertNull(jsonMap.get("met")); + Assert.assertNull(jsonMap.get("ignore_me")); + Assert.assertNull(jsonMap.get("foo")); + + // Invalid fields + Assert.assertNull(jsonMap.get("INVALID_ROOT")); + Assert.assertNull(jsonMap.get("INVALID_PATH")); + } + + @Test + public void testNestingNoDiscovery() + { + List fields = new ArrayList<>(); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.ROOT, "simpleVal", "simpleVal")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.ROOT, "timestamp", "timestamp")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "nested-foo.bar2", "$.foo.bar2")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "heybarx0", "$.hey[0].barx")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "met-array", "$.met.a")); + + final Parser jsonParser = new JSONPathParser(fields, false, null); + final Map jsonMap = jsonParser.parse(nestedJson); + + // Root fields + Assert.assertEquals("text", jsonMap.get("simpleVal")); + Assert.assertEquals("2999", jsonMap.get("timestamp")); + + // Nested fields + Assert.assertEquals("bbb", jsonMap.get("nested-foo.bar2")); + Assert.assertEquals("asdf", jsonMap.get("heybarx0")); + Assert.assertEquals(ImmutableList.of(7L, 8L, 9L), jsonMap.get("met-array")); + + // Fields that should not be discovered + Assert.assertNull(jsonMap.get("newmet")); + Assert.assertNull(jsonMap.get("foo.bar1")); + Assert.assertNull(jsonMap.get("baz")); + Assert.assertNull(jsonMap.get("blah")); + Assert.assertNull(jsonMap.get("nested-foo.bar1")); + Assert.assertNull(jsonMap.get("hey")); + Assert.assertNull(jsonMap.get("met")); + Assert.assertNull(jsonMap.get("ignore_me")); + Assert.assertNull(jsonMap.get("foo")); + } + + @Test + public void testRejectDuplicates() + { + List fields = new ArrayList<>(); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "met-array", "$.met.a")); + fields.add(new JSONPathParser.FieldSpec(JSONPathParser.FieldType.PATH, "met-array", "$.met.a")); + + thrown.expect(IllegalArgumentException.class); + thrown.expectMessage("Cannot have duplicate field definition: met-array"); + + final Parser jsonParser = new JSONPathParser(fields, false, null); + final Map jsonMap = jsonParser.parse(nestedJson); + } + + @Test + public void testParseFail() + { + List fields = new ArrayList<>(); + + thrown.expect(ParseException.class); + thrown.expectMessage("Unable to parse row [" + notJson + "]"); + + final Parser jsonParser = new JSONPathParser(fields, true, null); + final Map jsonMap = jsonParser.parse(notJson); + } +}