From 3d0aec1c25f3d4353d4102994e0178ef2da4172d Mon Sep 17 00:00:00 2001 From: ptkool Date: Sun, 21 Aug 2016 21:31:47 -0400 Subject: [PATCH 1/3] Quoted identifiers in column names. --- .../apache/parquet/avro/TestReadWrite.java | 2 +- .../org/apache/parquet/schema/GroupType.java | 2 +- .../apache/parquet/schema/MessageType.java | 2 +- .../parquet/schema/MessageTypeParser.java | 55 ++++++++++++-- .../apache/parquet/schema/PrimitiveType.java | 13 ++-- .../java/org/apache/parquet/schema/Type.java | 22 +++++- .../parquet/column/TestColumnDescriptor.java | 4 + .../predicate/TestFilterApiMethods.java | 9 +++ .../filter2/predicate/TestValidTypeMap.java | 2 +- .../org/apache/parquet/io/TestColumnIO.java | 16 ++++ .../parquet/parser/TestParquetParser.java | 40 +++++++--- .../parquet/schema/TestMessageType.java | 12 +++ .../parquet/schema/TestTypeBuilders.java | 9 +++ .../parquet/hadoop/metadata/ColumnPath.java | 32 +++++++- .../parquet/hadoop/TestParquetWriter.java | 76 +++++++++++++++++++ 15 files changed, 269 insertions(+), 27 deletions(-) diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java index 4fa71ea986..da5520e15c 100644 --- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java +++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java @@ -426,7 +426,7 @@ public void testAll() throws Exception { assertEquals(emptyMap, nextRecord.get("myemptymap")); assertEquals(genericFixed, nextRecord.get("myfixed")); } - + @Test public void testAllUsingDefaultAvroSchema() throws Exception { File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp"); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java b/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java index f8404a1cd0..6618eb191d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/GroupType.java @@ -211,7 +211,7 @@ public void writeToStringBuilder(StringBuilder sb, String indent) { sb.append(indent) .append(getRepetition().name().toLowerCase(Locale.ENGLISH)) .append(" group ") - .append(getName()) + .append(getQuotedName()) .append(getOriginalType() == null ? "" : " (" + getOriginalType() +")") .append(getId() == null ? "" : " = " + getId()) .append(" {\n"); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java b/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java index 1e26ed2425..ff2c8a42b4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java @@ -64,7 +64,7 @@ public void accept(TypeVisitor visitor) { @Override public void writeToStringBuilder(StringBuilder sb, String indent) { sb.append("message ") - .append(getName()) + .append(getQuotedName()) .append(getOriginalType() == null ? "" : " (" + getOriginalType() +")") .append(" {\n"); membersDisplayString(sb, " "); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/MessageTypeParser.java b/parquet-column/src/main/java/org/apache/parquet/schema/MessageTypeParser.java index b7274c2c5c..520b6432f4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/MessageTypeParser.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/MessageTypeParser.java @@ -38,18 +38,24 @@ public class MessageTypeParser { private static class Tokenizer { - private StringTokenizer st; + private String[] st; + private int stpos = 0; private int line = 0; private StringBuffer currentLine = new StringBuffer(); public Tokenizer(String schemaString, String string) { - st = new StringTokenizer(schemaString, " ,;{}()\n\t=", true); + StringTokenizer tokenizer = new StringTokenizer(schemaString, " ,;{}()`\n\t=", true); + st = new String[tokenizer.countTokens()]; + int i = 0; + while (tokenizer.hasMoreTokens()) { + st[i++] = tokenizer.nextToken(); + } } public String nextToken() { - while (st.hasMoreTokens()) { - String t = st.nextToken(); + while (stpos < st.length) { + String t = st[stpos++]; if (t.equals("\n")) { ++ line; currentLine.setLength(0); @@ -63,6 +69,43 @@ public String nextToken() { throw new IllegalArgumentException("unexpected end of schema"); } + public String getName() { + while (stpos < st.length) { + String t = st[stpos++]; + if (t.equals("\n")) { + ++line; + currentLine.setLength(0); + } else { + currentLine.append(t); + } + if (t.equals("`")) { + StringBuilder sb = new StringBuilder(); + while (stpos < st.length) { + t = st[stpos++]; + if (t.equals("`")) { + if (stpos < st.length) { + t = st[stpos]; + if (t.equals("`")) { + sb.append(t); + ++stpos; + continue; + } + } + String name = sb.toString(); + currentLine.append(name); + return name; + } + sb.append(t); + } + throw new IllegalArgumentException("unexpected end of schema"); + } + if (!isWhitespace(t)) { + return t; + } + } + throw new IllegalArgumentException("unexpected end of schema"); + } + private boolean isWhitespace(String t) { return t.equals(" ") || t.equals("\t") || t.equals("\n"); } @@ -84,7 +127,7 @@ public static MessageType parseMessageType(String input) { } private static MessageType parse(String schemaString) { - Tokenizer st = new Tokenizer(schemaString, " ;{}()\n\t"); + Tokenizer st = new Tokenizer(schemaString, " ;{}()`\n\t"); Types.MessageTypeBuilder builder = Types.buildMessage(); String t = st.nextToken(); @@ -154,7 +197,7 @@ private static void addPrimitiveType(Tokenizer st, PrimitiveTypeName type, Repet check(st.nextToken(), ")", "type length ended by )", st); } - String name = st.nextToken(); + String name = st.getName(); // Read annotation, if any. t = st.nextToken(); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 8056188d25..8a5533c144 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -18,16 +18,19 @@ */ package org.apache.parquet.schema; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; - import org.apache.parquet.column.ColumnReader; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.io.api.RecordConsumer; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + + /** * @@ -401,7 +404,7 @@ public void writeToStringBuilder(StringBuilder sb, String indent) { if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { sb.append("(" + length + ")"); } - sb.append(" ").append(getName()); + sb.append(" ").append(getQuotedName()); if (getOriginalType() != null) { sb.append(" (").append(getOriginalType()); DecimalMetadata meta = getDecimalMetadata(); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java index 99222f96c3..8ead7610cb 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java @@ -21,6 +21,8 @@ import static org.apache.parquet.Preconditions.checkNotNull; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.parquet.io.InvalidRecordException; @@ -113,6 +115,11 @@ public boolean isMoreRestrictiveThan(Repetition other) { private final OriginalType originalType; private final ID id; + /** + * Pattern for matching column names that need to wrapped in backquotes. + */ + private static final Pattern pattern = Pattern.compile("[^a-zA-Z\\.\\d_-]"); + /** * @param name the name of the type * @param repetition OPTIONAL, REPEATED, REQUIRED @@ -140,8 +147,14 @@ public Type(String name, Repetition repetition, OriginalType originalType) { */ Type(String name, Repetition repetition, OriginalType originalType, ID id) { super(); - this.name = checkNotNull(name, "name"); + + name = checkNotNull(name, "name"); + if (name.charAt(0) == '`') { + name = name.substring(1, name.length() - 1).replace("``", "`"); + } + this.repetition = checkNotNull(repetition, "repetition"); + this.name = name; this.originalType = originalType; this.id = id; } @@ -159,6 +172,13 @@ public String getName() { return name; } + /** + * @return the name of the type, possibly wrapped in backquotes. + */ + public String getQuotedName() { + return pattern.matcher(name).find() ? "`" + name.replace("`", "``") + "`" : name; + } + /** * @param rep * @return if repetition of the type is rep diff --git a/parquet-column/src/test/java/org/apache/parquet/column/TestColumnDescriptor.java b/parquet-column/src/test/java/org/apache/parquet/column/TestColumnDescriptor.java index 33d9cea43d..5638089aba 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/TestColumnDescriptor.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/TestColumnDescriptor.java @@ -48,5 +48,9 @@ public void testComparesTo() throws Exception { assertEquals(column("").compareTo(column("")), 0); assertEquals(column("").compareTo(column("a")), -1); assertEquals(column("a").compareTo(column("")), 1); + + assertEquals(column("foo bar").compareTo(column("foo bar")), 0); + assertEquals(column("foo`bar").compareTo(column("foo`bar")), 0); + assertEquals(column("foo#@bar").compareTo(column("foo#@bar")), 0); } } diff --git a/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java b/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java index 009da1c6f5..b01029bfc8 100644 --- a/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java +++ b/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java @@ -100,6 +100,15 @@ public void testToString() { pred.toString()); } + @Test + public void testQuotedColumnNames() { + IntColumn c1 = intColumn("a.b.`foo bar`"); + IntColumn c2 = intColumn("a.b.`foo.bar`"); + IntColumn c3 = intColumn("`a.b.foo.bar`"); + FilterPredicate pred = or(eq(c1, 10), or(eq(c2, 20), eq(c3, 30))); + assertEquals("or(eq(a.b.`foo bar`, 10), or(eq(a.b.`foo.bar`, 20), eq(`a.b.foo.bar`, 30)))", pred.toString()); + } + @Test public void testUdp() { FilterPredicate predicate = or(eq(doubleColumn, 12.0), userDefined(intColumn, DummyUdp.class)); diff --git a/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestValidTypeMap.java b/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestValidTypeMap.java index d44136998d..c23ed82f5a 100644 --- a/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestValidTypeMap.java +++ b/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestValidTypeMap.java @@ -90,7 +90,7 @@ public void testUnsupportedType() { assertTypeValid(invalidColumn, PrimitiveTypeName.INT32); fail("This should throw!"); } catch (IllegalArgumentException e) { - assertEquals("Column invalid.column was declared as type: " + assertEquals("Column `invalid.column` was declared as type: " + "org.apache.parquet.filter2.predicate.TestValidTypeMap$InvalidColumnType which is not supported " + "in FilterPredicates. Supported types for this column are: [class java.lang.Integer]", e.getMessage()); } diff --git a/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java b/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java index e9e599affe..aa53bd04ed 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java @@ -457,6 +457,22 @@ public void testOptionalRequiredInteraction() { } } + @Test + public void testQuotedColumnName() { + MessageType schema = MessageTypeParser.parseMessageType( + "message Document {\n" + + " required group foo {\n" + + " required int64 `bar?@`;\n" + + " }\n" + + "}\n"); + + GroupFactory gf = new SimpleGroupFactory(schema); + Group g1 = gf.newGroup(); + g1.addGroup("foo").append("bar?@", 2l); + + testSchema(schema, Arrays.asList(g1)); + } + private void testSchema(MessageType messageSchema, List groups) { MemPageStore memPageStore = new MemPageStore(groups.size()); ColumnWriteStoreV1 columns = newColumnWriteStore(memPageStore); diff --git a/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java b/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java index dc9369bfaa..d1cbf66cd2 100644 --- a/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java +++ b/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java @@ -62,11 +62,11 @@ public void testPaperExample() throws Exception { new GroupType(OPTIONAL, "Links", new PrimitiveType(REPEATED, INT64, "Backward"), new PrimitiveType(REPEATED, INT64, "Forward")), - new GroupType(REPEATED, "Name", - new GroupType(REPEATED, "Language", - new PrimitiveType(REQUIRED, BINARY, "Code"), - new PrimitiveType(REQUIRED, BINARY, "Country")), - new PrimitiveType(OPTIONAL, BINARY, "Url"))); + new GroupType(REPEATED, "Name", + new GroupType(REPEATED, "Language", + new PrimitiveType(REQUIRED, BINARY, "Code"), + new PrimitiveType(REQUIRED, BINARY, "Country")), + new PrimitiveType(OPTIONAL, BINARY, "Url"))); assertEquals(manuallyMade, parsed); MessageType parsedThenReparsed = parseMessageType(parsed.toString()); @@ -77,9 +77,9 @@ public void testPaperExample() throws Exception { manuallyMade = new MessageType("m", new GroupType(REQUIRED, "a", - new PrimitiveType(REQUIRED, BINARY, "b")), - new GroupType(REQUIRED, "c", - new PrimitiveType(REQUIRED, INT64, "d"))); + new PrimitiveType(REQUIRED, BINARY, "b")), + new GroupType(REQUIRED, "c", + new PrimitiveType(REQUIRED, INT64, "d"))); assertEquals(manuallyMade, parsed); @@ -100,7 +100,7 @@ public void testEachPrimitiveType() { builder.required(FIXED_LEN_BYTE_ARRAY).length(3).named("fixed_"); } else { schema.append(" required ").append(type) - .append(" ").append(type).append("_;\n"); + .append(" ").append(type).append("_;\n"); builder.required(type).named(type.toString() + "_"); } } @@ -310,4 +310,26 @@ public void testEmbeddedAnnotations() { MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString()); assertEquals(expected, reparsed); } + + @Test + public void testQuotedColumnNames() { + String message = "message IntMessage {" + + " required int32 `foo bar`;" + + " required int32 `foo.bar`;" + + " required int32 `foo``bar`;" + + " required int64 `foo@#$bar`;" + + "}\n"; + + MessageType parsed = MessageTypeParser.parseMessageType(message); + MessageType expected = Types.buildMessage() + .required(INT32).as(INT_8).named("foo bar") + .required(INT32).as(INT_16).named("foo.bar") + .required(INT32).as(INT_32).named("foo`bar") + .required(INT64).as(INT_64).named("foo@#$bar") + .named("IntMessage"); + + assertEquals(expected, parsed); + MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString()); + assertEquals(expected, reparsed); + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java index 438fae968e..5545e7681e 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java @@ -173,4 +173,16 @@ public void testIDs() throws Exception { assertEquals(schema, schema2); assertEquals(schema.toString(), schema2.toString()); } + + @Test + public void testQuotedIdentifiers() throws Exception { + MessageType schema = new MessageType("test", + new PrimitiveType(REQUIRED, BINARY, "foo bar"), + new PrimitiveType(REQUIRED, BINARY, "foo()`bar"), + new PrimitiveType(REQUIRED, BINARY, "foo@#$%^bar") + ); + MessageType schema2 = MessageTypeParser.parseMessageType(schema.toString()); + assertEquals(schema, schema2); + assertEquals(schema.toString(), schema2.toString()); + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java index 0c39ef2ba1..108e08ed63 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java @@ -127,6 +127,15 @@ public void testPrimitiveTypeConstruction() { } } + @Test + public void testPrimitiveTypeConstructionWithQuotedColumnName() { + String name = "Foo Bar"; + Type.Repetition repetition = Type.Repetition.REQUIRED; + PrimitiveType expected = new PrimitiveType(repetition, INT32, name); + PrimitiveType built = Types.primitive(INT32, repetition).named(name); + Assert.assertEquals(expected, built); + } + @Test public void testFixedTypeConstruction() { String name = "fixed_"; diff --git a/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java b/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java index 4383b0f2f6..71b4cce53d 100644 --- a/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java +++ b/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java @@ -19,8 +19,12 @@ package org.apache.parquet.hadoop.metadata; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; +import java.util.regex.Matcher; import org.apache.parquet.Strings; @@ -28,6 +32,8 @@ public final class ColumnPath implements Iterable, Serializable { + private static Pattern pattern = Pattern.compile("[^a-zA-Z\\d_-]"); + private static Canonicalizer paths = new Canonicalizer() { @Override protected ColumnPath toCanonical(ColumnPath value) { @@ -41,7 +47,14 @@ protected ColumnPath toCanonical(ColumnPath value) { public static ColumnPath fromDotString(String path) { checkNotNull(path, "path"); - return get(path.split("\\.")); + String[] parts = path.split("\\.(?=([^`]*`[^`]*`)*[^`]*$)"); + for (int i = 0; i < parts.length; ++i) { + String part = parts[i]; + if (part.startsWith("`") && part.endsWith("`")) { + parts[i] = part.substring(1, part.length() - 1); + } + } + return get(parts); } public static ColumnPath get(String... path){ @@ -54,6 +67,14 @@ private ColumnPath(String[] path) { this.p = path; } + /** + * @return the name of the type, possibly wrapped in backquotes. + */ + private String getQuotedName(String name) { + Matcher matcher = pattern.matcher(name); + return matcher.find() ? "`" + name.replace("`", "``") + "`" : name; + } + @Override public boolean equals(Object obj) { if (obj instanceof ColumnPath) { @@ -68,7 +89,14 @@ public int hashCode() { } public String toDotString() { - return Strings.join(p, "."); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < p.length; ++i) { + if (i > 0) { + sb.append("."); + } + sb.append(getQuotedName(p[i])); + } + return sb.toString(); } @Override diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index 6fc3c72f84..9ed7bcac67 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -142,6 +142,82 @@ public void test() throws Exception { } } + @Test + public void testQuotedColumns() throws Exception { + Configuration conf = new Configuration(); + Path root = new Path("target/tests/TestParquetWriter/"); + enforceEmptyDir(conf, root); + MessageType schema = parseMessageType( + "message test { " + + "required binary `binary field`; " + + "required int32 `int32 field`; " + + "required int64 `int64 field`; " + + "required boolean `boolean field`; " + + "required float `float field`; " + + "required double `double field`; " + + "required fixed_len_byte_array(3) `flba field`; " + + "required int96 `int96 field`; " + + "} "); + GroupWriteSupport.setSchema(schema, conf); + SimpleGroupFactory f = new SimpleGroupFactory(schema); + Map expected = new HashMap(); + expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY); + expected.put("1000-" + PARQUET_1_0, PLAIN); + expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY); + expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY); + for (int modulo : asList(10, 1000)) { + for (WriterVersion version : WriterVersion.values()) { + Path file = new Path(root, version.name() + "_" + modulo); + ParquetWriter writer = new ParquetWriter( + file, + new GroupWriteSupport(), + UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf); + for (int i = 0; i < 1000; i++) { + writer.write( + f.newGroup() + .append("binary field", "test" + (i % modulo)) + .append("int32 field", 32) + .append("int64 field", 64l) + .append("boolean field", true) + .append("float field", 1.0f) + .append("double field", 2.0d) + .append("flba field", "foo") + .append("int96 field", Binary.fromConstantByteArray(new byte[12]))); + } + writer.close(); + ParquetReader reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); + for (int i = 0; i < 1000; i++) { + Group group = reader.read(); + assertEquals("test" + (i % modulo), group.getBinary("binary field", 0).toStringUsingUTF8()); + assertEquals(32, group.getInteger("int32 field", 0)); + assertEquals(64l, group.getLong("int64 field", 0)); + assertEquals(true, group.getBoolean("boolean field", 0)); + assertEquals(1.0f, group.getFloat("float field", 0), 0.001); + assertEquals(2.0d, group.getDouble("double field", 0), 0.001); + assertEquals("foo", group.getBinary("flba field", 0).toStringUsingUTF8()); + assertEquals(Binary.fromConstantByteArray(new byte[12]), + group.getInt96("int96 field",0)); + } + reader.close(); + ParquetMetadata footer = readFooter(conf, file, NO_FILTER); + for (BlockMetaData blockMetaData : footer.getBlocks()) { + for (ColumnChunkMetaData column : blockMetaData.getColumns()) { + if (column.getPath().toDotString().equals("binary field")) { + String key = modulo + "-" + version; + Encoding expectedEncoding = expected.get(key); + assertTrue( + key + ":" + column.getEncodings() + " should contain " + expectedEncoding, + column.getEncodings().contains(expectedEncoding)); + } + } + } + assertEquals("Object model property should be example", + "example", footer.getFileMetaData().getKeyValueMetaData() + .get(ParquetWriter.OBJECT_MODEL_NAME_PROP)); + } + } + } + @Rule public TemporaryFolder temp = new TemporaryFolder(); From 33d50888cd332d6ae38a73d0f0de6a38f6f967f5 Mon Sep 17 00:00:00 2001 From: ptkool Date: Sat, 22 Oct 2016 12:12:21 -0400 Subject: [PATCH 2/3] Changes based on comments. --- .../apache/parquet/avro/TestReadWrite.java | 1 - .../apache/parquet/schema/PrimitiveType.java | 8 ++-- .../java/org/apache/parquet/schema/Type.java | 2 +- .../parquet/hadoop/TestParquetWriter.java | 42 +++++++++---------- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java index da5520e15c..ad38b38616 100644 --- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java +++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java @@ -426,7 +426,6 @@ public void testAll() throws Exception { assertEquals(emptyMap, nextRecord.get("myemptymap")); assertEquals(genericFixed, nextRecord.get("myfixed")); } - @Test public void testAllUsingDefaultAvroSchema() throws Exception { File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp"); diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 8a5533c144..a7de643387 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -18,15 +18,15 @@ */ package org.apache.parquet.schema; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; + import org.apache.parquet.column.ColumnReader; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.io.api.RecordConsumer; - -import java.util.Arrays; -import java.util.List; -import java.util.Locale; import java.util.regex.Pattern; import java.util.regex.Matcher; diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java index 8ead7610cb..477689c5fe 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java @@ -118,7 +118,7 @@ public boolean isMoreRestrictiveThan(Repetition other) { /** * Pattern for matching column names that need to wrapped in backquotes. */ - private static final Pattern pattern = Pattern.compile("[^a-zA-Z\\.\\d_-]"); + private static final Pattern pattern = Pattern.compile("[^a-zA-Z\\d_-]"); /** * @param name the name of the type diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index 9ed7bcac67..3baec5d42d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -150,13 +150,13 @@ public void testQuotedColumns() throws Exception { MessageType schema = parseMessageType( "message test { " + "required binary `binary field`; " - + "required int32 `int32 field`; " - + "required int64 `int64 field`; " - + "required boolean `boolean field`; " - + "required float `float field`; " - + "required double `double field`; " - + "required fixed_len_byte_array(3) `flba field`; " - + "required int96 `int96 field`; " + + "required int32 `int32.field`; " + + "required int64 `int64``field`; " + + "required boolean `boolean,field`; " + + "required float `!float field!`; " + + "required double `double@#$field`; " + + "required fixed_len_byte_array(3) `flba+& field`; " + + "required int96 `int96!`; " + "} "); GroupWriteSupport.setSchema(schema, conf); SimpleGroupFactory f = new SimpleGroupFactory(schema); @@ -176,27 +176,27 @@ public void testQuotedColumns() throws Exception { writer.write( f.newGroup() .append("binary field", "test" + (i % modulo)) - .append("int32 field", 32) - .append("int64 field", 64l) - .append("boolean field", true) - .append("float field", 1.0f) - .append("double field", 2.0d) - .append("flba field", "foo") - .append("int96 field", Binary.fromConstantByteArray(new byte[12]))); + .append("int32.field", 32) + .append("int64`field", 64l) + .append("boolean,field", true) + .append("!float field!", 1.0f) + .append("double@#$field", 2.0d) + .append("flba+& field", "foo") + .append("int96!", Binary.fromConstantByteArray(new byte[12]))); } writer.close(); ParquetReader reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); for (int i = 0; i < 1000; i++) { Group group = reader.read(); assertEquals("test" + (i % modulo), group.getBinary("binary field", 0).toStringUsingUTF8()); - assertEquals(32, group.getInteger("int32 field", 0)); - assertEquals(64l, group.getLong("int64 field", 0)); - assertEquals(true, group.getBoolean("boolean field", 0)); - assertEquals(1.0f, group.getFloat("float field", 0), 0.001); - assertEquals(2.0d, group.getDouble("double field", 0), 0.001); - assertEquals("foo", group.getBinary("flba field", 0).toStringUsingUTF8()); + assertEquals(32, group.getInteger("int32.field", 0)); + assertEquals(64l, group.getLong("int64`field", 0)); + assertEquals(true, group.getBoolean("boolean,field", 0)); + assertEquals(1.0f, group.getFloat("!float field!", 0), 0.001); + assertEquals(2.0d, group.getDouble("double@#$field", 0), 0.001); + assertEquals("foo", group.getBinary("flba+& field", 0).toStringUsingUTF8()); assertEquals(Binary.fromConstantByteArray(new byte[12]), - group.getInt96("int96 field",0)); + group.getInt96("int96!",0)); } reader.close(); ParquetMetadata footer = readFooter(conf, file, NO_FILTER); From 9b9fc6735eff9563b82ef7cae8674f283e3ad8a6 Mon Sep 17 00:00:00 2001 From: ptkool Date: Tue, 21 Feb 2017 00:45:05 -0500 Subject: [PATCH 3/3] Changes based on feedback. --- .../java/org/apache/parquet/schema/Type.java | 13 +---- .../predicate/TestFilterApiMethods.java | 18 ++++--- .../org/apache/parquet/io/TestColumnIO.java | 5 +- .../parquet/parser/TestParquetParser.java | 17 +++--- .../org/apache/parquet/QuotedIdentifiers.java | 52 +++++++++++++++++++ .../parquet/hadoop/metadata/ColumnPath.java | 27 ++-------- .../parquet/hadoop/TestParquetWriter.java | 10 ++-- 7 files changed, 84 insertions(+), 58 deletions(-) create mode 100644 parquet-common/src/main/java/org/apache/parquet/QuotedIdentifiers.java diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java index 477689c5fe..af42f5cc39 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Type.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Type.java @@ -21,9 +21,8 @@ import static org.apache.parquet.Preconditions.checkNotNull; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import org.apache.parquet.QuotedIdentifiers; import org.apache.parquet.io.InvalidRecordException; /** @@ -115,11 +114,6 @@ public boolean isMoreRestrictiveThan(Repetition other) { private final OriginalType originalType; private final ID id; - /** - * Pattern for matching column names that need to wrapped in backquotes. - */ - private static final Pattern pattern = Pattern.compile("[^a-zA-Z\\d_-]"); - /** * @param name the name of the type * @param repetition OPTIONAL, REPEATED, REQUIRED @@ -149,9 +143,6 @@ public Type(String name, Repetition repetition, OriginalType originalType) { super(); name = checkNotNull(name, "name"); - if (name.charAt(0) == '`') { - name = name.substring(1, name.length() - 1).replace("``", "`"); - } this.repetition = checkNotNull(repetition, "repetition"); this.name = name; @@ -176,7 +167,7 @@ public String getName() { * @return the name of the type, possibly wrapped in backquotes. */ public String getQuotedName() { - return pattern.matcher(name).find() ? "`" + name.replace("`", "``") + "`" : name; + return QuotedIdentifiers.getName(name); } /** diff --git a/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java b/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java index b01029bfc8..457be6bb9b 100644 --- a/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java +++ b/parquet-column/src/test/java/org/apache/parquet/filter2/predicate/TestFilterApiMethods.java @@ -105,8 +105,12 @@ public void testQuotedColumnNames() { IntColumn c1 = intColumn("a.b.`foo bar`"); IntColumn c2 = intColumn("a.b.`foo.bar`"); IntColumn c3 = intColumn("`a.b.foo.bar`"); - FilterPredicate pred = or(eq(c1, 10), or(eq(c2, 20), eq(c3, 30))); - assertEquals("or(eq(a.b.`foo bar`, 10), or(eq(a.b.`foo.bar`, 20), eq(`a.b.foo.bar`, 30)))", pred.toString()); + FilterPredicate pred1 = or(eq(c1, 10), or(eq(c2, 20), eq(c3, 30))); + assertEquals("or(eq(a.b.`foo bar`, 10), or(eq(a.b.`foo.bar`, 20), eq(`a.b.foo.bar`, 30)))", pred1.toString()); + + IntColumn c4 = intColumn("`a.b``foo.bar`"); + FilterPredicate pred2 = eq(c4, 40); + assertEquals("eq(`a.b``foo.bar`, 40)", pred2.toString()); } @Test @@ -120,7 +124,7 @@ public void testUdp() { } @Test - public void testSerializable() throws Exception { + public void testSerializable() throws Exception { BinaryColumn binary = binaryColumn("foo"); FilterPredicate p = and(or(and(userDefined(intColumn, DummyUdp.class), predicate), eq(binary, Binary.fromString("hi"))), userDefined(longColumn, new IsMultipleOf(7))); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -135,7 +139,7 @@ public void testSerializable() throws Exception { public static class IsMultipleOf extends UserDefinedPredicate implements Serializable { - private long of; + private long of; public IsMultipleOf(long of) { this.of = of; @@ -158,7 +162,7 @@ public boolean canDrop(Statistics statistics) { public boolean inverseCanDrop(Statistics statistics) { return false; } - + @Override public boolean equals(Object o) { if (this == o) return true; @@ -167,12 +171,12 @@ public boolean equals(Object o) { IsMultipleOf that = (IsMultipleOf) o; return this.of == that.of; } - + @Override public int hashCode() { return new Long(of).hashCode(); } - + @Override public String toString() { return "IsMultipleOf(" + of + ")"; diff --git a/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java b/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java index 281d501695..b1cac6a8cf 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java @@ -463,13 +463,14 @@ public void testQuotedColumnName() { MessageType schema = MessageTypeParser.parseMessageType( "message Document {\n" + " required group foo {\n" - + " required int64 `bar?@`;\n" + + " required int64 bar?@;\n" + + " required int64 `foo bar`;\n" + " }\n" + "}\n"); GroupFactory gf = new SimpleGroupFactory(schema); Group g1 = gf.newGroup(); - g1.addGroup("foo").append("bar?@", 2l); + g1.addGroup("foo").append("bar?@", 2l).append("foo bar", 3l); testSchema(schema, Arrays.asList(g1)); } diff --git a/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java b/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java index 92ea889a69..10cf8bc2b6 100644 --- a/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java +++ b/parquet-column/src/test/java/org/apache/parquet/parser/TestParquetParser.java @@ -64,9 +64,9 @@ public void testPaperExample() throws Exception { new PrimitiveType(REPEATED, INT64, "Forward")), new GroupType(REPEATED, "Name", new GroupType(REPEATED, "Language", - new PrimitiveType(REQUIRED, BINARY, "Code"), - new PrimitiveType(REQUIRED, BINARY, "Country")), - new PrimitiveType(OPTIONAL, BINARY, "Url"))); + new PrimitiveType(REQUIRED, BINARY, "Code"), + new PrimitiveType(REQUIRED, BINARY, "Country")), + new PrimitiveType(OPTIONAL, BINARY, "Url"))); assertEquals(manuallyMade, parsed); MessageType parsedThenReparsed = parseMessageType(parsed.toString()); @@ -77,9 +77,9 @@ public void testPaperExample() throws Exception { manuallyMade = new MessageType("m", new GroupType(REQUIRED, "a", - new PrimitiveType(REQUIRED, BINARY, "b")), + new PrimitiveType(REQUIRED, BINARY, "b")), new GroupType(REQUIRED, "c", - new PrimitiveType(REQUIRED, INT64, "d"))); + new PrimitiveType(REQUIRED, INT64, "d"))); assertEquals(manuallyMade, parsed); @@ -99,8 +99,7 @@ public void testEachPrimitiveType() { schema.append(" required fixed_len_byte_array(3) fixed_;"); builder.required(FIXED_LEN_BYTE_ARRAY).length(3).named("fixed_"); } else { - schema.append(" required ").append(type) - .append(" ").append(type).append("_;\n"); + schema.append(" required ").append(type).append(" ").append(type).append("_;\n"); builder.required(type).named(type.toString() + "_"); } } @@ -315,9 +314,9 @@ public void testEmbeddedAnnotations() { public void testQuotedColumnNames() { String message = "message IntMessage {" + " required int32 `foo bar`;" + - " required int32 `foo.bar`;" + + " required int32 foo.bar;" + " required int32 `foo``bar`;" + - " required int64 `foo@#$bar`;" + + " required int64 foo@#$bar;" + "}\n"; MessageType parsed = MessageTypeParser.parseMessageType(message); diff --git a/parquet-common/src/main/java/org/apache/parquet/QuotedIdentifiers.java b/parquet-common/src/main/java/org/apache/parquet/QuotedIdentifiers.java new file mode 100644 index 0000000000..beaea3d7ab --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/QuotedIdentifiers.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet; + +import java.util.regex.Pattern; + +/** + * Utility for handling quoted identifiers. + * + * Created by stylesm on 20/02/17. + */ +public class QuotedIdentifiers { + + /** + * Pattern for matching column names that need to wrapped in backquotes. + */ + private static final Pattern pattern = Pattern.compile("[ ,;{}()`\\.\\n\\t=]"); + + /** + * @return the name of the type, possibly wrapped in backquotes. + */ + public static String getName(String name) { + return pattern.matcher(name).find() ? String.format("`%s`", name.replace("`", "``")) : name; + } + + public static String[] getParts(String path) { + String[] parts = path.split("\\.(?=([^`]*`[^`]*`)*[^`]*$)"); + for (int i = 0; i < parts.length; ++i) { + String part = parts[i]; + if (part.startsWith("`") && part.endsWith("`")) { + parts[i] = part.substring(1, part.length() - 1).replaceAll("``", "`"); + } + } + return parts; + } +} diff --git a/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java b/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java index 71b4cce53d..82bbd303b8 100644 --- a/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java +++ b/parquet-common/src/main/java/org/apache/parquet/hadoop/metadata/ColumnPath.java @@ -19,21 +19,15 @@ package org.apache.parquet.hadoop.metadata; import java.io.Serializable; -import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; -import java.util.List; -import java.util.regex.Pattern; -import java.util.regex.Matcher; -import org.apache.parquet.Strings; +import org.apache.parquet.QuotedIdentifiers; import static org.apache.parquet.Preconditions.checkNotNull; public final class ColumnPath implements Iterable, Serializable { - private static Pattern pattern = Pattern.compile("[^a-zA-Z\\d_-]"); - private static Canonicalizer paths = new Canonicalizer() { @Override protected ColumnPath toCanonical(ColumnPath value) { @@ -47,14 +41,7 @@ protected ColumnPath toCanonical(ColumnPath value) { public static ColumnPath fromDotString(String path) { checkNotNull(path, "path"); - String[] parts = path.split("\\.(?=([^`]*`[^`]*`)*[^`]*$)"); - for (int i = 0; i < parts.length; ++i) { - String part = parts[i]; - if (part.startsWith("`") && part.endsWith("`")) { - parts[i] = part.substring(1, part.length() - 1); - } - } - return get(parts); + return get(QuotedIdentifiers.getParts(path)); } public static ColumnPath get(String... path){ @@ -67,14 +54,6 @@ private ColumnPath(String[] path) { this.p = path; } - /** - * @return the name of the type, possibly wrapped in backquotes. - */ - private String getQuotedName(String name) { - Matcher matcher = pattern.matcher(name); - return matcher.find() ? "`" + name.replace("`", "``") + "`" : name; - } - @Override public boolean equals(Object obj) { if (obj instanceof ColumnPath) { @@ -94,7 +73,7 @@ public String toDotString() { if (i > 0) { sb.append("."); } - sb.append(getQuotedName(p[i])); + sb.append(QuotedIdentifiers.getName(p[i])); } return sb.toString(); } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index 3baec5d42d..6f207b0d9e 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -154,9 +154,9 @@ public void testQuotedColumns() throws Exception { + "required int64 `int64``field`; " + "required boolean `boolean,field`; " + "required float `!float field!`; " - + "required double `double@#$field`; " - + "required fixed_len_byte_array(3) `flba+& field`; " - + "required int96 `int96!`; " + + "required double double@#$field; " + + "required fixed_len_byte_array(3) flba+&field; " + + "required int96 int96!; " + "} "); GroupWriteSupport.setSchema(schema, conf); SimpleGroupFactory f = new SimpleGroupFactory(schema); @@ -181,7 +181,7 @@ public void testQuotedColumns() throws Exception { .append("boolean,field", true) .append("!float field!", 1.0f) .append("double@#$field", 2.0d) - .append("flba+& field", "foo") + .append("flba+&field", "foo") .append("int96!", Binary.fromConstantByteArray(new byte[12]))); } writer.close(); @@ -194,7 +194,7 @@ public void testQuotedColumns() throws Exception { assertEquals(true, group.getBoolean("boolean,field", 0)); assertEquals(1.0f, group.getFloat("!float field!", 0), 0.001); assertEquals(2.0d, group.getDouble("double@#$field", 0), 0.001); - assertEquals("foo", group.getBinary("flba+& field", 0).toStringUsingUTF8()); + assertEquals("foo", group.getBinary("flba+&field", 0).toStringUsingUTF8()); assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96!",0)); }