Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,6 @@ public void testAll() throws Exception {
assertEquals(emptyMap, nextRecord.get("myemptymap"));
assertEquals(genericFixed, nextRecord.get("myfixed"));
}

@Test
public void testAllUsingDefaultAvroSchema() throws Exception {
File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ public void writeToStringBuilder(StringBuilder sb, String indent) {
sb.append(indent)
.append(getRepetition().name().toLowerCase(Locale.ENGLISH))
.append(" group ")
.append(getName())
.append(getQuotedName())
.append(getOriginalType() == null ? "" : " (" + getOriginalType() +")")
.append(getId() == null ? "" : " = " + getId())
.append(" {\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public void accept(TypeVisitor visitor) {
@Override
public void writeToStringBuilder(StringBuilder sb, String indent) {
sb.append("message ")
.append(getName())
.append(getQuotedName())
.append(getOriginalType() == null ? "" : " (" + getOriginalType() +")")
.append(" {\n");
membersDisplayString(sb, " ");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,24 @@ public class MessageTypeParser {

private static class Tokenizer {

private StringTokenizer st;
private String[] st;
private int stpos = 0;

private int line = 0;
private StringBuffer currentLine = new StringBuffer();

public Tokenizer(String schemaString, String string) {
st = new StringTokenizer(schemaString, " ,;{}()\n\t=", true);
StringTokenizer tokenizer = new StringTokenizer(schemaString, " ,;{}()`\n\t=", true);
st = new String[tokenizer.countTokens()];
int i = 0;
while (tokenizer.hasMoreTokens()) {
st[i++] = tokenizer.nextToken();
}
}

public String nextToken() {
while (st.hasMoreTokens()) {
String t = st.nextToken();
while (stpos < st.length) {
String t = st[stpos++];
if (t.equals("\n")) {
++ line;
currentLine.setLength(0);
Expand All @@ -64,6 +70,43 @@ public String nextToken() {
throw new IllegalArgumentException("unexpected end of schema");
}

public String getName() {
while (stpos < st.length) {
String t = st[stpos++];
if (t.equals("\n")) {
++line;
currentLine.setLength(0);
} else {
currentLine.append(t);
}
if (t.equals("`")) {
StringBuilder sb = new StringBuilder();
while (stpos < st.length) {
t = st[stpos++];
if (t.equals("`")) {
if (stpos < st.length) {
t = st[stpos];
if (t.equals("`")) {
sb.append(t);
++stpos;
continue;
}
}
String name = sb.toString();
currentLine.append(name);
return name;
}
sb.append(t);
}
throw new IllegalArgumentException("unexpected end of schema");
}
if (!isWhitespace(t)) {
return t;
}
}
throw new IllegalArgumentException("unexpected end of schema");
}

private boolean isWhitespace(String t) {
return t.equals(" ") || t.equals("\t") || t.equals("\n");
}
Expand All @@ -85,7 +128,7 @@ public static MessageType parseMessageType(String input) {
}

private static MessageType parse(String schemaString) {
Tokenizer st = new Tokenizer(schemaString, " ;{}()\n\t");
Tokenizer st = new Tokenizer(schemaString, " ;{}()`\n\t");
Types.MessageTypeBuilder builder = Types.buildMessage();

String t = st.nextToken();
Expand Down Expand Up @@ -155,7 +198,7 @@ private static void addPrimitiveType(Tokenizer st, PrimitiveTypeName type, Repet
check(st.nextToken(), ")", "type length ended by )", st);
}

String name = st.nextToken();
String name = st.getName();

// Read annotation, if any.
t = st.nextToken();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.io.api.RecordConsumer;
import java.util.regex.Pattern;
import java.util.regex.Matcher;



/**
Expand Down Expand Up @@ -401,7 +404,7 @@ public void writeToStringBuilder(StringBuilder sb, String indent) {
if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
sb.append("(" + length + ")");
}
sb.append(" ").append(getName());
sb.append(" ").append(getQuotedName());
if (getOriginalType() != null) {
sb.append(" (").append(getOriginalType());
DecimalMetadata meta = getDecimalMetadata();
Expand Down
13 changes: 12 additions & 1 deletion parquet-column/src/main/java/org/apache/parquet/schema/Type.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import java.util.List;

import org.apache.parquet.QuotedIdentifiers;
import org.apache.parquet.io.InvalidRecordException;

/**
Expand Down Expand Up @@ -140,8 +141,11 @@ public Type(String name, Repetition repetition, OriginalType originalType) {
*/
Type(String name, Repetition repetition, OriginalType originalType, ID id) {
super();
this.name = checkNotNull(name, "name");

name = checkNotNull(name, "name");

this.repetition = checkNotNull(repetition, "repetition");
this.name = name;
this.originalType = originalType;
this.id = id;
}
Expand All @@ -159,6 +163,13 @@ public String getName() {
return name;
}

/**
* @return the name of the type, possibly wrapped in backquotes.
*/
public String getQuotedName() {
return QuotedIdentifiers.getName(name);
}

/**
* @param rep
* @return if repetition of the type is rep
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,9 @@ public void testComparesTo() throws Exception {
assertEquals(column("").compareTo(column("")), 0);
assertEquals(column("").compareTo(column("a")), -1);
assertEquals(column("a").compareTo(column("")), 1);

assertEquals(column("foo bar").compareTo(column("foo bar")), 0);
assertEquals(column("foo`bar").compareTo(column("foo`bar")), 0);
assertEquals(column("foo#@bar").compareTo(column("foo#@bar")), 0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,19 @@ public void testToString() {
pred.toString());
}

@Test
public void testQuotedColumnNames() {
IntColumn c1 = intColumn("a.b.`foo bar`");
IntColumn c2 = intColumn("a.b.`foo.bar`");
IntColumn c3 = intColumn("`a.b.foo.bar`");
FilterPredicate pred1 = or(eq(c1, 10), or(eq(c2, 20), eq(c3, 30)));
assertEquals("or(eq(a.b.`foo bar`, 10), or(eq(a.b.`foo.bar`, 20), eq(`a.b.foo.bar`, 30)))", pred1.toString());

IntColumn c4 = intColumn("`a.b``foo.bar`");
FilterPredicate pred2 = eq(c4, 40);
assertEquals("eq(`a.b``foo.bar`, 40)", pred2.toString());
}

@Test
public void testUdp() {
FilterPredicate predicate = or(eq(doubleColumn, 12.0), userDefined(intColumn, DummyUdp.class));
Expand All @@ -111,7 +124,7 @@ public void testUdp() {
}

@Test
public void testSerializable() throws Exception {
public void testSerializable() throws Exception {
BinaryColumn binary = binaryColumn("foo");
FilterPredicate p = and(or(and(userDefined(intColumn, DummyUdp.class), predicate), eq(binary, Binary.fromString("hi"))), userDefined(longColumn, new IsMultipleOf(7)));
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Expand All @@ -126,7 +139,7 @@ public void testSerializable() throws Exception {

public static class IsMultipleOf extends UserDefinedPredicate<Long> implements Serializable {

private long of;
private long of;

public IsMultipleOf(long of) {
this.of = of;
Expand All @@ -149,7 +162,7 @@ public boolean canDrop(Statistics<Long> statistics) {
public boolean inverseCanDrop(Statistics<Long> statistics) {
return false;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand All @@ -158,12 +171,12 @@ public boolean equals(Object o) {
IsMultipleOf that = (IsMultipleOf) o;
return this.of == that.of;
}

@Override
public int hashCode() {
return new Long(of).hashCode();
}

@Override
public String toString() {
return "IsMultipleOf(" + of + ")";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public void testUnsupportedType() {
assertTypeValid(invalidColumn, PrimitiveTypeName.INT32);
fail("This should throw!");
} catch (IllegalArgumentException e) {
assertEquals("Column invalid.column was declared as type: "
assertEquals("Column `invalid.column` was declared as type: "
+ "org.apache.parquet.filter2.predicate.TestValidTypeMap$InvalidColumnType which is not supported "
+ "in FilterPredicates. Supported types for this column are: [class java.lang.Integer]", e.getMessage());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,23 @@ public void testOptionalRequiredInteraction() {
}
}

@Test
public void testQuotedColumnName() {
MessageType schema = MessageTypeParser.parseMessageType(
"message Document {\n"
+ " required group foo {\n"
+ " required int64 bar?@;\n"
+ " required int64 `foo bar`;\n"
+ " }\n"
+ "}\n");

GroupFactory gf = new SimpleGroupFactory(schema);
Group g1 = gf.newGroup();
g1.addGroup("foo").append("bar?@", 2l).append("foo bar", 3l);

testSchema(schema, Arrays.asList(g1));
}

private void testSchema(MessageType messageSchema, List<Group> groups) {
MemPageStore memPageStore = new MemPageStore(groups.size());
ColumnWriteStoreV1 columns = newColumnWriteStore(memPageStore);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ public void testPaperExample() throws Exception {
new GroupType(OPTIONAL, "Links",
new PrimitiveType(REPEATED, INT64, "Backward"),
new PrimitiveType(REPEATED, INT64, "Forward")),
new GroupType(REPEATED, "Name",
new GroupType(REPEATED, "Language",
new PrimitiveType(REQUIRED, BINARY, "Code"),
new PrimitiveType(REQUIRED, BINARY, "Country")),
new PrimitiveType(OPTIONAL, BINARY, "Url")));
new GroupType(REPEATED, "Name",
new GroupType(REPEATED, "Language",
new PrimitiveType(REQUIRED, BINARY, "Code"),
new PrimitiveType(REQUIRED, BINARY, "Country")),
new PrimitiveType(OPTIONAL, BINARY, "Url")));
assertEquals(manuallyMade, parsed);

MessageType parsedThenReparsed = parseMessageType(parsed.toString());
Expand All @@ -77,9 +77,9 @@ public void testPaperExample() throws Exception {
manuallyMade =
new MessageType("m",
new GroupType(REQUIRED, "a",
new PrimitiveType(REQUIRED, BINARY, "b")),
new GroupType(REQUIRED, "c",
new PrimitiveType(REQUIRED, INT64, "d")));
new PrimitiveType(REQUIRED, BINARY, "b")),
new GroupType(REQUIRED, "c",
new PrimitiveType(REQUIRED, INT64, "d")));

assertEquals(manuallyMade, parsed);

Expand All @@ -99,8 +99,7 @@ public void testEachPrimitiveType() {
schema.append(" required fixed_len_byte_array(3) fixed_;");
builder.required(FIXED_LEN_BYTE_ARRAY).length(3).named("fixed_");
} else {
schema.append(" required ").append(type)
.append(" ").append(type).append("_;\n");
schema.append(" required ").append(type).append(" ").append(type).append("_;\n");
builder.required(type).named(type.toString() + "_");
}
}
Expand Down Expand Up @@ -310,4 +309,26 @@ public void testEmbeddedAnnotations() {
MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
assertEquals(expected, reparsed);
}

@Test
public void testQuotedColumnNames() {
String message = "message IntMessage {" +
" required int32 `foo bar`;" +
" required int32 foo.bar;" +
" required int32 `foo``bar`;" +
" required int64 foo@#$bar;" +
"}\n";

MessageType parsed = MessageTypeParser.parseMessageType(message);
MessageType expected = Types.buildMessage()
.required(INT32).as(INT_8).named("foo bar")
.required(INT32).as(INT_16).named("foo.bar")
.required(INT32).as(INT_32).named("foo`bar")
.required(INT64).as(INT_64).named("foo@#$bar")
.named("IntMessage");

assertEquals(expected, parsed);
MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
assertEquals(expected, reparsed);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should add checks that column names that worked before do not get quoted when generating the schema now.
Otherwise older versions of parquet won't be able to read those files.

}
Original file line number Diff line number Diff line change
Expand Up @@ -200,4 +200,16 @@ public void testIDs() throws Exception {
assertEquals(schema, schema2);
assertEquals(schema.toString(), schema2.toString());
}

@Test
public void testQuotedIdentifiers() throws Exception {
MessageType schema = new MessageType("test",
new PrimitiveType(REQUIRED, BINARY, "foo bar"),
new PrimitiveType(REQUIRED, BINARY, "foo()`bar"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to support ` in names? Is it supported elsewhere, like in Hive?

It doesn't seem worth the added complexity in the tokenizer to me.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is supported in Hive.

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL

Have a look at the CREATE TABLE documentation.

new PrimitiveType(REQUIRED, BINARY, "foo@#$%^bar")
);
MessageType schema2 = MessageTypeParser.parseMessageType(schema.toString());
assertEquals(schema, schema2);
assertEquals(schema.toString(), schema2.toString());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,15 @@ public void testPrimitiveTypeConstruction() {
}
}

@Test
public void testPrimitiveTypeConstructionWithQuotedColumnName() {
String name = "Foo Bar";
Type.Repetition repetition = Type.Repetition.REQUIRED;
PrimitiveType expected = new PrimitiveType(repetition, INT32, name);
PrimitiveType built = Types.primitive(INT32, repetition).named(name);
Assert.assertEquals(expected, built);
}

@Test
public void testFixedTypeConstruction() {
String name = "fixed_";
Expand Down
Loading