Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def");
insert into serde_test5 values(1, "abc"),(2, "def");
insert into serde_test6 values(1, "abc"),(2, "def");
insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\"");

CREATE TABLE test_open_csv_default_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE;

CREATE TABLE test_open_csv_standard_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
STORED AS TEXTFILE;

CREATE TABLE test_open_csv_custom_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = "\t",
"quoteChar" = "\'",
"escapeChar" = "|"
)
STORED AS TEXTFILE;

INSERT INTO TABLE test_open_csv_default_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');

INSERT INTO TABLE test_open_csv_standard_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');

INSERT INTO TABLE test_open_csv_custom_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import com.google.common.collect.ImmutableSet;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.OpenCSVSerde;

import java.util.HashMap;
import java.util.Map;
Expand All @@ -27,15 +28,12 @@

public class HiveProperties {
public static final String PROP_FIELD_DELIMITER = "field.delim";
public static final String PROP_SEPARATOR_CHAR = "separatorChar";
public static final String PROP_SERIALIZATION_FORMAT = "serialization.format";
public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01"

public static final String PROP_LINE_DELIMITER = "line.delim";
public static final String DEFAULT_LINE_DELIMITER = "\n";

public static final String PROP_QUOTE_CHAR = "quoteChar";

public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim";
public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim";
public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
Expand All @@ -49,6 +47,14 @@ public class HiveProperties {
public static final String PROP_NULL_FORMAT = "serialization.null.format";
public static final String DEFAULT_NULL_FORMAT = "\\N";

// The following properties are used for OpenCsvSerde.
public static final String PROP_SEPARATOR_CHAR = OpenCSVSerde.SEPARATORCHAR;
public static final String DEFAULT_SEPARATOR_CHAR = ",";
public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR;
public static final String DEFAULT_QUOTE_CHAR = "\"";
public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
public static final String DEFAULT_ESCAPE_CHAR = "\\";

public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
PROP_FIELD_DELIMITER,
PROP_COLLECTION_DELIMITER_HIVE2,
Expand All @@ -59,37 +65,33 @@ public class HiveProperties {
PROP_QUOTE_CHAR,
PROP_MAP_KV_DELIMITER,
PROP_ESCAPE_DELIMITER,
PROP_NULL_FORMAT
);
PROP_ESCAPE_CHAR,
PROP_NULL_FORMAT);

public static String getFieldDelimiter(Table table) {
// This method is used for text format.
// If you need compatibility with csv format, please use `getColumnSeparator`.
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
}

public static String getColumnSeparator(Table table) {
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> columnSeparator = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat));
public static String getSeparatorChar(Table table) {
Optional<String> separatorChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_SEPARATOR_CHAR, separatorChar);
}


public static String getLineDelimiter(Table table) {
Optional<String> lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_LINE_DELIMITER, lineDelim));
DEFAULT_LINE_DELIMITER, lineDelim));
}

public static String getMapKvDelimiter(Table table) {
Optional<String> mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
}

public static String getCollectionDelimiter(Table table) {
Expand All @@ -101,14 +103,6 @@ public static String getCollectionDelimiter(Table table) {
DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3));
}

public static Optional<String> getQuoteChar(Table table) {
Map<String, String> serdeParams = table.getSd().getSerdeInfo().getParameters();
if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
return Optional.of(serdeParams.get(PROP_QUOTE_CHAR));
}
return Optional.empty();
}

public static Optional<String> getEscapeDelimiter(Table table) {
Optional<String> escapeDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER);
if (escapeDelim.isPresent()) {
Expand All @@ -127,6 +121,16 @@ public static String getNullFormat(Table table) {
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT, nullFormat);
}

public static String getQuoteChar(Table table) {
Optional<String> quoteChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar);
}

public static String getEscapeChar(Table table) {
Optional<String> escapeChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, escapeChar);
}

// Set properties to table
public static void setTableProperties(Table table, Map<String, String> properties) {
HashMap<String, String> serdeProps = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,20 +382,36 @@ protected Map<String, String> getLocationProperties() throws UserException {
protected TFileAttributes getFileAttributes() throws UserException {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
Table table = hmsTable.getRemoteTable();
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
// 4. set collection delimiter
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
// 5. set quote char
HiveProperties.getQuoteChar(table).ifPresent(d -> textParams.setEnclose(d.getBytes()[0]));
// 6. set escape delimiter
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 7. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
// TODO: separate hive text table and OpenCsv table
String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
// set properties of LazySimpleSerDe
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
// 4. set collection delimiter
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
// 5. set escape delimiter
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 6. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
} else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
// set set properties of OpenCSVSerde
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set enclose char
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
// 4. set escape char
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
} else {
throw new UserException(
"unsupported hive table serde: " + serDeLib);
}

TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ b 2.2
3 aaa
4 "null"

-- !test_open_csv_default_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""

-- !test_open_csv_standard_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

-- !test_open_csv_custom_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

-- !1 --
a 1.1
b 2.2
Expand Down Expand Up @@ -79,3 +91,14 @@ b 2.2
3 aaa
4 "null"

-- !test_open_csv_default_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""

-- !test_open_csv_standard_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

-- !test_open_csv_custom_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte
hive_docker """truncate table regression.serde_test8;"""
sql """insert into ${catalog_name}.regression.serde_test8 select * from ${catalog_name}.regression.serde_test7;"""
qt_9 """select * from ${catalog_name}.regression.serde_test8 order by id;"""

qt_test_open_csv_default_prop """select * from ${catalog_name}.regression.test_open_csv_default_prop order by id;"""
qt_test_open_csv_standard_prop """select * from ${catalog_name}.regression.test_open_csv_standard_prop order by id;"""
qt_test_open_csv_custom_prop """select * from ${catalog_name}.regression.test_open_csv_custom_prop order by id;"""
}
}