Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.doris.common.util.DebugUtil;
import org.apache.doris.datasource.property.constants.BosProperties;
import org.apache.doris.datasource.property.constants.S3Properties;
import org.apache.doris.datasource.property.fileformat.FileFormatProperties;
import org.apache.doris.load.loadv2.LoadTask.MergeType;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.qe.SessionVariable;
Expand Down Expand Up @@ -147,11 +148,11 @@ private void analyze(String user, String db, boolean checkAuth) throws AnalysisE
getOrigStmt() != null ? getOrigStmt().originStmt : "", copyFromParam.getFileColumns(),
copyFromParam.getColumnMappingList(), copyFromParam.getFileFilterExpr());
}
dataDescProperties.put(FileFormatProperties.PROP_COMPRESS_TYPE, copyIntoProperties.getCompression());
dataDescription = new DataDescription(tableName.getTbl(), null, Lists.newArrayList(filePath),
copyFromParam.getFileColumns(), separator, fileFormatStr, null, false,
copyFromParam.getColumnMappingList(), copyFromParam.getFileFilterExpr(), null, MergeType.APPEND, null,
null, dataDescProperties);
dataDescription.setCompressType(StageUtil.parseCompressType(copyIntoProperties.getCompression()));
if (!(copyFromParam.getColumnMappingList() == null
|| copyFromParam.getColumnMappingList().isEmpty())) {
dataDescription.setIgnoreCsvRedundantCol(true);
Expand Down
372 changes: 120 additions & 252 deletions fe/fe-core/src/main/java/org/apache/doris/analysis/DataDescription.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,11 @@ public void createScanRangeLocations(FileLoadScanNode.ParamCreateContext context
// If any of the file is unsplittable, all files will be treated as unsplittable.
boolean isSplittable = true;
for (TBrokerFileStatus fileStatus : fileStatuses) {
TFileFormatType formatType = formatType(context.fileGroup.getFileFormat(), fileStatus.path);
TFileFormatType formatType = formatType(context.fileGroup.getFileFormatProperties().getFormatName(),
fileStatus.path);
TFileCompressType compressType =
Util.getOrInferCompressType(context.fileGroup.getCompressType(), fileStatus.path);
Util.getOrInferCompressType(context.fileGroup.getFileFormatProperties().getCompressionType(),
fileStatus.path);
// Now only support split plain text
if (compressType == TFileCompressType.PLAIN
&& ((formatType == TFileFormatType.FORMAT_CSV_PLAIN && fileStatus.isSplitable)
Expand Down Expand Up @@ -257,10 +259,12 @@ public void createScanRangeLocationsUnsplittable(FileLoadScanNode.ParamCreateCon
TScanRangeLocations locations = newLocations(context.params, brokerDesc, backendPolicy);
for (int i : group) {
TBrokerFileStatus fileStatus = fileStatuses.get(i);
TFileFormatType formatType = formatType(context.fileGroup.getFileFormat(), fileStatus.path);
TFileFormatType formatType = formatType(context.fileGroup.getFileFormatProperties().getFormatName(),
fileStatus.path);
context.params.setFormatType(formatType);
TFileCompressType compressType =
Util.getOrInferCompressType(context.fileGroup.getCompressType(), fileStatus.path);
Util.getOrInferCompressType(context.fileGroup.getFileFormatProperties().getCompressionType(),
fileStatus.path);
context.params.setCompressType(compressType);
List<String> columnsFromPath = BrokerUtil.parseColumnsFromPath(fileStatus.path,
context.fileGroup.getColumnNamesFromPath());
Expand Down Expand Up @@ -299,10 +303,12 @@ public void createScanRangeLocationsSplittable(FileLoadScanNode.ParamCreateConte
long leftBytes = fileStatus.size - curFileOffset;
long tmpBytes = curInstanceBytes + leftBytes;
// header_type
TFileFormatType formatType = formatType(context.fileGroup.getFileFormat(), fileStatus.path);
TFileFormatType formatType = formatType(context.fileGroup.getFileFormatProperties().getFormatName(),
fileStatus.path);
context.params.setFormatType(formatType);
TFileCompressType compressType =
Util.getOrInferCompressType(context.fileGroup.getCompressType(), fileStatus.path);
Util.getOrInferCompressType(context.fileGroup.getFileFormatProperties().getCompressionType(),
fileStatus.path);
context.params.setCompressType(compressType);
List<String> columnsFromPath = BrokerUtil.parseColumnsFromPath(fileStatus.path,
context.fileGroup.getColumnNamesFromPath());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import org.apache.doris.common.MetaNotFoundException;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.FileFormatConstants;
import org.apache.doris.common.util.Util;
import org.apache.doris.datasource.property.fileformat.FileFormatProperties;
import org.apache.doris.load.BrokerFileGroup;
import org.apache.doris.load.Load;
import org.apache.doris.load.loadv2.LoadTask;
Expand All @@ -42,7 +42,6 @@
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileScanRangeParams;
import org.apache.doris.thrift.TFileScanSlotInfo;
import org.apache.doris.thrift.TFileTextScanRangeParams;
import org.apache.doris.thrift.TFileType;
import org.apache.doris.thrift.THdfsParams;
import org.apache.doris.thrift.TScanRangeLocations;
Expand Down Expand Up @@ -86,10 +85,11 @@ public FileLoadScanNode.ParamCreateContext createContext(Analyzer analyzer) thro
ctx.destTupleDescriptor = destTupleDesc;
ctx.fileGroup = fileGroupInfo.getFileGroup();
ctx.timezone = analyzer.getTimezone();
FileFormatProperties fileFormatProperties = fileGroupInfo.getFileGroup().getFileFormatProperties();

TFileScanRangeParams params = new TFileScanRangeParams();
params.setFormatType(formatType(fileGroupInfo.getFileGroup().getFileFormat()));
params.setCompressType(fileGroupInfo.getFileGroup().getCompressType());
params.setFormatType(fileFormatProperties.getFileFormatType());
params.setCompressType(fileFormatProperties.getCompressionType());
params.setStrictMode(fileGroupInfo.isStrictMode());
if (fileGroupInfo.getSequenceMapCol() != null) {
params.setSequenceMapCol(fileGroupInfo.getSequenceMapCol());
Expand All @@ -100,8 +100,7 @@ public FileLoadScanNode.ParamCreateContext createContext(Analyzer analyzer) thro
.getBackendConfigProperties());
params.setHdfsParams(tHdfsParams);
}
TFileAttributes fileAttributes = new TFileAttributes();
setFileAttributes(ctx.fileGroup, fileAttributes);
TFileAttributes fileAttributes = setFileAttributes(ctx.fileGroup);
params.setFileAttributes(fileAttributes);
params.setFileType(fileGroupInfo.getFileType());
ctx.params = params;
Expand All @@ -110,24 +109,11 @@ public FileLoadScanNode.ParamCreateContext createContext(Analyzer analyzer) thro
return ctx;
}

public void setFileAttributes(BrokerFileGroup fileGroup, TFileAttributes fileAttributes) {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
textParams.setColumnSeparator(fileGroup.getColumnSeparator());
textParams.setLineDelimiter(fileGroup.getLineDelimiter());
textParams.setEnclose(fileGroup.getEnclose());
textParams.setEscape(fileGroup.getEscape());
fileAttributes.setTextParams(textParams);
fileAttributes.setStripOuterArray(fileGroup.isStripOuterArray());
fileAttributes.setJsonpaths(fileGroup.getJsonPaths());
fileAttributes.setJsonRoot(fileGroup.getJsonRoot());
fileAttributes.setNumAsString(fileGroup.isNumAsString());
fileAttributes.setFuzzyParse(fileGroup.isFuzzyParse());
fileAttributes.setReadJsonByLine(fileGroup.isReadJsonByLine());
fileAttributes.setReadByColumnDef(true);
fileAttributes.setHeaderType(getHeaderType(fileGroup.getFileFormat()));
fileAttributes.setTrimDoubleQuotes(fileGroup.getTrimDoubleQuotes());
fileAttributes.setSkipLines(fileGroup.getSkipLines());
fileAttributes.setIgnoreCsvRedundantCol(fileGroup.getIgnoreCsvRedundantCol());
public TFileAttributes setFileAttributes(BrokerFileGroup fileGroup) {
TFileAttributes tFileAttributes = fileGroup.getFileFormatProperties().toTFileAttributes();
tFileAttributes.setReadByColumnDef(true);
tFileAttributes.setIgnoreCsvRedundantCol(fileGroup.getIgnoreCsvRedundantCol());
return tFileAttributes;
}

private String getHeaderType(String formatType) {
Expand Down Expand Up @@ -219,7 +205,7 @@ private void initColumns(FileLoadScanNode.ParamCreateContext context, Analyzer a
List<Integer> srcSlotIds = Lists.newArrayList();
Load.initColumns(fileGroupInfo.getTargetTable(), columnDescs, context.fileGroup.getColumnToHadoopFunction(),
context.exprMap, analyzer, context.srcTupleDescriptor, context.srcSlotDescByName, srcSlotIds,
formatType(context.fileGroup.getFileFormat()), fileGroupInfo.getHiddenColumns(),
context.fileGroup.getFileFormatProperties().getFileFormatType(), fileGroupInfo.getHiddenColumns(),
fileGroupInfo.getUniqueKeyUpdateMode());

int columnCountFromPath = 0;
Expand Down Expand Up @@ -250,18 +236,6 @@ private boolean shouldAddSequenceColumn(LoadTaskInfo.ImportColumnDescs columnDes
.equalsIgnoreCase(Column.DELETE_SIGN);
}

private TFileFormatType formatType(String fileFormat) throws UserException {
if (fileFormat == null) {
// get file format by the file path
return TFileFormatType.FORMAT_CSV_PLAIN;
}
TFileFormatType formatType = Util.getFileFormatTypeFromName(fileFormat);
if (formatType == TFileFormatType.FORMAT_UNKNOWN) {
throw new UserException("Not supported file format: " + fileFormat);
}
return formatType;
}

public TableIf getTargetTable() {
return fileGroupInfo.getTargetTable();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.datasource.property.fileformat;

import org.apache.doris.nereids.exceptions.AnalysisException;
import org.apache.doris.thrift.TFileAttributes;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileTextScanRangeParams;
import org.apache.doris.thrift.TResultFileSinkOptions;

import java.util.Map;

public class ArrowFileFormatProperties extends FileFormatProperties {
public ArrowFileFormatProperties() {
super(TFileFormatType.FORMAT_ARROW, FileFormatProperties.FORMAT_ARROW);
}

@Override
public void fullTResultFileSinkOptions(TResultFileSinkOptions sinkOptions) {
}

@Override
public TFileAttributes toTFileAttributes() {
TFileAttributes fileAttributes = new TFileAttributes();
TFileTextScanRangeParams fileTextScanRangeParams = new TFileTextScanRangeParams();
fileAttributes.setTextParams(fileTextScanRangeParams);
return fileAttributes;
}

@Override
public void analyzeFileFormatProperties(Map<String, String> formatProperties, boolean isRemoveOriginProperty)
throws AnalysisException {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.doris.datasource.property.fileformat;

import org.apache.doris.analysis.Separator;
import org.apache.doris.catalog.Column;
import org.apache.doris.common.util.Util;
import org.apache.doris.nereids.exceptions.AnalysisException;
import org.apache.doris.qe.ConnectContext;
Expand All @@ -28,11 +27,9 @@
import org.apache.doris.thrift.TResultFileSinkOptions;

import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.List;
import java.util.Map;

public class CsvFileFormatProperties extends FileFormatProperties {
Expand All @@ -53,18 +50,16 @@ public class CsvFileFormatProperties extends FileFormatProperties {
public static final String PROP_ENCLOSE = "enclose";
public static final String PROP_ESCAPE = "escape";

public static final String PROP_ENABLE_TEXT_VALIDATE_UTF8 = "enable_text_validate_utf8";

private String headerType = "";
private String columnSeparator = DEFAULT_COLUMN_SEPARATOR;
private String lineDelimiter = DEFAULT_LINE_DELIMITER;
private boolean trimDoubleQuotes;
private int skipLines;
private byte enclose;

private byte escape;

// used by tvf
// User specified csv columns, it will override columns got from file
private final List<Column> csvSchema = Lists.newArrayList();
private boolean enableTextValidateUTF8 = true;

String defaultColumnSeparator = DEFAULT_COLUMN_SEPARATOR;

Expand Down Expand Up @@ -103,9 +98,6 @@ public void analyzeFileFormatProperties(Map<String, String> formatProperties, bo
throw new AnalysisException("enclose should not be longer than one byte.");
}
enclose = (byte) enclosedString.charAt(0);
if (enclose == 0) {
throw new AnalysisException("enclose should not be byte [0].");
}
}

String escapeStr = getOrDefault(formatProperties, PROP_ESCAPE,
Expand All @@ -131,6 +123,18 @@ public void analyzeFileFormatProperties(Map<String, String> formatProperties, bo
PROP_COMPRESS_TYPE, "UNKNOWN", isRemoveOriginProperty);
compressionType = Util.getFileCompressType(compressTypeStr);

// get ENABLE_TEXT_VALIDATE_UTF8 from properties map first,
// if not exist, try getting from session variable,
// if connection context is null, use "true" as default value.
String validateUtf8 = getOrDefault(formatProperties, PROP_ENABLE_TEXT_VALIDATE_UTF8, "",
isRemoveOriginProperty);
if (Strings.isNullOrEmpty(validateUtf8)) {
enableTextValidateUTF8 = ConnectContext.get() == null ? true
: ConnectContext.get().getSessionVariable().enableTextValidateUtf8;
} else {
enableTextValidateUTF8 = Boolean.parseBoolean(validateUtf8);
}

} catch (org.apache.doris.common.AnalysisException e) {
throw new AnalysisException(e.getMessage());
}
Expand All @@ -149,15 +153,13 @@ public TFileAttributes toTFileAttributes() {
TFileTextScanRangeParams fileTextScanRangeParams = new TFileTextScanRangeParams();
fileTextScanRangeParams.setColumnSeparator(this.columnSeparator);
fileTextScanRangeParams.setLineDelimiter(this.lineDelimiter);
if (this.enclose != 0) {
fileTextScanRangeParams.setEnclose(this.enclose);
}
fileTextScanRangeParams.setEnclose(this.enclose);
fileTextScanRangeParams.setEscape(this.escape);
fileAttributes.setTextParams(fileTextScanRangeParams);
fileAttributes.setHeaderType(headerType);
fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
fileAttributes.setSkipLines(skipLines);
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
fileAttributes.setEnableTextValidateUtf8(enableTextValidateUTF8);
return fileAttributes;
}

Expand Down Expand Up @@ -188,8 +190,5 @@ public byte getEnclose() {
public byte getEscape() {
return escape;
}

public List<Column> getCsvSchema() {
return csvSchema;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ public abstract void analyzeFileFormatProperties(
public abstract TFileAttributes toTFileAttributes();

public static FileFormatProperties createFileFormatProperties(String formatString) {
switch (formatString) {
if (formatString == null) {
throw new AnalysisException("formatString can not be null");
}
switch (formatString.toLowerCase()) {
case FORMAT_CSV:
return new CsvFileFormatProperties(formatString);
case FORMAT_HIVE_TEXT:
Expand All @@ -91,15 +94,16 @@ public static FileFormatProperties createFileFormatProperties(String formatStrin
return new AvroFileFormatProperties();
case FORMAT_WAL:
return new WalFileFormatProperties();
case FORMAT_ARROW:
return new ArrowFileFormatProperties();
default:
throw new AnalysisException("format:" + formatString + " is not supported.");
}
}

public static FileFormatProperties createFileFormatProperties(Map<String, String> formatProperties)
throws AnalysisException {
String formatString = formatProperties.getOrDefault(PROP_FORMAT, "")
.toLowerCase();
String formatString = formatProperties.getOrDefault(PROP_FORMAT, "csv");
return createFileFormatProperties(formatString);
}

Expand Down
Loading