Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
import com.fasterxml.jackson.annotation.JsonSubTypes.Type;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import org.apache.druid.data.input.impl.CsvInputFormat;
import org.apache.druid.data.input.impl.DelimitedInputFormat;
import org.apache.druid.data.input.impl.JsonInputFormat;
import org.apache.druid.data.input.impl.NestedInputFormat;
import org.apache.druid.data.input.impl.RegexInputFormat;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.data.input.impl.TsvInputFormat;
import org.apache.druid.guice.annotations.UnstableApi;

import java.io.File;
Expand All @@ -38,7 +38,7 @@
* InputFormat abstracts the file format of input data.
* It creates a {@link InputEntityReader} to read data and parse it into {@link InputRow}.
* The created InputEntityReader is used by {@link InputSourceReader}.
*
* <p>
* See {@link NestedInputFormat} for nested input formats such as JSON.
*/
@UnstableApi
Expand All @@ -47,13 +47,13 @@
@Type(name = "csv", value = CsvInputFormat.class),
@Type(name = "json", value = JsonInputFormat.class),
@Type(name = "regex", value = RegexInputFormat.class),
@Type(name = "tsv", value = TsvInputFormat.class)
@Type(name = "tsv", value = DelimitedInputFormat.class)
})
public interface InputFormat
{
/**
* Trait to indicate that a file can be split into multiple {@link InputSplit}s.
*
* <p>
* This method is not being used anywhere for now, but should be considered
* in {@link SplittableInputSource#createSplits} in the future.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

/**
* {@link InputEntityReader} that parses bytes into some intermediate rows first, and then into {@link InputRow}s.
* For example, {@link org.apache.druid.data.input.impl.CsvReader} parses bytes into string lines, and then parses
* For example, {@link org.apache.druid.data.input.impl.DelimitedValueReader} parses bytes into string lines, and then parses
* those lines into InputRows.
*
* @param <T> type of intermediate row. For example, it can be {@link String} for text formats.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import javax.annotation.Nullable;
import java.util.List;

public class CsvInputFormat extends SeparateValueInputFormat
public class CsvInputFormat extends DelimitedInputFormat
{
@JsonCreator
public CsvInputFormat(
Expand All @@ -36,6 +36,6 @@ public CsvInputFormat(
@JsonProperty("skipHeaderRows") int skipHeaderRows
)
{
super(columns, listDelimiter, hasHeaderRow, findColumnsFromHeader, skipHeaderRows, Format.CSV);
super(columns, listDelimiter, ",", hasHeaderRow, findColumnsFromHeader, skipHeaderRows);
}
}
52 changes: 0 additions & 52 deletions core/src/main/java/org/apache/druid/data/input/impl/CsvReader.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,19 @@
import java.util.Objects;

/**
* SeparateValueInputFormat abstracts the (Comma/Tab) Separate Value format of input data.
* It implements the common logic between {@link CsvInputFormat} and {@link TsvInputFormat}
* Should never be instantiated
* InputFormat for customized Delimitor Separate Value format of input data(default is TSV).
*/
public abstract class SeparateValueInputFormat implements InputFormat
public class DelimitedInputFormat implements InputFormat
{

public enum Format
{
CSV(',', "comma"),
TSV('\t', "tab");
TSV('\t', "tab"),
CustomizeSV('|', "");

private final char delimiter;
private final String literal;
private char delimiter;
private String literal;

Format(char delimiter, String literal)
{
Expand All @@ -62,6 +61,12 @@ public String getDelimiterAsString()
return String.valueOf(delimiter);
}

private void setDelimiter(String delimiter, String literal)
{
this.delimiter = (delimiter != null && delimiter.length() > 0) ? delimiter.charAt(0) : '\t';
this.literal = literal != null ? literal : "customize separator: " + delimiter;
}

public char getDelimiter()
{
return delimiter;
Expand All @@ -78,14 +83,15 @@ public String getLiteral()
private final boolean findColumnsFromHeader;
private final int skipHeaderRows;
private final Format format;

protected SeparateValueInputFormat(
@Nullable List<String> columns,
@Nullable String listDelimiter,
@Nullable Boolean hasHeaderRow,
@Nullable Boolean findColumnsFromHeader,
int skipHeaderRows,
Format format
private final String delimiter;

public DelimitedInputFormat(
@JsonProperty("columns") @Nullable List<String> columns,
@JsonProperty("listDelimiter") @Nullable String listDelimiter,
@JsonProperty("delimiter") @Nullable String delimiter,
@Deprecated @JsonProperty("hasHeaderRow") @Nullable Boolean hasHeaderRow,
@JsonProperty("findColumnsFromHeader") @Nullable Boolean findColumnsFromHeader,
@JsonProperty("skipHeaderRows") int skipHeaderRows
)
{
this.listDelimiter = listDelimiter;
Expand All @@ -98,8 +104,17 @@ protected SeparateValueInputFormat(
)
).getValue();
this.skipHeaderRows = skipHeaderRows;
this.format = format;

this.delimiter = delimiter == null ? "\t" : delimiter;
this.format = getFormat(this.delimiter);
Preconditions.checkArgument(
this.delimiter.length() == 1,
"The delimiter should be a single character"
);
Preconditions.checkArgument(
!this.delimiter.equals(listDelimiter),
"Cannot have same delimiter and list delimiter of [%s]",
this.delimiter
);
if (!this.columns.isEmpty()) {
for (String column : this.columns) {
Preconditions.checkArgument(
Expand All @@ -117,6 +132,18 @@ protected SeparateValueInputFormat(
}
}

private static Format getFormat(String delimiter)
{
if (",".equals(delimiter)) {
return Format.CSV;
} else if ("\t".equals(delimiter)) {
return Format.TSV;
} else {
Format.CustomizeSV.setDelimiter(delimiter, null);
return Format.CustomizeSV;
}
}


@JsonProperty
public List<String> getColumns()
Expand Down Expand Up @@ -151,22 +178,15 @@ public boolean isSplittable()
@Override
public InputEntityReader createReader(InputRowSchema inputRowSchema, InputEntity source, File temporaryDirectory)
{
return this.format == Format.TSV ? new TsvReader(
inputRowSchema,
source,
temporaryDirectory,
listDelimiter,
columns,
findColumnsFromHeader,
skipHeaderRows
) : new CsvReader(
return new DelimitedValueReader(
inputRowSchema,
source,
temporaryDirectory,
listDelimiter,
columns,
findColumnsFromHeader,
skipHeaderRows
skipHeaderRows,
this.format
);
}

Expand All @@ -179,7 +199,7 @@ public boolean equals(Object o)
if (o == null || getClass() != o.getClass()) {
return false;
}
SeparateValueInputFormat format = (SeparateValueInputFormat) o;
DelimitedInputFormat format = (DelimitedInputFormat) o;
return findColumnsFromHeader == format.findColumnsFromHeader &&
skipHeaderRows == format.skipHeaderRows &&
Objects.equals(listDelimiter, format.listDelimiter) &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import com.opencsv.RFC4180Parser;
import com.opencsv.RFC4180ParserBuilder;
import com.opencsv.enums.CSVReaderNullFieldIndicator;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.data.input.InputEntity;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.InputRowSchema;
Expand All @@ -45,37 +46,38 @@
import java.util.Map;

/**
* SeparateValueReader abstracts the reader for (Comma/Tab) Separate Value format input data.
* It implements the common logic between {@link CsvReader} and {@link TsvReader}
* Should never be instantiated
* DelimitedValueReader is the reader for Delimitor Separate Value format input data(CSV/TSV).
*/
public abstract class SeparateValueReader extends TextReader
public class DelimitedValueReader extends TextReader
{
private final boolean findColumnsFromHeader;
private final int skipHeaderRows;
private final Function<String, Object> multiValueFunction;
@Nullable
private List<String> columns;
private final SeparateValueInputFormat.Format format;
private final RFC4180Parser parser;

public static RFC4180Parser createOpenCsvParser(char separator)
{
return new RFC4180ParserBuilder().withFieldAsNull(
return NullHandling.replaceWithDefault()
? new RFC4180ParserBuilder()
.withSeparator(separator)
.build()
: new RFC4180ParserBuilder().withFieldAsNull(
CSVReaderNullFieldIndicator.EMPTY_SEPARATORS)
.withSeparator(separator)
.build();
}

SeparateValueReader(
DelimitedValueReader(
InputRowSchema inputRowSchema,
InputEntity source,
File temporaryDirectory,
@Nullable String listDelimiter,
@Nullable List<String> columns,
boolean findColumnsFromHeader,
int skipHeaderRows,
SeparateValueInputFormat.Format format
DelimitedInputFormat.Format format
)
{
super(inputRowSchema, source, temporaryDirectory);
Expand All @@ -84,7 +86,6 @@ public static RFC4180Parser createOpenCsvParser(char separator)
final String finalListDelimeter = listDelimiter == null ? Parsers.DEFAULT_LIST_DELIMITER : listDelimiter;
this.multiValueFunction = ParserUtils.getMultiValueFunction(finalListDelimeter, Splitter.on(finalListDelimeter));
this.columns = findColumnsFromHeader ? null : columns; // columns will be overriden by header row
this.format = format;
this.parser = createOpenCsvParser(format.getDelimiter());

if (this.columns != null) {
Expand Down

This file was deleted.

Loading