Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

import com.google.common.annotations.VisibleForTesting;
import com.opencsv.RFC4180Parser;
import com.opencsv.RFC4180ParserBuilder;
import com.opencsv.enums.CSVReaderNullFieldIndicator;
import org.apache.druid.common.config.NullHandling;

import javax.annotation.Nullable;
import java.io.IOException;
Expand All @@ -29,7 +32,10 @@

public class CSVParser extends AbstractFlatTextFormatParser
{
private final RFC4180Parser parser = new RFC4180Parser();
private final RFC4180Parser parser = NullHandling.replaceWithDefault()
? new RFC4180Parser()
: new RFC4180ParserBuilder().withFieldAsNull(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this could just be the all the time mode, since I think it wouldn't really change anything for the default mode either since the values would be coerced to ''.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intention was to preserve backward compatibility but thinking about it, the empty separators flag might not impact the default mode. I'll do a few tests and raise a PR to make CSVReaderNullFieldIndicator.EMPTY_SEPARATORSthe default mode if it goes well.

CSVReaderNullFieldIndicator.EMPTY_SEPARATORS).build();

public CSVParser(
@Nullable final String listDelimiter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import org.apache.druid.common.config.NullHandling;

import javax.annotation.Nullable;
import java.util.ArrayList;
Expand Down Expand Up @@ -94,7 +95,12 @@ private List<String> splitToList(String input)
List<String> result = new ArrayList<String>();

while (iterator.hasNext()) {
result.add(iterator.next());
String splitValue = iterator.next();
if (!NullHandling.replaceWithDefault() && splitValue.isEmpty()) {
result.add(null);
} else {
result.add(splitValue);
}
}

return Collections.unmodifiableList(result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,20 @@ public void testWithoutStartFileFromBeginning()
parser.parseToMap(body[0]);
}

@Test
public void testWithNullValues()
{
final Parser<String, Object> parser = PARSER_FACTORY.get(format, true, 0);
parser.startFileFromBeginning();
final String[] body = new String[]{
concat(format, "time", "value1", "value2"),
concat(format, "hello", "world", "")
};
Assert.assertNull(parser.parseToMap(body[0]));
final Map<String, Object> jsonMap = parser.parseToMap(body[1]);
Assert.assertNull(jsonMap.get("value2"));
}

private static class FlatTextFormatParserFactory
{
public Parser<String, Object> get(FlatTextFormat format)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -814,9 +814,7 @@ private ParseSpec getParseSpec(TimestampSpec timestampSpec, DimensionsSpec dimen
private String getUnparseableTimestampString()
{
return ParserType.STR_CSV.equals(parserType)
? (USE_DEFAULT_VALUE_FOR_NULL
? "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=null, met1=6}"
: "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=, met1=6}")
? "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=null, met1=6}"
Copy link
Copy Markdown
Contributor Author

@a2l007 a2l007 Nov 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nishantmonu51 Please let me know if you think there is a better way to do this.
With the CSVParser fix in this PR, the way to specify empty values for csv files would be ""
One of the test data samples is bad_timestamp,foo,,6 and dim2 would be null regardless of whether null handling is enabled or not.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be ok. @nishantmonu51 do you have something in your mind?

: "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, met1=6}";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ public void setup() throws Exception
"2011-01-12T00:00:00.000Z,product_1,t1\tt2\tt3,u1\tu2",
"2011-01-13T00:00:00.000Z,product_2,t3\tt4\tt5,u3\tu4",
"2011-01-14T00:00:00.000Z,product_3,t5\tt6\tt7,u1\tu5",
"2011-01-14T00:00:00.000Z,product_4,,u2"
"2011-01-14T00:00:00.000Z,product_4,\"\",u2"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this change for?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is to preserve the original test, since previously csv parser would always produce empty strings here, but after this change in null compatible mode it will produce a null

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for clarifying this. I was too late 💤

};

for (String row : rows) {
Expand Down