Skip to content

Commit cc317dd

Browse files
authored
Add handling for ambiguous parsing options (#226)
GitHub: fix GH-225 With Ruby 3.0.2 and csv 3.2.1, the file ```ruby require "csv" File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") } CSV.read("example.tsv", col_sep: "\t", strip: true) ``` produces the error ``` lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful message in line 1. (CSV::MalformedCSVError) ``` However, the CSV in this example is not malformed; instead, ambiguous options were provided to the parser. It is not obvious (to me) whether the string should be parsed as - `["foo\t\tbar"]`, - `["foo", "bar"]`, - `["foo", "", "bar"]`, or - `["foo", nil, "bar"]`. This commit adds code that raises an exception when this situation is encountered. Specifically, it checks if the column separator either ends with or starts with the characters that would be stripped away. This commit also adds unit tests and updates the documentation.
1 parent 27c0b66 commit cc317dd

File tree

3 files changed

+59
-6
lines changed

3 files changed

+59
-6
lines changed

lib/csv.rb

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -341,14 +341,14 @@
341341
# liberal_parsing: false,
342342
# nil_value: nil,
343343
# empty_value: "",
344+
# strip: false,
344345
# # For generating.
345346
# write_headers: nil,
346347
# quote_empty: true,
347348
# force_quotes: false,
348349
# write_converters: nil,
349350
# write_nil_value: nil,
350351
# write_empty_value: "",
351-
# strip: false,
352352
# }
353353
#
354354
# ==== Options for Parsing
@@ -366,8 +366,9 @@
366366
# - +header_converters+: Specifies the header converters to be used.
367367
# - +skip_blanks+: Specifies whether blanks lines are to be ignored.
368368
# - +skip_lines+: Specifies how comments lines are to be recognized.
369-
# - +strip+: Specifies whether leading and trailing whitespace are
370-
# to be stripped from fields..
369+
# - +strip+: Specifies whether leading and trailing whitespace are to be
370+
# stripped from fields. This must be compatible with +col_sep+; if it is not,
371+
# then an +ArgumentError+ exception will be raised.
371372
# - +liberal_parsing+: Specifies whether \CSV should attempt to parse
372373
# non-compliant data.
373374
# - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field.
@@ -946,14 +947,14 @@ def initialize(message, line_number)
946947
liberal_parsing: false,
947948
nil_value: nil,
948949
empty_value: "",
950+
strip: false,
949951
# For generating.
950952
write_headers: nil,
951953
quote_empty: true,
952954
force_quotes: false,
953955
write_converters: nil,
954956
write_nil_value: nil,
955957
write_empty_value: "",
956-
strip: false,
957958
}.freeze
958959

959960
class << self
@@ -1879,11 +1880,11 @@ def initialize(data,
18791880
encoding: nil,
18801881
nil_value: nil,
18811882
empty_value: "",
1883+
strip: false,
18821884
quote_empty: true,
18831885
write_converters: nil,
18841886
write_nil_value: nil,
1885-
write_empty_value: "",
1886-
strip: false)
1887+
write_empty_value: "")
18871888
raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
18881889

18891890
if data.is_a?(String)

lib/csv/parser.rb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ def prepare
361361
prepare_skip_lines
362362
prepare_strip
363363
prepare_separators
364+
validate_strip_and_col_sep_options
364365
prepare_quoted
365366
prepare_unquoted
366367
prepare_line
@@ -531,6 +532,28 @@ def prepare_separators
531532
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
532533
end
533534

535+
# This method verifies that there are no (obvious) ambiguities with the
536+
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
537+
# and +strip+ were both equal to +\t+, then there would be no clear way to
538+
# parse the input.
539+
def validate_strip_and_col_sep_options
540+
return unless @strip
541+
542+
if @strip.is_a?(String)
543+
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
544+
raise ArgumentError,
545+
"The provided strip (#{@escaped_strip}) and " \
546+
"col_sep (#{@escaped_column_separator}) options are incompatible."
547+
end
548+
else
549+
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
550+
raise ArgumentError,
551+
"The provided strip (true) and " \
552+
"col_sep (#{@escaped_column_separator}) options are incompatible."
553+
end
554+
end
555+
end
556+
534557
def prepare_quoted
535558
if @quote_character
536559
@quotes = Regexp.new(@escaped_quote_character +

test/csv/parse/test_strip.rb

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,33 @@ def test_do_not_strip_crlf
8080
%Q{"a" ,"b " \r\n},
8181
strip: true))
8282
end
83+
84+
def test_col_sep_incompatible_true
85+
message = "The provided strip (true) and " \
86+
"col_sep (\\t) options are incompatible."
87+
assert_raise_with_message(ArgumentError, message) do
88+
CSV.parse_line(%Q{"a"\t"b"\n},
89+
col_sep: "\t",
90+
strip: true)
91+
end
92+
end
93+
94+
def test_col_sep_incompatible_string
95+
message = "The provided strip (\\t) and " \
96+
"col_sep (\\t) options are incompatible."
97+
assert_raise_with_message(ArgumentError, message) do
98+
CSV.parse_line(%Q{"a"\t"b"\n},
99+
col_sep: "\t",
100+
strip: "\t")
101+
end
102+
end
103+
104+
def test_col_sep_compatible_string
105+
assert_equal(
106+
["a", "b"],
107+
CSV.parse_line(%Q{\va\tb\v\n},
108+
col_sep: "\t",
109+
strip: "\v")
110+
)
111+
end
83112
end

0 commit comments

Comments
 (0)