Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,12 @@ static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
DataMember("field_nullability", &MakeStructOptions::field_nullability),
DataMember("field_metadata", &MakeStructOptions::field_metadata));
static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
DataMember("count_from_zero", &DayOfWeekOptions::count_from_zero),
DataMember("week_start", &DayOfWeekOptions::week_start));
static auto kWeekOptionsType = GetFunctionOptionsType<WeekOptions>(
DataMember("week_starts_monday", &WeekOptions::week_starts_monday),
DataMember("count_from_zero", &WeekOptions::count_from_zero),
DataMember("first_week_is_fully_in_year", &WeekOptions::first_week_is_fully_in_year));
static auto kNullOptionsType = GetFunctionOptionsType<NullOptions>(
DataMember("nan_is_null", &NullOptions::nan_is_null));
} // namespace
Expand Down Expand Up @@ -406,12 +410,20 @@ MakeStructOptions::MakeStructOptions(std::vector<std::string> n)
MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector<std::string>()) {}
constexpr char MakeStructOptions::kTypeName[];

DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start)
DayOfWeekOptions::DayOfWeekOptions(bool count_from_zero, uint32_t week_start)
: FunctionOptions(internal::kDayOfWeekOptionsType),
one_based_numbering(one_based_numbering),
count_from_zero(count_from_zero),
week_start(week_start) {}
constexpr char DayOfWeekOptions::kTypeName[];

WeekOptions::WeekOptions(bool week_starts_monday, bool count_from_zero,
bool first_week_is_fully_in_year)
: FunctionOptions(internal::kWeekOptionsType),
week_starts_monday(week_starts_monday),
count_from_zero(count_from_zero),
first_week_is_fully_in_year(first_week_is_fully_in_year) {}
constexpr char WeekOptions::kTypeName[];

NullOptions::NullOptions(bool nan_is_null)
: FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {}
constexpr char NullOptions::kTypeName[];
Expand All @@ -438,6 +450,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kWeekOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType));
}
} // namespace internal
Expand Down Expand Up @@ -629,6 +642,7 @@ SCALAR_EAGER_UNARY(Day, "day")
SCALAR_EAGER_UNARY(DayOfYear, "day_of_year")
SCALAR_EAGER_UNARY(ISOYear, "iso_year")
SCALAR_EAGER_UNARY(ISOWeek, "iso_week")
SCALAR_EAGER_UNARY(USWeek, "us_week")
SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar")
SCALAR_EAGER_UNARY(Quarter, "quarter")
SCALAR_EAGER_UNARY(Hour, "hour")
Expand All @@ -648,6 +662,10 @@ Result<Datum> AssumeTimezone(const Datum& arg, AssumeTimezoneOptions options,
return CallFunction("assume_timezone", {arg}, &options, ctx);
}

Result<Datum> Week(const Datum& arg, WeekOptions options, ExecContext* ctx) {
return CallFunction("week", {arg}, &options, ctx);
}

Result<Datum> Strftime(const Datum& arg, StrftimeOptions options, ExecContext* ctx) {
return CallFunction("strftime", {arg}, &options, ctx);
}
Expand Down
64 changes: 60 additions & 4 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,12 @@ class ARROW_EXPORT MakeStructOptions : public FunctionOptions {

struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
public:
explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1);
explicit DayOfWeekOptions(bool count_from_zero = true, uint32_t week_start = 1);
constexpr static char const kTypeName[] = "DayOfWeekOptions";
static DayOfWeekOptions Defaults() { return DayOfWeekOptions(); }

/// Number days from 1 if true and from 0 if false
bool one_based_numbering;
/// Number days from 0 if true and from 1 if false
bool count_from_zero;
/// What day does the week start with (Monday=1, Sunday=7)
uint32_t week_start;
};
Expand Down Expand Up @@ -361,6 +361,33 @@ struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions {
Nonexistent nonexistent;
};

struct ARROW_EXPORT WeekOptions : public FunctionOptions {
public:
explicit WeekOptions(bool week_starts_monday = true, bool count_from_zero = false,
bool first_week_is_fully_in_year = false);
constexpr static char const kTypeName[] = "WeekOptions";
static WeekOptions Defaults() { return WeekOptions{}; }
static WeekOptions ISODefaults() {
return WeekOptions{/*week_starts_monday*/ true,
/*count_from_zero=*/false,
/*first_week_is_fully_in_year=*/false};
}
static WeekOptions USDefaults() {
return WeekOptions{/*week_starts_monday*/ false,
/*count_from_zero=*/false,
/*first_week_is_fully_in_year=*/false};
}

/// What day does the week start with (Monday=true, Sunday=false)
bool week_starts_monday;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... did you deliberately choose a different option name and semantics compared with DayOfWeekOptions::week_start?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Week start is an int and can have values from 1 to 7. Here I'd like to limit to two options - Monday/Sunday hence the boolean and different name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would you like to limit it? What is the rationale for that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've not found an example of Week function that offers options other than Monday or Sunday. So I'd not like to offer more options if possible.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit curious why it would be useful for DayOfWeek but not for Week. Ideally we should try to use similar idioms when similar concepts are involved.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would actually switch DayOfWeekOptions::week_start to bool DayOfWeekOptions::week_starts_monday if everyone's up for it. Then we could just use WeekOptions for both and it would be just slightly wrong.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's reasonable.

/// Dates from current year that fall into last ISO week of the previous year return
/// 0 if true and 52 or 53 if false.
bool count_from_zero;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar question here vs. DayOfWeekOptions::one_based_numbering.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a difference between the two - one_based_numbering controls if Monday=1 or Monday=0.
count_from_zero defines what is the last week of previous year that is still in this year - 0 or 52.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Er... then what is the difference between count_from_zero and first_week_is_fully_in_year?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

first_week_is_fully_in_year controls if the whole week (from Sunday or Monday) is in new year or only most of it (4 days).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See here for all the possibilities: #11026 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While the implementation is reasonably simple, my main concern here is a head-scratching API. Supporting the MySQL modes does not sound like an interesting goal in itself, and exposing obscure parameters makes the API more difficult to learn and use.

We should look for actual cases. One actual use case is US week numbering (like epiweek in lubridate). Are there any others?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should look for actual cases. One actual use case is US week numbering (like epiweek in lubridate). Are there any others?

SQL server seems to support week (week 1 starts with 1st day of week in year) and iso_week.

It seems to me we can agree on two parameters already (first_week_is_fully_in_year, week_starts_monday) but maybe not on the third: count_from_zero (Dates from current year that fall into last ISO week of the previous year returns 0 if true and 52 or 53 if false.)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From MySQL docs about why the 8 modes:

If a date falls in the last week of the previous year, MySQL returns 0 if you do not use 2, 3, 6, or 7 as the optional mode argument.

One might argue that WEEK() should return 52 because the given date actually occurs in the 52nd week of 1999. WEEK() returns 0 instead so that the return value is “the week number in the given year.” This makes use of the WEEK() function reliable when combined with other functions that extract a date part from a date.

If you prefer a result evaluated with respect to the year that contains the first day of the week for the given date, use 0, 2, 5, or 7 as the optional mode argument.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the concern is primarily a confusing API, could we offer default options for 'common' modes like us_week and iso_week? Then the individual options could be left up to people who need/understand what they want.

/// Must the first week be fully in January (true), or is a week that begins on
/// December 29, 30, or 31 considered to be the first week of the new year (false)?
bool first_week_is_fully_in_year;
};

/// @}

/// \brief Get the absolute value of a value.
Expand Down Expand Up @@ -1008,7 +1035,8 @@ Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);

/// \brief ISOWeek returns ISO week of year number for each element of `values`.
/// First ISO week has the majority (4 or more) of its days in January.
/// Week of the year starts with 1 and can run up to 53.
/// ISO week starts on Monday. Year can have 52 or 53 weeks.
/// Week numbering can start with 1.
///
/// \param[in] values input to extract ISO week of year from
/// \param[in] ctx the function execution context, optional
Expand All @@ -1018,6 +1046,34 @@ Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
/// \note API not yet finalized
ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);

/// \brief USWeek returns US week of year number for each element of `values`.
/// First US week has the majority (4 or more) of its days in January.
/// US week starts on Sunday. Year can have 52 or 53 weeks.
/// Week numbering starts with 1.
///
/// \param[in] values input to extract US week of year from
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 6.0.0
/// \note API not yet finalized
ARROW_EXPORT Result<Datum> USWeek(const Datum& values, ExecContext* ctx = NULLPTR);

/// \brief Week returns week of year number for each element of `values`.
/// First ISO week has the majority (4 or more) of its days in January.
/// Year can have 52 or 53 weeks. Week numbering can start with 0 or 1
/// depending on DayOfWeekOptions.count_from_zero.
///
/// \param[in] values input to extract week of year from
/// \param[in] options for setting numbering start
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 6.0.0
/// \note API not yet finalized
ARROW_EXPORT Result<Datum> Week(const Datum& values, WeekOptions options = WeekOptions(),
ExecContext* ctx = NULLPTR);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also add a helper for USWeek?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears to be missing its counterpart in api_scalar.cc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.


/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
/// each element of `values`.
/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/compute/function_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ TEST(FunctionOptions, Equality) {
options.emplace_back(
new MakeStructOptions({"col1"}, {false}, {key_value_metadata({{"key", "val"}})}));
options.emplace_back(new DayOfWeekOptions(false, 1));
options.emplace_back(new WeekOptions(true, false, false));
options.emplace_back(new CastOptions(CastOptions::Safe(boolean())));
options.emplace_back(new CastOptions(CastOptions::Unsafe(int64())));
options.emplace_back(new FilterOptions());
Expand Down
121 changes: 105 additions & 16 deletions cpp/src/arrow/compute/kernels/scalar_temporal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ using arrow_vendored::date::literals::dec;
using arrow_vendored::date::literals::jan;
using arrow_vendored::date::literals::last;
using arrow_vendored::date::literals::mon;
using arrow_vendored::date::literals::sun;
using arrow_vendored::date::literals::thu;
using arrow_vendored::date::literals::wed;
using internal::applicator::ScalarUnaryNotNull;
using internal::applicator::SimpleUnary;

using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
using WeekState = OptionsWrapper<WeekOptions>;
using StrftimeState = OptionsWrapper<StrftimeOptions>;
using AssumeTimezoneState = OptionsWrapper<AssumeTimezoneOptions>;

Expand Down Expand Up @@ -230,6 +233,18 @@ struct AssumeTimezoneExtractor
}
};

template <template <typename...> class Op, typename Duration, typename InType,
typename OutType>
struct TemporalComponentExtractWeek
: public TemporalComponentExtractBase<Op, Duration, InType, OutType> {
using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType>;

static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const WeekOptions& options = WeekState::Get(ctx);
return Base::ExecWithOptions(ctx, &options, batch, out);
}
};

// ----------------------------------------------------------------------
// Extract year from temporal types
//
Expand Down Expand Up @@ -301,7 +316,7 @@ struct DayOfWeek {
for (int i = 0; i < 7; i++) {
lookup_table[i] = i + 8 - options->week_start;
lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i];
lookup_table[i] += options->one_based_numbering;
lookup_table[i] += !options->count_from_zero;
}
}

Expand Down Expand Up @@ -362,31 +377,70 @@ struct ISOYear {
};

// ----------------------------------------------------------------------
// Extract ISO week from temporal types
// Extract week from temporal types
//
// First week of an ISO year has the majority (4 or more) of it's days in January.
// First week of an ISO year has the majority (4 or more) of its days in January.
// Last week of an ISO year has the year's last Thursday in it.
// Based on
// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503

template <typename Duration, typename Localizer>
struct ISOWeek {
explicit ISOWeek(const FunctionOptions* options, Localizer&& localizer)
: localizer_(std::move(localizer)) {}
struct Week {
explicit Week(const WeekOptions* options, Localizer&& localizer)
: localizer_(std::move(localizer)),
count_from_zero_(options->count_from_zero),
first_week_is_fully_in_year_(options->first_week_is_fully_in_year) {
if (options->week_starts_monday) {
if (first_week_is_fully_in_year_) {
wd_ = mon;
} else {
wd_ = thu;
}
} else {
if (first_week_is_fully_in_year_) {
wd_ = sun;
} else {
wd_ = wed;
}
}
if (count_from_zero_) {
days_offset_ = days{0};
} else {
days_offset_ = days{3};
}
}

template <typename T, typename Arg0>
T Call(KernelContext*, Arg0 arg, Status*) const {
const auto t = floor<days>(localizer_.template ConvertTimePoint<Duration>(arg));
auto y = year_month_day{t + days{3}}.year();
auto start = localizer_.ConvertDays((y - years{1}) / dec / thu[last]) + (mon - thu);
if (t < start) {
--y;
start = localizer_.ConvertDays((y - years{1}) / dec / thu[last]) + (mon - thu);
auto y = year_month_day{t + days_offset_}.year();

if (first_week_is_fully_in_year_) {
auto start = localizer_.ConvertDays(y / jan / wd_[1]);
if (!count_from_zero_) {
if (t < start) {
--y;
start = localizer_.ConvertDays(y / jan / wd_[1]);
}
}
return static_cast<T>(floor<weeks>(t - start).count() + 1);
}
return static_cast<T>(trunc<weeks>(t - start).count() + 1);

auto start = localizer_.ConvertDays((y - years{1}) / dec / wd_[last]) + (mon - thu);
if (!count_from_zero_) {
if (t < start) {
--y;
start = localizer_.ConvertDays((y - years{1}) / dec / wd_[last]) + (mon - thu);
}
}
return static_cast<T>(floor<weeks>(t - start).count() + 1);
}

Localizer localizer_;
arrow_vendored::date::weekday wd_;
arrow_vendored::date::days days_offset_;
const bool count_from_zero_;
const bool first_week_is_fully_in_year_;
};

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -979,7 +1033,7 @@ const FunctionDoc day_of_week_doc{
"represented by 6.\n"
"`DayOfWeekOptions.week_start` can be used to set another starting day using\n"
"the ISO numbering convention (1=start week on Monday, 7=start week on Sunday).\n"
"Day numbers can start at 0 or 1 based on `DayOfWeekOptions.one_based_numbering`.\n"
"Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`.\n"
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
Expand All @@ -1004,13 +1058,34 @@ const FunctionDoc iso_year_doc{

const FunctionDoc iso_week_doc{
"Extract ISO week of year number",
("First ISO week has the majority (4 or more) of its days in January.\n"
("First ISO week has the majority (4 or more) of its days in January."
"ISO week starts on Monday.\n"
"Week of the year starts with 1 and can run up to 53.\n"
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc us_week_doc{
"Extract US week of year number",
("First US week has the majority (4 or more) of its days in January."
"US week starts on Sunday.\n"
"Week of the year starts with 1 and can run up to 53.\n"
"Null values emit null.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"}};

const FunctionDoc week_doc{
"Extract week of year number",
("First week has the majority (4 or more) of its days in January.\n"
"Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using "
"DayOfWeekOptions.count_from_zero.\n"
"An error is returned if the timestamps have a defined timezone but it\n"
"cannot be found in the timezone database."),
{"values"},
"WeekOptions"};

const FunctionDoc iso_calendar_doc{
"Extract (ISO year, ISO week, ISO day of week) struct",
("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
Expand Down Expand Up @@ -1140,10 +1215,24 @@ void RegisterScalarTemporal(FunctionRegistry* registry) {
"iso_year", {WithDates, WithTimestamps}, int64(), &iso_year_doc);
DCHECK_OK(registry->AddFunction(std::move(iso_year)));

auto iso_week = MakeTemporal<ISOWeek, TemporalComponentExtract, Int64Type>(
"iso_week", {WithDates, WithTimestamps}, int64(), &iso_week_doc);
static const auto default_iso_week_options = WeekOptions::ISODefaults();
auto iso_week = MakeTemporal<Week, TemporalComponentExtractWeek, Int64Type>(
"iso_week", {WithDates, WithTimestamps}, int64(), &iso_week_doc,
&default_iso_week_options, WeekState::Init);
DCHECK_OK(registry->AddFunction(std::move(iso_week)));

static const auto default_us_week_options = WeekOptions::USDefaults();
auto us_week = MakeTemporal<Week, TemporalComponentExtractWeek, Int64Type>(
"us_week", {WithDates, WithTimestamps}, int64(), &us_week_doc,
&default_us_week_options, WeekState::Init);
DCHECK_OK(registry->AddFunction(std::move(us_week)));

static const auto default_week_options = WeekOptions();
auto week = MakeTemporal<Week, TemporalComponentExtractWeek, Int64Type>(
"week", {WithDates, WithTimestamps}, int64(), &week_doc, &default_week_options,
WeekState::Init);
DCHECK_OK(registry->AddFunction(std::move(week)));

auto iso_calendar = MakeSimpleUnaryTemporal<ISOCalendar>(
"iso_calendar", {WithDates, WithTimestamps}, IsoCalendarType(), &iso_calendar_doc);
DCHECK_OK(registry->AddFunction(std::move(iso_calendar)));
Expand Down
Loading