diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 7cb30254f6..6be4012073 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -3,25 +3,28 @@ #include "kernels/accessor.h" #include "kernels/device.h" +#include "utils/required_core.h" namespace FlexFlow { -class ReshapePerDeviceState : public PerDeviceOpState { -public: - ReshapePerDeviceState(FFHandler handler); - DataType data_type; +struct ReshapePerDeviceState { + req data_type; }; +FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); + namespace Kernels { namespace Reshape { +ReshapePerDeviceState init_kernel(DataType data_type); + void forward_kernel(ffStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &per_device_state, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &per_device_state, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu index 15ae0c7109..e935b0d0c2 100644 --- a/lib/kernels/src/cuda/reshape_kernels.cu +++ b/lib/kernels/src/cuda/reshape_kernels.cu @@ -19,8 +19,9 @@ namespace FlexFlow { -ReshapePerDeviceState::ReshapePerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) {} +ReshapePerDeviceState init_kernel(DataType data_type) { + return ReshapePerDeviceState{data_type}; +} namespace Kernels { namespace Reshape { @@ -41,7 +42,6 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - ReshapePerDeviceState const *m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { float alpha = 1.0f; @@ -54,17 +54,17 @@ struct BackwardKernel { } void forward_kernel(cudaStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, input, output); } void backward_kernel(cudaStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, input, output); } } // namespace Reshape diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 71fb10bc9c..c9dc8cff8d 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -16,7 +16,6 @@ #include "reshape.h" #include "kernels/reshape_kernels.h" #include "legion/legion_utilities.h" -#include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names @@ -37,427 +36,162 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reshape; -/* Params */ -bool operator==(ReshapeParams const &lhs, ReshapeParams const &rhs) { - return lhs.shape == rhs.shape; -} +enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; -bool ReshapeParams::is_valid(ParallelTensorShape const &input) const { - return input.is_valid(); -} +OpTaskInvocation init(ReshapeAttrs const &attrs) { + OpTaskBinding binding; -Tensor FFModel::reshape(const Tensor input, - std::vector const &shape, - char const *name) { - Layer *reshape = new Layer(this, - OP_RESHAPE, - DT_FLOAT, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input); - int dims[MAX_TENSOR_DIM]; - int numdim = shape.size(); - for (int i = 0; i < numdim; i++) { - assert(shape[i] > 0); - dims[i] = shape[i]; - } - reshape->outputs[0] = create_tensor( - numdim, dims, input->data_type, reshape, 0, true /*create_grad*/); - reshape->add_int_vector_property("shape", shape); - layers.push_back(reshape); - return reshape->outputs[0]; -} + binding.bind_arg(ATTRS, attrs); -Op *Reshape::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - std::vector shape; - layer->get_int_vector_property("shape", shape); - return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); + return {RESHAPE_INIT_TASK_ID, binding}; } -Reshape::Reshape(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor input, - std::vector const &_shape, - char const *name) - : Op(model, - OP_RESHAPE, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input) { - layer_guid = _layer_guid; - shape_length = _shape.size(); - assert(shape_length <= MAX_TENSOR_DIM); - for (int i = 0; i < shape_length; i++) { - shape_array[i] = _shape[i]; - } - numOutputs = 1; - numWeights = 0; - int num_replica_dims = 0; - for (int i = 0; i < input->num_dims; i++) { - if (input->dims[i].is_replica_dim) { - num_replica_dims++; - } - } - // assert that all replica dims are leading dims - for (int i = 0; i < num_replica_dims; i++) { - assert(input->dims[input->num_dims - 1 - i].is_replica_dim); - } - int numdim = (int)_shape.size(); - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i].size = _shape[numdim - 1 - i]; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = false; - } - // copy all replica dims - for (int i = 0; i < num_replica_dims; i++) { - dims[i + numdim] = input->dims[input->num_dims - 1 - i]; - } - numdim += num_replica_dims; - for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) { - if (dims[numdim - 1 - i].size != - input->dims[input->num_dims - 1 - i].size) { - break; - } - dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, input->data_type, this); - assert(outputs[0]->get_volume() == inputs[0]->get_volume()); -} +OpTaskInvocation forward(ReshapeAttrs const &attrs) { + OpTaskBinding binding; -Reshape::Reshape(FFModel &model, - ReshapeParams const ¶ms, - const ParallelTensor input, - char const *name) - : Reshape(model, params.layer_guid, input, params.shape, name) {} - -void Reshape::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(RESHAPE_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reshape)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + binding.bind_arg(PROFILING, profiling_settings()); + + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + return {RESHAPE_FWD_TASK_ID, binding}; } -PerDeviceOpState *Reshape::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Reshape const *reshape = (Reshape *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - ReshapeMeta *m = new ReshapeMeta(handle); - m->data_type = reshape->outputs[0]->data_type; - return m; +OpTaskInvocation backward(ReshapeAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); + + return {RESHAPE_BWD_TASK_ID, binding}; } -void Reshape::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(RESHAPE_FWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { + auto attrs = acc.get_argument(ATTRS); + + DeviceSpecific per_device_state = + acc.create_device_specific( + init_kernel(attrs.shape.data_type)); + return per_device_state; } -void Reshape::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Reshape* reshape = (const Reshape*) task->args; - ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(in_domain.get_volume() == out_domain.get_volume()); - - if (m->data_type == DT_FLOAT) { - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else if (m->data_type == DT_DOUBLE) { - double const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - double *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else if (m->data_type == DT_INT32) { - int32_t const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int32_t *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else if (m->data_type == DT_INT64) { - int64_t const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int64_t *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else { - assert(false && "Unsupported data type in Reshape forward"); - } +static DeviceSpecific + init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + return init_task_impl(acc); } -void Reshape::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(RESHAPE_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[3](I/O): input0_grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + Profiling profiling = acc.get_argument(PROFILING); + + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Reshape] forward time = %.2lfms\n", + per_device_state, + input, + output); } -ReshapeParams Reshape::get_params() const { - std::vector shape_vec; - for (size_t i = 0; i < shape_length; i++) { - shape_vec.push_back(shape_array[i]); - } - ReshapeParams params; - params.shape = shape_vec; - params.layer_guid = this->layer_guid; - return params; +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); } -void Reshape::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Reshape* reshape = (const Reshape*) task->args; - ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); - Domain out_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain in_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(in_grad_domain.get_volume() == out_grad_domain.get_volume()); - - if (m->data_type == DT_FLOAT) { - float const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else if (m->data_type == DT_DOUBLE) { - double const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - double *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else if (m->data_type == DT_INT32) { - int32_t const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int32_t *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else if (m->data_type == DT_INT64) { - int64_t const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int64_t *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else { - assert(false && "Unsupported data type in Reshape backward"); - } +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + Profiling profiling = acc.get_argument(PROFILING); + + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + + return profile(backward_kernel, + profiling, + "[Reshape] backward time = %.2lfms\n", + per_device_state, + input_grad, + output_grad); } -bool Reshape::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - assert(sub_output.get_volume() == sub_input.get_volume()); - size_t num_elements = sub_input.get_volume(); - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(input_ptr, output_ptr, num_elements); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - float *input_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *output_grad_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - - backward = [&] { - backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf( - "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } - return true; +static void backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); } -void Reshape::serialize(Legion::Serializer &sez) const { - sez.serialize(this->shape_length); - for (size_t i = 0; i < this->shape_length; i++) { - sez.serialize(this->shape_array[i]); - } - sez.serialize(this->layer_guid.id); +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + ReshapeAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + + SimTaskBinding init_binding; + init_binding.bind_arg(ATTRS, attrs); + auto init_accessor = + env.get_init_accessor(RESHAPE_INIT_TASK_ID, init_binding); + auto per_device_state = init_task_impl(init_accessor); + + SimTaskBinding fwd_binding; + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind(INPUT, input.shape); + fwd_binding.bind(OUTPUT, output_shape); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(RESHAPE_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(RESHAPE_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } -using PCG::Node; - -Node Reshape::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - size_t shape_length; - std::vector shape; - dez.deserialize(shape_length); - for (size_t i = 0; i < shape_length; i++) { - int value; - dez.deserialize(value); - shape.push_back(value); - } - size_t id; - dez.deserialize(id); - LayerID layer_guid(id); - - ReshapeParams params; - params.shape = shape; - params.layer_guid = layer_guid; - return ff.get_or_create_node(inputs[0], params); +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT); + + init.add_arg_slot(ATTRS); + + init.add_return_value(PER_DEVICE_STATE); + + register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); } -Op *Reshape::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - assert(num_inputs == 1); - std::vector shape; - for (size_t i = 0; i < this->shape_length; i++) { - shape.push_back(shape_array[i]); - } - return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name); +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_arg_slot(PROFILING); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + + register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task); } -}; // namespace FlexFlow +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); -namespace std { -size_t hash::operator()( - FlexFlow::ReshapeParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.shape.size()); - for (int n : params.shape) { - hash_combine(key, n); - } - hash_combine(key, params.layer_guid.id); - return key; + register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task); } -}; // namespace std + +}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reshape.h b/lib/runtime/src/ops/reshape.h index 42bbefd9db..f044e3f057 100644 --- a/lib/runtime/src/ops/reshape.h +++ b/lib/runtime/src/ops/reshape.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RESHAPE_H #include "op-attrs/ops/reshape.h" -#include "op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(ReshapeAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReshapeAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view);