Skip to content

Commit e151f0d

Browse files
ezhulenevcopybara-github
authored andcommitted
[stream_executor] NFC: Optimize kernel arguments packing
Remove unused packed arguments members that are mostly leftovers from passing shared memory arguments to OpenCL kernels name old cpu/op new cpu/op delta BM_PackDeviceMemoryArgs/4 26.8ns ± 3% 21.8ns ± 8% -18.63% (p=0.000 n=20+20) BM_PackDeviceMemoryArgs/8 33.0ns ± 3% 30.9ns ± 4% -6.21% (p=0.000 n=20+19) BM_PackDeviceMemoryArgs/32 105ns ± 3% 80ns ± 3% -23.97% (p=0.000 n=20+18) BM_PackDeviceMemoryArgs/64 201ns ± 4% 156ns ± 4% -22.43% (p=0.000 n=19+20) BM_PackDeviceMemoryArgs/128 520ns ± 4% 362ns ± 4% -30.35% (p=0.000 n=19+20) BM_PackDeviceMemoryArgs/256 783ns ± 7% 586ns ± 4% -25.21% (p=0.000 n=20+20) BM_PackDeviceMemoryArgs/512 1.58µs ± 4% 1.14µs ± 4% -27.71% (p=0.000 n=20+20) BM_PackDeviceMemoryArgs/1024 3.48µs ± 4% 2.39µs ± 4% -31.34% (p=0.000 n=20+20) PiperOrigin-RevId: 580343747
1 parent ddeb0a8 commit e151f0d

File tree

1 file changed

+3
-21
lines changed

1 file changed

+3
-21
lines changed

xla/stream_executor/kernel.h

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ limitations under the License.
7070
#define XLA_STREAM_EXECUTOR_KERNEL_H_
7171

7272
#include <array>
73+
#include <cstddef>
7374
#include <cstdint>
7475
#include <cstring>
7576
#include <memory>
@@ -376,7 +377,6 @@ class KernelArgsPackedArray : public KernelArgsPackedArrayBase {
376377
std::memcpy(generic_arg_storage, &arg, sizeof(T));
377378

378379
argument_addresses_[number_of_argument_addresses_] = generic_arg_storage;
379-
argument_sizes_[number_of_argument_addresses_] = sizeof(arg);
380380
++number_of_argument_addresses_;
381381
}
382382

@@ -386,7 +386,6 @@ class KernelArgsPackedArray : public KernelArgsPackedArrayBase {
386386
&device_memory_opaque_pointers_[number_of_argument_addresses_];
387387
*copy_ptr = arg.opaque();
388388
argument_addresses_[number_of_argument_addresses_] = copy_ptr;
389-
argument_sizes_[number_of_argument_addresses_] = sizeof(void *);
390389
++number_of_argument_addresses_;
391390
}
392391

@@ -395,17 +394,13 @@ class KernelArgsPackedArray : public KernelArgsPackedArrayBase {
395394
// The only significant information about a shared argument is its size, so
396395
// that is the only parameter in this function.
397396
void add_shared_bytes(size_t number_of_bytes) {
398-
shared_memory_indices_[number_of_shared_memory_arguments_] =
399-
number_of_argument_addresses_ + number_of_shared_memory_arguments_;
400-
shared_memory_bytes_[number_of_shared_memory_arguments_] = number_of_bytes;
401-
++number_of_shared_memory_arguments_;
402397
total_shared_memory_bytes_ += number_of_bytes;
403398
}
404399

405400
// Gets the number of arguments added so far, including shared memory
406401
// arguments.
407402
size_t number_of_arguments() const override {
408-
return number_of_argument_addresses_ + number_of_shared_memory_arguments_;
403+
return number_of_argument_addresses_ + (total_shared_memory_bytes_ > 0);
409404
}
410405

411406
// Gets the total number of shared memory bytes added so far.
@@ -427,28 +422,15 @@ class KernelArgsPackedArray : public KernelArgsPackedArrayBase {
427422
std::array<const void *, kNumArgs> argument_addresses_;
428423

429424
// Storage for arguments of templated type.
430-
alignas(kMaxGenericArgSize)
425+
alignas(std::max_align_t)
431426
std::array<char, kNumArgs * kMaxGenericArgSize> generic_arguments_;
432427

433-
// Sizes for non-shared-memory arguments.
434-
std::array<size_t, kNumArgs> argument_sizes_;
435-
436-
// Size in bytes for each shared memory argument.
437-
std::array<size_t, kNumArgs> shared_memory_bytes_;
438-
439-
// Indices in the arguments array for shared memory arguments.
440-
std::array<size_t, kNumArgs> shared_memory_indices_;
441-
442428
// Total of all shared memory sizes.
443429
size_t total_shared_memory_bytes_ = 0;
444430

445431
// Number of significant entries in argument_addresses_ and argument_sizes_.
446432
size_t number_of_argument_addresses_ = 0;
447433

448-
// Number of significant entries in shared_memory_bytes_ and
449-
// shared_memory_indices_.
450-
size_t number_of_shared_memory_arguments_ = 0;
451-
452434
// The number of generic arguments that have been added to generic_arguments_.
453435
size_t number_of_generic_arguments_ = 0;
454436
};

0 commit comments

Comments
 (0)