Skip to content

Commit 744fe3e

Browse files
ezhulenevcopybara-github
authored andcommitted
[xla:cpu] Micro-optimizations for BufferAllocations
name old cpu/op new cpu/op delta BM_SelectAndScatterF32/128/process_time 420µs ± 1% 401µs ± 1% -4.67% BM_SelectAndScatterF32/256/process_time 1.73ms ± 2% 1.65ms ± 3% -4.48% BM_SelectAndScatterF32/512/process_time 7.73ms ± 1% 7.41ms ± 2% -4.14% name old time/op new time/op delta BM_SelectAndScatterF32/128/process_time 421µs ± 1% 401µs ± 1% -4.69% BM_SelectAndScatterF32/256/process_time 1.73ms ± 2% 1.65ms ± 3% -4.57% BM_SelectAndScatterF32/512/process_time 7.34ms ± 1% 7.02ms ± 2% -4.46% name old INSTRUCTIONS/op new INSTRUCTIONS/op delta BM_SelectAndScatterF32/128/process_time 4.55M ± 0% 4.20M ± 0% -7.51% BM_SelectAndScatterF32/256/process_time 18.4M ± 0% 17.0M ± 0% -7.54% BM_SelectAndScatterF32/512/process_time 74.9M ± 0% 69.3M ± 0% -7.48% PiperOrigin-RevId: 657760541
1 parent 11fc7d9 commit 744fe3e

File tree

1 file changed

+12
-9
lines changed

1 file changed

+12
-9
lines changed

xla/service/cpu/runtime/buffer_allocations.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class BufferAllocations {
4545
// Same as above, but also adjusts the returned address for the offset and
4646
// size contained in the given slice.
4747
absl::StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
48-
const BufferAllocation::Slice& slice) const;
48+
BufferAllocation::Slice slice) const;
4949

5050
// Unchecked version of `GetDeviceAddress` that does not check the buffer
5151
// index and assumes it is valid.
@@ -55,16 +55,19 @@ class BufferAllocations {
5555
// Unchecked version of `GetDeviceAddress` that does not check the slice
5656
// buffer index, offset and size and assumes they all are valid.
5757
se::DeviceMemoryBase GetDeviceAddressUnchecked(
58-
const BufferAllocation::Slice& slice) const;
58+
BufferAllocation::Slice slice) const;
5959

6060
private:
6161
std::vector<se::DeviceMemoryBase> buffers_;
62+
se::DeviceMemoryBase* buffers_data_; // buffers_.data()
6263
size_t num_buffers_;
6364
};
6465

6566
inline BufferAllocations::BufferAllocations(
6667
absl::Span<const MaybeOwningDeviceMemory> buffers)
67-
: buffers_(buffers.size()), num_buffers_(buffers_.size()) {
68+
: buffers_(buffers.size()),
69+
buffers_data_(buffers_.data()),
70+
num_buffers_(buffers_.size()) {
6871
for (size_t i = 0; i < buffers.size(); ++i) {
6972
buffers_[i] = buffers[i].AsDeviceMemoryBase();
7073
}
@@ -82,8 +85,7 @@ BufferAllocations::GetDeviceAddress(BufferAllocation::Index index) const {
8285
}
8386

8487
inline ABSL_ATTRIBUTE_ALWAYS_INLINE absl::StatusOr<se::DeviceMemoryBase>
85-
BufferAllocations::GetDeviceAddress(
86-
const BufferAllocation::Slice& slice) const {
88+
BufferAllocations::GetDeviceAddress(BufferAllocation::Slice slice) const {
8789
// Handle empty slices explicitly and return a null pointer device memory to
8890
// guarantee that we do not accidentally write through the empty slice which
8991
// would hide a real bug in the code.
@@ -97,7 +99,7 @@ BufferAllocations::GetDeviceAddress(
9799
"Invalid buffer index %d. It must be in the range [0, %d)", index,
98100
num_buffers_);
99101
}
100-
const se::DeviceMemoryBase& base = buffers_[index];
102+
const se::DeviceMemoryBase& base = buffers_data_[index];
101103

102104
int64_t offset = slice.offset();
103105
int64_t extent = offset + slice.size();
@@ -125,15 +127,16 @@ BufferAllocations::GetDeviceAddress(
125127
inline ABSL_ATTRIBUTE_ALWAYS_INLINE se::DeviceMemoryBase
126128
BufferAllocations::GetDeviceAddressUnchecked(
127129
BufferAllocation::Index buffer_index) const {
128-
return buffers_[buffer_index];
130+
return buffers_data_[buffer_index];
129131
}
130132

131133
// Unchecked version of `GetDeviceAddress` that does not check the slice
132134
// buffer index, offset and size and assumes they are valid.
133135
inline ABSL_ATTRIBUTE_ALWAYS_INLINE se::DeviceMemoryBase
134136
BufferAllocations::GetDeviceAddressUnchecked(
135-
const BufferAllocation::Slice& slice) const {
136-
return buffers_[slice.index()].GetByteSlice(slice.offset(), slice.size());
137+
BufferAllocation::Slice slice) const {
138+
return buffers_data_[slice.index()].GetByteSlice(slice.offset(),
139+
slice.size());
137140
}
138141

139142
} // namespace xla::cpu

0 commit comments

Comments
 (0)