datafusion_expr_common/
accumulator.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Accumulator module contains the trait definition for aggregation function's accumulators.
19
20use arrow::array::ArrayRef;
21use datafusion_common::{internal_err, Result, ScalarValue};
22use std::fmt::Debug;
23
24/// Tracks an aggregate function's state.
25///
26/// `Accumulator`s are stateful objects that implement a single group. They
27/// aggregate values from multiple rows together into a final output aggregate.
28///
29/// [`GroupsAccumulator]` is an additional more performant (but also complex) API
30/// that manages state for multiple groups at once.
31///
32/// An accumulator knows how to:
33/// * update its state from inputs via [`update_batch`]
34///
35/// * compute the final value from its internal state via [`evaluate`]
36///
37/// * retract an update to its state from given inputs via
38///   [`retract_batch`] (when used as a window aggregate [window
39///   function])
40///
41/// * convert its internal state to a vector of aggregate values via
42///   [`state`] and combine the state from multiple accumulators
43///   via [`merge_batch`], as part of efficient multi-phase grouping.
44///
45/// [`update_batch`]: Self::update_batch
46/// [`retract_batch`]: Self::retract_batch
47/// [`state`]: Self::state
48/// [`evaluate`]: Self::evaluate
49/// [`merge_batch`]: Self::merge_batch
50/// [window function]: https://en.wikipedia.org/wiki/Window_function_(SQL)
51pub trait Accumulator: Send + Sync + Debug {
52    /// Updates the accumulator's state from its input.
53    ///
54    /// `values` contains the arguments to this aggregate function.
55    ///
56    /// For example, the `SUM` accumulator maintains a running sum,
57    /// and `update_batch` adds each of the input values to the
58    /// running sum.
59    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()>;
60
61    /// Returns the final aggregate value, consuming the internal state.
62    ///
63    /// For example, the `SUM` accumulator maintains a running sum,
64    /// and `evaluate` will produce that running sum as its output.
65    ///
66    /// This function should not be called twice, otherwise it will
67    /// result in potentially non-deterministic behavior.
68    ///
69    /// This function gets `&mut self` to allow for the accumulator to build
70    /// arrow-compatible internal state that can be returned without copying
71    /// when possible (for example distinct strings)
72    fn evaluate(&mut self) -> Result<ScalarValue>;
73
74    /// Returns the allocated size required for this accumulator, in
75    /// bytes, including `Self`.
76    ///
77    /// This value is used to calculate the memory used during
78    /// execution so DataFusion can stay within its allotted limit.
79    ///
80    /// "Allocated" means that for internal containers such as `Vec`,
81    /// the `capacity` should be used not the `len`.
82    fn size(&self) -> usize;
83
84    /// Returns the intermediate state of the accumulator, consuming the
85    /// intermediate state.
86    ///
87    /// This function should not be called twice, otherwise it will
88    /// result in potentially non-deterministic behavior.
89    ///
90    /// This function gets `&mut self` to allow for the accumulator to build
91    /// arrow-compatible internal state that can be returned without copying
92    /// when possible (for example distinct strings).
93    ///
94    /// Intermediate state is used for "multi-phase" grouping in
95    /// DataFusion, where an aggregate is computed in parallel with
96    /// multiple `Accumulator` instances, as described below:
97    ///
98    /// # Multi-Phase Grouping
99    ///
100    /// ```text
101    ///                               ▲
102    ///                               │                   evaluate() is called to
103    ///                               │                   produce the final aggregate
104    ///                               │                   value per group
105    ///                               │
106    ///                  ┌─────────────────────────┐
107    ///                  │GroupBy                  │
108    ///                  │(AggregateMode::Final)   │      state() is called for each
109    ///                  │                         │      group and the resulting
110    ///                  └─────────────────────────┘      RecordBatches passed to the
111    ///                                                   Final GroupBy via merge_batch()
112    ///                               ▲
113    ///                               │
114    ///              ┌────────────────┴───────────────┐
115    ///              │                                │
116    ///              │                                │
117    /// ┌─────────────────────────┐      ┌─────────────────────────┐
118    /// │        GroupBy          │      │        GroupBy          │
119    /// │(AggregateMode::Partial) │      │(AggregateMode::Partial) │
120    /// └─────────────────────────┘      └─────────────────────────┘
121    ///              ▲                                ▲
122    ///              │                                │    update_batch() is called for
123    ///              │                                │    each input RecordBatch
124    ///         .─────────.                      .─────────.
125    ///      ,─'           '─.                ,─'           '─.
126    ///     ;      Input      :              ;      Input      :
127    ///     :   Partition 0   ;              :   Partition 1   ;
128    ///      ╲               ╱                ╲               ╱
129    ///       '─.         ,─'                  '─.         ,─'
130    ///          `───────'                        `───────'
131    /// ```
132    ///
133    /// The partial state is serialized as `Arrays` and then combined
134    /// with other partial states from different instances of this
135    /// Accumulator (that ran on different partitions, for example).
136    ///
137    /// The state can be and often is a different type than the output
138    /// type of the [`Accumulator`] and needs different merge
139    /// operations (for example, the partial state for `COUNT` needs
140    /// to be summed together)
141    ///
142    /// Some accumulators can return multiple values for their
143    /// intermediate states. For example, the average accumulator
144    /// tracks `sum` and `n`, and this function should return a vector
145    /// of two values, sum and n.
146    ///
147    /// Note that [`ScalarValue::List`] can be used to pass multiple
148    /// values if the number of intermediate values is not known at
149    /// planning time (e.g. for `MEDIAN`)
150    ///
151    /// # Multi-phase repartitioned Grouping
152    ///
153    /// Many multi-phase grouping plans contain a Repartition operation
154    /// as well as shown below:
155    ///
156    /// ```text
157    ///                ▲                          ▲
158    ///                │                          │
159    ///                │                          │
160    ///                │                          │
161    ///                │                          │
162    ///                │                          │
163    ///    ┌───────────────────────┐  ┌───────────────────────┐       4. Each AggregateMode::Final
164    ///    │GroupBy                │  │GroupBy                │       GroupBy has an entry for its
165    ///    │(AggregateMode::Final) │  │(AggregateMode::Final) │       subset of groups (in this case
166    ///    │                       │  │                       │       that means half the entries)
167    ///    └───────────────────────┘  └───────────────────────┘
168    ///                ▲                          ▲
169    ///                │                          │
170    ///                └─────────────┬────────────┘
171    ///                              │
172    ///                              │
173    ///                              │
174    ///                 ┌─────────────────────────┐                   3. Repartitioning by hash(group
175    ///                 │       Repartition       │                   keys) ensures that each distinct
176    ///                 │         HASH(x)         │                   group key now appears in exactly
177    ///                 └─────────────────────────┘                   one partition
178    ///                              ▲
179    ///                              │
180    ///              ┌───────────────┴─────────────┐
181    ///              │                             │
182    ///              │                             │
183    /// ┌─────────────────────────┐  ┌──────────────────────────┐     2. Each AggregateMode::Partial
184    /// │        GroupBy          │  │       GroupBy            │     GroupBy has an entry for *all*
185    /// │(AggregateMode::Partial) │  │ (AggregateMode::Partial) │     the groups
186    /// └─────────────────────────┘  └──────────────────────────┘
187    ///              ▲                             ▲
188    ///              │                             │
189    ///              │                             │
190    ///         .─────────.                   .─────────.
191    ///      ,─'           '─.             ,─'           '─.
192    ///     ;      Input      :           ;      Input      :         1. Since input data is
193    ///     :   Partition 0   ;           :   Partition 1   ;         arbitrarily or RoundRobin
194    ///      ╲               ╱             ╲               ╱          distributed, each partition
195    ///       '─.         ,─'               '─.         ,─'           likely has all distinct
196    ///          `───────'                     `───────'
197    /// ```
198    ///
199    /// This structure is used so that the `AggregateMode::Partial` accumulators
200    /// reduces the cardinality of the input as soon as possible. Typically,
201    /// each partial accumulator sees all groups in the input as the group keys
202    /// are evenly distributed across the input.
203    ///
204    /// The final output is computed by repartitioning the result of
205    /// [`Self::state`] from each Partial aggregate and `hash(group keys)` so
206    /// that each distinct group key appears in exactly one of the
207    /// `AggregateMode::Final` GroupBy nodes. The outputs of the final nodes are
208    /// then unioned together to produce the overall final output.
209    ///
210    /// Here is an example that shows the distribution of groups in the
211    /// different phases
212    ///
213    /// ```text
214    ///               ┌─────┐                ┌─────┐
215    ///               │  1  │                │  3  │
216    ///               ├─────┤                ├─────┤
217    ///               │  2  │                │  4  │                After repartitioning by
218    ///               └─────┘                └─────┘                hash(group keys), each distinct
219    ///               ┌─────┐                ┌─────┐                group key now appears in exactly
220    ///               │  1  │                │  3  │                one partition
221    ///               ├─────┤                ├─────┤
222    ///               │  2  │                │  4  │
223    ///               └─────┘                └─────┘
224    ///
225    ///
226    /// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
227    ///
228    ///               ┌─────┐                ┌─────┐
229    ///               │  2  │                │  2  │
230    ///               ├─────┤                ├─────┤
231    ///               │  1  │                │  2  │
232    ///               ├─────┤                ├─────┤
233    ///               │  3  │                │  3  │
234    ///               ├─────┤                ├─────┤
235    ///               │  4  │                │  1  │
236    ///               └─────┘                └─────┘                Input data is arbitrarily or
237    ///                 ...                    ...                  RoundRobin distributed, each
238    ///               ┌─────┐                ┌─────┐                partition likely has all
239    ///               │  1  │                │  4  │                distinct group keys
240    ///               ├─────┤                ├─────┤
241    ///               │  4  │                │  3  │
242    ///               ├─────┤                ├─────┤
243    ///               │  1  │                │  1  │
244    ///               ├─────┤                ├─────┤
245    ///               │  4  │                │  3  │
246    ///               └─────┘                └─────┘
247    ///
248    ///           group values           group values
249    ///           in partition 0         in partition 1
250    /// ```
251    fn state(&mut self) -> Result<Vec<ScalarValue>>;
252
253    /// Updates the accumulator's state from an `Array` containing one
254    /// or more intermediate values.
255    ///
256    /// For some aggregates (such as `SUM`), merge_batch is the same
257    /// as `update_batch`, but for some aggregates (such as `COUNT`)
258    /// the operations differ. See [`Self::state`] for more details on how
259    /// state is used and merged.
260    ///
261    /// The `states` array passed was formed by concatenating the
262    /// results of calling [`Self::state`] on zero or more other
263    /// `Accumulator` instances.
264    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()>;
265
266    /// Retracts (removed) an update (caused by the given inputs) to
267    /// accumulator's state.
268    ///
269    /// This is the inverse operation of [`Self::update_batch`] and is used
270    /// to incrementally calculate window aggregates where the `OVER`
271    /// clause defines a bounded window.
272    ///
273    /// # Example
274    ///
275    /// For example, given the following input partition
276    ///
277    /// ```text
278    ///                     │      current      │
279    ///                            window
280    ///                     │                   │
281    ///                ┌────┬────┬────┬────┬────┬────┬────┬────┬────┐
282    ///     Input      │ A  │ B  │ C  │ D  │ E  │ F  │ G  │ H  │ I  │
283    ///   partition    └────┴────┴────┴────┼────┴────┴────┴────┼────┘
284    ///
285    ///                                    │         next      │
286    ///                                             window
287    /// ```
288    ///
289    /// First, [`Self::evaluate`] will be called to produce the output
290    /// for the current window.
291    ///
292    /// Then, to advance to the next window:
293    ///
294    /// First, [`Self::retract_batch`] will be called with the values
295    /// that are leaving the window, `[B, C, D]` and then
296    /// [`Self::update_batch`] will be called with the values that are
297    /// entering the window, `[F, G, H]`.
298    fn retract_batch(&mut self, _values: &[ArrayRef]) -> Result<()> {
299        // TODO add retract for all accumulators
300        internal_err!(
301            "Retract should be implemented for aggregate functions when used with custom window frame queries"
302        )
303    }
304
305    /// Does the accumulator support incrementally updating its value
306    /// by *removing* values.
307    ///
308    /// If this function returns true, [`Self::retract_batch`] will be
309    /// called for sliding window functions such as queries with an
310    /// `OVER (ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING)`
311    fn supports_retract_batch(&self) -> bool {
312        false
313    }
314}
datafusion_expr_common/accumulator.rs

datafusion_expr_common/
accumulator.rs