datafusion_expr_common/
columnar_value.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`ColumnarValue`] represents the result of evaluating an expression.
19
20use arrow::array::{Array, ArrayRef, NullArray};
21use arrow::compute::{kernels, CastOptions};
22use arrow::datatypes::DataType;
23use arrow::util::pretty::pretty_format_columns;
24use datafusion_common::format::DEFAULT_CAST_OPTIONS;
25use datafusion_common::{internal_err, Result, ScalarValue};
26use std::fmt;
27use std::sync::Arc;
28
29/// The result of evaluating an expression.
30///
31/// [`ColumnarValue::Scalar`] represents a single value repeated any number of
32/// times. This is an important performance optimization for handling values
33/// that do not change across rows.
34///
35/// [`ColumnarValue::Array`] represents a column of data, stored as an  Arrow
36/// [`ArrayRef`]
37///
38/// A slice of `ColumnarValue`s logically represents a table, with each column
39/// having the same number of rows. This means that all `Array`s are the same
40/// length.
41///
42/// # Example
43///
44/// A `ColumnarValue::Array` with an array of 5 elements and a
45/// `ColumnarValue::Scalar` with the value 100
46///
47/// ```text
48/// ┌──────────────┐
49/// │ ┌──────────┐ │
50/// │ │   "A"    │ │
51/// │ ├──────────┤ │
52/// │ │   "B"    │ │
53/// │ ├──────────┤ │
54/// │ │   "C"    │ │
55/// │ ├──────────┤ │
56/// │ │   "D"    │ │        ┌──────────────┐
57/// │ ├──────────┤ │        │ ┌──────────┐ │
58/// │ │   "E"    │ │        │ │   100    │ │
59/// │ └──────────┘ │        │ └──────────┘ │
60/// └──────────────┘        └──────────────┘
61///
62///  ColumnarValue::        ColumnarValue::
63///       Array                 Scalar
64/// ```
65///
66/// Logically represents the following table:
67///
68/// | Column 1| Column 2 |
69/// | ------- | -------- |
70/// | A | 100 |
71/// | B | 100 |
72/// | C | 100 |
73/// | D | 100 |
74/// | E | 100 |
75///
76/// # Performance Notes
77///
78/// When implementing functions or operators, it is important to consider the
79/// performance implications of handling scalar values.
80///
81/// Because all functions must handle [`ArrayRef`], it is
82/// convenient to convert [`ColumnarValue::Scalar`]s using
83/// [`Self::into_array`]. For example,  [`ColumnarValue::values_to_arrays`]
84/// converts multiple columnar values into arrays of the same length.
85///
86/// However, it is often much more performant to provide a different,
87/// implementation that handles scalar values differently
88#[derive(Clone, Debug)]
89pub enum ColumnarValue {
90    /// Array of values
91    Array(ArrayRef),
92    /// A single value
93    Scalar(ScalarValue),
94}
95
96impl From<ArrayRef> for ColumnarValue {
97    fn from(value: ArrayRef) -> Self {
98        ColumnarValue::Array(value)
99    }
100}
101
102impl From<ScalarValue> for ColumnarValue {
103    fn from(value: ScalarValue) -> Self {
104        ColumnarValue::Scalar(value)
105    }
106}
107
108impl ColumnarValue {
109    pub fn data_type(&self) -> DataType {
110        match self {
111            ColumnarValue::Array(array_value) => array_value.data_type().clone(),
112            ColumnarValue::Scalar(scalar_value) => scalar_value.data_type(),
113        }
114    }
115
116    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
117    /// number of rows. [`Self::Scalar`] is converted by repeating the same
118    /// scalar multiple times which is not as efficient as handling the scalar
119    /// directly.
120    ///
121    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
122    /// arrays of the same length.
123    ///
124    /// # Errors
125    ///
126    /// Errors if `self` is a Scalar that fails to be converted into an array of size
127    pub fn into_array(self, num_rows: usize) -> Result<ArrayRef> {
128        Ok(match self {
129            ColumnarValue::Array(array) => array,
130            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?,
131        })
132    }
133
134    /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
135    /// number of rows. [`Self::Scalar`] is converted by repeating the same
136    /// scalar multiple times which is not as efficient as handling the scalar
137    /// directly.
138    ///
139    /// See [`Self::values_to_arrays`] to convert multiple columnar values into
140    /// arrays of the same length.
141    ///
142    /// # Errors
143    ///
144    /// Errors if `self` is a Scalar that fails to be converted into an array of size
145    pub fn to_array(&self, num_rows: usize) -> Result<ArrayRef> {
146        Ok(match self {
147            ColumnarValue::Array(array) => Arc::clone(array),
148            ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?,
149        })
150    }
151
152    /// Null columnar values are implemented as a null array in order to pass batch
153    /// num_rows
154    pub fn create_null_array(num_rows: usize) -> Self {
155        ColumnarValue::Array(Arc::new(NullArray::new(num_rows)))
156    }
157
158    /// Converts  [`ColumnarValue`]s to [`ArrayRef`]s with the same length.
159    ///
160    /// # Performance Note
161    ///
162    /// This function expands any [`ScalarValue`] to an array. This expansion
163    /// permits using a single function in terms of arrays, but it can be
164    /// inefficient compared to handling the scalar value directly.
165    ///
166    /// Thus, it is recommended to provide specialized implementations for
167    /// scalar values if performance is a concern.
168    ///
169    /// # Errors
170    ///
171    /// If there are multiple array arguments that have different lengths
172    pub fn values_to_arrays(args: &[ColumnarValue]) -> Result<Vec<ArrayRef>> {
173        if args.is_empty() {
174            return Ok(vec![]);
175        }
176
177        let mut array_len = None;
178        for arg in args {
179            array_len = match (arg, array_len) {
180                (ColumnarValue::Array(a), None) => Some(a.len()),
181                (ColumnarValue::Array(a), Some(array_len)) => {
182                    if array_len == a.len() {
183                        Some(array_len)
184                    } else {
185                        return internal_err!(
186                            "Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len()
187                        );
188                    }
189                }
190                (ColumnarValue::Scalar(_), array_len) => array_len,
191            }
192        }
193
194        // If array_len is none, it means there are only scalars, so make a 1 element array
195        let inferred_length = array_len.unwrap_or(1);
196
197        let args = args
198            .iter()
199            .map(|arg| arg.to_array(inferred_length))
200            .collect::<Result<Vec<_>>>()?;
201
202        Ok(args)
203    }
204
205    /// Cast's this [ColumnarValue] to the specified `DataType`
206    pub fn cast_to(
207        &self,
208        cast_type: &DataType,
209        cast_options: Option<&CastOptions<'static>>,
210    ) -> Result<ColumnarValue> {
211        let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
212        match self {
213            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
214                kernels::cast::cast_with_options(array, cast_type, &cast_options)?,
215            )),
216            ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
217                scalar.cast_to_with_options(cast_type, &cast_options)?,
218            )),
219        }
220    }
221}
222
223// Implement Display trait for ColumnarValue
224impl fmt::Display for ColumnarValue {
225    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
226        let formatted = match self {
227            ColumnarValue::Array(array) => {
228                pretty_format_columns("ColumnarValue(ArrayRef)", &[Arc::clone(array)])
229            }
230            ColumnarValue::Scalar(_) => {
231                if let Ok(array) = self.to_array(1) {
232                    pretty_format_columns("ColumnarValue(ScalarValue)", &[array])
233                } else {
234                    return write!(f, "Error formatting columnar value");
235                }
236            }
237        };
238
239        if let Ok(formatted) = formatted {
240            write!(f, "{formatted}")
241        } else {
242            write!(f, "Error formatting columnar value")
243        }
244    }
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250    use arrow::array::Int32Array;
251
252    #[test]
253    fn values_to_arrays() {
254        // (input, expected)
255        let cases = vec![
256            // empty
257            TestCase {
258                input: vec![],
259                expected: vec![],
260            },
261            // one array of length 3
262            TestCase {
263                input: vec![ColumnarValue::Array(make_array(1, 3))],
264                expected: vec![make_array(1, 3)],
265            },
266            // two arrays length 3
267            TestCase {
268                input: vec![
269                    ColumnarValue::Array(make_array(1, 3)),
270                    ColumnarValue::Array(make_array(2, 3)),
271                ],
272                expected: vec![make_array(1, 3), make_array(2, 3)],
273            },
274            // array and scalar
275            TestCase {
276                input: vec![
277                    ColumnarValue::Array(make_array(1, 3)),
278                    ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
279                ],
280                expected: vec![
281                    make_array(1, 3),
282                    make_array(100, 3), // scalar is expanded
283                ],
284            },
285            // scalar and array
286            TestCase {
287                input: vec![
288                    ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
289                    ColumnarValue::Array(make_array(1, 3)),
290                ],
291                expected: vec![
292                    make_array(100, 3), // scalar is expanded
293                    make_array(1, 3),
294                ],
295            },
296            // multiple scalars and array
297            TestCase {
298                input: vec![
299                    ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
300                    ColumnarValue::Array(make_array(1, 3)),
301                    ColumnarValue::Scalar(ScalarValue::Int32(Some(200))),
302                ],
303                expected: vec![
304                    make_array(100, 3), // scalar is expanded
305                    make_array(1, 3),
306                    make_array(200, 3), // scalar is expanded
307                ],
308            },
309        ];
310        for case in cases {
311            case.run();
312        }
313    }
314
315    #[test]
316    #[should_panic(
317        expected = "Arguments has mixed length. Expected length: 3, found length: 4"
318    )]
319    fn values_to_arrays_mixed_length() {
320        ColumnarValue::values_to_arrays(&[
321            ColumnarValue::Array(make_array(1, 3)),
322            ColumnarValue::Array(make_array(2, 4)),
323        ])
324        .unwrap();
325    }
326
327    #[test]
328    #[should_panic(
329        expected = "Arguments has mixed length. Expected length: 3, found length: 7"
330    )]
331    fn values_to_arrays_mixed_length_and_scalar() {
332        ColumnarValue::values_to_arrays(&[
333            ColumnarValue::Array(make_array(1, 3)),
334            ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
335            ColumnarValue::Array(make_array(2, 7)),
336        ])
337        .unwrap();
338    }
339
340    struct TestCase {
341        input: Vec<ColumnarValue>,
342        expected: Vec<ArrayRef>,
343    }
344
345    impl TestCase {
346        fn run(self) {
347            let Self { input, expected } = self;
348
349            assert_eq!(
350                ColumnarValue::values_to_arrays(&input).unwrap(),
351                expected,
352                "\ninput: {input:?}\nexpected: {expected:?}"
353            );
354        }
355    }
356
357    /// Makes an array of length `len` with all elements set to `val`
358    fn make_array(val: i32, len: usize) -> ArrayRef {
359        Arc::new(Int32Array::from(vec![val; len]))
360    }
361
362    #[test]
363    fn test_display_scalar() {
364        let column = ColumnarValue::from(ScalarValue::from("foo"));
365        assert_eq!(
366            column.to_string(),
367            concat!(
368                "+----------------------------+\n",
369                "| ColumnarValue(ScalarValue) |\n",
370                "+----------------------------+\n",
371                "| foo                        |\n",
372                "+----------------------------+"
373            )
374        );
375    }
376
377    #[test]
378    fn test_display_array() {
379        let array: ArrayRef = Arc::new(Int32Array::from_iter_values(vec![1, 2, 3]));
380        let column = ColumnarValue::from(array);
381        assert_eq!(
382            column.to_string(),
383            concat!(
384                "+-------------------------+\n",
385                "| ColumnarValue(ArrayRef) |\n",
386                "+-------------------------+\n",
387                "| 1                       |\n",
388                "| 2                       |\n",
389                "| 3                       |\n",
390                "+-------------------------+"
391            )
392        );
393    }
394}