datafusion_expr_common/columnar_value.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`ColumnarValue`] represents the result of evaluating an expression.
19
20use arrow::array::{Array, ArrayRef, NullArray};
21use arrow::compute::{kernels, CastOptions};
22use arrow::datatypes::DataType;
23use arrow::util::pretty::pretty_format_columns;
24use datafusion_common::format::DEFAULT_CAST_OPTIONS;
25use datafusion_common::{internal_err, Result, ScalarValue};
26use std::fmt;
27use std::sync::Arc;
28
29/// The result of evaluating an expression.
30///
31/// [`ColumnarValue::Scalar`] represents a single value repeated any number of
32/// times. This is an important performance optimization for handling values
33/// that do not change across rows.
34///
35/// [`ColumnarValue::Array`] represents a column of data, stored as an Arrow
36/// [`ArrayRef`]
37///
38/// A slice of `ColumnarValue`s logically represents a table, with each column
39/// having the same number of rows. This means that all `Array`s are the same
40/// length.
41///
42/// # Example
43///
44/// A `ColumnarValue::Array` with an array of 5 elements and a
45/// `ColumnarValue::Scalar` with the value 100
46///
47/// ```text
48/// ┌──────────────┐
49/// │ ┌──────────┐ │
50/// │ │ "A" │ │
51/// │ ├──────────┤ │
52/// │ │ "B" │ │
53/// │ ├──────────┤ │
54/// │ │ "C" │ │
55/// │ ├──────────┤ │
56/// │ │ "D" │ │ ┌──────────────┐
57/// │ ├──────────┤ │ │ ┌──────────┐ │
58/// │ │ "E" │ │ │ │ 100 │ │
59/// │ └──────────┘ │ │ └──────────┘ │
60/// └──────────────┘ └──────────────┘
61///
62/// ColumnarValue:: ColumnarValue::
63/// Array Scalar
64/// ```
65///
66/// Logically represents the following table:
67///
68/// | Column 1| Column 2 |
69/// | ------- | -------- |
70/// | A | 100 |
71/// | B | 100 |
72/// | C | 100 |
73/// | D | 100 |
74/// | E | 100 |
75///
76/// # Performance Notes
77///
78/// When implementing functions or operators, it is important to consider the
79/// performance implications of handling scalar values.
80///
81/// Because all functions must handle [`ArrayRef`], it is
82/// convenient to convert [`ColumnarValue::Scalar`]s using
83/// [`Self::into_array`]. For example, [`ColumnarValue::values_to_arrays`]
84/// converts multiple columnar values into arrays of the same length.
85///
86/// However, it is often much more performant to provide a different,
87/// implementation that handles scalar values differently
88#[derive(Clone, Debug)]
89pub enum ColumnarValue {
90 /// Array of values
91 Array(ArrayRef),
92 /// A single value
93 Scalar(ScalarValue),
94}
95
96impl From<ArrayRef> for ColumnarValue {
97 fn from(value: ArrayRef) -> Self {
98 ColumnarValue::Array(value)
99 }
100}
101
102impl From<ScalarValue> for ColumnarValue {
103 fn from(value: ScalarValue) -> Self {
104 ColumnarValue::Scalar(value)
105 }
106}
107
108impl ColumnarValue {
109 pub fn data_type(&self) -> DataType {
110 match self {
111 ColumnarValue::Array(array_value) => array_value.data_type().clone(),
112 ColumnarValue::Scalar(scalar_value) => scalar_value.data_type(),
113 }
114 }
115
116 /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
117 /// number of rows. [`Self::Scalar`] is converted by repeating the same
118 /// scalar multiple times which is not as efficient as handling the scalar
119 /// directly.
120 ///
121 /// See [`Self::values_to_arrays`] to convert multiple columnar values into
122 /// arrays of the same length.
123 ///
124 /// # Errors
125 ///
126 /// Errors if `self` is a Scalar that fails to be converted into an array of size
127 pub fn into_array(self, num_rows: usize) -> Result<ArrayRef> {
128 Ok(match self {
129 ColumnarValue::Array(array) => array,
130 ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?,
131 })
132 }
133
134 /// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
135 /// number of rows. [`Self::Scalar`] is converted by repeating the same
136 /// scalar multiple times which is not as efficient as handling the scalar
137 /// directly.
138 ///
139 /// See [`Self::values_to_arrays`] to convert multiple columnar values into
140 /// arrays of the same length.
141 ///
142 /// # Errors
143 ///
144 /// Errors if `self` is a Scalar that fails to be converted into an array of size
145 pub fn to_array(&self, num_rows: usize) -> Result<ArrayRef> {
146 Ok(match self {
147 ColumnarValue::Array(array) => Arc::clone(array),
148 ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?,
149 })
150 }
151
152 /// Null columnar values are implemented as a null array in order to pass batch
153 /// num_rows
154 pub fn create_null_array(num_rows: usize) -> Self {
155 ColumnarValue::Array(Arc::new(NullArray::new(num_rows)))
156 }
157
158 /// Converts [`ColumnarValue`]s to [`ArrayRef`]s with the same length.
159 ///
160 /// # Performance Note
161 ///
162 /// This function expands any [`ScalarValue`] to an array. This expansion
163 /// permits using a single function in terms of arrays, but it can be
164 /// inefficient compared to handling the scalar value directly.
165 ///
166 /// Thus, it is recommended to provide specialized implementations for
167 /// scalar values if performance is a concern.
168 ///
169 /// # Errors
170 ///
171 /// If there are multiple array arguments that have different lengths
172 pub fn values_to_arrays(args: &[ColumnarValue]) -> Result<Vec<ArrayRef>> {
173 if args.is_empty() {
174 return Ok(vec![]);
175 }
176
177 let mut array_len = None;
178 for arg in args {
179 array_len = match (arg, array_len) {
180 (ColumnarValue::Array(a), None) => Some(a.len()),
181 (ColumnarValue::Array(a), Some(array_len)) => {
182 if array_len == a.len() {
183 Some(array_len)
184 } else {
185 return internal_err!(
186 "Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len()
187 );
188 }
189 }
190 (ColumnarValue::Scalar(_), array_len) => array_len,
191 }
192 }
193
194 // If array_len is none, it means there are only scalars, so make a 1 element array
195 let inferred_length = array_len.unwrap_or(1);
196
197 let args = args
198 .iter()
199 .map(|arg| arg.to_array(inferred_length))
200 .collect::<Result<Vec<_>>>()?;
201
202 Ok(args)
203 }
204
205 /// Cast's this [ColumnarValue] to the specified `DataType`
206 pub fn cast_to(
207 &self,
208 cast_type: &DataType,
209 cast_options: Option<&CastOptions<'static>>,
210 ) -> Result<ColumnarValue> {
211 let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
212 match self {
213 ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
214 kernels::cast::cast_with_options(array, cast_type, &cast_options)?,
215 )),
216 ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
217 scalar.cast_to_with_options(cast_type, &cast_options)?,
218 )),
219 }
220 }
221}
222
223// Implement Display trait for ColumnarValue
224impl fmt::Display for ColumnarValue {
225 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
226 let formatted = match self {
227 ColumnarValue::Array(array) => {
228 pretty_format_columns("ColumnarValue(ArrayRef)", &[Arc::clone(array)])
229 }
230 ColumnarValue::Scalar(_) => {
231 if let Ok(array) = self.to_array(1) {
232 pretty_format_columns("ColumnarValue(ScalarValue)", &[array])
233 } else {
234 return write!(f, "Error formatting columnar value");
235 }
236 }
237 };
238
239 if let Ok(formatted) = formatted {
240 write!(f, "{formatted}")
241 } else {
242 write!(f, "Error formatting columnar value")
243 }
244 }
245}
246
247#[cfg(test)]
248mod tests {
249 use super::*;
250 use arrow::array::Int32Array;
251
252 #[test]
253 fn values_to_arrays() {
254 // (input, expected)
255 let cases = vec![
256 // empty
257 TestCase {
258 input: vec![],
259 expected: vec![],
260 },
261 // one array of length 3
262 TestCase {
263 input: vec![ColumnarValue::Array(make_array(1, 3))],
264 expected: vec![make_array(1, 3)],
265 },
266 // two arrays length 3
267 TestCase {
268 input: vec![
269 ColumnarValue::Array(make_array(1, 3)),
270 ColumnarValue::Array(make_array(2, 3)),
271 ],
272 expected: vec![make_array(1, 3), make_array(2, 3)],
273 },
274 // array and scalar
275 TestCase {
276 input: vec![
277 ColumnarValue::Array(make_array(1, 3)),
278 ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
279 ],
280 expected: vec![
281 make_array(1, 3),
282 make_array(100, 3), // scalar is expanded
283 ],
284 },
285 // scalar and array
286 TestCase {
287 input: vec![
288 ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
289 ColumnarValue::Array(make_array(1, 3)),
290 ],
291 expected: vec![
292 make_array(100, 3), // scalar is expanded
293 make_array(1, 3),
294 ],
295 },
296 // multiple scalars and array
297 TestCase {
298 input: vec![
299 ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
300 ColumnarValue::Array(make_array(1, 3)),
301 ColumnarValue::Scalar(ScalarValue::Int32(Some(200))),
302 ],
303 expected: vec![
304 make_array(100, 3), // scalar is expanded
305 make_array(1, 3),
306 make_array(200, 3), // scalar is expanded
307 ],
308 },
309 ];
310 for case in cases {
311 case.run();
312 }
313 }
314
315 #[test]
316 #[should_panic(
317 expected = "Arguments has mixed length. Expected length: 3, found length: 4"
318 )]
319 fn values_to_arrays_mixed_length() {
320 ColumnarValue::values_to_arrays(&[
321 ColumnarValue::Array(make_array(1, 3)),
322 ColumnarValue::Array(make_array(2, 4)),
323 ])
324 .unwrap();
325 }
326
327 #[test]
328 #[should_panic(
329 expected = "Arguments has mixed length. Expected length: 3, found length: 7"
330 )]
331 fn values_to_arrays_mixed_length_and_scalar() {
332 ColumnarValue::values_to_arrays(&[
333 ColumnarValue::Array(make_array(1, 3)),
334 ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
335 ColumnarValue::Array(make_array(2, 7)),
336 ])
337 .unwrap();
338 }
339
340 struct TestCase {
341 input: Vec<ColumnarValue>,
342 expected: Vec<ArrayRef>,
343 }
344
345 impl TestCase {
346 fn run(self) {
347 let Self { input, expected } = self;
348
349 assert_eq!(
350 ColumnarValue::values_to_arrays(&input).unwrap(),
351 expected,
352 "\ninput: {input:?}\nexpected: {expected:?}"
353 );
354 }
355 }
356
357 /// Makes an array of length `len` with all elements set to `val`
358 fn make_array(val: i32, len: usize) -> ArrayRef {
359 Arc::new(Int32Array::from(vec![val; len]))
360 }
361
362 #[test]
363 fn test_display_scalar() {
364 let column = ColumnarValue::from(ScalarValue::from("foo"));
365 assert_eq!(
366 column.to_string(),
367 concat!(
368 "+----------------------------+\n",
369 "| ColumnarValue(ScalarValue) |\n",
370 "+----------------------------+\n",
371 "| foo |\n",
372 "+----------------------------+"
373 )
374 );
375 }
376
377 #[test]
378 fn test_display_array() {
379 let array: ArrayRef = Arc::new(Int32Array::from_iter_values(vec![1, 2, 3]));
380 let column = ColumnarValue::from(array);
381 assert_eq!(
382 column.to_string(),
383 concat!(
384 "+-------------------------+\n",
385 "| ColumnarValue(ArrayRef) |\n",
386 "+-------------------------+\n",
387 "| 1 |\n",
388 "| 2 |\n",
389 "| 3 |\n",
390 "+-------------------------+"
391 )
392 );
393 }
394}