arrow_array/array/
dictionary_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder};
19use crate::cast::AsArray;
20use crate::iterator::ArrayIter;
21use crate::types::*;
22use crate::{
23    make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, PrimitiveArray, Scalar,
24    StringArray,
25};
26use arrow_buffer::bit_util::set_bit;
27use arrow_buffer::buffer::NullBuffer;
28use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
29use arrow_data::ArrayData;
30use arrow_schema::{ArrowError, DataType};
31use std::any::Any;
32use std::sync::Arc;
33
34/// A [`DictionaryArray`] indexed by `i8`
35///
36/// # Example: Using `collect`
37/// ```
38/// # use arrow_array::{Array, Int8DictionaryArray, Int8Array, StringArray};
39/// # use std::sync::Arc;
40///
41/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
42/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
43/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
44/// assert_eq!(array.values(), &values);
45/// ```
46///
47/// See [`DictionaryArray`] for more information and examples
48pub type Int8DictionaryArray = DictionaryArray<Int8Type>;
49
50/// A [`DictionaryArray`] indexed by `i16`
51///
52/// # Example: Using `collect`
53/// ```
54/// # use arrow_array::{Array, Int16DictionaryArray, Int16Array, StringArray};
55/// # use std::sync::Arc;
56///
57/// let array: Int16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
58/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
59/// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2]));
60/// assert_eq!(array.values(), &values);
61/// ```
62///
63/// See [`DictionaryArray`] for more information and examples
64pub type Int16DictionaryArray = DictionaryArray<Int16Type>;
65
66/// A [`DictionaryArray`] indexed by `i32`
67///
68/// # Example: Using `collect`
69/// ```
70/// # use arrow_array::{Array, Int32DictionaryArray, Int32Array, StringArray};
71/// # use std::sync::Arc;
72///
73/// let array: Int32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
74/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
75/// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2]));
76/// assert_eq!(array.values(), &values);
77/// ```
78///
79/// See [`DictionaryArray`] for more information and examples
80pub type Int32DictionaryArray = DictionaryArray<Int32Type>;
81
82/// A [`DictionaryArray`] indexed by `i64`
83///
84/// # Example: Using `collect`
85/// ```
86/// # use arrow_array::{Array, Int64DictionaryArray, Int64Array, StringArray};
87/// # use std::sync::Arc;
88///
89/// let array: Int64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
90/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
91/// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2]));
92/// assert_eq!(array.values(), &values);
93/// ```
94///
95/// See [`DictionaryArray`] for more information and examples
96pub type Int64DictionaryArray = DictionaryArray<Int64Type>;
97
98/// A [`DictionaryArray`] indexed by `u8`
99///
100/// # Example: Using `collect`
101/// ```
102/// # use arrow_array::{Array, UInt8DictionaryArray, UInt8Array, StringArray};
103/// # use std::sync::Arc;
104///
105/// let array: UInt8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
106/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
107/// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2]));
108/// assert_eq!(array.values(), &values);
109/// ```
110///
111/// See [`DictionaryArray`] for more information and examples
112pub type UInt8DictionaryArray = DictionaryArray<UInt8Type>;
113
114/// A [`DictionaryArray`] indexed by `u16`
115///
116/// # Example: Using `collect`
117/// ```
118/// # use arrow_array::{Array, UInt16DictionaryArray, UInt16Array, StringArray};
119/// # use std::sync::Arc;
120///
121/// let array: UInt16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
122/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
123/// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2]));
124/// assert_eq!(array.values(), &values);
125/// ```
126///
127/// See [`DictionaryArray`] for more information and examples
128pub type UInt16DictionaryArray = DictionaryArray<UInt16Type>;
129
130/// A [`DictionaryArray`] indexed by `u32`
131///
132/// # Example: Using `collect`
133/// ```
134/// # use arrow_array::{Array, UInt32DictionaryArray, UInt32Array, StringArray};
135/// # use std::sync::Arc;
136///
137/// let array: UInt32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
138/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
139/// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2]));
140/// assert_eq!(array.values(), &values);
141/// ```
142///
143/// See [`DictionaryArray`] for more information and examples
144pub type UInt32DictionaryArray = DictionaryArray<UInt32Type>;
145
146/// A [`DictionaryArray`] indexed by `u64`
147///
148/// # Example: Using `collect`
149/// ```
150/// # use arrow_array::{Array, UInt64DictionaryArray, UInt64Array, StringArray};
151/// # use std::sync::Arc;
152///
153/// let array: UInt64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
154/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
155/// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2]));
156/// assert_eq!(array.values(), &values);
157/// ```
158///
159/// See [`DictionaryArray`] for more information and examples
160pub type UInt64DictionaryArray = DictionaryArray<UInt64Type>;
161
162/// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout)
163///
164/// This is mostly used to represent strings or a limited set of primitive types as integers,
165/// for example when doing NLP analysis or representing chromosomes by name.
166///
167/// [`DictionaryArray`] are represented using a `keys` array and a
168/// `values` array, which may be different lengths. The `keys` array
169/// stores indexes in the `values` array which holds
170/// the corresponding logical value, as shown here:
171///
172/// ```text
173/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
174///   ┌─────────────────┐  ┌─────────┐ │     ┌─────────────────┐
175/// │ │        A        │  │    0    │       │        A        │     values[keys[0]]
176///   ├─────────────────┤  ├─────────┤ │     ├─────────────────┤
177/// │ │        D        │  │    2    │       │        B        │     values[keys[1]]
178///   ├─────────────────┤  ├─────────┤ │     ├─────────────────┤
179/// │ │        B        │  │    2    │       │        B        │     values[keys[2]]
180///   └─────────────────┘  ├─────────┤ │     ├─────────────────┤
181/// │                      │    1    │       │        D        │     values[keys[3]]
182///                        ├─────────┤ │     ├─────────────────┤
183/// │                      │    1    │       │        D        │     values[keys[4]]
184///                        ├─────────┤ │     ├─────────────────┤
185/// │                      │    0    │       │        A        │     values[keys[5]]
186///                        └─────────┘ │     └─────────────────┘
187/// │       values            keys
188///  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
189///                                             Logical array
190///                                                Contents
191///           DictionaryArray
192///              length = 6
193/// ```
194///
195/// # Example: From Nullable Data
196///
197/// ```
198/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
199/// let test = vec!["a", "a", "b", "c"];
200/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
201/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
202/// ```
203///
204/// # Example: From Non-Nullable Data
205///
206/// ```
207/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
208/// let test = vec!["a", "a", "b", "c"];
209/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
210/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
211/// ```
212///
213/// # Example: From Existing Arrays
214///
215/// ```
216/// # use std::sync::Arc;
217/// # use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type};
218/// // You can form your own DictionaryArray by providing the
219/// // values (dictionary) and keys (indexes into the dictionary):
220/// let values = StringArray::from_iter_values(["a", "b", "c"]);
221/// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
222/// let array = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
223/// let expected: DictionaryArray::<Int8Type> = vec!["a", "a", "b", "c"].into_iter().collect();
224/// assert_eq!(&array, &expected);
225/// ```
226///
227/// # Example: Using Builder
228///
229/// ```
230/// # use arrow_array::{Array, StringArray};
231/// # use arrow_array::builder::StringDictionaryBuilder;
232/// # use arrow_array::types::Int32Type;
233/// let mut builder = StringDictionaryBuilder::<Int32Type>::new();
234/// builder.append_value("a");
235/// builder.append_null();
236/// builder.append_value("a");
237/// builder.append_value("b");
238/// let array = builder.finish();
239///
240/// let values: Vec<_> = array.downcast_dict::<StringArray>().unwrap().into_iter().collect();
241/// assert_eq!(&values, &[Some("a"), None, Some("a"), Some("b")]);
242/// ```
243pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
244    data_type: DataType,
245
246    /// The keys of this dictionary. These are constructed from the
247    /// buffer and null bitmap of `data`.  Also, note that these do
248    /// not correspond to the true values of this array. Rather, they
249    /// map to the real values.
250    keys: PrimitiveArray<K>,
251
252    /// Array of dictionary values (can be any DataType).
253    values: ArrayRef,
254
255    /// Values are ordered.
256    is_ordered: bool,
257}
258
259impl<K: ArrowDictionaryKeyType> Clone for DictionaryArray<K> {
260    fn clone(&self) -> Self {
261        Self {
262            data_type: self.data_type.clone(),
263            keys: self.keys.clone(),
264            values: self.values.clone(),
265            is_ordered: self.is_ordered,
266        }
267    }
268}
269
270impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
271    /// Attempt to create a new DictionaryArray with a specified keys
272    /// (indexes into the dictionary) and values (dictionary)
273    /// array.
274    ///
275    /// # Panics
276    ///
277    /// Panics if [`Self::try_new`] returns an error
278    pub fn new(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
279        Self::try_new(keys, values).unwrap()
280    }
281
282    /// Attempt to create a new DictionaryArray with a specified keys
283    /// (indexes into the dictionary) and values (dictionary)
284    /// array.
285    ///
286    /// # Errors
287    ///
288    /// Returns an error if any `keys[i] >= values.len() || keys[i] < 0`
289    pub fn try_new(keys: PrimitiveArray<K>, values: ArrayRef) -> Result<Self, ArrowError> {
290        let data_type = DataType::Dictionary(
291            Box::new(keys.data_type().clone()),
292            Box::new(values.data_type().clone()),
293        );
294
295        let zero = K::Native::usize_as(0);
296        let values_len = values.len();
297
298        if let Some((idx, v)) =
299            keys.values().iter().enumerate().find(|(idx, v)| {
300                (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx)
301            })
302        {
303            return Err(ArrowError::InvalidArgumentError(format!(
304                "Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}",
305            )));
306        }
307
308        Ok(Self {
309            data_type,
310            keys,
311            values,
312            is_ordered: false,
313        })
314    }
315
316    /// Create a new [`Scalar`] from `value`
317    pub fn new_scalar<T: Array + 'static>(value: Scalar<T>) -> Scalar<Self> {
318        Scalar::new(Self::new(
319            PrimitiveArray::new(vec![K::Native::usize_as(0)].into(), None),
320            Arc::new(value.into_inner()),
321        ))
322    }
323
324    /// Create a new [`DictionaryArray`] without performing validation
325    ///
326    /// # Safety
327    ///
328    /// Safe provided [`Self::try_new`] would not return an error
329    pub unsafe fn new_unchecked(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
330        if cfg!(feature = "force_validate") {
331            return Self::new(keys, values);
332        }
333
334        let data_type = DataType::Dictionary(
335            Box::new(keys.data_type().clone()),
336            Box::new(values.data_type().clone()),
337        );
338
339        Self {
340            data_type,
341            keys,
342            values,
343            is_ordered: false,
344        }
345    }
346
347    /// Deconstruct this array into its constituent parts
348    pub fn into_parts(self) -> (PrimitiveArray<K>, ArrayRef) {
349        (self.keys, self.values)
350    }
351
352    /// Return an array view of the keys of this dictionary as a PrimitiveArray.
353    pub fn keys(&self) -> &PrimitiveArray<K> {
354        &self.keys
355    }
356
357    /// If `value` is present in `values` (aka the dictionary),
358    /// returns the corresponding key (index into the `values`
359    /// array). Otherwise returns `None`.
360    ///
361    /// Panics if `values` is not a [`StringArray`].
362    pub fn lookup_key(&self, value: &str) -> Option<K::Native> {
363        let rd_buf: &StringArray = self.values.as_any().downcast_ref::<StringArray>().unwrap();
364
365        (0..rd_buf.len())
366            .position(|i| rd_buf.value(i) == value)
367            .and_then(K::Native::from_usize)
368    }
369
370    /// Returns a reference to the dictionary values array
371    pub fn values(&self) -> &ArrayRef {
372        &self.values
373    }
374
375    /// Returns a clone of the value type of this list.
376    pub fn value_type(&self) -> DataType {
377        self.values.data_type().clone()
378    }
379
380    /// The length of the dictionary is the length of the keys array.
381    pub fn len(&self) -> usize {
382        self.keys.len()
383    }
384
385    /// Whether this dictionary is empty
386    pub fn is_empty(&self) -> bool {
387        self.keys.is_empty()
388    }
389
390    /// Currently exists for compatibility purposes with Arrow IPC.
391    pub fn is_ordered(&self) -> bool {
392        self.is_ordered
393    }
394
395    /// Return an iterator over the keys (indexes into the dictionary)
396    pub fn keys_iter(&self) -> impl Iterator<Item = Option<usize>> + '_ {
397        self.keys.iter().map(|key| key.map(|k| k.as_usize()))
398    }
399
400    /// Return the value of `keys` (the dictionary key) at index `i`,
401    /// cast to `usize`, `None` if the value at `i` is `NULL`.
402    pub fn key(&self, i: usize) -> Option<usize> {
403        self.keys.is_valid(i).then(|| self.keys.value(i).as_usize())
404    }
405
406    /// Returns a zero-copy slice of this array with the indicated offset and length.
407    pub fn slice(&self, offset: usize, length: usize) -> Self {
408        Self {
409            data_type: self.data_type.clone(),
410            keys: self.keys.slice(offset, length),
411            values: self.values.clone(),
412            is_ordered: self.is_ordered,
413        }
414    }
415
416    /// Downcast this dictionary to a [`TypedDictionaryArray`]
417    ///
418    /// ```
419    /// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::Int32Type};
420    ///
421    /// let orig = [Some("a"), Some("b"), None];
422    /// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
423    /// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
424    /// assert_eq!(typed.value(0), "a");
425    /// assert_eq!(typed.value(1), "b");
426    /// assert!(typed.is_null(2));
427    /// ```
428    ///
429    pub fn downcast_dict<V: 'static>(&self) -> Option<TypedDictionaryArray<'_, K, V>> {
430        let values = self.values.as_any().downcast_ref()?;
431        Some(TypedDictionaryArray {
432            dictionary: self,
433            values,
434        })
435    }
436
437    /// Returns a new dictionary with the same keys as the current instance
438    /// but with a different set of dictionary values
439    ///
440    /// This can be used to perform an operation on the values of a dictionary
441    ///
442    /// # Panics
443    ///
444    /// Panics if `values` has a length less than the current values
445    ///
446    /// ```
447    /// # use std::sync::Arc;
448    /// # use arrow_array::builder::PrimitiveDictionaryBuilder;
449    /// # use arrow_array::{Int8Array, Int64Array, ArrayAccessor};
450    /// # use arrow_array::types::{Int32Type, Int8Type};
451    ///
452    /// // Construct a Dict(Int32, Int8)
453    /// let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int8Type>::with_capacity(2, 200);
454    /// for i in 0..100 {
455    ///     builder.append(i % 2).unwrap();
456    /// }
457    ///
458    /// let dictionary = builder.finish();
459    ///
460    /// // Perform a widening cast of dictionary values
461    /// let typed_dictionary = dictionary.downcast_dict::<Int8Array>().unwrap();
462    /// let values: Int64Array = typed_dictionary.values().unary(|x| x as i64);
463    ///
464    /// // Create a Dict(Int32,
465    /// let new = dictionary.with_values(Arc::new(values));
466    ///
467    /// // Verify values are as expected
468    /// let new_typed = new.downcast_dict::<Int64Array>().unwrap();
469    /// for i in 0..100 {
470    ///     assert_eq!(new_typed.value(i), (i % 2) as i64)
471    /// }
472    /// ```
473    ///
474    pub fn with_values(&self, values: ArrayRef) -> Self {
475        assert!(values.len() >= self.values.len());