datafusion_common/
config.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Runtime configuration, via [`ConfigOptions`]
19
20use crate::error::_config_err;
21use crate::parsers::CompressionTypeVariant;
22use crate::utils::get_available_parallelism;
23use crate::{DataFusionError, Result};
24use std::any::Any;
25use std::collections::{BTreeMap, HashMap};
26use std::error::Error;
27use std::fmt::{self, Display};
28use std::str::FromStr;
29
30/// A macro that wraps a configuration struct and automatically derives
31/// [`Default`] and [`ConfigField`] for it, allowing it to be used
32/// in the [`ConfigOptions`] configuration tree.
33///
34/// `transform` is used to normalize values before parsing.
35///
36/// For example,
37///
38/// ```ignore
39/// config_namespace! {
40///    /// Amazing config
41///    pub struct MyConfig {
42///        /// Field 1 doc
43///        field1: String, transform = str::to_lowercase, default = "".to_string()
44///
45///        /// Field 2 doc
46///        field2: usize, default = 232
47///
48///        /// Field 3 doc
49///        field3: Option<usize>, default = None
50///    }
51///}
52/// ```
53///
54/// Will generate
55///
56/// ```ignore
57/// /// Amazing config
58/// #[derive(Debug, Clone)]
59/// #[non_exhaustive]
60/// pub struct MyConfig {
61///     /// Field 1 doc
62///     field1: String,
63///     /// Field 2 doc
64///     field2: usize,
65///     /// Field 3 doc
66///     field3: Option<usize>,
67/// }
68/// impl ConfigField for MyConfig {
69///     fn set(&mut self, key: &str, value: &str) -> Result<()> {
70///         let (key, rem) = key.split_once('.').unwrap_or((key, ""));
71///         match key {
72///             "field1" => {
73///                 let value = str::to_lowercase(value);
74///                 self.field1.set(rem, value.as_ref())
75///             },
76///             "field2" => self.field2.set(rem, value.as_ref()),
77///             "field3" => self.field3.set(rem, value.as_ref()),
78///             _ => _internal_err!(
79///                 "Config value \"{}\" not found on MyConfig",
80///                 key
81///             ),
82///         }
83///     }
84///
85///     fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
86///         let key = format!("{}.field1", key_prefix);
87///         let desc = "Field 1 doc";
88///         self.field1.visit(v, key.as_str(), desc);
89///         let key = format!("{}.field2", key_prefix);
90///         let desc = "Field 2 doc";
91///         self.field2.visit(v, key.as_str(), desc);
92///         let key = format!("{}.field3", key_prefix);
93///         let desc = "Field 3 doc";
94///         self.field3.visit(v, key.as_str(), desc);
95///     }
96/// }
97///
98/// impl Default for MyConfig {
99///     fn default() -> Self {
100///         Self {
101///             field1: "".to_string(),
102///             field2: 232,
103///             field3: None,
104///         }
105///     }
106/// }
107/// ```
108///
109/// NB: Misplaced commas may result in nonsensical errors
110#[macro_export]
111macro_rules! config_namespace {
112    (
113        $(#[doc = $struct_d:tt])* // Struct-level documentation attributes
114        $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
115        $(#[allow($($struct_de:tt)*)])?
116        $vis:vis struct $struct_name:ident {
117            $(
118                $(#[doc = $d:tt])* // Field-level documentation attributes
119                $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
120                $(#[allow($($field_de:tt)*)])?
121                $field_vis:vis $field_name:ident : $field_type:ty,
122                $(warn = $warn:expr,)?
123                $(transform = $transform:expr,)?
124                default = $default:expr
125            )*$(,)*
126        }
127    ) => {
128        $(#[doc = $struct_d])* // Apply struct documentation
129        $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
130        $(#[allow($($struct_de)*)])?
131        #[derive(Debug, Clone, PartialEq)]
132        $vis struct $struct_name {
133            $(
134                $(#[doc = $d])* // Apply field documentation
135                $(#[deprecated($($field_depr)*)])? // Apply field deprecation
136                $(#[allow($($field_de)*)])?
137                $field_vis $field_name: $field_type,
138            )*
139        }
140
141        impl $crate::config::ConfigField for $struct_name {
142            fn set(&mut self, key: &str, value: &str) -> $crate::error::Result<()> {
143                let (key, rem) = key.split_once('.').unwrap_or((key, ""));
144                match key {
145                    $(
146                        stringify!($field_name) => {
147                            // Safely apply deprecated attribute if present
148                            // $(#[allow(deprecated)])?
149                            {
150                                $(let value = $transform(value);)? // Apply transformation if specified
151                                #[allow(deprecated)]
152                                let ret = self.$field_name.set(rem, value.as_ref());
153
154                                $(if !$warn.is_empty() {
155                                    let default: $field_type = $default;
156                                    #[allow(deprecated)]
157                                    if default != self.$field_name {
158                                        log::warn!($warn);
159                                    }
160                                })? // Log warning if specified, and the value is not the default
161                                ret
162                            }
163                        },
164                    )*
165                    _ => return $crate::error::_config_err!(
166                        "Config value \"{}\" not found on {}", key, stringify!($struct_name)
167                    )
168                }
169            }
170
171            fn visit<V: $crate::config::Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
172                $(
173                    let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
174                    let desc = concat!($($d),*).trim();
175                    #[allow(deprecated)]
176                    self.$field_name.visit(v, key.as_str(), desc);
177                )*
178            }
179        }
180        impl Default for $struct_name {
181            fn default() -> Self {
182                #[allow(deprecated)]
183                Self {
184                    $($field_name: $default),*
185                }
186            }
187        }
188    }
189}
190
191config_namespace! {
192    /// Options related to catalog and directory scanning
193    ///
194    /// See also: [`SessionConfig`]
195    ///
196    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
197    pub struct CatalogOptions {
198        /// Whether the default catalog and schema should be created automatically.
199        pub create_default_catalog_and_schema: bool, default = true
200
201        /// The default catalog name - this impacts what SQL queries use if not specified
202        pub default_catalog: String, default = "datafusion".to_string()
203
204        /// The default schema name - this impacts what SQL queries use if not specified
205        pub default_schema: String, default = "public".to_string()
206
207        /// Should DataFusion provide access to `information_schema`
208        /// virtual tables for displaying schema information
209        pub information_schema: bool, default = false
210
211        /// Location scanned to load tables for `default` schema
212        pub location: Option<String>, default = None
213
214        /// Type of `TableProvider` to use when loading `default` schema
215        pub format: Option<String>, default = None
216
217        /// Default value for `format.has_header` for `CREATE EXTERNAL TABLE`
218        /// if not specified explicitly in the statement.
219        pub has_header: bool, default = true
220
221        /// Specifies whether newlines in (quoted) CSV values are supported.
222        ///
223        /// This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE`
224        /// if not specified explicitly in the statement.
225        ///
226        /// Parsing newlines in quoted values may be affected by execution behaviour such as
227        /// parallel file scanning. Setting this to `true` ensures that newlines in values are
228        /// parsed successfully, which may reduce performance.
229        pub newlines_in_values: bool, default = false
230    }
231}
232
233config_namespace! {
234    /// Options related to SQL parser
235    ///
236    /// See also: [`SessionConfig`]
237    ///
238    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
239    pub struct SqlParserOptions {
240        /// When set to true, SQL parser will parse float as decimal type
241        pub parse_float_as_decimal: bool, default = false
242
243        /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
244        pub enable_ident_normalization: bool, default = true
245
246        /// When set to true, SQL parser will normalize options value (convert value to lowercase).
247        /// Note that this option is ignored and will be removed in the future. All case-insensitive values
248        /// are normalized automatically.
249        pub enable_options_value_normalization: bool, warn = "`enable_options_value_normalization` is deprecated and ignored", default = false
250
251        /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
252        /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
253        pub dialect: String, default = "generic".to_string()
254        // no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive
255
256        /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
257        /// ignore the length. If false, error if a `VARCHAR` with a length is
258        /// specified. The Arrow type system does not have a notion of maximum
259        /// string length and thus DataFusion can not enforce such limits.
260        pub support_varchar_with_length: bool, default = true
261
262       /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning.
263       /// If false, `VARCHAR` is mapped to `Utf8`  during SQL planning.
264       /// Default is false.
265        pub map_varchar_to_utf8view: bool, default = true
266
267        /// When set to true, the source locations relative to the original SQL
268        /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected
269        /// and recorded in the logical plan nodes.
270        pub collect_spans: bool, default = false
271
272        /// Specifies the recursion depth limit when parsing complex SQL Queries
273        pub recursion_limit: usize, default = 50
274    }
275}
276
277config_namespace! {
278    /// Options related to query execution
279    ///
280    /// See also: [`SessionConfig`]
281    ///
282    /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
283    pub struct ExecutionOptions {
284        /// Default batch size while creating new batches, it's especially useful for
285        /// buffer-in-memory batches since creating tiny batches would result in too much
286        /// metadata memory consumption
287        pub batch_size: usize, default = 8192
288
289        /// When set to true, record batches will be examined between each operator and
290        /// small batches will be coalesced into larger batches. This is helpful when there
291        /// are highly selective filters or joins that could produce tiny output batches. The
292        /// target batch size is determined by the configuration setting
293        pub coalesce_batches: bool, default = true
294
295        /// Should DataFusion collect statistics when first creating a table.
296        /// Has no effect after the table is created. Applies to the default
297        /// `ListingTableProvider` in DataFusion. Defaults to true.
298        pub collect_statistics: bool, default = true
299
300        /// Number of partitions for query execution. Increasing partitions can increase
301        /// concurrency.
302        ///
303        /// Defaults to the number of CPU cores on the system
304        pub target_partitions: usize, transform = ExecutionOptions::normalized_parallelism, default = get_available_parallelism()
305
306        /// The default time zone
307        ///
308        /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
309        /// according to this time zone, and then extract the hour
310        pub time_zone: Option<String>, default = Some("+00:00".into())
311
312        /// Parquet options
313        pub parquet: ParquetOptions, default = Default::default()
314
315        /// Fan-out during initial physical planning.
316        ///
317        /// This is mostly use to plan `UNION` children in parallel.
318        ///
319        /// Defaults to the number of CPU cores on the system
320        pub planning_concurrency: usize, transform = ExecutionOptions::normalized_parallelism, default = get_available_parallelism()
321
322        /// When set to true, skips verifying that the schema produced by
323        /// planning the input of `LogicalPlan::Aggregate` exactly matches the
324        /// schema of the input plan.
325        ///
326        /// When set to false, if the schema does not match exactly
327        /// (including nullability and metadata), a planning error will be raised.
328        ///
329        /// This is used to workaround bugs in the planner that are now caught by
330        /// the new schema verification step.
331        pub skip_physical_aggregate_schema_check: bool, default = false
332
333        /// Specifies the reserved memory for each spillable sort operation to
334        /// facilitate an in-memory merge.
335        ///
336        /// When a sort operation spills to disk, the in-memory data must be
337        /// sorted and merged before being written to a file. This setting reserves
338        /// a specific amount of memory for that in-memory sort/merge process.
339        ///
340        /// Note: This setting is irrelevant if the sort operation cannot spill
341        /// (i.e., if there's no `DiskManager` configured).
342        pub sort_spill_reservation_bytes: usize, default = 10 * 1024 * 1024
343
344        /// When sorting, below what size should data be concatenated
345        /// and sorted in a single RecordBatch rather than sorted in
346        /// batches and merged.
347        pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024
348
349        /// Number of files to read in parallel when inferring schema and statistics
350        pub meta_fetch_concurrency: usize, default = 32
351
352        /// Guarantees a minimum level of output files running in parallel.
353        /// RecordBatches will be distributed in round robin fashion to each
354        /// parallel writer. Each writer is closed and a new file opened once
355        /// soft_max_rows_per_output_file is reached.
356        pub minimum_parallel_output_files: usize, default = 4
357
358        /// Target number of rows in output files when writing multiple.
359        /// This is a soft max, so it can be exceeded slightly. There also
360        /// will be one file smaller than the limit if the total
361        /// number of rows written is not roughly divisible by the soft max
362        pub soft_max_rows_per_output_file: usize, default = 50000000
363
364        /// This is the maximum number of RecordBatches buffered
365        /// for each output file being worked. Higher values can potentially
366        /// give faster write performance at the cost of higher peak
367        /// memory consumption
368        pub max_buffered_batches_per_output_file: usize, default = 2
369
370        /// Should sub directories be ignored when scanning directories for data
371        /// files. Defaults to true (ignores subdirectories), consistent with