datafusion_common/config.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Runtime configuration, via [`ConfigOptions`]
19
20use crate::error::_config_err;
21use crate::parsers::CompressionTypeVariant;
22use crate::utils::get_available_parallelism;
23use crate::{DataFusionError, Result};
24use std::any::Any;
25use std::collections::{BTreeMap, HashMap};
26use std::error::Error;
27use std::fmt::{self, Display};
28use std::str::FromStr;
29
30/// A macro that wraps a configuration struct and automatically derives
31/// [`Default`] and [`ConfigField`] for it, allowing it to be used
32/// in the [`ConfigOptions`] configuration tree.
33///
34/// `transform` is used to normalize values before parsing.
35///
36/// For example,
37///
38/// ```ignore
39/// config_namespace! {
40/// /// Amazing config
41/// pub struct MyConfig {
42/// /// Field 1 doc
43/// field1: String, transform = str::to_lowercase, default = "".to_string()
44///
45/// /// Field 2 doc
46/// field2: usize, default = 232
47///
48/// /// Field 3 doc
49/// field3: Option<usize>, default = None
50/// }
51///}
52/// ```
53///
54/// Will generate
55///
56/// ```ignore
57/// /// Amazing config
58/// #[derive(Debug, Clone)]
59/// #[non_exhaustive]
60/// pub struct MyConfig {
61/// /// Field 1 doc
62/// field1: String,
63/// /// Field 2 doc
64/// field2: usize,
65/// /// Field 3 doc
66/// field3: Option<usize>,
67/// }
68/// impl ConfigField for MyConfig {
69/// fn set(&mut self, key: &str, value: &str) -> Result<()> {
70/// let (key, rem) = key.split_once('.').unwrap_or((key, ""));
71/// match key {
72/// "field1" => {
73/// let value = str::to_lowercase(value);
74/// self.field1.set(rem, value.as_ref())
75/// },
76/// "field2" => self.field2.set(rem, value.as_ref()),
77/// "field3" => self.field3.set(rem, value.as_ref()),
78/// _ => _internal_err!(
79/// "Config value \"{}\" not found on MyConfig",
80/// key
81/// ),
82/// }
83/// }
84///
85/// fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
86/// let key = format!("{}.field1", key_prefix);
87/// let desc = "Field 1 doc";
88/// self.field1.visit(v, key.as_str(), desc);
89/// let key = format!("{}.field2", key_prefix);
90/// let desc = "Field 2 doc";
91/// self.field2.visit(v, key.as_str(), desc);
92/// let key = format!("{}.field3", key_prefix);
93/// let desc = "Field 3 doc";
94/// self.field3.visit(v, key.as_str(), desc);
95/// }
96/// }
97///
98/// impl Default for MyConfig {
99/// fn default() -> Self {
100/// Self {
101/// field1: "".to_string(),
102/// field2: 232,
103/// field3: None,
104/// }
105/// }
106/// }
107/// ```
108///
109/// NB: Misplaced commas may result in nonsensical errors
110#[macro_export]
111macro_rules! config_namespace {
112 (
113 $(#[doc = $struct_d:tt])* // Struct-level documentation attributes
114 $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute
115 $(#[allow($($struct_de:tt)*)])?
116 $vis:vis struct $struct_name:ident {
117 $(
118 $(#[doc = $d:tt])* // Field-level documentation attributes
119 $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute
120 $(#[allow($($field_de:tt)*)])?
121 $field_vis:vis $field_name:ident : $field_type:ty,
122 $(warn = $warn:expr,)?
123 $(transform = $transform:expr,)?
124 default = $default:expr
125 )*$(,)*
126 }
127 ) => {
128 $(#[doc = $struct_d])* // Apply struct documentation
129 $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation
130 $(#[allow($($struct_de)*)])?
131 #[derive(Debug, Clone, PartialEq)]
132 $vis struct $struct_name {
133 $(
134 $(#[doc = $d])* // Apply field documentation
135 $(#[deprecated($($field_depr)*)])? // Apply field deprecation
136 $(#[allow($($field_de)*)])?
137 $field_vis $field_name: $field_type,
138 )*
139 }
140
141 impl $crate::config::ConfigField for $struct_name {
142 fn set(&mut self, key: &str, value: &str) -> $crate::error::Result<()> {
143 let (key, rem) = key.split_once('.').unwrap_or((key, ""));
144 match key {
145 $(
146 stringify!($field_name) => {
147 // Safely apply deprecated attribute if present
148 // $(#[allow(deprecated)])?
149 {
150 $(let value = $transform(value);)? // Apply transformation if specified
151 #[allow(deprecated)]
152 let ret = self.$field_name.set(rem, value.as_ref());
153
154 $(if !$warn.is_empty() {
155 let default: $field_type = $default;
156 #[allow(deprecated)]
157 if default != self.$field_name {
158 log::warn!($warn);
159 }
160 })? // Log warning if specified, and the value is not the default
161 ret
162 }
163 },
164 )*
165 _ => return $crate::error::_config_err!(
166 "Config value \"{}\" not found on {}", key, stringify!($struct_name)
167 )
168 }
169 }
170
171 fn visit<V: $crate::config::Visit>(&self, v: &mut V, key_prefix: &str, _description: &'static str) {
172 $(
173 let key = format!(concat!("{}.", stringify!($field_name)), key_prefix);
174 let desc = concat!($($d),*).trim();
175 #[allow(deprecated)]
176 self.$field_name.visit(v, key.as_str(), desc);
177 )*
178 }
179 }
180 impl Default for $struct_name {
181 fn default() -> Self {
182 #[allow(deprecated)]
183 Self {
184 $($field_name: $default),*
185 }
186 }
187 }
188 }
189}
190
191config_namespace! {
192 /// Options related to catalog and directory scanning
193 ///
194 /// See also: [`SessionConfig`]
195 ///
196 /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
197 pub struct CatalogOptions {
198 /// Whether the default catalog and schema should be created automatically.
199 pub create_default_catalog_and_schema: bool, default = true
200
201 /// The default catalog name - this impacts what SQL queries use if not specified
202 pub default_catalog: String, default = "datafusion".to_string()
203
204 /// The default schema name - this impacts what SQL queries use if not specified
205 pub default_schema: String, default = "public".to_string()
206
207 /// Should DataFusion provide access to `information_schema`
208 /// virtual tables for displaying schema information
209 pub information_schema: bool, default = false
210
211 /// Location scanned to load tables for `default` schema
212 pub location: Option<String>, default = None
213
214 /// Type of `TableProvider` to use when loading `default` schema
215 pub format: Option<String>, default = None
216
217 /// Default value for `format.has_header` for `CREATE EXTERNAL TABLE`
218 /// if not specified explicitly in the statement.
219 pub has_header: bool, default = true
220
221 /// Specifies whether newlines in (quoted) CSV values are supported.
222 ///
223 /// This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE`
224 /// if not specified explicitly in the statement.
225 ///
226 /// Parsing newlines in quoted values may be affected by execution behaviour such as
227 /// parallel file scanning. Setting this to `true` ensures that newlines in values are
228 /// parsed successfully, which may reduce performance.
229 pub newlines_in_values: bool, default = false
230 }
231}
232
233config_namespace! {
234 /// Options related to SQL parser
235 ///
236 /// See also: [`SessionConfig`]
237 ///
238 /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
239 pub struct SqlParserOptions {
240 /// When set to true, SQL parser will parse float as decimal type
241 pub parse_float_as_decimal: bool, default = false
242
243 /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
244 pub enable_ident_normalization: bool, default = true
245
246 /// When set to true, SQL parser will normalize options value (convert value to lowercase).
247 /// Note that this option is ignored and will be removed in the future. All case-insensitive values
248 /// are normalized automatically.
249 pub enable_options_value_normalization: bool, warn = "`enable_options_value_normalization` is deprecated and ignored", default = false
250
251 /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
252 /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
253 pub dialect: String, default = "generic".to_string()
254 // no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive
255
256 /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
257 /// ignore the length. If false, error if a `VARCHAR` with a length is
258 /// specified. The Arrow type system does not have a notion of maximum
259 /// string length and thus DataFusion can not enforce such limits.
260 pub support_varchar_with_length: bool, default = true
261
262 /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning.
263 /// If false, `VARCHAR` is mapped to `Utf8` during SQL planning.
264 /// Default is false.
265 pub map_varchar_to_utf8view: bool, default = true
266
267 /// When set to true, the source locations relative to the original SQL
268 /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected
269 /// and recorded in the logical plan nodes.
270 pub collect_spans: bool, default = false
271
272 /// Specifies the recursion depth limit when parsing complex SQL Queries
273 pub recursion_limit: usize, default = 50
274 }
275}
276
277config_namespace! {
278 /// Options related to query execution
279 ///
280 /// See also: [`SessionConfig`]
281 ///
282 /// [`SessionConfig`]: https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
283 pub struct ExecutionOptions {
284 /// Default batch size while creating new batches, it's especially useful for
285 /// buffer-in-memory batches since creating tiny batches would result in too much
286 /// metadata memory consumption
287 pub batch_size: usize, default = 8192
288
289 /// When set to true, record batches will be examined between each operator and
290 /// small batches will be coalesced into larger batches. This is helpful when there
291 /// are highly selective filters or joins that could produce tiny output batches. The
292 /// target batch size is determined by the configuration setting
293 pub coalesce_batches: bool, default = true
294
295 /// Should DataFusion collect statistics when first creating a table.
296 /// Has no effect after the table is created. Applies to the default
297 /// `ListingTableProvider` in DataFusion. Defaults to true.
298 pub collect_statistics: bool, default = true
299
300 /// Number of partitions for query execution. Increasing partitions can increase
301 /// concurrency.
302 ///
303 /// Defaults to the number of CPU cores on the system
304 pub target_partitions: usize, transform = ExecutionOptions::normalized_parallelism, default = get_available_parallelism()
305
306 /// The default time zone
307 ///
308 /// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
309 /// according to this time zone, and then extract the hour
310 pub time_zone: Option<String>, default = Some("+00:00".into())
311
312 /// Parquet options
313 pub parquet: ParquetOptions, default = Default::default()
314
315 /// Fan-out during initial physical planning.
316 ///
317 /// This is mostly use to plan `UNION` children in parallel.
318 ///
319 /// Defaults to the number of CPU cores on the system
320 pub planning_concurrency: usize, transform = ExecutionOptions::normalized_parallelism, default = get_available_parallelism()
321
322 /// When set to true, skips verifying that the schema produced by
323 /// planning the input of `LogicalPlan::Aggregate` exactly matches the
324 /// schema of the input plan.
325 ///
326 /// When set to false, if the schema does not match exactly
327 /// (including nullability and metadata), a planning error will be raised.
328 ///
329 /// This is used to workaround bugs in the planner that are now caught by
330 /// the new schema verification step.
331 pub skip_physical_aggregate_schema_check: bool, default = false
332
333 /// Specifies the reserved memory for each spillable sort operation to
334 /// facilitate an in-memory merge.
335 ///
336 /// When a sort operation spills to disk, the in-memory data must be
337 /// sorted and merged before being written to a file. This setting reserves
338 /// a specific amount of memory for that in-memory sort/merge process.
339 ///
340 /// Note: This setting is irrelevant if the sort operation cannot spill
341 /// (i.e., if there's no `DiskManager` configured).
342 pub sort_spill_reservation_bytes: usize, default = 10 * 1024 * 1024
343
344 /// When sorting, below what size should data be concatenated
345 /// and sorted in a single RecordBatch rather than sorted in
346 /// batches and merged.
347 pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024
348
349 /// Number of files to read in parallel when inferring schema and statistics
350 pub meta_fetch_concurrency: usize, default = 32
351
352 /// Guarantees a minimum level of output files running in parallel.
353 /// RecordBatches will be distributed in round robin fashion to each
354 /// parallel writer. Each writer is closed and a new file opened once
355 /// soft_max_rows_per_output_file is reached.
356 pub minimum_parallel_output_files: usize, default = 4
357
358 /// Target number of rows in output files when writing multiple.
359 /// This is a soft max, so it can be exceeded slightly. There also
360 /// will be one file smaller than the limit if the total
361 /// number of rows written is not roughly divisible by the soft max
362 pub soft_max_rows_per_output_file: usize, default = 50000000
363
364 /// This is the maximum number of RecordBatches buffered
365 /// for each output file being worked. Higher values can potentially
366 /// give faster write performance at the cost of higher peak
367 /// memory consumption
368 pub max_buffered_batches_per_output_file: usize, default = 2
369
370 /// Should sub directories be ignored when scanning directories for data
371 /// files. Defaults to true (ignores subdirectories), consistent with