@@ -99,9 +99,12 @@ class ReaderBase {
99
99
uint32_t maxDefine,
100
100
uint32_t parentSchemaIdx,
101
101
uint32_t & schemaIdx,
102
- uint32_t & columnIdx) const ;
102
+ uint32_t & columnIdx,
103
+ const TypePtr& requestedType) const ;
103
104
104
- TypePtr convertType (const thrift::SchemaElement& schemaElement) const ;
105
+ TypePtr convertType (
106
+ const thrift::SchemaElement& schemaElement,
107
+ const TypePtr& requestedType) const ;
105
108
106
109
template <typename T>
107
110
static std::shared_ptr<const RowType> createRowType (
@@ -119,8 +122,6 @@ class ReaderBase {
119
122
RowTypePtr schema_;
120
123
std::shared_ptr<const dwio::common::TypeWithId> schemaWithId_;
121
124
122
- const bool binaryAsString = false ;
123
-
124
125
// Map from row group index to pre-created loading BufferedInput.
125
126
std::unordered_map<uint32_t , std::shared_ptr<dwio::common::BufferedInput>>
126
127
inputs_;
@@ -220,7 +221,13 @@ void ReaderBase::initializeSchema() {
220
221
// is the root itself. This is ok because it's never required to check the
221
222
// parent of the root in getParquetColumnInfo().
222
223
schemaWithId_ = getParquetColumnInfo (
223
- maxSchemaElementIdx, maxRepeat, maxDefine, 0 , schemaIdx, columnIdx);
224
+ maxSchemaElementIdx,
225
+ maxRepeat,
226
+ maxDefine,
227
+ 0 ,
228
+ schemaIdx,
229
+ columnIdx,
230
+ options_.fileSchema ());
224
231
schema_ = createRowType (
225
232
schemaWithId_->getChildren (), isFileColumnNamesReadAsLowerCase ());
226
233
}
@@ -231,7 +238,8 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
231
238
uint32_t maxDefine,
232
239
uint32_t parentSchemaIdx,
233
240
uint32_t & schemaIdx,
234
- uint32_t & columnIdx) const {
241
+ uint32_t & columnIdx,
242
+ const TypePtr& requestedType) const {
235
243
VELOX_CHECK (fileMetaData_ != nullptr );
236
244
VELOX_CHECK_LT (schemaIdx, fileMetaData_->schema .size ());
237
245
@@ -265,18 +273,24 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
265
273
VELOX_CHECK (
266
274
schemaElement.__isset .num_children && schemaElement.num_children > 0 ,
267
275
" Node has no children but should" );
276
+ VELOX_CHECK (!requestedType || requestedType->isRow ());
268
277
269
278
std::vector<std::unique_ptr<ParquetTypeWithId::TypeWithId>> children;
270
279
271
280
auto curSchemaIdx = schemaIdx;
272
281
for (int32_t i = 0 ; i < schemaElement.num_children ; i++) {
282
+ ++schemaIdx;
283
+ auto & childName = schema[schemaIdx].name ;
284
+ auto childRequestedType =
285
+ requestedType ? requestedType->asRow ().findChild (childName) : nullptr ;
273
286
auto child = getParquetColumnInfo (
274
287
maxSchemaElementIdx,
275
288
maxRepeat,
276
289
maxDefine,
277
290
curSchemaIdx,
278
- ++schemaIdx,
279
- columnIdx);
291
+ schemaIdx,
292
+ columnIdx,
293
+ childRequestedType);
280
294
children.push_back (std::move (child));
281
295
}
282
296
VELOX_CHECK (!children.empty ());
@@ -467,7 +481,7 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
467
481
}
468
482
}
469
483
} else { // leaf node
470
- const auto veloxType = convertType (schemaElement);
484
+ const auto veloxType = convertType (schemaElement, requestedType );
471
485
int32_t precision =
472
486
schemaElement.__isset .precision ? schemaElement.precision : 0 ;
473
487
int32_t scale = schemaElement.__isset .scale ? schemaElement.scale : 0 ;
@@ -523,7 +537,8 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
523
537
}
524
538
525
539
TypePtr ReaderBase::convertType (
526
- const thrift::SchemaElement& schemaElement) const {
540
+ const thrift::SchemaElement& schemaElement,
541
+ const TypePtr& requestedType) const {
527
542
VELOX_CHECK (schemaElement.__isset .type && schemaElement.num_children == 0 );
528
543
VELOX_CHECK (
529
544
schemaElement.type != thrift::Type::FIXED_LEN_BYTE_ARRAY ||
@@ -655,7 +670,7 @@ TypePtr ReaderBase::convertType(
655
670
return DOUBLE ();
656
671
case thrift::Type::type::BYTE_ARRAY:
657
672
case thrift::Type::type::FIXED_LEN_BYTE_ARRAY:
658
- if (binaryAsString ) {
673
+ if (requestedType && requestedType-> isVarchar () ) {
659
674
return VARCHAR ();
660
675
} else {
661
676
return VARBINARY ();
0 commit comments