Skip to content

Commit e47f2d4

Browse files
kevincmchenfacebook-github-bot
authored andcommitted
support to read binary as string in parquet (facebookincubator#10399)
Summary: Some other Parquet-producing systems, in particular Impala, will write string-type data as binary into Parquet file. and Velox's ParquetReader does not support binaryAsString. this PR is intended to resolve this issue. issue resolved: [facebookincubator#10398] Pull Request resolved: facebookincubator#10399 Reviewed By: pedroerp Differential Revision: D60452520 Pulled By: Yuhta fbshipit-source-id: b582117d0d64074bd56fdc52995d80d5eb489222
1 parent d24035d commit e47f2d4

File tree

2 files changed

+56
-11
lines changed

2 files changed

+56
-11
lines changed

velox/dwio/parquet/reader/ParquetReader.cpp

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,12 @@ class ReaderBase {
9999
uint32_t maxDefine,
100100
uint32_t parentSchemaIdx,
101101
uint32_t& schemaIdx,
102-
uint32_t& columnIdx) const;
102+
uint32_t& columnIdx,
103+
const TypePtr& requestedType) const;
103104

104-
TypePtr convertType(const thrift::SchemaElement& schemaElement) const;
105+
TypePtr convertType(
106+
const thrift::SchemaElement& schemaElement,
107+
const TypePtr& requestedType) const;
105108

106109
template <typename T>
107110
static std::shared_ptr<const RowType> createRowType(
@@ -119,8 +122,6 @@ class ReaderBase {
119122
RowTypePtr schema_;
120123
std::shared_ptr<const dwio::common::TypeWithId> schemaWithId_;
121124

122-
const bool binaryAsString = false;
123-
124125
// Map from row group index to pre-created loading BufferedInput.
125126
std::unordered_map<uint32_t, std::shared_ptr<dwio::common::BufferedInput>>
126127
inputs_;
@@ -220,7 +221,13 @@ void ReaderBase::initializeSchema() {
220221
// is the root itself. This is ok because it's never required to check the
221222
// parent of the root in getParquetColumnInfo().
222223
schemaWithId_ = getParquetColumnInfo(
223-
maxSchemaElementIdx, maxRepeat, maxDefine, 0, schemaIdx, columnIdx);
224+
maxSchemaElementIdx,
225+
maxRepeat,
226+
maxDefine,
227+
0,
228+
schemaIdx,
229+
columnIdx,
230+
options_.fileSchema());
224231
schema_ = createRowType(
225232
schemaWithId_->getChildren(), isFileColumnNamesReadAsLowerCase());
226233
}
@@ -231,7 +238,8 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
231238
uint32_t maxDefine,
232239
uint32_t parentSchemaIdx,
233240
uint32_t& schemaIdx,
234-
uint32_t& columnIdx) const {
241+
uint32_t& columnIdx,
242+
const TypePtr& requestedType) const {
235243
VELOX_CHECK(fileMetaData_ != nullptr);
236244
VELOX_CHECK_LT(schemaIdx, fileMetaData_->schema.size());
237245

@@ -265,18 +273,24 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
265273
VELOX_CHECK(
266274
schemaElement.__isset.num_children && schemaElement.num_children > 0,
267275
"Node has no children but should");
276+
VELOX_CHECK(!requestedType || requestedType->isRow());
268277

269278
std::vector<std::unique_ptr<ParquetTypeWithId::TypeWithId>> children;
270279

271280
auto curSchemaIdx = schemaIdx;
272281
for (int32_t i = 0; i < schemaElement.num_children; i++) {
282+
++schemaIdx;
283+
auto& childName = schema[schemaIdx].name;
284+
auto childRequestedType =
285+
requestedType ? requestedType->asRow().findChild(childName) : nullptr;
273286
auto child = getParquetColumnInfo(
274287
maxSchemaElementIdx,
275288
maxRepeat,
276289
maxDefine,
277290
curSchemaIdx,
278-
++schemaIdx,
279-
columnIdx);
291+
schemaIdx,
292+
columnIdx,
293+
childRequestedType);
280294
children.push_back(std::move(child));
281295
}
282296
VELOX_CHECK(!children.empty());
@@ -467,7 +481,7 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
467481
}
468482
}
469483
} else { // leaf node
470-
const auto veloxType = convertType(schemaElement);
484+
const auto veloxType = convertType(schemaElement, requestedType);
471485
int32_t precision =
472486
schemaElement.__isset.precision ? schemaElement.precision : 0;
473487
int32_t scale = schemaElement.__isset.scale ? schemaElement.scale : 0;
@@ -523,7 +537,8 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
523537
}
524538

525539
TypePtr ReaderBase::convertType(
526-
const thrift::SchemaElement& schemaElement) const {
540+
const thrift::SchemaElement& schemaElement,
541+
const TypePtr& requestedType) const {
527542
VELOX_CHECK(schemaElement.__isset.type && schemaElement.num_children == 0);
528543
VELOX_CHECK(
529544
schemaElement.type != thrift::Type::FIXED_LEN_BYTE_ARRAY ||
@@ -655,7 +670,7 @@ TypePtr ReaderBase::convertType(
655670
return DOUBLE();
656671
case thrift::Type::type::BYTE_ARRAY:
657672
case thrift::Type::type::FIXED_LEN_BYTE_ARRAY:
658-
if (binaryAsString) {
673+
if (requestedType && requestedType->isVarchar()) {
659674
return VARCHAR();
660675
} else {
661676
return VARBINARY();

velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,36 @@ TEST_F(ParquetReaderTest, readVarbinaryFromFLBA) {
11501150
0));
11511151
}
11521152

1153+
TEST_F(ParquetReaderTest, readBinaryAsStringFromNation) {
1154+
const std::string filename("nation.parquet");
1155+
const std::string sample(getExampleFilePath(filename));
1156+
1157+
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
1158+
auto outputRowType =
1159+
ROW({"nationkey", "name", "regionkey", "comment"},
1160+
{BIGINT(), VARCHAR(), BIGINT(), VARCHAR()});
1161+
1162+
readerOptions.setFileSchema(outputRowType);
1163+
auto reader = createReader(sample, readerOptions);
1164+
EXPECT_EQ(reader->numberOfRows(), 25ULL);
1165+
auto rowType = reader->typeWithId();
1166+
EXPECT_EQ(rowType->type()->kind(), TypeKind::ROW);
1167+
EXPECT_EQ(rowType->size(), 4ULL);
1168+
EXPECT_EQ(rowType->childAt(1)->type()->kind(), TypeKind::VARCHAR);
1169+
1170+
auto rowReaderOpts = getReaderOpts(outputRowType);
1171+
rowReaderOpts.setScanSpec(makeScanSpec(outputRowType));
1172+
auto rowReader = reader->createRowReader(rowReaderOpts);
1173+
1174+
auto expected = std::string("ALGERIA");
1175+
VectorPtr result = BaseVector::create(outputRowType, 0, &(*leafPool_));
1176+
rowReader->next(1, result);
1177+
EXPECT_EQ(
1178+
expected,
1179+
result->as<RowVector>()->childAt(1)->asFlatVector<StringView>()->valueAt(
1180+
0));
1181+
}
1182+
11531183
TEST_F(ParquetReaderTest, testV2PageWithZeroMaxDefRep) {
11541184
// enum_type.parquet contains 1 column (ENUM) with 3 rows.
11551185
const std::string sample(getExampleFilePath("v2_page.parquet"));

0 commit comments

Comments
 (0)