Skip to content

Commit 4a76cf2

Browse files
Yuhtafacebook-github-bot
authored andcommitted
Optimize remaining filter to not eagerly materialize multi-referenced fields only directly referenced in AND clauses (facebookincubator#10645)
Summary: Pull Request resolved: facebookincubator#10645 When a field is referenced in both remaining filter and output vector, we eagerly materialize it to avoid missing rows loaded. The condition here can be relaxed: if the field is only referenced in AND clauses without any other conditional expressions, we should consider it safe to be lazy loaded because we will not need more rows than any of the AND clause needs. This helps us improving some queries from 181 hours to 7.53 hours, comparing to Java using 38.8 hours. Reviewed By: kevinwilfong Differential Revision: D60561537 fbshipit-source-id: 92e509f75f71c2baaf732d7c44cc64902eb350b6
1 parent 47bb048 commit 4a76cf2

File tree

4 files changed

+85
-12
lines changed

4 files changed

+85
-12
lines changed

velox/connectors/hive/HiveDataSource.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@ namespace facebook::velox::connector::hive {
3232
class HiveTableHandle;
3333
class HiveColumnHandle;
3434

35+
namespace {
36+
37+
bool isMember(
38+
const std::vector<exec::FieldReference*>& fields,
39+
const exec::FieldReference& field) {
40+
return std::find(fields.begin(), fields.end(), &field) != fields.end();
41+
}
42+
43+
bool shouldEagerlyMaterialize(
44+
const exec::Expr& remainingFilter,
45+
const exec::FieldReference& field) {
46+
if (!remainingFilter.evaluatesArgumentsOnNonIncreasingSelection()) {
47+
return true;
48+
}
49+
for (auto& input : remainingFilter.inputs()) {
50+
if (isMember(input->distinctFields(), field) && input->hasConditionals()) {
51+
return true;
52+
}
53+
}
54+
return false;
55+
}
56+
57+
} // namespace
58+
3559
HiveDataSource::HiveDataSource(
3660
const RowTypePtr& outputType,
3761
const std::shared_ptr<connector::ConnectorTableHandle>& tableHandle,
@@ -126,7 +150,9 @@ HiveDataSource::HiveDataSource(
126150
for (auto& input : remainingFilterExpr->distinctFields()) {
127151
auto it = columnNames.find(input->field());
128152
if (it != columnNames.end()) {
129-
multiReferencedFields_.push_back(it->second);
153+
if (shouldEagerlyMaterialize(*remainingFilterExpr, *input)) {
154+
multiReferencedFields_.push_back(it->second);
155+
}
130156
continue;
131157
}
132158
// Remaining filter may reference columns that are not used otherwise,

velox/exec/tests/TableScanTest.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3277,6 +3277,49 @@ TEST_F(TableScanTest, remainingFilterLazyWithMultiReferences) {
32773277
ASSERT_TRUE(waitForTaskCompletion(cursor->task().get()));
32783278
}
32793279

3280+
// When the multi-referenced fields are in AND clauses without any other
3281+
// conditionals, they should not be eagerly materialized.
3282+
TEST_F(
3283+
TableScanTest,
3284+
remainingFilterLazyWithMultiReferencesDirectlyInAndClause) {
3285+
constexpr int kSize = 10;
3286+
auto vector = makeRowVector({
3287+
makeFlatVector<int64_t>(kSize, folly::identity),
3288+
makeFlatVector<int64_t>(kSize, folly::identity),
3289+
});
3290+
auto schema = asRowType(vector->type());
3291+
auto file = TempFilePath::create();
3292+
writeToFile(file->getPath(), {vector});
3293+
CursorParameters params;
3294+
params.copyResult = false;
3295+
params.singleThreaded = true;
3296+
params.planNode = PlanBuilder()
3297+
.tableScan(schema, {}, "c0 % 7 == 0 AND c1 % 2 == 0")
3298+
.planNode();
3299+
auto cursor = TaskCursor::create(params);
3300+
cursor->task()->addSplit(
3301+
"0", exec::Split(makeHiveConnectorSplit(file->getPath())));
3302+
cursor->task()->noMoreSplits("0");
3303+
ASSERT_TRUE(cursor->moveNext());
3304+
auto* result = cursor->current()->asUnchecked<RowVector>();
3305+
ASSERT_EQ(result->size(), 1);
3306+
auto* c0 =
3307+
result->childAt(0)->loadedVector()->asUnchecked<SimpleVector<int64_t>>();
3308+
ASSERT_FALSE(c0->isNullAt(0));
3309+
ASSERT_EQ(c0->valueAt(0), 0);
3310+
auto* c1 = result->childAt(1)->loadedVector();
3311+
ASSERT_EQ(c1->encoding(), VectorEncoding::Simple::DICTIONARY);
3312+
auto* c1Dict = c1->asUnchecked<DictionaryVector<int64_t>>();
3313+
ASSERT_FALSE(c1Dict->isNullAt(0));
3314+
ASSERT_EQ(c1Dict->valueAt(0), 0);
3315+
ASSERT_EQ(
3316+
c1Dict->valueVector()->encoding(), VectorEncoding::Simple::DICTIONARY);
3317+
c1Dict = c1Dict->valueVector()->asUnchecked<DictionaryVector<int64_t>>();
3318+
ASSERT_EQ(c1Dict->valueVector()->size(), 2);
3319+
ASSERT_FALSE(cursor->moveNext());
3320+
ASSERT_TRUE(waitForTaskCompletion(cursor->task().get()));
3321+
}
3322+
32803323
TEST_F(TableScanTest, remainingFilterSkippedStrides) {
32813324
auto rowType = ROW({{"c0", BIGINT()}, {"c1", BIGINT()}});
32823325
std::vector<RowVectorPtr> vectors(3);

velox/expression/Expr.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ void Expr::computeMetadata() {
299299
}
300300

301301
// (5) Compute hasConditionals_.
302-
hasConditionals_ = hasConditionals(this);
302+
hasConditionals_ = exec::hasConditionals(this);
303303

304304
metaDataComputed_ = true;
305305
}

velox/expression/Expr.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,10 @@ class Expr {
279279
return false;
280280
}
281281

282+
bool hasConditionals() const {
283+
return hasConditionals_;
284+
}
285+
282286
bool isDeterministic() const {
283287
return deterministic_;
284288
}
@@ -297,6 +301,16 @@ class Expr {
297301
isMultiplyReferenced_ = true;
298302
}
299303

304+
/// True if this is a special form where the next argument will always be
305+
/// evaluated on a subset of the rows for which the previous one was
306+
/// evaluated. This is true of AND and no other at this time. This implies
307+
/// that lazies can be loaded on first use and not before starting evaluating
308+
/// the form. This is so because a subsequent use will never access rows that
309+
/// were not in scope for the previous one.
310+
virtual bool evaluatesArgumentsOnNonIncreasingSelection() const {
311+
return false;
312+
}
313+
300314
std::vector<common::Subfield> extractSubfields() const;
301315

302316
virtual void extractSubfieldsImpl(
@@ -556,16 +570,6 @@ class Expr {
556570
// referenced fields.
557571
virtual void computeDistinctFields();
558572

559-
// True if this is a spcial form where the next argument will always be
560-
// evaluated on a subset of the rows for which the previous one was evaluated.
561-
// This is true of AND and no other at this time. This implies that lazies
562-
// can be loaded on first use and not before starting evaluating the form.
563-
// This is so because a subsequent use will never access rows that were not in
564-
// scope for the previous one.
565-
virtual bool evaluatesArgumentsOnNonIncreasingSelection() const {
566-
return false;
567-
}
568-
569573
const TypePtr type_;
570574
const std::vector<std::shared_ptr<Expr>> inputs_;
571575
const std::string name_;

0 commit comments

Comments
 (0)