-
Notifications
You must be signed in to change notification settings - Fork 25.2k
ESQL: CATEGORIZE as a BlockHash #114317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ESQL: CATEGORIZE as a BlockHash #114317
Changes from all commits
a4647fc
9cb4425
95f767b
e4b9e4b
f2d1806
a309133
e50d5b9
5674046
31e9e20
82cc74a
a234326
3f94143
239d159
260b572
975d6ef
f44774d
f7264f2
5b1cfc1
f05a3c2
fe3d536
1b7705b
c3c0c68
f207321
790770b
3f65d30
28d0012
1af1c68
2b00cc2
cc9f84a
8c54acd
6040f6b
8c96df9
4485b0e
6bf3b31
64ca567
b5ad72a
46ac649
e311014
5cd145d
3da877d
1bad621
51dc274
d421281
bf466f7
b5b387c
3b75c65
006768d
30a25b9
147023d
8cb0e26
043f99d
9dd8973
df39b40
cbffe6d
483cbfe
0aeb50c
6ad0804
32fb924
5ad52de
27da936
0372701
c6cda24
f833ba0
ddacdc3
9ed6262
ea13baf
dc8ecfd
c1c70fc
aecacde
7c7b604
9dc6ad1
e0b84f0
9d1bfd3
6cf6ec1
f677b47
f37d2bd
d10709b
b622e6c
30412ad
74aeae3
3771532
599b02b
3763f74
d2325aa
43a335f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 114317 | ||
summary: "ESQL: CATEGORIZE as a `BlockHash`" | ||
area: ES|QL | ||
type: enhancement | ||
issues: [] |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.compute.aggregation.blockhash; | ||
|
||
import org.apache.lucene.util.BytesRefBuilder; | ||
import org.elasticsearch.common.io.stream.BytesStreamOutput; | ||
import org.elasticsearch.common.unit.ByteSizeValue; | ||
import org.elasticsearch.common.util.BigArrays; | ||
import org.elasticsearch.common.util.BitArray; | ||
import org.elasticsearch.common.util.BytesRefHash; | ||
import org.elasticsearch.compute.data.Block; | ||
import org.elasticsearch.compute.data.BlockFactory; | ||
import org.elasticsearch.compute.data.BytesRefVector; | ||
import org.elasticsearch.compute.data.IntBlock; | ||
import org.elasticsearch.compute.data.IntVector; | ||
import org.elasticsearch.compute.data.Page; | ||
import org.elasticsearch.core.ReleasableIterator; | ||
import org.elasticsearch.xpack.ml.aggs.categorization.CategorizationBytesRefHash; | ||
import org.elasticsearch.xpack.ml.aggs.categorization.CategorizationPartOfSpeechDictionary; | ||
import org.elasticsearch.xpack.ml.aggs.categorization.SerializableTokenListCategory; | ||
import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Base BlockHash implementation for {@code Categorize} grouping function. | ||
*/ | ||
public abstract class AbstractCategorizeBlockHash extends BlockHash { | ||
alex-spies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO: this should probably also take an emitBatchSize | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nik9000 Some info on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TLDR: It's probably not important for the single-element So This is much much less important for single element When I first built this I thought I might apply this to single valued |
||
private final int channel; | ||
private final boolean outputPartial; | ||
alex-spies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
protected final TokenListCategorizer.CloseableTokenListCategorizer categorizer; | ||
|
||
AbstractCategorizeBlockHash(BlockFactory blockFactory, int channel, boolean outputPartial) { | ||
super(blockFactory); | ||
this.channel = channel; | ||
this.outputPartial = outputPartial; | ||
this.categorizer = new TokenListCategorizer.CloseableTokenListCategorizer( | ||
new CategorizationBytesRefHash(new BytesRefHash(2048, blockFactory.bigArrays())), | ||
CategorizationPartOfSpeechDictionary.getInstance(), | ||
0.70f | ||
); | ||
} | ||
|
||
protected int channel() { | ||
return channel; | ||
} | ||
|
||
@Override | ||
public Block[] getKeys() { | ||
return new Block[] { outputPartial ? buildIntermediateBlock() : buildFinalBlock() }; | ||
} | ||
|
||
@Override | ||
public IntVector nonEmpty() { | ||
return IntVector.range(0, categorizer.getCategoryCount(), blockFactory); | ||
} | ||
|
||
@Override | ||
public BitArray seenGroupIds(BigArrays bigArrays) { | ||
throw new UnsupportedOperationException(); | ||
} | ||
|
||
@Override | ||
public final ReleasableIterator<IntBlock> lookup(Page page, ByteSizeValue targetBlockSize) { | ||
throw new UnsupportedOperationException(); | ||
Comment on lines
+66
to
+71
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: let's give these some useful error messages if we ever end up here - and maybe an explanation as a comment why it's fine that we do not support these. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nik9000 You may have more context about this. Is this something we didn't "want" to implement yet, or something that can't be done with this HashBlock? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is powered hash joins. Originally when I built the infrastructure for hash joins I thought any Except maybe the |
||
} | ||
alex-spies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/** | ||
* Serializes the intermediate state into a single BytesRef block, or an empty Null block if there are no categories. | ||
*/ | ||
private Block buildIntermediateBlock() { | ||
if (categorizer.getCategoryCount() == 0) { | ||
return blockFactory.newConstantNullBlock(0); | ||
} | ||
try (BytesStreamOutput out = new BytesStreamOutput()) { | ||
// TODO be more careful here. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I totally hacked this thing together. I haven't a clue if it's properly correct. I mean, it works, but it's worth another set of eyes on it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't sure what you meant with "be more careful here", but it lgtm There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we worried about memory usage with this? These are untracked bytes. We're tracking the memory usage on the agg itself, but not the serialization. But if it's super small we don't have to worry. |
||
out.writeVInt(categorizer.getCategoryCount()); | ||
for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { | ||
category.writeTo(out); | ||
} | ||
// We're returning a block with N positions just because the Page must have all blocks with the same position count! | ||
return blockFactory.newConstantBytesRefBlockWith(out.bytes().toBytesRef(), categorizer.getCategoryCount()); | ||
Comment on lines
+83
to
+88
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really need to wirite the vInt and the Pages positions hack? Can't we just write a position per category? To be more like ESQL There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you exactly mean. The number of categories is not equal to the number of inputs texts, meaning you still have a mismatch in number of positions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're building here the intermediate state to pass to the The current one:
Instead, do:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My memory was that the state was one blob of bytes and not a blob per category. There's, like, shared state. But it's been a month since I thought a lot about this. And I'm wrong about lots of stuff. |
||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private Block buildFinalBlock() { | ||
try (BytesRefVector.Builder result = blockFactory.newBytesRefVectorBuilder(categorizer.getCategoryCount())) { | ||
BytesRefBuilder scratch = new BytesRefBuilder(); | ||
for (SerializableTokenListCategory category : categorizer.toCategoriesById()) { | ||
scratch.copyChars(category.getRegex()); | ||
result.appendBytesRef(scratch.get()); | ||
scratch.clear(); | ||
} | ||
return result.build().asBlock(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.compute.aggregation.blockhash; | ||
|
||
import org.apache.lucene.analysis.core.WhitespaceTokenizer; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.elasticsearch.compute.aggregation.GroupingAggregatorFunction; | ||
import org.elasticsearch.compute.data.Block; | ||
import org.elasticsearch.compute.data.BlockFactory; | ||
import org.elasticsearch.compute.data.BytesRefBlock; | ||
import org.elasticsearch.compute.data.BytesRefVector; | ||
import org.elasticsearch.compute.data.IntBlock; | ||
import org.elasticsearch.compute.data.IntVector; | ||
import org.elasticsearch.compute.data.Page; | ||
import org.elasticsearch.core.Releasable; | ||
import org.elasticsearch.core.Releasables; | ||
import org.elasticsearch.index.analysis.CharFilterFactory; | ||
import org.elasticsearch.index.analysis.CustomAnalyzer; | ||
import org.elasticsearch.index.analysis.TokenFilterFactory; | ||
import org.elasticsearch.index.analysis.TokenizerFactory; | ||
import org.elasticsearch.xpack.ml.aggs.categorization.TokenListCategorizer; | ||
import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer; | ||
|
||
/** | ||
* BlockHash implementation for {@code Categorize} grouping function. | ||
* <p> | ||
* This implementation expects rows, and can't deserialize intermediate states coming from other nodes. | ||
* </p> | ||
*/ | ||
public class CategorizeRawBlockHash extends AbstractCategorizeBlockHash { | ||
private final CategorizeEvaluator evaluator; | ||
|
||
CategorizeRawBlockHash(int channel, BlockFactory blockFactory, boolean outputPartial) { | ||
super(blockFactory, channel, outputPartial); | ||
CategorizationAnalyzer analyzer = new CategorizationAnalyzer( | ||
// TODO: should be the same analyzer as used in Production | ||
new CustomAnalyzer( | ||
TokenizerFactory.newFactory("whitespace", WhitespaceTokenizer::new), | ||
new CharFilterFactory[0], | ||
new TokenFilterFactory[0] | ||
), | ||
alex-spies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
true | ||
); | ||
this.evaluator = new CategorizeEvaluator(analyzer, categorizer, blockFactory); | ||
} | ||
|
||
@Override | ||
public void add(Page page, GroupingAggregatorFunction.AddInput addInput) { | ||
try (IntBlock result = (IntBlock) evaluator.eval(page.getBlock(channel()))) { | ||
addInput.add(0, result); | ||
} | ||
} | ||
|
||
@Override | ||
public void close() { | ||
evaluator.close(); | ||
} | ||
|
||
/** | ||
* Similar implementation to an Evaluator. | ||
*/ | ||
public static final class CategorizeEvaluator implements Releasable { | ||
private final CategorizationAnalyzer analyzer; | ||
|
||
private final TokenListCategorizer.CloseableTokenListCategorizer categorizer; | ||
|
||
private final BlockFactory blockFactory; | ||
|
||
public CategorizeEvaluator( | ||
CategorizationAnalyzer analyzer, | ||
TokenListCategorizer.CloseableTokenListCategorizer categorizer, | ||
BlockFactory blockFactory | ||
) { | ||
this.analyzer = analyzer; | ||
this.categorizer = categorizer; | ||
this.blockFactory = blockFactory; | ||
} | ||
|
||
public Block eval(BytesRefBlock vBlock) { | ||
BytesRefVector vVector = vBlock.asVector(); | ||
if (vVector == null) { | ||
return eval(vBlock.getPositionCount(), vBlock); | ||
} | ||
IntVector vector = eval(vBlock.getPositionCount(), vVector); | ||
return vector.asBlock(); | ||
} | ||
|
||
public IntBlock eval(int positionCount, BytesRefBlock vBlock) { | ||
try (IntBlock.Builder result = blockFactory.newIntBlockBuilder(positionCount)) { | ||
BytesRef vScratch = new BytesRef(); | ||
for (int p = 0; p < positionCount; p++) { | ||
if (vBlock.isNull(p)) { | ||
result.appendNull(); | ||
continue; | ||
} | ||
int first = vBlock.getFirstValueIndex(p); | ||
int count = vBlock.getValueCount(p); | ||
if (count == 1) { | ||
result.appendInt(process(vBlock.getBytesRef(first, vScratch))); | ||
continue; | ||
} | ||
int end = first + count; | ||
result.beginPositionEntry(); | ||
for (int i = first; i < end; i++) { | ||
result.appendInt(process(vBlock.getBytesRef(i, vScratch))); | ||
} | ||
result.endPositionEntry(); | ||
} | ||
return result.build(); | ||
} | ||
} | ||
|
||
public IntVector eval(int positionCount, BytesRefVector vVector) { | ||
try (IntVector.FixedBuilder result = blockFactory.newIntVectorFixedBuilder(positionCount)) { | ||
BytesRef vScratch = new BytesRef(); | ||
for (int p = 0; p < positionCount; p++) { | ||
result.appendInt(p, process(vVector.getBytesRef(p, vScratch))); | ||
} | ||
return result.build(); | ||
} | ||
} | ||
|
||
private int process(BytesRef v) { | ||
return categorizer.computeCategory(v.utf8ToString(), analyzer).getId(); | ||
} | ||
|
||
@Override | ||
public void close() { | ||
Releasables.closeExpectNoException(analyzer, categorizer); | ||
} | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.