Fix bug when use ZoneMap/BloomFiter on column with REPLACE/REPLACE_IF_NOT_NULL (#3288)

Now, column with REPLACE/REPLACE_IF_NOT_NULL can be filtered by ZoneMap/BloomFilter
when the rowset is base(version starts with zero). Always we think is an optimization.
But when some case, it will occurs bug.

create table test(
  k1 int,
  v1 int replace,
  v2 int sum
);
If I have two records on different two versions

1 2 2 on version [0-10]
1 3 1 on version 11
If I perform a query

select * from test where k1 = 1 and v1 = 3;
The result will be 1 3 1, this is not right because of the first record is filtered.
The right answer is 1 3 3, the v2 should be summed.
Remove this optimization is necessity to make the result is right.
This commit is contained in:
lichaoyong
2020-04-10 10:22:21 +08:00
committed by GitHub
parent f39c8b156d
commit 3086790e06
7 changed files with 6 additions and 35 deletions

View File

@ -192,7 +192,6 @@ OLAPStatus BetaRowsetWriter::_create_segment_writer() {
DCHECK(wblock != nullptr);
segment_v2::SegmentWriterOptions writer_options;
writer_options.whether_to_filter_value = _context.version.first == 0;
_segment_writer.reset(new segment_v2::SegmentWriter(
wblock.get(), _num_segment, _context.tablet_schema, writer_options));
_wblocks.push_back(std::move(wblock));

View File

@ -491,10 +491,7 @@ OLAPStatus SegmentReader::_pick_row_groups(uint32_t first_block, uint32_t last_b
for (auto& i : _conditions->columns()) {
FieldAggregationMethod aggregation = _get_aggregation_by_index(i.first);
bool is_continue = (aggregation == OLAP_FIELD_AGGREGATION_NONE
|| ((aggregation == OLAP_FIELD_AGGREGATION_REPLACE ||
aggregation == OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL)
&& _segment_group->version().first == 0));
bool is_continue = (aggregation == OLAP_FIELD_AGGREGATION_NONE);
if (!is_continue) {
continue;
}
@ -533,10 +530,7 @@ OLAPStatus SegmentReader::_pick_row_groups(uint32_t first_block, uint32_t last_b
for (uint32_t i : _load_bf_columns) {
FieldAggregationMethod aggregation = _get_aggregation_by_index(i);
bool is_continue = (aggregation == OLAP_FIELD_AGGREGATION_NONE
|| ((aggregation == OLAP_FIELD_AGGREGATION_REPLACE ||
aggregation == OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL)
&& _segment_group->version().first == 0));
bool is_continue = (aggregation == OLAP_FIELD_AGGREGATION_NONE);
if (!is_continue) {
continue;
}

View File

@ -71,25 +71,9 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) {
}
if (column.is_bf_column()) {
opts.need_bloom_filter = true;
if ((column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE
|| column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL)
&& !_opts.whether_to_filter_value) {
// if the column's Aggregation type is OLAP_FIELD_AGGREGATION_REPLACE or
// OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL and the segment is not in base rowset,
// do not write the bloom filter index because it is useless
opts.need_bloom_filter = false;
}
}
if (column.has_bitmap_index()) {
opts.need_bitmap_index = true;
if ((column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE
|| column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL)
&& !_opts.whether_to_filter_value) {
// if the column's Aggregation type is OLAP_FIELD_AGGREGATION_REPLACE or
// OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL and the segment is not in base rowset,
// do not write the bitmap index because it is useless
opts.need_bitmap_index = false;
}
}
std::unique_ptr<ColumnWriter> writer(

View File

@ -46,8 +46,6 @@ extern const uint32_t k_segment_magic_length;
struct SegmentWriterOptions {
uint32_t num_rows_per_block = 1024;
// whether to filter value column against bloom filter/zone map
bool whether_to_filter_value = false;
};
class SegmentWriter {

View File

@ -1124,14 +1124,12 @@ TEST_F(SegmentReaderWriterTest, TestBloomFilterIndexUniqueModel) {
// for not base segment
SegmentWriterOptions opts1;
opts1.whether_to_filter_value = false;
shared_ptr<Segment> seg1;
build_segment(opts1, schema, schema, 100, DefaultIntGenerator, &seg1);
ASSERT_FALSE(column_contains_index(seg1->footer().columns(3), BLOOM_FILTER_INDEX));
ASSERT_TRUE(column_contains_index(seg1->footer().columns(3), BLOOM_FILTER_INDEX));
// for base segment
SegmentWriterOptions opts2;
opts2.whether_to_filter_value = true;
shared_ptr<Segment> seg2;
build_segment(opts2, schema, schema, 100, DefaultIntGenerator, &seg2);
ASSERT_TRUE(column_contains_index(seg2->footer().columns(3), BLOOM_FILTER_INDEX));

View File

@ -284,9 +284,7 @@ public class PropertyAnalyzer {
throw new AnalysisException(type + " is not supported in bloom filter index. "
+ "invalid column: " + bfColumn);
} else if (column.isKey()
|| column.getAggregationType() == AggregateType.NONE
|| column.getAggregationType() == AggregateType.REPLACE
|| column.getAggregationType() == AggregateType.REPLACE_IF_NOT_NULL) {
|| column.getAggregationType() == AggregateType.NONE) {
if (!bfColumnSet.add(bfColumn)) {
throw new AnalysisException("Reduplicated bloom filter column: " + bfColumn);
}

View File

@ -49,10 +49,10 @@ public class PropertyAnalyzerTest {
columns.get(1).setIsKey(true);
Map<String, String> properties = Maps.newHashMap();
properties.put(PropertyAnalyzer.PROPERTIES_BF_COLUMNS, "k1,v1");
properties.put(PropertyAnalyzer.PROPERTIES_BF_COLUMNS, "k1");
Set<String> bfColumns = PropertyAnalyzer.analyzeBloomFilterColumns(properties, columns);
Assert.assertEquals(Sets.newHashSet("k1", "v1"), bfColumns);
Assert.assertEquals(Sets.newHashSet("k1"), bfColumns);
}
@Test