[fix](storage) Fix query result error due to find code by bound (#8787)
Problem recurrence
SSB single table `lineorder_flat`, the query SQL is as follows:
```sql
SELECT
sum(LO_REVENUE),
(LO_ORDERDATE DIV 10000) AS year,
P_BRAND
FROM lineorder_flat
WHERE P_BRAND >= 'MFGR#22211111' AND P_BRAND <= 'MFGR#22281111' AND S_REGION = 'ASIA' and (LO_ORDERDATE DIV 10000) = 1992
GROUP BY
year,
P_BRAND
ORDER BY
year,
P_BRAND;
```
when `enable_low_cardinality_optimize=false`, query result:
```sql
+-------------------+------+-----------+
| sum(`LO_REVENUE`) | year | P_BRAND |
+-------------------+------+-----------+
| 65423264312 | 1992 | MFGR#2222 |
| 66936772687 | 1992 | MFGR#2223 |
| 64047191934 | 1992 | MFGR#2224 |
| 65744559138 | 1992 | MFGR#2225 |
| 66993045668 | 1992 | MFGR#2226 |
| 67411226147 | 1992 | MFGR#2227 |
| 69390885970 | 1992 | MFGR#2228 |
+-------------------+------+-----------+
```
when `enable_low_cardinality_optimize=true`, query result:
```sql
+-------------------+------+-----------+
| sum(`LO_REVENUE`) | year | P_BRAND |
+-------------------+------+-----------+
| 66936772687 | 1992 | MFGR#2223 |
| 64047191934 | 1992 | MFGR#2224 |
| 65744559138 | 1992 | MFGR#2225 |
| 66993045668 | 1992 | MFGR#2226 |
| 67411226147 | 1992 | MFGR#2227 |
| 69390885970 | 1992 | MFGR#2228 |
+-------------------+------+-----------+
```
One line less than the correct result.
The reason is that 'MFGR#22211111' is not in the dictionary, so get the boundary code (`find_code_by_bound` method), but there is a bug here.
This commit is contained in:
@ -520,8 +520,7 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=)
|
||||
auto& dict_col = \
|
||||
reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \
|
||||
*col_ptr); \
|
||||
auto code = dict_col.find_code(_value); \
|
||||
_dict_code = code; \
|
||||
_dict_code = dict_col.find_code(_value); \
|
||||
_dict_code_inited = true; \
|
||||
} \
|
||||
} \
|
||||
@ -530,6 +529,9 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=)
|
||||
COMPARISON_PRED_SET_DICT_CODE(EqualPredicate)
|
||||
COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate)
|
||||
|
||||
// If 1 OP 0 returns true, it means the predicate is > or >=
|
||||
// If 1 OP 1 returns true, it means the predicate is >= or <=
|
||||
// by this way, avoid redundant code
|
||||
#define RAMGE_COMPARISON_PRED_SET_DICT_CODE(CLASS, OP) \
|
||||
template <class T> \
|
||||
void CLASS<T>::set_dict_code_if_necessary(vectorized::IColumn& column) { \
|
||||
@ -548,8 +550,7 @@ COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate)
|
||||
auto& dict_col = \
|
||||
reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \
|
||||
*col_ptr); \
|
||||
auto code = dict_col.find_code_by_bound(_value, 0 OP 1, 1 OP 1); \
|
||||
_dict_code = code; \
|
||||
_dict_code = dict_col.find_code_by_bound(_value, 1 OP 0, 1 OP 1); \
|
||||
_dict_code_inited = true; \
|
||||
} \
|
||||
} \
|
||||
|
||||
@ -299,8 +299,7 @@ IN_LIST_PRED_BITMAP_EVALUATE(NotInListPredicate, -=)
|
||||
auto& dict_col = \
|
||||
reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \
|
||||
*col_ptr); \
|
||||
auto code_set = dict_col.find_codes(_values); \
|
||||
_dict_codes = std::move(code_set); \
|
||||
_dict_codes = dict_col.find_codes(_values); \
|
||||
_dict_code_inited = true; \
|
||||
} \
|
||||
} \
|
||||
|
||||
@ -244,8 +244,8 @@ public:
|
||||
|
||||
int32_t find_code(const StringValue& value) const { return _dict.find_code(value); }
|
||||
|
||||
int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const {
|
||||
return _dict.find_code_by_bound(value, lower, eq);
|
||||
int32_t find_code_by_bound(const StringValue& value, bool greater, bool eq) const {
|
||||
return _dict.find_code_by_bound(value, greater, eq);
|
||||
}
|
||||
|
||||
phmap::flat_hash_set<int32_t> find_codes(
|
||||
@ -294,19 +294,31 @@ public:
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const {
|
||||
// For > , code takes upper_bound - 1; For >= , code takes upper_bound
|
||||
// For < , code takes upper_bound; For <=, code takes upper_bound - 1
|
||||
// For example a sorted dict: <'b',0> <'c',1> <'d',2>
|
||||
// Now the predicate value is 'ccc', 'ccc' is not in the dict, 'ccc' is between 'c' and 'd'.
|
||||
// std::upper_bound(..., 'ccc') - begin, will return the encoding of 'd', which is 2
|
||||
// If the predicate is col > 'ccc' and the value of upper_bound-1 is 1,
|
||||
// then evaluate code > 1 and the result is 'd'.
|
||||
// If the predicate is col < 'ccc' and the value of upper_bound is 2,
|
||||
// evaluate code < 2, and the return result is 'b'.
|
||||
// If the predicate is col >= 'ccc' and the value of upper_bound is 2,
|
||||
// evaluate code >= 2, and the return result is 'd'.
|
||||
// If the predicate is col <= 'ccc' and the value of upper_bound-1 is 1,
|
||||
// evaluate code <= 1, and the returned result is 'b'.
|
||||
// If the predicate is col < 'a', 'a' is also not in the dict, and 'a' is less than 'b',
|
||||
// so upper_bound is the code 0 of b, then evaluate code < 0 and returns empty
|
||||
// If the predicate is col <= 'a' and upper_bound-1 is -1,
|
||||
// then evaluate code <= -1 and returns empty
|
||||
inline int32_t find_code_by_bound(const StringValue& value, bool greater, bool eq) const {
|
||||
auto code = find_code(value);
|
||||
if (code >= 0) {
|
||||
return code;
|
||||
}
|
||||
|
||||
if (lower) {
|
||||
return std::lower_bound(_dict_data.begin(), _dict_data.end(), value) -
|
||||
_dict_data.begin() - eq;
|
||||
} else {
|
||||
return std::upper_bound(_dict_data.begin(), _dict_data.end(), value) -
|
||||
_dict_data.begin() + eq;
|
||||
}
|
||||
auto bound = std::upper_bound(_dict_data.begin(), _dict_data.end(), value) -
|
||||
_dict_data.begin();
|
||||
return greater ? bound - greater + eq : bound - eq;
|
||||
}
|
||||
|
||||
inline phmap::flat_hash_set<int32_t> find_codes(
|
||||
|
||||
Reference in New Issue
Block a user