Skip to content

Commit e6f043f

Browse files
authored
Merge branch 'master' into show_storage_vault
2 parents 06e6c8d + 5ea8025 commit e6f043f

File tree

401 files changed

+9246
-4160
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

401 files changed

+9246
-4160
lines changed

be/src/cloud/cloud_meta_mgr.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ Status CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
600600
resp.stats(), req.idx(), &delete_bitmap,
601601
options.full_sync, sync_stats);
602602
if (st.is<ErrorCode::ROWSETS_EXPIRED>() && tried++ < retry_times) {
603-
LOG_WARNING("rowset meta is expired, need to retry")
603+
LOG_INFO("rowset meta is expired, need to retry")
604604
.tag("tablet", tablet->tablet_id())
605605
.tag("tried", tried)
606606
.error(st);

be/src/exec/es/es_scroll_parser.cpp

Lines changed: 101 additions & 103 deletions
Large diffs are not rendered by default.

be/src/exec/olap_common.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ std::string cast_to_string(T value, int scale) {
9191
template <PrimitiveType primitive_type>
9292
class ColumnValueRange {
9393
public:
94-
using CppType = typename PrimitiveTypeTraits<primitive_type>::CppType;
94+
using CppType = std::conditional_t<primitive_type == TYPE_HLL, StringRef,
95+
typename PrimitiveTypeTraits<primitive_type>::CppType>;
9596
using IteratorType = typename std::set<CppType>::iterator;
9697

9798
ColumnValueRange();
@@ -1014,7 +1015,8 @@ template <PrimitiveType primitive_type>
10141015
Status OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range,
10151016
int32_t max_scan_key_num, bool* exact_value, bool* eos,
10161017
bool* should_break) {
1017-
using CppType = typename PrimitiveTypeTraits<primitive_type>::CppType;
1018+
using CppType = std::conditional_t<primitive_type == TYPE_HLL, StringRef,
1019+
typename PrimitiveTypeTraits<primitive_type>::CppType>;
10181020
using ConstIterator = typename std::set<CppType>::const_iterator;
10191021

10201022
// 1. clear ScanKey if some column range is empty

be/src/olap/in_list_predicate.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -405,8 +405,8 @@ class InListPredicateBase : public ColumnPredicate {
405405

406406
if (column->is_column_dictionary()) {
407407
if constexpr (std::is_same_v<T, StringRef>) {
408-
const auto* nested_col_ptr = vectorized::check_and_get_column<
409-
vectorized::ColumnDictionary<vectorized::Int32>>(column);
408+
const auto* nested_col_ptr =
409+
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
410410
const auto& data_array = nested_col_ptr->get_data();
411411
auto segid = column->get_rowset_segment_id();
412412
DCHECK((segid.first.hi | segid.first.mi | segid.first.lo) != 0);
@@ -472,8 +472,8 @@ class InListPredicateBase : public ColumnPredicate {
472472
const uint16_t* sel, uint16_t size, bool* flags) const {
473473
if (column->is_column_dictionary()) {
474474
if constexpr (std::is_same_v<T, StringRef>) {
475-
const auto* nested_col_ptr = vectorized::check_and_get_column<
476-
vectorized::ColumnDictionary<vectorized::Int32>>(column);
475+
const auto* nested_col_ptr =
476+
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
477477
const auto& data_array = nested_col_ptr->get_data();
478478
auto& value_in_dict_flags =
479479
_segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()];

be/src/olap/like_column_predicate.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ uint16_t LikeColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& colu
5959
auto& null_map_data = nullable_col->get_null_map_column().get_data();
6060
auto& nested_col = nullable_col->get_nested_column();
6161
if (nested_col.is_column_dictionary()) {
62-
auto* nested_col_ptr = vectorized::check_and_get_column<
63-
vectorized::ColumnDictionary<vectorized::Int32>>(nested_col);
62+
auto* nested_col_ptr =
63+
vectorized::check_and_get_column<vectorized::ColumnDictI32>(nested_col);
6464
auto& data_array = nested_col_ptr->get_data();
6565
if (!nullable_col->has_null()) {
6666
for (uint16_t i = 0; i != size; i++) {
@@ -124,8 +124,8 @@ uint16_t LikeColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& colu
124124
}
125125
} else {
126126
if (column.is_column_dictionary()) {
127-
auto* nested_col_ptr = vectorized::check_and_get_column<
128-
vectorized::ColumnDictionary<vectorized::Int32>>(column);
127+
auto* nested_col_ptr =
128+
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
129129
auto& data_array = nested_col_ptr->get_data();
130130
for (uint16_t i = 0; i != size; i++) {
131131
uint16_t idx = sel[i];

be/src/olap/like_column_predicate.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ class LikeColumnPredicate : public ColumnPredicate {
9999
auto& null_map_data = nullable_col->get_null_map_column().get_data();
100100
auto& nested_col = nullable_col->get_nested_column();
101101
if (nested_col.is_column_dictionary()) {
102-
auto* nested_col_ptr = vectorized::check_and_get_column<
103-
vectorized::ColumnDictionary<vectorized::Int32>>(nested_col);
102+
auto* nested_col_ptr =
103+
vectorized::check_and_get_column<vectorized::ColumnDictI32>(nested_col);
104104
auto& data_array = nested_col_ptr->get_data();
105105
for (uint16_t i = 0; i < size; i++) {
106106
if (null_map_data[i]) {
@@ -133,8 +133,8 @@ class LikeColumnPredicate : public ColumnPredicate {
133133
}
134134
} else {
135135
if (column.is_column_dictionary()) {
136-
auto* nested_col_ptr = vectorized::check_and_get_column<
137-
vectorized::ColumnDictionary<vectorized::Int32>>(column);
136+
auto* nested_col_ptr =
137+
vectorized::check_and_get_column<vectorized::ColumnDictI32>(column);
138138
auto& data_array = nested_col_ptr->get_data();
139139
for (uint16_t i = 0; i < size; i++) {
140140
StringRef cell_value = nested_col_ptr->get_shrink_value(data_array[i]);

be/src/olap/partial_update_info.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,9 @@ Status FixedReadPlan::fill_missing_columns(
399399
DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT);
400400
auto* auto_inc_column =
401401
assert_cast<vectorized::ColumnInt64*, TypeCheckOnRelease::DISABLE>(missing_col.get());
402-
auto_inc_column->insert(
403-
(assert_cast<const vectorized::ColumnInt64*, TypeCheckOnRelease::DISABLE>(
404-
block->get_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL).column.get()))->get_element(idx));
402+
auto_inc_column->insert(vectorized::Field::create_field<TYPE_BIGINT>(
403+
assert_cast<const vectorized::ColumnInt64*, TypeCheckOnRelease::DISABLE>(
404+
block->get_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL).column.get())->get_element(idx)));
405405
} else {
406406
// If the control flow reaches this branch, the column neither has default value
407407
// nor is nullable. It means that the row's delete sign is marked, and the value

be/src/olap/push_handler.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,9 @@ Status PushBrokerReader::_cast_to_input_block() {
511511
vectorized::ColumnsWithTypeAndName arguments {
512512
arg,
513513
{vectorized::DataTypeString().create_column_const(
514-
arg.column->size(), remove_nullable(return_type)->get_family_name()),
514+
arg.column->size(),
515+
vectorized::Field::create_field<TYPE_STRING>(
516+
remove_nullable(return_type)->get_family_name())),
515517
std::make_shared<vectorized::DataTypeString>(), ""}};
516518
auto func_cast = vectorized::SimpleFunctionFactory::instance().get_function(
517519
"CAST", arguments, return_type);

be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp

Lines changed: 95 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
#include "regexp_query.h"
1919

2020
#include <CLucene/config/repl_wchar.h>
21-
#include <hs/hs.h>
21+
22+
#include <optional>
2223

2324
#include "common/logging.h"
25+
#include "util/debug_points.h"
2426

2527
namespace doris::segment_v2 {
2628

@@ -36,6 +38,7 @@ void RegexpQuery::add(const InvertedIndexQueryInfo& query_info) {
3638
}
3739

3840
const std::string& pattern = query_info.terms[0];
41+
auto prefix = get_regex_prefix(pattern);
3942

4043
hs_database_t* database = nullptr;
4144
hs_compile_error_t* compile_err = nullptr;
@@ -54,63 +57,120 @@ void RegexpQuery::add(const InvertedIndexQueryInfo& query_info) {
5457
return;
5558
}
5659

60+
std::vector<std::string> terms;
61+
try {
62+
collect_matching_terms(query_info.field_name, terms, database, scratch, prefix);
63+
}
64+
_CLFINALLY({
65+
hs_free_scratch(scratch);
66+
hs_free_database(database);
67+
})
68+
69+
if (terms.empty()) {
70+
return;
71+
}
72+
73+
InvertedIndexQueryInfo new_query_info;
74+
new_query_info.field_name = query_info.field_name;
75+
new_query_info.terms.swap(terms);
76+
_query.add(new_query_info);
77+
}
78+
79+
void RegexpQuery::search(roaring::Roaring& roaring) {
80+
_query.search(roaring);
81+
}
82+
83+
std::optional<std::string> RegexpQuery::get_regex_prefix(const std::string& pattern) {
84+
DBUG_EXECUTE_IF("RegexpQuery.get_regex_prefix", { return std::nullopt; });
85+
86+
if (pattern.empty() || pattern[0] != '^') {
87+
return std::nullopt;
88+
}
89+
90+
re2::RE2 re(pattern);
91+
if (!re.ok()) {
92+
return std::nullopt;
93+
}
94+
95+
std::string min_prefix, max_prefix;
96+
if (!re.PossibleMatchRange(&min_prefix, &max_prefix, 256)) {
97+
return std::nullopt;
98+
}
99+
100+
if (min_prefix.empty() || max_prefix.empty() || min_prefix[0] != max_prefix[0]) {
101+
return std::nullopt;
102+
}
103+
104+
auto [it1, it2] = std::mismatch(min_prefix.begin(), min_prefix.end(), max_prefix.begin(),
105+
max_prefix.end());
106+
107+
const size_t common_len = std::distance(min_prefix.begin(), it1);
108+
if (common_len == 0) {
109+
return std::nullopt;
110+
}
111+
112+
return min_prefix.substr(0, common_len);
113+
}
114+
115+
void RegexpQuery::collect_matching_terms(const std::wstring& field_name,
116+
std::vector<std::string>& terms, hs_database_t* database,
117+
hs_scratch_t* scratch,
118+
const std::optional<std::string>& prefix) {
57119
auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
58120
unsigned int flags, void* context) -> int {
59121
*((bool*)context) = true;
60122
return 0;
61123
};
62124

125+
int32_t count = 0;
63126
Term* term = nullptr;
64127
TermEnum* enumerator = nullptr;
65-
std::vector<std::string> terms;
66-
int32_t count = 0;
67-
68128
try {
69-
enumerator = _searcher->getReader()->terms();
70-
while (enumerator->next()) {
129+
if (prefix) {
130+
std::wstring ws_prefix = StringUtil::string_to_wstring(*prefix);
131+
Term prefix(field_name.c_str(), ws_prefix.c_str());
132+
enumerator = _searcher->getReader()->terms(&prefix);
133+
} else {
134+
enumerator = _searcher->getReader()->terms();
135+
enumerator->next();
136+
}
137+
do {
71138
term = enumerator->term();
72-
std::string input = lucene_wcstoutf8string(term->text(), term->textLength());
139+
if (term != nullptr) {
140+
std::string input = lucene_wcstoutf8string(term->text(), term->textLength());
73141

74-
bool is_match = false;
75-
if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match,
76-
(void*)&is_match) != HS_SUCCESS) {
77-
LOG(ERROR) << "hyperscan match failed: " << input;
78-
break;
79-
}
142+
if (prefix) {
143+
if (!input.starts_with(*prefix)) {
144+
break;
145+
}
146+
}
80147

81-
if (is_match) {
82-
if (_max_expansions > 0 && count >= _max_expansions) {
148+
bool is_match = false;
149+
if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match,
150+
(void*)&is_match) != HS_SUCCESS) {
151+
LOG(ERROR) << "hyperscan match failed: " << input;
83152
break;
84153
}
85154

86-
terms.emplace_back(std::move(input));
87-
count++;
88-
}
155+
if (is_match) {
156+
if (_max_expansions > 0 && count >= _max_expansions) {
157+
break;
158+
}
89159

160+
terms.emplace_back(std::move(input));
161+
count++;
162+
}
163+
} else {
164+
break;
165+
}
90166
_CLDECDELETE(term);
91-
}
167+
} while (enumerator->next());
92168
}
93169
_CLFINALLY({
94170
_CLDECDELETE(term);
95171
enumerator->close();
96172
_CLDELETE(enumerator);
97-
98-
hs_free_scratch(scratch);
99-
hs_free_database(database);
100173
})
101-
102-
if (terms.empty()) {
103-
return;
104-
}
105-
106-
InvertedIndexQueryInfo new_query_info;
107-
new_query_info.field_name = query_info.field_name;
108-
new_query_info.terms.swap(terms);
109-
_query.add(new_query_info);
110-
}
111-
112-
void RegexpQuery::search(roaring::Roaring& roaring) {
113-
_query.search(roaring);
114174
}
115175

116176
} // namespace doris::segment_v2

be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717

1818
#pragma once
1919

20+
#include <hs/hs.h>
21+
#include <re2/re2.h>
22+
23+
#include <optional>
24+
2025
#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h"
2126
#include "olap/rowset/segment_v2/inverted_index/query/query.h"
2227

@@ -35,10 +40,18 @@ class RegexpQuery : public Query {
3540
void search(roaring::Roaring& roaring) override;
3641

3742
private:
43+
static std::optional<std::string> get_regex_prefix(const std::string& pattern);
44+
45+
void collect_matching_terms(const std::wstring& field_name, std::vector<std::string>& terms,
46+
hs_database_t* database, hs_scratch_t* scratch,
47+
const std::optional<std::string>& prefix);
48+
3849
std::shared_ptr<lucene::search::IndexSearcher> _searcher;
3950

4051
int32_t _max_expansions = 50;
4152
DisjunctionQuery _query;
53+
54+
friend class RegexpQueryTest;
4255
};
4356

4457
} // namespace doris::segment_v2

be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@ void VerticalSegmentWriter::_init_column_meta(ColumnMetaPB* meta, uint32_t colum
168168
for (uint32_t i = 0; i < column.num_sparse_columns(); i++) {
169169
_init_column_meta(meta->add_sparse_columns(), -1, column.sparse_column_at(i));
170170
}
171+
meta->set_result_is_nullable(column.get_result_is_nullable());
172+
meta->set_function_name(column.get_aggregation_name());
173+
meta->set_be_exec_version(column.get_be_exec_version());
171174
}
172175

173176
Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& column,

be/src/olap/schema.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(const FieldType
174174
break;
175175
case FieldType::OLAP_FIELD_TYPE_CHAR:
176176
if (config::enable_low_cardinality_optimize && reader_type == ReaderType::READER_QUERY) {
177-
ptr = doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create(type);
177+
ptr = doris::vectorized::ColumnDictI32::create(type);
178178
} else {
179179
ptr = doris::vectorized::PredicateColumnType<TYPE_CHAR>::create();
180180
}
@@ -183,7 +183,7 @@ vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(const FieldType
183183
case FieldType::OLAP_FIELD_TYPE_STRING:
184184
case FieldType::OLAP_FIELD_TYPE_JSONB:
185185
if (config::enable_low_cardinality_optimize && reader_type == ReaderType::READER_QUERY) {
186-
ptr = doris::vectorized::ColumnDictionary<doris::vectorized::Int32>::create(type);
186+
ptr = doris::vectorized::ColumnDictI32::create(type);
187187
} else {
188188
ptr = doris::vectorized::PredicateColumnType<TYPE_STRING>::create();
189189
}

be/src/pipeline/exec/scan_operator.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -877,14 +877,20 @@ Status ScanLocalState<Derived>::_change_value_range(ColumnValueRange<PrimitiveTy
877877
reinterpret_cast<typename PrimitiveTypeTraits<PrimitiveType>::CppType*>(
878878
reinterpret_cast<char*>(value)));
879879
}
880+
} else if constexpr (PrimitiveType == TYPE_HLL) {
881+
if constexpr (IsFixed) {
882+
func(temp_range, reinterpret_cast<StringRef*>(value));
883+
} else {
884+
func(temp_range, to_olap_filter_type(fn_name, slot_ref_child),
885+
reinterpret_cast<StringRef*>(value));
886+
}
880887
} else if constexpr ((PrimitiveType == TYPE_DECIMALV2) || (PrimitiveType == TYPE_CHAR) ||
881-
(PrimitiveType == TYPE_VARCHAR) || (PrimitiveType == TYPE_HLL) ||
882-
(PrimitiveType == TYPE_DATETIMEV2) || (PrimitiveType == TYPE_TINYINT) ||
883-
(PrimitiveType == TYPE_SMALLINT) || (PrimitiveType == TYPE_INT) ||
884-
(PrimitiveType == TYPE_BIGINT) || (PrimitiveType == TYPE_LARGEINT) ||
885-
(PrimitiveType == TYPE_IPV4) || (PrimitiveType == TYPE_IPV6) ||
886-
(PrimitiveType == TYPE_DECIMAL32) || (PrimitiveType == TYPE_DECIMAL64) ||
887-
(PrimitiveType == TYPE_DECIMAL128I) ||
888+
(PrimitiveType == TYPE_VARCHAR) || (PrimitiveType == TYPE_DATETIMEV2) ||
889+
(PrimitiveType == TYPE_TINYINT) || (PrimitiveType == TYPE_SMALLINT) ||
890+
(PrimitiveType == TYPE_INT) || (PrimitiveType == TYPE_BIGINT) ||
891+
(PrimitiveType == TYPE_LARGEINT) || (PrimitiveType == TYPE_IPV4) ||
892+
(PrimitiveType == TYPE_IPV6) || (PrimitiveType == TYPE_DECIMAL32) ||
893+
(PrimitiveType == TYPE_DECIMAL64) || (PrimitiveType == TYPE_DECIMAL128I) ||
888894
(PrimitiveType == TYPE_DECIMAL256) || (PrimitiveType == TYPE_STRING) ||
889895
(PrimitiveType == TYPE_BOOLEAN) || (PrimitiveType == TYPE_DATEV2)) {
890896
if constexpr (IsFixed) {

be/src/runtime/descriptors.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,14 @@ vectorized::MutableColumnPtr SlotDescriptor::get_empty_mutable_column() const {
121121
return type()->create_column();
122122
}
123123

124+
bool SlotDescriptor::is_nullable() const {
125+
return _type->is_nullable();
126+
}
127+
128+
PrimitiveType SlotDescriptor::col_type() const {
129+
return _type->get_primitive_type();
130+
}
131+
124132
std::string SlotDescriptor::debug_string() const {
125133
std::stringstream out;
126134
out << "Slot(id=" << _id << " type=" << _type->get_name() << " col=" << _col_pos

0 commit comments

Comments
 (0)