Skip to content

Commit eca5573

Browse files
committed
1
1 parent adda628 commit eca5573

File tree

3 files changed

+37
-7
lines changed

3 files changed

+37
-7
lines changed

be/src/olap/rowset/segment_v2/segment_writer.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,11 +792,13 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po
792792
short_key_pos.push_back(_short_key_row_pos - _num_rows_written);
793793
}
794794
}
795+
int64_t total_data_size = 0;
795796

796797
// convert column data from engine format to storage layer format
797798
std::vector<vectorized::IOlapColumnDataAccessor*> key_columns;
798799
vectorized::IOlapColumnDataAccessor* seq_column = nullptr;
799800
for (size_t id = 0; id < _column_writers.size(); ++id) {
801+
int64_t column_data_size = 0;
800802
// olap data convertor alway start from id = 0
801803
auto converted_result = _olap_data_convertor->convert_column_data(id);
802804
if (!converted_result.first.ok()) {
@@ -811,7 +813,18 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po
811813
}
812814
RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(),
813815
converted_result.second->get_data(), num_rows));
816+
817+
// estimate column data size for flush memtable, may be inaccurate at low cardinality
818+
column_data_size += _column_writers[id]->estimate_buffer_size();
819+
total_data_size += column_data_size;
820+
auto origin_column_data_size = _footer.columns(id).total_data_size();
821+
_footer.mutable_columns(id)->set_total_data_size(origin_column_data_size +
822+
column_data_size);
814823
}
824+
825+
auto origin_data_footprint = _footer.data_footprint();
826+
_footer.set_data_footprint(origin_data_footprint + total_data_size);
827+
815828
if (_has_key) {
816829
if (_is_mow_with_cluster_key()) {
817830
// for now we don't need to query short key index for CLUSTER BY feature,

be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
#include "olap/short_key_index.h"
5555
#include "olap/tablet_schema.h"
5656
#include "olap/utils.h"
57+
#include "runtime/define_primitive_type.h"
5758
#include "runtime/exec_env.h"
5859
#include "runtime/memory/mem_tracker.h"
5960
#include "service/point_query_executor.h"
@@ -62,6 +63,7 @@
6263
#include "util/debug_points.h"
6364
#include "util/faststring.h"
6465
#include "util/key_util.h"
66+
#include "vec/columns/column_map.h"
6567
#include "vec/columns/column_nullable.h"
6668
#include "vec/columns/column_vector.h"
6769
#include "vec/columns/columns_number.h"
@@ -1174,7 +1176,9 @@ Status VerticalSegmentWriter::write_batch() {
11741176
vectorized::IOlapColumnDataAccessor* seq_column = nullptr;
11751177
// the key is cluster key column unique id
11761178
std::map<uint32_t, vectorized::IOlapColumnDataAccessor*> cid_to_column;
1179+
int64_t total_data_size = 0;
11771180
for (uint32_t cid = 0; cid < _tablet_schema->num_columns(); ++cid) {
1181+
int64_t column_data_size = 0;
11781182
RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema));
11791183
for (auto& data : _batched_blocks) {
11801184
RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns(
@@ -1200,17 +1204,28 @@ Status VerticalSegmentWriter::write_batch() {
12001204
}
12011205
RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(),
12021206
data.num_rows));
1207+
if (_data_dir != nullptr &&
1208+
_data_dir->reach_capacity_limit(_column_writers[cid]->estimate_buffer_size())) {
1209+
return Status::Error<DISK_REACH_CAPACITY_LIMIT>("disk {} exceed capacity limit.",
1210+
_data_dir->path_hash());
1211+
}
1212+
1213+
// estimate column data size for flush memtable, may be inaccurate at low cardinality
1214+
column_data_size += _column_writers[cid]->estimate_buffer_size();
1215+
total_data_size += column_data_size;
1216+
auto origin_data_size = _footer.columns(cid).total_data_size();
1217+
_footer.mutable_columns(cid)->set_total_data_size(origin_data_size + column_data_size);
1218+
1219+
RETURN_IF_ERROR(_column_writers[cid]->finish());
1220+
RETURN_IF_ERROR(_column_writers[cid]->write_data());
1221+
12031222
_olap_data_convertor->clear_source_content();
12041223
}
1205-
if (_data_dir != nullptr &&
1206-
_data_dir->reach_capacity_limit(_column_writers[cid]->estimate_buffer_size())) {
1207-
return Status::Error<DISK_REACH_CAPACITY_LIMIT>("disk {} exceed capacity limit.",
1208-
_data_dir->path_hash());
1209-
}
1210-
RETURN_IF_ERROR(_column_writers[cid]->finish());
1211-
RETURN_IF_ERROR(_column_writers[cid]->write_data());
12121224
}
12131225

1226+
auto origin_data_footprint = _footer.data_footprint();
1227+
_footer.set_data_footprint(origin_data_footprint + total_data_size);
1228+
12141229
for (auto& data : _batched_blocks) {
12151230
_olap_data_convertor->set_source_content(data.block, data.row_pos, data.num_rows);
12161231
RETURN_IF_ERROR(_generate_key_index(data, key_columns, seq_column, cid_to_column));

gensrc/proto/segment_v2.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ message ColumnMetaPB {
197197
optional bool result_is_nullable = 18; // used on agg_state type
198198
optional string function_name = 19; // used on agg_state type
199199
optional int32 be_exec_version = 20; // used on agg_state type
200+
201+
optional uint64 total_data_size = 21;
200202
}
201203

202204
message PrimaryKeyIndexMetaPB {

0 commit comments

Comments
 (0)