diff --git a/CMakeLists.txt b/CMakeLists.txt index be3cdd794e..4cb0749ce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -247,8 +247,12 @@ if(ENABLE_CUDA) else() if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0") message(STATUS "CMake 3.18+, Setting CUDA_ARCHITECTURES.") - set(CMAKE_CUDA_ARCHITECTURES - 35-virtual + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0.0) + set(CMAKE_CUDA_ARCHITECTURES 35-virtual) + else() + set(CMAKE_CUDA_ARCHITECTURES "") + endif() + list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 60-virtual 70-virtual @@ -258,8 +262,12 @@ if(ENABLE_CUDA) message(STATUS "CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") else() message(STATUS "CMake 3.17 or under, setting CUDA architecture flags manually.") - set(CUDA_COMPILATION_ARCH - -gencode=arch=compute_35,code=compute_35; + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0.0) + set(CUDA_COMPILATION_ARCH -gencode=arch=compute_35,code=compute_35;) + else() + set(CUDA_COMPILATION_ARCH "") + endif() + list(APPEND CUDA_COMPILATION_ARCH -gencode=arch=compute_50,code=compute_50; -gencode=arch=compute_60,code=compute_60; -gencode=arch=compute_70,code=compute_70; diff --git a/DataMgr/BufferMgr/BufferMgr.cpp b/DataMgr/BufferMgr/BufferMgr.cpp index 16d3fac7f6..d883ff949d 100644 --- a/DataMgr/BufferMgr/BufferMgr.cpp +++ b/DataMgr/BufferMgr/BufferMgr.cpp @@ -620,17 +620,11 @@ void BufferMgr::deleteBuffersWithPrefix(const ChunkKey& key_prefix, const bool) // reserveBuffer which needs segs_mutex_ and then // chunk_index_mutex_ std::lock_guard chunk_index_lock(chunk_index_mutex_); - auto startChunkIt = chunk_index_.lower_bound(key_prefix); - if (startChunkIt == chunk_index_.end()) { - return; - } - - auto buffer_it = startChunkIt; - while (buffer_it != chunk_index_.end() && - std::search(buffer_it->first.begin(), - buffer_it->first.begin() + key_prefix.size(), - key_prefix.begin(), - key_prefix.end()) != buffer_it->first.begin() + key_prefix.size()) { + auto prefix_upper_bound = key_prefix; + prefix_upper_bound.emplace_back(std::numeric_limits::max()); + for (auto buffer_it = chunk_index_.lower_bound(key_prefix), + end_chunk_it = chunk_index_.upper_bound(prefix_upper_bound); + buffer_it != end_chunk_it;) { auto seg_it = buffer_it->second; if (seg_it->buffer) { if (seg_it->buffer->getPinCount() != 0) { diff --git a/DataMgr/ForeignStorage/CsvFileBufferParser.cpp b/DataMgr/ForeignStorage/CsvFileBufferParser.cpp index 0937a4661f..58312016ee 100644 --- a/DataMgr/ForeignStorage/CsvFileBufferParser.cpp +++ b/DataMgr/ForeignStorage/CsvFileBufferParser.cpp @@ -168,6 +168,8 @@ ParseBufferResult CsvFileBufferParser::parseBuffer(ParseBufferRequest& request, std::vector> tmp_buffers; // holds string w/ removed escape chars, etc const char* line_start = p; + row_index_plus_one++; + bool incorrect_column_count = false; p = import_export::delimited_parser::get_row(p, thread_buf_end, buf_end, @@ -177,10 +179,6 @@ ParseBufferResult CsvFileBufferParser::parseBuffer(ParseBufferRequest& request, tmp_buffers, try_single_thread, !columns_are_pre_filtered); - - row_index_plus_one++; - - bool incorrect_column_count = false; try { validate_expected_column_count(row, num_cols, point_cols, file_path); } catch (const ForeignStorageException& e) { diff --git a/ImportExport/DelimitedParserUtils.cpp b/ImportExport/DelimitedParserUtils.cpp index 0231c12768..20cea36fd1 100644 --- a/ImportExport/DelimitedParserUtils.cpp +++ b/ImportExport/DelimitedParserUtils.cpp @@ -46,13 +46,18 @@ inline void trim_space(const char*& field_begin, const char*& field_end) { inline void trim_quotes(const char*& field_begin, const char*& field_end, const import_export::CopyParams& copy_params) { - if (copy_params.quoted && field_end - field_begin > 0 && - *field_begin == copy_params.quote) { - ++field_begin; + auto quote_begin = field_begin, quote_end = field_end; + if (copy_params.quoted) { + trim_space(quote_begin, quote_end); } - if (copy_params.quoted && field_end - field_begin > 0 && - *(field_end - 1) == copy_params.quote) { - --field_end; + if (copy_params.quoted && quote_end - quote_begin > 0) { + if (*quote_begin == copy_params.quote && *(quote_end - 1) == copy_params.quote) { + field_begin = ++quote_begin; + field_end = (quote_begin == quote_end) ? quote_end : --quote_end; + } else { + throw import_export::delimited_parser::DelimitedParserException( + "Unable to trim quotes."); + } } } } // namespace @@ -123,13 +128,26 @@ size_t find_end(const char* buffer, if (last_line_delim_pos <= 0) { size_t excerpt_length = std::min(50, size); std::string buffer_excerpt{buffer, buffer + excerpt_length}; - std::string error_message = - "Unable to find an end of line character after reading " + std::to_string(size) + - " characters. Please ensure that the correct \"line_delimiter\" option is " - "specified or update the \"buffer_size\" option appropriately. Row number: " + - std::to_string(buffer_first_row_index + 1) + - ". First few characters in row: " + buffer_excerpt; - throw InsufficientBufferSizeException{error_message}; + if (in_quote) { + std::string quote(1, copy_params.quote); + std::string error_message = + "Unable to find a matching end quote for the quote character '" + quote + + "' after reading " + std::to_string(size) + + " characters. Please ensure that all data fields are correctly formatted " + "or update the \"buffer_size\" option appropriately. Row number: " + + std::to_string(buffer_first_row_index + 1) + + ". First few characters in row: " + buffer_excerpt; + throw InsufficientBufferSizeException{error_message}; + } else { + std::string error_message = + "Unable to find an end of line character after reading " + + std::to_string(size) + + " characters. Please ensure that the correct \"line_delimiter\" option is " + "specified or update the \"buffer_size\" option appropriately. Row number: " + + std::to_string(buffer_first_row_index + 1) + + ". First few characters in row: " + buffer_excerpt; + throw InsufficientBufferSizeException{error_message}; + } } return last_line_delim_pos + 1; @@ -244,10 +262,10 @@ const char* get_row(const char* buf, } const char* field_begin = field_buf; const char* field_end = field_buf + j; + trim_quotes(field_begin, field_end, copy_params); if (copy_params.trim_spaces) { trim_space(field_begin, field_end); } - trim_quotes(field_begin, field_end, copy_params); row.emplace_back(field_begin, field_end - field_begin); } field = p + 1; diff --git a/ImportExport/DelimitedParserUtils.h b/ImportExport/DelimitedParserUtils.h index 032633d102..5aad86425f 100644 --- a/ImportExport/DelimitedParserUtils.h +++ b/ImportExport/DelimitedParserUtils.h @@ -37,6 +37,11 @@ class InsufficientBufferSizeException : public std::runtime_error { : std::runtime_error(message) {} }; +class DelimitedParserException : public std::runtime_error { + public: + DelimitedParserException(const std::string& message) : std::runtime_error(message) {} +}; + /** * @brief Finds the closest possible row beginning in the given buffer. * diff --git a/ImportExport/Importer.cpp b/ImportExport/Importer.cpp index 79f66ca0fa..f7029d1713 100644 --- a/ImportExport/Importer.cpp +++ b/ImportExport/Importer.cpp @@ -2020,6 +2020,7 @@ static ImportStatus import_thread_delimited( row.clear(); std::vector> tmp_buffers; // holds string w/ removed escape chars, etc + row_index_plus_one++; if (DEBUG_TIMING) { us = measure::execution([&]() { p = import_export::delimited_parser::get_row(p, @@ -2044,7 +2045,6 @@ static ImportStatus import_thread_delimited( try_single_thread, true); } - row_index_plus_one++; // Each POINT could consume two separate coords instead of a single WKT if (row.size() < num_cols || (num_cols + point_cols) < row.size()) { thread_import_status.rows_rejected++; diff --git a/QueryEngine/ColumnarResults.cpp b/QueryEngine/ColumnarResults.cpp index ae932777b7..e377bae564 100644 --- a/QueryEngine/ColumnarResults.cpp +++ b/QueryEngine/ColumnarResults.cpp @@ -147,9 +147,10 @@ int64_t countNumberOfValues(const ResultSet& rows, const size_t column_idx) { std::plus()); } -int64_t countNumberOfValuesGeoLineString(const ResultSet& rows, - const SQLTypeInfo& ti, - const size_t column_idx) { +template +int64_t countNumberOfValuesGeoType(const ResultSet& rows, + const SQLTypeInfo& ti, + const size_t column_idx) { return tbb::parallel_reduce( tbb::blocked_range(0, rows.rowCount()), static_cast(0), @@ -162,68 +163,30 @@ int64_t countNumberOfValuesGeoLineString(const ResultSet& rows, const auto s_ptr = boost::get(ns); if (s_ptr) { // We count the number of commas in WKT representation - // of a line string (e.g. LINESTRING(1 2,3 4)) to get - // the number of points it contains: + // (e.g. POLYGON ((0 0,4 0,4 4,0 4,0 0),(1 1,1 2,2 2,2 + // 1,1 1))) to get the number of points it contains. + // This method is usable for any geo type. running_count += std::count(s_ptr->begin(), s_ptr->end(), ',') + 1; } } else if (const auto tv = boost::get(&crt_row[column_idx])) { - const auto s = boost::get(tv); + const auto s = boost::get(tv); CHECK(s); VarlenDatum* d = s->coords_data.get(); - CHECK(d); - running_count += d->length / - (ti.get_compression() == kENCODING_GEOINT ? sizeof(int32_t) - : sizeof(double)) / - 2; + if (d != nullptr) { + running_count += + d->length / + (ti.get_compression() == kENCODING_GEOINT ? sizeof(int32_t) + : sizeof(double)) / + 2; + } // else s is NULL } else if (const auto tv = boost::get(&crt_row[column_idx])) { - const auto s = boost::get(tv->get()); - std::vector* d = s.coords.get(); - CHECK(d); - running_count += d->size(); - } else { - UNREACHABLE(); - } - } - return running_count; - }, - std::plus()); -} - -int64_t countNumberOfValuesGeoPolygon(const ResultSet& rows, - const SQLTypeInfo& ti, - const size_t column_idx) { - return tbb::parallel_reduce( - tbb::blocked_range(0, rows.rowCount()), - static_cast(0), - [&](tbb::blocked_range r, int64_t running_count) { - for (int i = r.begin(); i < r.end(); ++i) { - const auto crt_row = rows.getRowAtNoTranslations(i); - if (const auto tv = boost::get(&crt_row[column_idx])) { - const auto ns = boost::get(tv); - CHECK(ns); - const auto s_ptr = boost::get(ns); - if (s_ptr) { - // We count the number of commas and parenthesis in WKT representation - // of a polygon (e.g. POLYGON ((0 0,4 0,4 4,0 4,0 0),(1 1,1 2,2 2,2 1,1 1))) - // to get the number of points it contains: - running_count += std::count(s_ptr->begin(), s_ptr->end(), ',') + 1; - } - } else if (const auto tv = - boost::get(&crt_row[column_idx])) { - const auto s = boost::get(tv); - CHECK(s); - VarlenDatum* d = s->coords_data.get(); - CHECK(d); - running_count += d->length / - (ti.get_compression() == kENCODING_GEOINT ? sizeof(int32_t) - : sizeof(double)) / - 2; - } else if (const auto tv = boost::get(&crt_row[column_idx])) { - const auto s = boost::get(tv->get()); - std::vector* d = s.coords.get(); - CHECK(d); - running_count += d->size(); + if (tv->get_ptr() != nullptr) { + const auto s = boost::get(tv->get()); + std::vector* d = s.coords.get(); + CHECK(d); + running_count += d->size(); + } // else s is NULL } else { UNREACHABLE(); } @@ -278,15 +241,30 @@ ColumnarResults::ColumnarResults(std::shared_ptr row_set_mem_ values_count = num_rows_; break; case kLINESTRING: - values_count = countNumberOfValuesGeoLineString(rows, ti, i); + values_count = + countNumberOfValuesGeoType(rows, ti, i); break; case kPOLYGON: - values_count = countNumberOfValuesGeoPolygon(rows, ti, i); + values_count = + countNumberOfValuesGeoType( + rows, ti, i); + break; + case kMULTILINESTRING: + values_count = + countNumberOfValuesGeoType(rows, ti, i); + break; + case kMULTIPOLYGON: + values_count = + countNumberOfValuesGeoType(rows, ti, i); break; default: UNREACHABLE() << "count number of values not implemented for " << ti.toString(); } + // TODO: include sizes count to optimize flatbuffer size const int64_t flatbuffer_size = getFlatBufferSize(num_rows_, values_count, ti); column_buffers_[i] = row_set_mem_owner->allocate(flatbuffer_size, thread_idx_); FlatBufferManager m{column_buffers_[i]}; @@ -474,6 +452,172 @@ void ColumnarResults::materializeAllColumnsThroughIteration(const ResultSet& row rows.moveToBegin(); } +template +void TargetValueToNestedArray(int8_t* buf, + const int64_t index, + const SQLTypeInfo& ti, + const TargetValue& col_val, + std::mutex* write_mutex) { + FlatBufferManager m{buf}; + const SQLTypeInfoLite* ti_lite = + reinterpret_cast(m.get_user_data_buffer()); + if (ti_lite->is_geoint()) { + CHECK_EQ(ti.get_compression(), kENCODING_GEOINT); + } else { + CHECK_EQ(ti.get_compression(), kENCODING_NONE); + } + FlatBufferManager::Status status{}; + if (const auto tv = boost::get(&col_val)) { + const auto ns = boost::get(tv); + CHECK(ns); + const auto s_ptr = boost::get(ns); + if (s_ptr == nullptr || *s_ptr == "NULL") { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setNull(index); + } else { + std::vector coords; + std::vector bounds; + std::vector ring_sizes; + std::vector poly_rings; + int64_t approx_nof_coords = 2 * std::count(s_ptr->begin(), s_ptr->end(), ','); + coords.reserve(approx_nof_coords); + bounds.reserve(4); + const auto gdal_wkt_ls = GeospatialGeoType(*s_ptr); + if constexpr (NDIM == 1) { + gdal_wkt_ls.getColumns(coords, bounds); + } else if constexpr (NDIM == 2) { + int64_t approx_nof_rings = std::count(s_ptr->begin(), s_ptr->end(), '(') - 1; + ring_sizes.reserve(approx_nof_rings); + gdal_wkt_ls.getColumns(coords, ring_sizes, bounds); + } else if constexpr (NDIM == 3) { + int64_t approx_nof_rings = std::count(s_ptr->begin(), s_ptr->end(), '(') - 1; + ring_sizes.reserve(approx_nof_rings); + poly_rings.reserve(approx_nof_rings); + gdal_wkt_ls.getColumns(coords, ring_sizes, poly_rings, bounds); + } else { + UNREACHABLE(); + } + const std::vector compressed_coords = + Geospatial::compress_coords(coords, ti); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + if constexpr (NDIM == 1) { + status = m.setItem(index, compressed_coords); + } else if constexpr (NDIM == 2) { + status = m.setItem(index, compressed_coords, ring_sizes); + } else if constexpr (NDIM == 3) { + status = m.setItem(index, compressed_coords, ring_sizes, poly_rings); + } else { + UNREACHABLE(); + } + } + } + } else if (const auto tv = boost::get(&col_val)) { + const auto s = boost::get(tv); + CHECK(s); + if (s->coords_data == nullptr || s->coords_data->pointer == nullptr) { + status = m.setNull(index); + } else { + const VarlenDatum* d = s->coords_data.get(); + CHECK(d); + CHECK(d->pointer); + + int32_t nof_values = + d->length / (ti_lite->is_geoint() ? 2 * sizeof(int32_t) : 2 * sizeof(double)); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + if constexpr (NDIM == 1) { + status = m.setItem<0, false>(index, d->pointer, nof_values); + } else if constexpr (NDIM == 2) { + VarlenDatum* r = nullptr; + if constexpr (is_multi) { + r = s->linestring_sizes_data.get(); + } else { + r = s->ring_sizes_data.get(); + } + status = m.setItem<1, /*check_sizes=*/false>( + index, + d->pointer, + nof_values, + reinterpret_cast(r->pointer), + r->length / sizeof(int32_t)); + } else if constexpr (NDIM == 3) { + const VarlenDatum* r = s->ring_sizes_data.get(); + const VarlenDatum* p = s->poly_rings_data.get(); + status = m.setItem<2, /*check_sizes=*/false>( + index, + d->pointer, + nof_values, + reinterpret_cast(r->pointer), + r->length / sizeof(int32_t), + reinterpret_cast(p->pointer), + p->length / sizeof(int32_t)); + } else { + UNREACHABLE(); + } + } + } + } else if (const auto tv = boost::get(&col_val)) { + if (tv->get_ptr() == nullptr) { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setNull(index); + } else { + const auto s = boost::get(tv->get()); + const std::vector* d = s.coords.get(); + const std::vector* r = nullptr; + const std::vector* p = nullptr; + if constexpr (NDIM == 1) { + CHECK(r == nullptr); + CHECK(p == nullptr); + } else if constexpr (NDIM == 2) { + if constexpr (is_multi) { + r = s.linestring_sizes.get(); + } else { + r = s.ring_sizes.get(); + } + CHECK(p == nullptr); + } else if constexpr (NDIM == 3) { + r = s.ring_sizes.get(); + p = s.poly_rings.get(); + } else { + UNREACHABLE(); + } + CHECK(d); + CHECK_NE(d->size(), 0); + std::vector compressed_coords = Geospatial::compress_coords(*d, ti); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + if constexpr (NDIM == 1) { + status = m.setItem(index, compressed_coords); + } else if constexpr (NDIM == 2) { + status = m.setItem(index, compressed_coords, *r); + } else if constexpr (NDIM == 3) { + status = m.setItem(index, compressed_coords, *r, *p); + } else { + UNREACHABLE(); + } + } + } + } else { + UNREACHABLE(); + } + CHECK_EQ(status, FlatBufferManager::Status::Success); +} + /* * This function processes and decodes its input TargetValue * and write it into its corresponding column buffer's cell (with corresponding @@ -544,7 +688,7 @@ inline void ColumnarResults::writeBackCell(const TargetValue& col_val, auto lock_scope = (write_mutex == nullptr ? std::unique_lock() : std::unique_lock(*write_mutex)); - status = m.setItem( + status = m.setItemOld( row_idx, reinterpret_cast(data.data()), data.size()); } CHECK_EQ(status, FlatBufferManager::Status::Success); @@ -559,7 +703,7 @@ inline void ColumnarResults::writeBackCell(const TargetValue& col_val, auto lock_scope = (write_mutex == nullptr ? std::unique_lock() : std::unique_lock(*write_mutex)); - status = m.setItem( + status = m.setItemOld( row_idx, reinterpret_cast(d->pointer), d->length); } CHECK_EQ(status, FlatBufferManager::Status::Success); @@ -575,7 +719,7 @@ inline void ColumnarResults::writeBackCell(const TargetValue& col_val, auto lock_scope = (write_mutex == nullptr ? std::unique_lock() : std::unique_lock(*write_mutex)); - status = m.setItem( + status = m.setItemOld( row_idx, reinterpret_cast(d->data()), m.dtypeSize()); } CHECK_EQ(d->size(), 2); @@ -587,169 +731,42 @@ inline void ColumnarResults::writeBackCell(const TargetValue& col_val, } case kLINESTRING: { CHECK(FlatBufferManager::isFlatBuffer(column_buffers_[column_idx])); - FlatBufferManager m{column_buffers_[column_idx]}; - if (const auto tv = boost::get(&col_val)) { - const auto ns = boost::get(tv); - CHECK(ns); - const auto s_ptr = boost::get(ns); - if (s_ptr == nullptr) { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - status = m.setNull(row_idx); - } else { - std::vector coords; - std::vector bounds; - int64_t approx_nof_coords = - 2 * (std::count(s_ptr->begin(), s_ptr->end(), ',') + 1); - coords.reserve(approx_nof_coords); - bounds.reserve(4); - const auto gdal_wkt_ls = Geospatial::GeoLineString(*s_ptr); - gdal_wkt_ls.getColumns(coords, bounds); - std::vector compressed_coords = - Geospatial::compress_coords(coords, type_info); - { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - status = - m.setItem(row_idx, - reinterpret_cast(compressed_coords.data()), - compressed_coords.size()); - } - } - CHECK_EQ(status, FlatBufferManager::Status::Success); - } else if (const auto tv = boost::get(&col_val)) { - const auto s = boost::get(tv); - CHECK(s); - VarlenDatum* d = s->coords_data.get(); - CHECK(d); - CHECK_EQ(type_info.get_compression() == kENCODING_GEOINT, - m.getGeoLineStringMetadata()->is_geoint); - { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - if (d->pointer == nullptr) { - status = m.setNull(row_idx); - } else { - status = m.setItem( - row_idx, reinterpret_cast(d->pointer), d->length); - } - } - CHECK_EQ(status, FlatBufferManager::Status::Success); - } else if (const auto tv = boost::get(&col_val)) { - /* - Warning: the following code fails for NULL row values - because of the failure to detect the nullness correctly. - */ - const auto s = boost::get(tv->get()); - std::vector* d = s.coords.get(); - CHECK(d); - std::vector compressed_coords = - Geospatial::compress_coords(*d, type_info); - { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - status = m.setItem(row_idx, - reinterpret_cast(compressed_coords.data()), - compressed_coords.size()); - } - CHECK_EQ(status, FlatBufferManager::Status::Success); - } else { - UNREACHABLE(); - } + TargetValueToNestedArray<1, + Geospatial::GeoLineString, + GeoLineStringTargetValue, + GeoLineStringTargetValuePtr, + /*is_multi=*/false>( + column_buffers_[column_idx], row_idx, type_info, col_val, write_mutex); break; } case kPOLYGON: { CHECK(FlatBufferManager::isFlatBuffer(column_buffers_[column_idx])); - FlatBufferManager m{column_buffers_[column_idx]}; - FlatBufferManager::Status status{}; - if (const auto tv = boost::get(&col_val)) { - const auto ns = boost::get(tv); - CHECK(ns); - const auto s_ptr = boost::get(ns); - if (s_ptr == nullptr) { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - status = m.setNull(row_idx); - } else { - std::vector coords; - std::vector ring_sizes; - std::vector bounds; - int64_t approx_nof_coords = 2 * std::count(s_ptr->begin(), s_ptr->end(), ','); - int64_t approx_nof_rings = std::count(s_ptr->begin(), s_ptr->end(), '(') - 1; - coords.reserve(approx_nof_coords); - ring_sizes.reserve(approx_nof_rings); - bounds.reserve(4); - const auto gdal_wkt_ls = Geospatial::GeoPolygon(*s_ptr); - gdal_wkt_ls.getColumns(coords, ring_sizes, bounds); - std::vector compressed_coords = - Geospatial::compress_coords(coords, type_info); - { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - status = m.setItemCountsAndData( - row_idx, - ring_sizes.data(), - ring_sizes.size(), - reinterpret_cast(compressed_coords.data()), - nullptr); - } - } - CHECK_EQ(status, FlatBufferManager::Status::Success); - } else if (const auto tv = boost::get(&col_val)) { - const auto s = boost::get(tv); - CHECK(s); - const VarlenDatum* d = s->coords_data.get(); - const VarlenDatum* r = s->ring_sizes_data.get(); - CHECK(d); - CHECK_EQ(type_info.get_compression() == kENCODING_GEOINT, - m.getGeoPolygonMetadata()->is_geoint); - { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - if (d->pointer == nullptr) { - status = m.setNull(row_idx); - } else { - status = - m.setItemCountsAndData(row_idx, - reinterpret_cast(r->pointer), - r->length / sizeof(int32_t), - reinterpret_cast(d->pointer)); - } - } - CHECK_EQ(status, FlatBufferManager::Status::Success); - } else if (const auto tv = boost::get(&col_val)) { - /* - Warning: the following code fails for NULL row values - because of the failure to detect the nullness correctly. - */ - const auto s = boost::get(tv->get()); - const std::vector* d = s.coords.get(); - const std::vector* r = s.ring_sizes.get(); - CHECK(d); - CHECK(r); - std::vector compressed_coords = - Geospatial::compress_coords(*d, type_info); - { - auto lock_scope = - (write_mutex == nullptr ? std::unique_lock() - : std::unique_lock(*write_mutex)); - status = m.setItemCountsAndData( - row_idx, - r->data(), - r->size(), - reinterpret_cast(compressed_coords.data())); - } - CHECK_EQ(status, FlatBufferManager::Status::Success); - } else { - UNREACHABLE(); - } + TargetValueToNestedArray<2, + Geospatial::GeoPolygon, + GeoPolyTargetValue, + GeoPolyTargetValuePtr, + /*is_multi=*/false>( + column_buffers_[column_idx], row_idx, type_info, col_val, write_mutex); + break; + } + case kMULTILINESTRING: { + CHECK(FlatBufferManager::isFlatBuffer(column_buffers_[column_idx])); + TargetValueToNestedArray<2, + Geospatial::GeoMultiLineString, + GeoMultiLineStringTargetValue, + GeoMultiLineStringTargetValuePtr, + /*is_multi=*/true>( + column_buffers_[column_idx], row_idx, type_info, col_val, write_mutex); + break; + } + case kMULTIPOLYGON: { + CHECK(FlatBufferManager::isFlatBuffer(column_buffers_[column_idx])); + TargetValueToNestedArray<3, + Geospatial::GeoMultiPolygon, + GeoMultiPolyTargetValue, + GeoMultiPolyTargetValuePtr, + /*is_true=*/false>( + column_buffers_[column_idx], row_idx, type_info, col_val, write_mutex); break; } default: diff --git a/QueryEngine/Descriptors/InputDescriptors.h b/QueryEngine/Descriptors/InputDescriptors.h index ceca307d2d..34aa935a5b 100644 --- a/QueryEngine/Descriptors/InputDescriptors.h +++ b/QueryEngine/Descriptors/InputDescriptors.h @@ -69,6 +69,14 @@ class InputColDescriptor final { const InputDescriptor& getScanDesc() const { return input_desc_; } + shared::TableKey getTableKey() const { + return shared::TableKey{input_desc_.getTableKey()}; + } + + shared::ColumnKey getColumnKey() const { + return shared::ColumnKey{getTableKey(), col_id_}; + } + size_t hash() const { return input_desc_.hash() ^ (static_cast(col_id_) << 16); } diff --git a/QueryEngine/ExpressionRewrite.cpp b/QueryEngine/ExpressionRewrite.cpp index 1aba6e4c71..a85c276c0f 100644 --- a/QueryEngine/ExpressionRewrite.cpp +++ b/QueryEngine/ExpressionRewrite.cpp @@ -870,11 +870,9 @@ OverlapsJoinTranslationResult translate_overlaps_conjunction_with_reordering( // ordering for overlaps, the join builder will fail. std::set lhs_rte_idx; lhs->collect_rte_idx(lhs_rte_idx); - CHECK(!lhs_rte_idx.empty()); std::set rhs_rte_idx; rhs->collect_rte_idx(rhs_rte_idx); - CHECK(!rhs_rte_idx.empty()); - auto has_invalid_num_join_cols = lhs_rte_idx.size() > 1 || rhs_rte_idx.size() > 1; + auto has_invalid_num_join_cols = lhs_rte_idx.size() != 1 || rhs_rte_idx.size() != 1; auto has_invalid_rte_idx = lhs_rte_idx > rhs_rte_idx; return std::make_pair(has_invalid_num_join_cols || has_invalid_rte_idx, has_invalid_rte_idx); diff --git a/QueryEngine/GroupByRuntime.cpp b/QueryEngine/GroupByRuntime.cpp index c79e1d4fb3..33a0645415 100644 --- a/QueryEngine/GroupByRuntime.cpp +++ b/QueryEngine/GroupByRuntime.cpp @@ -272,7 +272,7 @@ bucketized_hash_join_idx(int64_t hash_buff, int64_t const max_key, const int64_t translated_null_val, int64_t bucket_normalization) { - if (key >= min_key && key <= max_key) { + if (hash_buff && key >= min_key && key <= max_key) { return *SUFFIX(get_bucketized_hash_slot)(reinterpret_cast(hash_buff), key, min_key / bucket_normalization, @@ -354,7 +354,7 @@ hash_join_idx_sharded(int64_t hash_buff, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count) { - if (key >= min_key && key <= max_key) { + if (hash_buff && key >= min_key && key <= max_key) { return *SUFFIX(get_hash_slot_sharded)(reinterpret_cast(hash_buff), key, min_key, diff --git a/QueryEngine/InputMetadata.cpp b/QueryEngine/InputMetadata.cpp index 23cf3f628a..59806eeb09 100644 --- a/QueryEngine/InputMetadata.cpp +++ b/QueryEngine/InputMetadata.cpp @@ -266,14 +266,26 @@ ChunkMetadataMap synthesize_metadata_table_function(const ResultSet* rows) { FlatBufferManager m{const_cast(columnar_buffer)}; chunk_metadata->numBytes = m.getBufferSize(); if (is_geometry) { - // a geometry value is a pair of coordinates but its element - // type value is a int or double, hence multiplication by 2: - values_count = m.get_nof_values() * 2; + switch (col_sql_type_info.get_type()) { + case kLINESTRING: + case kPOLYGON: + case kMULTILINESTRING: + case kMULTIPOLYGON: { + values_count = m.getValuesCount(); + values_buffer = m.get_values_buffer(); + } break; + default: { + // a geometry value is a pair of coordinates but its element + // type value is a int or double, hence multiplication by 2: + values_count = m.get_nof_values() * 2; + values_buffer = m.get_values(); + } + } } else { CHECK(is_array); values_count = m.get_nof_values(); + values_buffer = m.get_values(); } - values_buffer = m.get_values(); } else { chunk_metadata->numBytes = row_count * col_type_info.get_size(); values_count = row_count; diff --git a/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp b/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp index 188ea108cc..48d8917f0e 100644 --- a/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp +++ b/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp @@ -52,8 +52,15 @@ std::shared_ptr BaselineJoinHashTable::getInstance( const TableIdToNodeMap& table_id_to_node_map) { decltype(std::chrono::steady_clock::now()) ts1, ts2; + auto hash_type = preferred_hash_type; + if (query_hints.force_one_to_many_hash_join) { + LOG(INFO) << "A user's query hint forced the join operation to use OneToMany hash " + "join layout"; + hash_type = HashType::OneToMany; + } + if (VLOGGING(1)) { - VLOG(1) << "Building keyed hash table " << getHashTypeString(preferred_hash_type) + VLOG(1) << "Building keyed hash table " << getHashTypeString(hash_type) << " for qual: " << condition->toString(); ts1 = std::chrono::steady_clock::now(); } @@ -75,7 +82,7 @@ std::shared_ptr BaselineJoinHashTable::getInstance( hashtable_build_dag_map, table_id_to_node_map)); try { - join_hash_table->reify(preferred_hash_type); + join_hash_table->reify(hash_type); } catch (const TableMustBeReplicated& e) { // Throw a runtime error to abort the query join_hash_table->freeHashBufferMemory(); diff --git a/QueryEngine/JoinHashTable/HashJoin.cpp b/QueryEngine/JoinHashTable/HashJoin.cpp index 3ebdf8c637..b18e38f2ce 100644 --- a/QueryEngine/JoinHashTable/HashJoin.cpp +++ b/QueryEngine/JoinHashTable/HashJoin.cpp @@ -314,7 +314,12 @@ std::shared_ptr HashJoin::getInstance( query_hint, table_id_to_node_map); } else if (dynamic_cast( - qual_bin_oper->get_left_operand())) { + qual_bin_oper->get_left_operand()) || + query_hint.force_baseline_hash_join) { + if (query_hint.force_baseline_hash_join) { + LOG(INFO) << "A user's query hint forced the join operation to use the Baseline " + "hash join layout"; + } VLOG(1) << "Trying to build keyed hash table:"; join_hash_table = BaselineJoinHashTable::getInstance(qual_bin_oper, query_infos, diff --git a/QueryEngine/JoinHashTable/OverlapsJoinHashTable.cpp b/QueryEngine/JoinHashTable/OverlapsJoinHashTable.cpp index adc306eef3..e8a49a5198 100644 --- a/QueryEngine/JoinHashTable/OverlapsJoinHashTable.cpp +++ b/QueryEngine/JoinHashTable/OverlapsJoinHashTable.cpp @@ -46,9 +46,19 @@ std::shared_ptr OverlapsJoinHashTable::getInstance( const RegisteredQueryHint& query_hints, const TableIdToNodeMap& table_id_to_node_map) { decltype(std::chrono::steady_clock::now()) ts1, ts2; - + auto copied_query_hints = query_hints; + if (query_hints.force_one_to_many_hash_join) { + LOG(INFO) << "Ignoring query hint \'force_one_to_many_hash_join\' for the overlaps " + "or range hash " + "join operation"; + copied_query_hints.force_one_to_many_hash_join = false; + } + if (query_hints.force_baseline_hash_join) { + LOG(INFO) << "Ignoring query hint \'force_baseline_hash_join\' for the overlaps or " + "range hash join operation"; + copied_query_hints.force_baseline_hash_join = false; + } std::vector inner_outer_pairs; - if (const auto range_expr = dynamic_cast(condition->get_right_operand())) { return RangeJoinHashTable::getInstance(condition, @@ -60,7 +70,7 @@ std::shared_ptr OverlapsJoinHashTable::getInstance( column_cache, executor, hashtable_build_dag_map, - query_hints, + copied_query_hints, table_id_to_node_map); } else { inner_outer_pairs = @@ -115,7 +125,7 @@ std::shared_ptr OverlapsJoinHashTable::getInstance( executor, inner_outer_pairs, device_count, - query_hints, + copied_query_hints, hashtable_build_dag_map, table_id_to_node_map); try { diff --git a/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp b/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp index e2cb46d983..794dff125a 100644 --- a/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp +++ b/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp @@ -243,15 +243,6 @@ std::shared_ptr PerfectJoinHashTable::getInstance( throw TooManyHashEntries(oss.str()); } - auto const shard_count = get_shard_count(qual_bin_oper.get(), executor); - if (device_count > 1 && shard_count > 1) { - // use baseline hash join to compute this case until resolving related hash join logic - // todd(yoonmin): relax this after fixing related hashtable build/probe logic is fixed - throw TooManyHashEntries( - "Use baseline hash join: multiple GPUs process the input sharded table via " - "perfect hash can cause a wrong result"); - } - if (qual_bin_oper->get_optype() == kBW_EQ && col_range.getIntMax() >= std::numeric_limits::max()) { throw HashJoinFail("Cannot translate null value for kBW_EQ"); @@ -260,14 +251,19 @@ std::shared_ptr PerfectJoinHashTable::getInstance( if (VLOGGING(1)) { ts1 = std::chrono::steady_clock::now(); } - + auto hash_type = preferred_hash_type; + if (query_hints.force_one_to_many_hash_join) { + LOG(INFO) << "A user's query hint forced the join operation to use OneToMany hash " + "join layout"; + hash_type = HashType::OneToMany; + } auto join_hash_table = std::shared_ptr( new PerfectJoinHashTable(qual_bin_oper, inner_col, query_infos, memory_level, join_type, - preferred_hash_type, + hash_type, col_range, rhs_source_col_range, bucketized_entry_count_info, diff --git a/QueryEngine/PlanState.cpp b/QueryEngine/PlanState.cpp index 56d0ec6b48..aaf15b8956 100644 --- a/QueryEngine/PlanState.cpp +++ b/QueryEngine/PlanState.cpp @@ -67,17 +67,45 @@ void PlanState::allocateLocalColumnIds( int PlanState::getLocalColumnId(const Analyzer::ColumnVar* col_var, const bool fetch_column) { + // Previously, we consider `rte_idx` of `col_var` w/ its column key together + // to specify columns in the `global_to_local_col_ids_`. + // But there is a case when the same col has multiple 'rte_idx's + // For instance, the same geometry col is used not only as input col of the geo join op, + // but also included as input col of filter predicate + // In such a case, the same geometry col has two rte_idxs (the one defined by the filter + // predicate and the other determined by the geo join operator) + // The previous logic cannot cover this case b/c it allows only one `rte_idx` per col + // But it is safe to share `rte_idx` of among all use cases of the same col CHECK(col_var); const auto& global_col_key = col_var->getColumnKey(); - const int scan_idx = col_var->get_rte_idx(); - InputColDescriptor scan_col_desc( - global_col_key.column_id, global_col_key.table_id, global_col_key.db_id, scan_idx); + InputColDescriptor scan_col_desc(global_col_key.column_id, + global_col_key.table_id, + global_col_key.db_id, + col_var->get_rte_idx()); + std::optional col_id{std::nullopt}; + // let's try to find col_id w/ considering `rte_idx` const auto it = global_to_local_col_ids_.find(scan_col_desc); - CHECK(it != global_to_local_col_ids_.end()) << "Expected to find " << scan_col_desc; - if (fetch_column) { - columns_to_fetch_.insert(global_col_key); + if (it != global_to_local_col_ids_.end()) { + // we have a valid col_id + col_id = it->second; + } else { + // otherwise, let's try to find col_id for the same col + // (but have different 'rte_idx') to share it w/ `col_var` + for (auto const& kv : global_to_local_col_ids_) { + if (kv.first.getColumnKey() == global_col_key) { + col_id = kv.second; + break; + } + } + } + if (col_id && *col_id >= 0) { + if (fetch_column) { + columns_to_fetch_.insert(global_col_key); + } + return *col_id; } - return it->second; + CHECK(false) << "Expected to find " << global_col_key; + return {}; } void PlanState::addNonHashtableQualForLeftJoin(size_t idx, diff --git a/QueryEngine/QueryHint.h b/QueryEngine/QueryHint.h index 8d528ee793..4f6801d237 100644 --- a/QueryEngine/QueryHint.h +++ b/QueryEngine/QueryHint.h @@ -50,6 +50,8 @@ enum QueryHint { kDisableLoopJoin, kLoopJoinInnerTableMaxNumRows, kMaxJoinHashTableSize, + kforceBaselineHashJoin, + kforceOneToManyHashJoin, kHintCount, // should be at the last elem before INVALID enum value to count # // supported hints correctly kInvalidHint // this should be the last elem of this enum @@ -78,7 +80,9 @@ static const std::unordered_map SupportedQueryHints = { {"allow_loop_join", QueryHint::kAllowLoopJoin}, {"disable_loop_join", QueryHint::kDisableLoopJoin}, {"loop_join_inner_table_max_num_rows", QueryHint::kLoopJoinInnerTableMaxNumRows}, - {"max_join_hashtable_size", QueryHint::kMaxJoinHashTableSize}}; + {"max_join_hashtable_size", QueryHint::kMaxJoinHashTableSize}, + {"force_baseline_hash_join", QueryHint::kforceBaselineHashJoin}, + {"force_one_to_many_hash_join", QueryHint::kforceOneToManyHashJoin}}; struct HintIdentifier { bool global_hint; @@ -203,6 +207,8 @@ struct RegisteredQueryHint { , use_loop_join(std::nullopt) , loop_join_inner_table_max_num_rows(g_trivial_loop_join_threshold) , max_join_hash_table_size(std::numeric_limits::max()) + , force_baseline_hash_join(false) + , force_one_to_many_hash_join(false) , registered_hint(QueryHint::kHintCount, false) {} RegisteredQueryHint operator||(const RegisteredQueryHint& global_hints) const { @@ -286,6 +292,14 @@ struct RegisteredQueryHint { updated_query_hints.max_join_hash_table_size = global_hints.max_join_hash_table_size; break; + case QueryHint::kforceBaselineHashJoin: + updated_query_hints.force_baseline_hash_join = + global_hints.force_baseline_hash_join; + break; + case QueryHint::kforceOneToManyHashJoin: + updated_query_hints.force_one_to_many_hash_join = + global_hints.force_one_to_many_hash_join; + break; default: UNREACHABLE(); } @@ -323,6 +337,8 @@ struct RegisteredQueryHint { std::optional use_loop_join; size_t loop_join_inner_table_max_num_rows; size_t max_join_hash_table_size; + bool force_baseline_hash_join; + bool force_one_to_many_hash_join; std::vector registered_hint; diff --git a/QueryEngine/RelAlgDag.h b/QueryEngine/RelAlgDag.h index 5c2f7ce2cf..573ae6ab73 100644 --- a/QueryEngine/RelAlgDag.h +++ b/QueryEngine/RelAlgDag.h @@ -2877,6 +2877,24 @@ class RelAlgDag : public boost::noncopyable { } break; } + case QueryHint::kforceBaselineHashJoin: { + query_hint.registerHint(QueryHint::kforceBaselineHashJoin); + query_hint.force_baseline_hash_join = true; + if (target.isGlobalHint()) { + global_query_hint.registerHint(QueryHint::kforceBaselineHashJoin); + global_query_hint.force_baseline_hash_join = true; + } + break; + } + case QueryHint::kforceOneToManyHashJoin: { + query_hint.registerHint(QueryHint::kforceOneToManyHashJoin); + query_hint.force_one_to_many_hash_join = true; + if (target.isGlobalHint()) { + global_query_hint.registerHint(QueryHint::kforceOneToManyHashJoin); + global_query_hint.force_one_to_many_hash_join = true; + } + break; + } default: break; } diff --git a/QueryEngine/ResultSetIteration.cpp b/QueryEngine/ResultSetIteration.cpp index 909396c0a7..ad984a2c04 100644 --- a/QueryEngine/ResultSetIteration.cpp +++ b/QueryEngine/ResultSetIteration.cpp @@ -1503,6 +1503,138 @@ bool ResultSet::isGeoColOnGpu(const size_t col_idx) const { return device_type_ == ExecutorDeviceType::GPU; } +template +TargetValue NestedArrayToGeoTargetValue(const int8_t* buf, + const int64_t index, + const SQLTypeInfo& ti, + const ResultSet::GeoReturnType return_type) { + FlatBufferManager m{const_cast(buf)}; + const SQLTypeInfoLite* ti_lite = + reinterpret_cast(m.get_user_data_buffer()); + if (ti_lite->is_geoint()) { + CHECK_EQ(ti.get_compression(), kENCODING_GEOINT); + } else { + CHECK_EQ(ti.get_compression(), kENCODING_NONE); + } + FlatBufferManager::NestedArrayItem item; + auto status = m.getItem(index, item); + CHECK_EQ(status, FlatBufferManager::Status::Success); + if (!item.is_null) { + // to ensure we can access item.sizes_buffers[...] and item.sizes_lengths[...] + CHECK_EQ(item.nof_sizes, NDIM - 1); + } + switch (return_type) { + case ResultSet::GeoReturnType::WktString: { + if (item.is_null) { + return NullableString(nullptr); + } + std::vector coords; + if (ti_lite->is_geoint()) { + coords = *decompress_coords( + ti, item.values, 2 * item.nof_values * sizeof(int32_t)); + } else { + const double* values_buf = reinterpret_cast(item.values); + coords.insert(coords.end(), values_buf, values_buf + 2 * item.nof_values); + } + if constexpr (NDIM == 1) { + GeospatialGeoType obj(coords); + return NullableString(obj.getWktString()); + } else if constexpr (NDIM == 2) { + std::vector rings; + rings.insert(rings.end(), + item.sizes_buffers[0], + item.sizes_buffers[0] + item.sizes_lengths[0]); + GeospatialGeoType obj(coords, rings); + return NullableString(obj.getWktString()); + } else if constexpr (NDIM == 3) { + std::vector rings; + std::vector poly_rings; + poly_rings.insert(poly_rings.end(), + item.sizes_buffers[0], + item.sizes_buffers[0] + item.sizes_lengths[0]); + rings.insert(rings.end(), + item.sizes_buffers[1], + item.sizes_buffers[1] + item.sizes_lengths[1]); + GeospatialGeoType obj(coords, rings, poly_rings); + return NullableString(obj.getWktString()); + } else { + UNREACHABLE(); + } + } break; + case ResultSet::GeoReturnType::GeoTargetValue: { + if (item.is_null) { + return GeoTargetValue(); + } + std::vector coords; + if (ti_lite->is_geoint()) { + coords = *decompress_coords( + ti, item.values, 2 * item.nof_values * sizeof(int32_t)); + } else { + const double* values_buf = reinterpret_cast(item.values); + coords.insert(coords.end(), values_buf, values_buf + 2 * item.nof_values); + } + if constexpr (NDIM == 1) { + return GeoTargetValue(GeoTypeTargetValue(coords)); + } else if constexpr (NDIM == 2) { + std::vector rings; + rings.insert(rings.end(), + item.sizes_buffers[0], + item.sizes_buffers[0] + item.sizes_lengths[0]); + return GeoTargetValue(GeoTypeTargetValue(coords, rings)); + } else if constexpr (NDIM == 3) { + std::vector rings; + std::vector poly_rings; + poly_rings.insert(poly_rings.end(), + item.sizes_buffers[0], + item.sizes_buffers[0] + item.sizes_lengths[0]); + rings.insert(rings.end(), + item.sizes_buffers[1], + item.sizes_buffers[1] + item.sizes_lengths[1]); + return GeoTargetValue(GeoTypeTargetValue(coords, rings, poly_rings)); + } else { + UNREACHABLE(); + } + } break; + case ResultSet::GeoReturnType::GeoTargetValuePtr: + case ResultSet::GeoReturnType::GeoTargetValueGpuPtr: { + if (item.is_null) { + return GeoTypeTargetValuePtr(); + } + auto coords = std::make_shared( + item.nof_values * m.getValueSize(), item.values, false); + + if constexpr (NDIM == 1) { + return GeoTypeTargetValuePtr({std::move(coords)}); + } else if constexpr (NDIM == 2) { + auto rings = std::make_shared( + item.sizes_lengths[0] * sizeof(int32_t), + reinterpret_cast(item.sizes_buffers[0]), + false); + return GeoTypeTargetValuePtr({std::move(coords), std::move(rings)}); + } else if constexpr (NDIM == 3) { + auto poly_rings = std::make_shared( + item.sizes_lengths[0] * sizeof(int32_t), + reinterpret_cast(item.sizes_buffers[0]), + false); + auto rings = std::make_shared( + item.sizes_lengths[1] * sizeof(int32_t), + reinterpret_cast(item.sizes_buffers[1]), + false); + return GeoTypeTargetValuePtr( + {std::move(coords), std::move(rings), std::move(poly_rings)}); + } else { + UNREACHABLE(); + } + } break; + default: + UNREACHABLE(); + } + return TargetValue(nullptr); +} + // Reads a geo value from a series of ptrs to var len types // In Columnar format, geo_target_ptr is the geo column ptr (a pointer to the beginning // of that specific geo column) and should be appropriately adjusted with the @@ -1702,6 +1834,16 @@ TargetValue ResultSet::makeGeoTargetValue(const int8_t* geo_target_ptr, static_cast(varlen_buffer[getCoordsDataPtr(geo_target_ptr)].size())); } else if (col_lazy_fetch && col_lazy_fetch->is_lazily_fetched) { const auto& frag_col_buffers = getFragColBuffers(); + + auto ptr = frag_col_buffers[col_lazy_fetch->local_col_id]; + if (FlatBufferManager::isFlatBuffer(ptr)) { + int64_t index = getCoordsDataPtr(geo_target_ptr); + return NestedArrayToGeoTargetValue<1, + Geospatial::GeoLineString, + GeoLineStringTargetValue, + GeoLineStringTargetValuePtr>( + ptr, index, target_info.sql_type, geo_return_type_); + } return GeoTargetValueBuilder::build( target_info.sql_type, geo_return_type_, @@ -1741,6 +1883,16 @@ TargetValue ResultSet::makeGeoTargetValue(const int8_t* geo_target_ptr, } else if (col_lazy_fetch && col_lazy_fetch->is_lazily_fetched) { const auto& frag_col_buffers = getFragColBuffers(); + auto ptr = frag_col_buffers[col_lazy_fetch->local_col_id]; + if (FlatBufferManager::isFlatBuffer(ptr)) { + int64_t index = getCoordsDataPtr(geo_target_ptr); + return NestedArrayToGeoTargetValue<2, + Geospatial::GeoMultiLineString, + GeoMultiLineStringTargetValue, + GeoMultiLineStringTargetValuePtr>( + ptr, index, target_info.sql_type, geo_return_type_); + } + return GeoTargetValueBuilder::build( target_info.sql_type, geo_return_type_, @@ -1785,48 +1937,12 @@ TargetValue ResultSet::makeGeoTargetValue(const int8_t* geo_target_ptr, const auto& frag_col_buffers = getFragColBuffers(); auto ptr = frag_col_buffers[col_lazy_fetch->local_col_id]; if (FlatBufferManager::isFlatBuffer(ptr)) { - FlatBufferManager m{const_cast(ptr)}; - - int8_t* coords_buf = nullptr; - int64_t nof_counts{}; - bool is_null = false; - int32_t* counts = nullptr; - int64_t length{}; int64_t index = getCoordsDataPtr(geo_target_ptr); - m.getItemCountsAndData(index, counts, nof_counts, coords_buf, length, is_null); - - if (ResultSet::GeoReturnType::WktString == geo_return_type_) { - if (is_null) { - return NullableString("NULL"); - } - std::vector ring_sizes_vec; - ring_sizes_vec.reserve(nof_counts); - for (int i = 0; i < nof_counts; i++) { - ring_sizes_vec.push_back(counts[i]); - } - Geospatial::GeoPolygon poly(*decompress_coords( - target_info.sql_type, coords_buf, length), - ring_sizes_vec); - return NullableString(poly.getWktString()); - } else if (ResultSet::GeoReturnType::GeoTargetValuePtr == geo_return_type_) { - auto coords = std::make_shared(length, coords_buf, false); - auto rings = std::make_shared( - nof_counts * sizeof(int32_t), reinterpret_cast(counts), false); - return GeoPolyTargetValuePtr({std::move(coords), std::move(rings)}); - } else if (ResultSet::GeoReturnType::GeoTargetValue == geo_return_type_) { - std::shared_ptr> coords = - decompress_coords( - target_info.sql_type, coords_buf, length); - std::vector rings(counts, counts + nof_counts); - return GeoTargetValue(GeoPolyTargetValue(*coords, rings)); - } else if (ResultSet::GeoReturnType::GeoTargetValueGpuPtr == geo_return_type_) { - auto coords = std::make_shared(length, coords_buf, false); - auto rings = std::make_shared( - nof_counts * sizeof(int32_t), reinterpret_cast(counts), false); - return GeoPolyTargetValuePtr({std::move(coords), std::move(rings)}); - } else { - UNREACHABLE(); - } + return NestedArrayToGeoTargetValue<2, + Geospatial::GeoPolygon, + GeoPolyTargetValue, + GeoPolyTargetValuePtr>( + ptr, index, target_info.sql_type, geo_return_type_); } return GeoTargetValueBuilder::build( @@ -1875,6 +1991,15 @@ TargetValue ResultSet::makeGeoTargetValue(const int8_t* geo_target_ptr, varlen_buffer[getCoordsDataPtr(geo_target_ptr) + 2].size())); } else if (col_lazy_fetch && col_lazy_fetch->is_lazily_fetched) { const auto& frag_col_buffers = getFragColBuffers(); + auto ptr = frag_col_buffers[col_lazy_fetch->local_col_id]; + if (FlatBufferManager::isFlatBuffer(ptr)) { + int64_t index = getCoordsDataPtr(geo_target_ptr); + return NestedArrayToGeoTargetValue<3, + Geospatial::GeoMultiPolygon, + GeoMultiPolyTargetValue, + GeoMultiPolyTargetValuePtr>( + ptr, index, target_info.sql_type, geo_return_type_); + } return GeoTargetValueBuilder::build( target_info.sql_type, diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/GDALTableFunctions.hpp b/QueryEngine/TableFunctions/SystemFunctions/os/GDALTableFunctions.hpp index 0779161dc0..62235f7dd1 100644 --- a/QueryEngine/TableFunctions/SystemFunctions/os/GDALTableFunctions.hpp +++ b/QueryEngine/TableFunctions/SystemFunctions/os/GDALTableFunctions.hpp @@ -354,9 +354,7 @@ int32_t tf_raster_contour_impl(TableFunctionManager& mgr, } // set output contour linestring - auto const* coords_int8_t = reinterpret_cast(coords.data()); - contour_features.setItem( - output_feature_index, coords_int8_t, sizeof(double) * coords.size()); + contour_features[output_feature_index].fromCoords(coords); // set output contour value contour_values[output_feature_index] = diff --git a/QueryEngine/TableFunctions/TableFunctionManager.h b/QueryEngine/TableFunctions/TableFunctionManager.h index be4116f46c..7fd738bda1 100644 --- a/QueryEngine/TableFunctions/TableFunctionManager.h +++ b/QueryEngine/TableFunctions/TableFunctionManager.h @@ -147,7 +147,10 @@ struct TableFunctionManager { switch (ti.get_type()) { case kARRAY: case kLINESTRING: - case kPOLYGON: { + case kPOLYGON: + case kMULTIPOINT: + case kMULTILINESTRING: + case kMULTIPOLYGON: { if (output_item_values_total_number_[i] == -1) { throw std::runtime_error("set_output_item_values_total_number(" + std::to_string(i) + @@ -221,7 +224,10 @@ struct TableFunctionManager { switch (ti.get_type()) { case kARRAY: case kLINESTRING: - case kPOLYGON: { + case kPOLYGON: + case kMULTIPOINT: + case kMULTILINESTRING: + case kMULTIPOLYGON: { total_number = output_item_values_total_number_[i]; break; } @@ -232,6 +238,10 @@ struct TableFunctionManager { << ti.toString(); } initializeFlatBuffer(m, output_num_rows_, total_number, ti); + CHECK(FlatBufferManager::isFlatBuffer(output_buffers_ptr)); + // Checks if the implementations of getFlatBufferSize and + // initializeFlatBuffer in sqltypes.h are in sync: + CHECK_EQ(m.getBufferSize(), query_mem_desc.getFlatBufferSize(i)); output_buffers_ptr = align_to_int64(output_buffers_ptr + m.getBufferSize()); } else { const size_t col_width = ti.get_size(); diff --git a/QueryEngine/TableFunctions/TableFunctionOps.cpp b/QueryEngine/TableFunctions/TableFunctionOps.cpp index 40a17ab462..420a3975cd 100644 --- a/QueryEngine/TableFunctions/TableFunctionOps.cpp +++ b/QueryEngine/TableFunctions/TableFunctionOps.cpp @@ -251,10 +251,10 @@ extern "C" DEVICE RUNTIME_EXPORT void ColumnArray_getItem(int8_t* flatbuffer, throw std::runtime_error("getItem failed: " + ::toString(status)); #endif } - status = m.setItem(index, - nullptr, - expected_numel * sizeof_T, - nullptr); // reserves a junk in array buffer + status = m.setItemOld(index, + nullptr, + expected_numel * sizeof_T, + nullptr); // reserves a junk in array buffer if (status != FlatBufferManager::Status::Success) { #ifndef __CUDACC__ throw std::runtime_error("getItem failed[setItem]: " + ::toString(status)); @@ -310,7 +310,7 @@ extern "C" DEVICE RUNTIME_EXPORT void ColumnArray_setItem(int8_t* flatbuffer, if (is_null) { status = m.setNull(index); } else { - status = m.setItem(index, ptr, size * sizeof_T); + status = m.setItemOld(index, ptr, size * sizeof_T); } #ifndef __CUDACC__ if (status != FlatBufferManager::Status::Success) { diff --git a/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp b/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp index e360745a35..7175578e3e 100644 --- a/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp +++ b/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp @@ -84,22 +84,43 @@ EXTENSION_NOINLINE int32_t ct_pointn__cpu_(TableFunctionManager& mgr, return size; } -EXTENSION_NOINLINE int32_t ct_copy__cpu_(TableFunctionManager& mgr, - const Column& linestrings, - Column& copied_linestrings) { - auto size = linestrings.size(); - mgr.set_output_item_values_total_number(0, linestrings.getNofValues()); +template +NEVER_INLINE HOST int32_t ct_copy__cpu_template(TableFunctionManager& mgr, + const Column& inputs, + Column& outputs) { + auto size = inputs.size(); + mgr.set_output_item_values_total_number(0, inputs.getNofValues()); mgr.set_output_row_size(size); for (int64_t i = 0; i < size; i++) { - if (linestrings.isNull(i)) { - copied_linestrings.setNull(i); + if (inputs.isNull(i)) { + outputs.setNull(i); } else { - copied_linestrings[i] = linestrings[i]; + outputs[i] = inputs[i]; } } return size; } +// explicit instantiations +template NEVER_INLINE HOST int32_t +ct_copy__cpu_template(TableFunctionManager& mgr, + const Column& inputs, + Column& outputs); + +template NEVER_INLINE HOST int32_t +ct_copy__cpu_template(TableFunctionManager& mgr, + const Column& inputs, + Column& outputs); + +template NEVER_INLINE HOST int32_t ct_copy__cpu_template(TableFunctionManager& mgr, + const Column& inputs, + Column& outputs); + +template NEVER_INLINE HOST int32_t +ct_copy__cpu_template(TableFunctionManager& mgr, + const Column& inputs, + Column& outputs); + EXTENSION_NOINLINE int32_t ct_linestringn__cpu_(TableFunctionManager& mgr, const Column& polygons, int64_t n, @@ -111,7 +132,14 @@ EXTENSION_NOINLINE int32_t ct_linestringn__cpu_(TableFunctionManager& mgr, if (polygons.isNull(i)) { linestrings.setNull(i); } else { - linestrings.setItem(i, polygons[i], n - 1); + int64_t sz = polygons[i].size(); + if (n < 1 || n > sz) { + linestrings.setNull(i); + } else { + const auto poly = polygons[i]; + const auto ring = poly[n - 1]; + linestrings.setItem(i, ring); + } } } return size; @@ -127,16 +155,23 @@ EXTENSION_NOINLINE int32_t ct_make_polygon3__cpu_(TableFunctionManager& mgr, mgr.set_output_item_values_total_number( 0, rings.getNofValues() + holes1.getNofValues() + holes2.getNofValues()); mgr.set_output_row_size(size); + // Initialize polygons + int count_nulls = 0; for (int64_t i = 0; i < size; i++) { if (rings.isNull(i)) { polygons.setNull(i); sizes.setNull(i); + count_nulls++; } else { std::vector> polygon_coords; polygon_coords.push_back(rings[i].toCoords()); - polygon_coords.push_back(holes1[i].toCoords()); - polygon_coords.push_back(holes2[i].toCoords()); + if (!holes1.isNull(i)) { + polygon_coords.push_back(holes1[i].toCoords()); + if (!holes2.isNull(i)) { + polygon_coords.push_back(holes2[i].toCoords()); + } + } auto polygon = polygons[i]; auto status = polygon.fromCoords(polygon_coords); @@ -145,12 +180,124 @@ EXTENSION_NOINLINE int32_t ct_make_polygon3__cpu_(TableFunctionManager& mgr, return mgr.ERROR_MESSAGE("fromCoords failed: " + ::toString(status)); } int nofpoints = 0; - for (int j = 0; j < polygon.size(); j++) { + for (size_t j = 0; j < polygon.size(); j++) { nofpoints += polygon.size(j); } sizes[i] = nofpoints; } } + + // Check polygons content + if (count_nulls == 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected non-zero."); + } + + for (int64_t i = 0; i < size; i++) { + if (polygons.isNull(i)) { + count_nulls--; + } else { + std::vector> polygon_coords; + polygon_coords.push_back(rings[i].toCoords()); + if (!holes1.isNull(i)) { + polygon_coords.push_back(holes1[i].toCoords()); + } + if (!holes2.isNull(i)) { + polygon_coords.push_back(holes2[i].toCoords()); + } + + // polygons[i] is Geo::Polygon instances + // polygons[i][j] is Geo::LineString instances + // polygons[i][j][k] is Geo::Point2D instances + + auto nof_lines = polygons[i].size(); + + if (nof_lines != polygon_coords.size()) { + return mgr.ERROR_MESSAGE( + "polygon size test failed: nof_lines=" + ::toString(nof_lines) + + ", expected " + ::toString(polygon_coords.size()) + "."); + } + std::vector> poly_coords = polygons[i].toCoords(); + if (nof_lines != poly_coords.size()) { + return mgr.ERROR_MESSAGE( + "polygon toCoords size test failed: poly_coords.size()=" + + ::toString(poly_coords.size()) + ", expected " + ::toString(nof_lines) + "."); + } + + auto poly = polygons[i]; + + for (size_t j = 0; j < poly.size(); j++) { + Geo::LineString line = poly[j]; + std::vector line_coords = line.toCoords(); + auto nof_points = polygon_coords[j].size() / 2; + if (poly.size(j) != nof_points) { + return mgr.ERROR_MESSAGE("polygon linestring size test failed: poly.size(" + + ::toString(j) + ")=" + ::toString(poly.size(j)) + + ", expected " + ::toString(nof_points) + "."); + } + if (line.size() != nof_points) { + return mgr.ERROR_MESSAGE("polygon linestring size test failed: line.size()=" + + ::toString(line.size()) + ", expected " + + ::toString(nof_points) + "."); + } + if (poly_coords[j].size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "polygon linestring coords size test failed: poly_coords[j].size()=" + + ::toString(poly_coords[j].size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + if (line_coords.size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "polygon linestring coords size test failed: line_coords.size()=" + + ::toString(line_coords.size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + for (size_t k = 0; k < nof_points; k++) { + if (std::abs(polygon_coords[j][2 * k] - line_coords[2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring X coord test failed: line_coords[2*k]=" + + ::toString(line_coords[2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - poly_coords[j][2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE("polygon X coord test failed: poly_coords[j][2*k]=" + + ::toString(poly_coords[j][2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - line[k].x) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring X coord test failed: line[k].x=" + + ::toString(line[k].x) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line_coords[2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring Y coord test failed: line_coords[2*k+1]=" + + ::toString(line_coords[2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - poly_coords[j][2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon Y coord test failed: poly_coords[j][2*k+1]=" + + ::toString(poly_coords[j][2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line[k].y) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring X coord test failed: line[k].y=" + + ::toString(line[k].y) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + } + } + } + } + + if (count_nulls != 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected 0."); + } + return size; } @@ -167,12 +314,236 @@ EXTENSION_NOINLINE int32_t ct_make_linestring2__cpu_(TableFunctionManager& mgr, if (x.isNull(i) || y.isNull(i)) { linestrings.setNull(i); } else { - double line[4] = {x[i], y[i], x[i] + dx, y[i] + dy}; - linestrings.setItem( - i, reinterpret_cast(&line[0]), 4 * sizeof(double)); + std::vector line{x[i], y[i], x[i] + dx, y[i] + dy}; + linestrings[i].fromCoords(line); + } + } + return size; +} + +EXTENSION_NOINLINE int32_t +ct_make_multipolygon__cpu_(TableFunctionManager& mgr, + const Column& polygons, + Column& mpolygons) { + auto size = polygons.size(); + mgr.set_output_item_values_total_number(0, polygons.getNofValues()); + mgr.set_output_row_size(size); + + // Initialize mpolygons + int count_nulls = 0; + for (int64_t i = 0; i < size; i++) { + if (polygons.isNull(i)) { + mpolygons.setNull(i); + count_nulls++; + } else { + std::vector>> mpolygon_coords; + mpolygon_coords.reserve(1); + std::vector> polygon_coords = polygons[i].toCoords(); + mpolygon_coords.push_back(polygon_coords); + auto status = mpolygons[i].fromCoords(mpolygon_coords); + if (status != FlatBufferManager::Status::Success) { + return mgr.ERROR_MESSAGE("fromCoords failed: " + ::toString(status)); + } + } + } + + // Check mpolygons content + if (count_nulls == 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected non-zero."); + } + + for (int64_t i = 0; i < size; i++) { + if (mpolygons.isNull(i)) { + count_nulls--; + } else { + std::vector> polygon_coords = polygons[i].toCoords(); + + // mpolygons[i] is Geo::MultiPolygon instances + // mpolygons[i][j] is Geo::Polygon instances + // mpolygons[i][j][k] is Geo::LineString instances + // mpolygons[i][j][k][l] is Geo::Point2D instances + + auto nof_polygons = mpolygons[i].size(); + if (nof_polygons != 1) { + return mgr.ERROR_MESSAGE("multipolygon size test failed: nof_polygons=" + + ::toString(nof_polygons) + ", expected 1."); + } + + std::vector>> mpolygon_coords = + mpolygons[i].toCoords(); + if (nof_polygons != mpolygon_coords.size()) { + return mgr.ERROR_MESSAGE( + "multipolygon toCoords size test failed: mpolygon_coords.size()=" + + ::toString(mpolygon_coords.size()) + ", expected " + + ::toString(nof_polygons) + "."); + } + + Geo::Polygon poly = mpolygons[i][0]; + std::vector> poly_coords = mpolygon_coords[0]; + if (poly.size() != polygon_coords.size()) { + return mgr.ERROR_MESSAGE("multipolygon polygon size test failed: poly.size()=" + + ::toString(poly.size()) + ", expected " + + ::toString(polygon_coords.size()) + "."); + } + + if (poly_coords.size() != polygon_coords.size()) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon coords size test failed: poly_coords.size()=" + + ::toString(poly_coords.size()) + ", expected " + + ::toString(polygon_coords.size()) + "."); + } + + for (size_t j = 0; j < poly.size(); j++) { + Geo::LineString line = poly[j]; + std::vector line_coords = line.toCoords(); + auto nof_points = polygon_coords[j].size() / 2; + if (poly.size(j) != nof_points) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring size test failed: poly.size(" + + ::toString(j) + ")=" + ::toString(poly.size(j)) + ", expected " + + ::toString(nof_points) + "."); + } + if (line.size() != nof_points) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring size test failed: line.size()=" + + ::toString(line.size()) + ", expected " + ::toString(nof_points) + "."); + } + if (poly_coords[j].size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring coords size test failed: " + "poly_coords[j].size()=" + + ::toString(poly_coords[j].size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + if (line_coords.size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring coords size test failed: " + "line_coords.size()=" + + ::toString(line_coords.size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + + for (size_t k = 0; k < nof_points; k++) { + if (std::abs(polygon_coords[j][2 * k] - line_coords[2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring X coord test failed: line_coords[2*k]=" + + ::toString(line_coords[2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - poly_coords[j][2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon X coord test failed: poly_coords[j][2*k]=" + + ::toString(poly_coords[j][2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - line[k].x) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring X coord test failed: line[k].x=" + + ::toString(line[k].x) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line_coords[2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring Y coord test failed: " + "line_coords[2*k+1]=" + + ::toString(line_coords[2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - poly_coords[j][2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon Y coord test failed: poly_coords[j][2*k+1]=" + + ::toString(poly_coords[j][2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line[k].y) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring X coord test failed: line[k].y=" + + ::toString(line[k].y) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + } + } + } + } + + if (count_nulls != 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected 0."); + } + + return size; +} + +EXTENSION_NOINLINE int32_t ct_polygonn__cpu_(TableFunctionManager& mgr, + const Column& mpolygons, + int64_t n, + Column& polygons) { + auto size = mpolygons.size(); + mgr.set_output_item_values_total_number(0, mpolygons.getNofValues()); + mgr.set_output_row_size(size); + for (int64_t i = 0; i < size; i++) { + if (mpolygons.isNull(i)) { + polygons.setNull(i); + } else { + polygons.setItem(i, mpolygons[i][n - 1]); + } + } + return size; +} + +EXTENSION_NOINLINE int32_t +ct_to_multilinestring__cpu_(TableFunctionManager& mgr, + const Column& polygons, + Column& mlinestrings) { + auto size = polygons.size(); + mgr.set_output_item_values_total_number(0, polygons.getNofValues()); + mgr.set_output_row_size(size); + // Initialize mlinestrings + int count_nulls = 0; + FlatBufferManager::Status status{}; + for (int64_t i = 0; i < size; i++) { + if (polygons.isNull(i)) { + mlinestrings.setNull(i); + count_nulls++; + } else { + std::vector> polygon_coords = polygons[i].toCoords(); + status = mlinestrings[i].fromCoords(polygon_coords); + if (status != FlatBufferManager::Status::Success) { + return mgr.ERROR_MESSAGE("fromCoords failed: " + ::toString(status)); + } + } + } + return size; +} + +EXTENSION_NOINLINE int32_t +ct_to_polygon__cpu_(TableFunctionManager& mgr, + const Column& mlinestrings, + Column& polygons) { + auto size = mlinestrings.size(); + mgr.set_output_item_values_total_number(0, mlinestrings.getNofValues()); + mgr.set_output_row_size(size); + // Initialize polygons + int count_nulls = 0; + FlatBufferManager::Status status{}; + for (int64_t i = 0; i < size; i++) { + if (mlinestrings.isNull(i)) { + polygons.setNull(i); + count_nulls++; + } else { + std::vector> polygon_coords; + status = mlinestrings[i].toCoords(polygon_coords); + if (status != FlatBufferManager::Status::Success) { + return mgr.ERROR_MESSAGE("toCoords failed: " + ::toString(status)); + } + status = polygons[i].fromCoords(polygon_coords); + if (status != FlatBufferManager::Status::Success) { + return mgr.ERROR_MESSAGE("fromCoords failed: " + ::toString(status)); + } } } return size; } -#endif // #ifndef __CUDACC__ \ No newline at end of file +#endif // #ifndef __CUDACC__ diff --git a/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h b/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h index c92e2f0c52..f79505aad9 100644 --- a/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h +++ b/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h @@ -1313,10 +1313,14 @@ EXTENSION_NOINLINE int32_t ct_require_range__cpu_(const Column& input1, UDTF: ct_coords__cpu_(TableFunctionManager, Column points) -> Column x, Column y UDTF: ct_shift__cpu_(TableFunctionManager, Column points, double x, double y) -> Column shifted UDTF: ct_pointn__cpu_(TableFunctionManager, Column linestrings, int64_t n) -> Column x, Column y - UDTF: ct_copy__cpu_(TableFunctionManager mgr, Column linestrings) -> Column copied_linestrings | input_id=args<0> + UDTF: ct_copy__cpu_template(TableFunctionManager mgr, Column inputs) -> Column outputs | input_id=args<0>, T=[GeoLineString, GeoMultiLineString, GeoPolygon, GeoMultiPolygon] UDTF: ct_linestringn__cpu_(TableFunctionManager, Column polygons, int64_t n) -> Column linestrings UDTF: ct_make_polygon3__cpu_(TableFunctionManager, Cursor rings, Column holes1, Column holes2>) -> Column polygons, Column sizes UDTF: ct_make_linestring2__cpu_(TableFunctionManager, Cursor x, Column y>, double dx, double dy) -> Column linestrings + UDTF: ct_make_multipolygon__cpu_(TableFunctionManager, Column polygons) -> Column mpolygons + UDTF: ct_polygonn__cpu_(TableFunctionManager, Column mpolygons, int64_t n) -> Column polygons + UDTF: ct_to_multilinestring__cpu_(TableFunctionManager, Column polygons) -> Column mlinestrings + UDTF: ct_to_polygon__cpu_(TableFunctionManager, Column mlinestrings) -> Column polygons */ // clang-format on @@ -1337,9 +1341,10 @@ EXTENSION_NOINLINE int32_t ct_pointn__cpu_(TableFunctionManager& mgr, Column& xcoords, Column& ycoords); -EXTENSION_NOINLINE int32_t ct_copy__cpu_(TableFunctionManager& mgr, - const Column& linestrings, - Column& copied_linestrings); +template +NEVER_INLINE HOST int32_t ct_copy__cpu_template(TableFunctionManager& mgr, + const Column& inputs, + Column& outputs); EXTENSION_NOINLINE int32_t ct_linestringn__cpu_(TableFunctionManager& mgr, const Column& polygons, @@ -1360,6 +1365,25 @@ EXTENSION_NOINLINE int32_t ct_make_linestring2__cpu_(TableFunctionManager& mgr, double dy, Column& linestrings); +EXTENSION_NOINLINE int32_t ct_make_multipolygon__cpu_(TableFunctionManager& mgr, + const Column& polygons, + Column& mpolygons); + +EXTENSION_NOINLINE int32_t ct_polygonn__cpu_(TableFunctionManager& mgr, + const Column& mpolygons, + int64_t n, + Column& polygons); + +EXTENSION_NOINLINE int32_t +ct_to_multilinestring__cpu_(TableFunctionManager& mgr, + const Column& polygons, + Column& mlinestrings); + +EXTENSION_NOINLINE int32_t +ct_to_polygon__cpu_(TableFunctionManager& mgr, + const Column& mlinestrings, + Column& polygons); + #endif // ifndef __CUDACC__ // clang-format off diff --git a/QueryEngine/TargetExprBuilder.cpp b/QueryEngine/TargetExprBuilder.cpp index 60dbca2091..b8d6381701 100644 --- a/QueryEngine/TargetExprBuilder.cpp +++ b/QueryEngine/TargetExprBuilder.cpp @@ -132,9 +132,15 @@ void TargetExprCodegen::codegen( auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in; const auto arg_expr = agg_arg(target_expr); const bool varlen_projection = is_varlen_projection(target_expr, target_info.sql_type); - /* TODO: find a better way to determine if target uses FlatBuffer storage or not. + /* TODO: find a better way to determine if target uses FlatBuffer + storage or not. Atm, we assume that target uses it whenever the + target sql type supports FlatBuffer but it may not be 100% + correct. Although, here the correctness does not matter much as + the effect of uses_flatbuffer being true is that it just disables + few checks and does not affect the functionality of codegen at + all. */ - const bool uses_flatbuffer = target_info.sql_type.get_type() == kPOLYGON; + const bool uses_flatbuffer = target_info.sql_type.supports_flatbuffer(); const auto agg_fn_names = agg_fn_base_names(target_info, varlen_projection); const auto window_func = dynamic_cast(target_expr); WindowProjectNodeContext::resetWindowFunctionContext(executor); diff --git a/QueryEngine/Utils/FlatBuffer.h b/QueryEngine/Utils/FlatBuffer.h index c1130b5ae1..56467dab37 100644 --- a/QueryEngine/Utils/FlatBuffer.h +++ b/QueryEngine/Utils/FlatBuffer.h @@ -50,7 +50,6 @@ interpretation depends on the format parameters specified above. The size of the raw data buffer depends on the format id and the user-specified parameters in the data format metadata. - All buffers above are aligned to the 64-bit boundaries. In summary, the memory layout of a flatbuffer is: @@ -89,86 +88,195 @@ if is_geoint is true, point coordinates are stored as integers, otherwise as double floating point numbers. - GeoLineString format specification - ---------------------------------- + NestedArray format specification + -------------------------------- - The GeoLineString format metadata and worker buffers memory layout - is described by GeoLineString and GeoLineStringWorker struct - definitions below. The raw data buffer memory layout is a follows: + NestedArray represents a storage for zero, one, two, and three + dimensional ragged arrays. The storage format consists of sizes and + values buffers (plus offset buffers to optimize accessing + items). The sizes buffer stores the sizes of ragged arrays at + various levels and the values buffer stores the values of ragged + arrays. - | | | | - = - |<-- (max nof values) * (is_geoint ? 4 : 8) bytes -->|<-- (num items + 1) * 8 bytes -->|<-- (num items) * 8 bytes-->| + The NestedArray storage is used as a uniform storage schema for + different types (variable-length arrays, geotypes, etc) with + variable dimensionality. For example, a GeoMultiPolygon - where stores points coordinates in a point-wise manner: - X0, Y0, X1, Y1, ... + GeoMultiPolygon([ + GeoPolygon([LineString([(x000, y000), (x001, y001), ...])], + LineString([(x010, y010), (x011, y011), ...])], + ...]), + GeoPolygon([LineString([(x100, y100), (x101, y101), ...])], + LineString([(x110, y110), (x111, y111), ...])], + ...]), + ... + ]) - if is_geoint is true, point coordinates are stored as integers, - otherwise as double floating point numbers. + is represented as a three dimensional ragged array where the sizes + buffer contains the number of polygons in the multi-polygon, all the + numbers of linestrings in polygons, all the numbers of points in + linestrings, and finally, the values buffer contains all the + coordinates. Note that a "value" is defined as a point with two + coordinates. - A line of n-th item consists of points defined by + The current implementation of NestedArray supports dimensionalities + up to 3 but the format can be extended to arbitrary dimensions. - i0 = compressed_indices[storage_indices[n]] - i1 = compressed_indices[storage_indices[n] + 1] - line_string = point_data[i0:i1] + NestedArray API + --------------- - GeoPolygon format specification - ---------------------------------- + To compute flatbuffer size required to represent a nested array with + the given dimensionsinality, total items count, estimated total + sizes and values counts, value type, and user data buffer size, + use:: - The GeoPolygon format metadata and worker buffers memory layout - is described by GeoPolygon and GeoPolygonWorker struct - definitions below. The raw data buffer memory layout is a follows: + int64_t compute_flatbuffer_size(ndims, + total_items_count, + total_sizes_count, + total_values_count, + value_type, + user_data_size) + + To initialize the provided buffer for nested array format, use:: + + Status .initialize(ndims, + total_items_count, + total_sizes_count, + total_values_count, + value_type, + null_value_ptr, + user_data_ptr, user_data_size) + + To test if the provided buffer contains an initialized FlatBuffer:: + + bool isFlatBuffer(buffer) + + To get the size of an initialized FlatBuffer:: + + int64_t getBufferSize(buffer) + int64_t .getBufferSize() + + To get the size of the values buffer:: + + size_t .getValuesBufferSize() + + To get the size of a value:: + + size_t .getValueSize() + + To get the number of specified values:: + + size_t .getValuesCount() + + To get the dimensionality of a nested array:: + + size_t .getNDims() + + To get various buffers:: + + int8_t* .get_user_data_buffer() + int8_t* .get_values_buffer() + sizes_t* .get_sizes_buffer() + offsets_t* .get_values_offsets() + offsets_t* .get_sizes_offsets() + int8_t* .getNullValuePtr() + + To test if the provided buffer contains null value:: + + bool .containsNullValue() + + To get the item (and subitems) of a nested array:: + + template + Status getItemWorker(const int64_t index[NDIM], + const size_t n, + int8_t*& values, + int32_t& nof_values, + int32_t* sizes_buffers[NDIM], + int32_t sizes_lengths[NDIM], + int32_t& nof_sizes, + bool& is_null) + + template + Status getItem(const int64_t index, NestedArrayItem& result) + + template + Status getItem(const int64_t index[NDIM], const size_t n, NestedArrayItem& result) + + template + Status getItem(const int64_t index, + std::vector& values, + std::vector& sizes, + bool& is_null) + + template + Status getItem(const int64_t index, + std::vector& values, + std::vector& sizes, + std::vector& sizes_of_sizes, + bool& is_null) - | | | | | | - = - |<-- (max nof points) * (is_geoint ? 4 : 8) bytes -->|<-- (max nof rings) * 4 bytes -->|<-- (max nof rings + 1) * 8 bytes -->|<-- (num items + 1) * 8 bytes -->|<-- (num items) * 8 bytes-->| - where - stores points coordinates in a point-wise manner: - X0, Y0, X1, Y1, ... If is_geoint is true, point coordinates are - stored as integers, otherwise as double floating point numbers. + To set the item of a nested array:: - contains ring sizes (in points). All entires are - non-negative. + template + Status setItemWorker(const int64_t index, + const int8_t* values, + const int32_t nof_values, + const int32_t* const sizes_buffers[NDIM], + const int32_t sizes_lengths[NDIM], + const int32_t nof_sizes) - contains the "cumulative sum" of all ring - sizes (in points). All entires are non-negative and sorted. + template + Status setItem(const int64_t index, + const int8_t* values_buf, + const int32_t nof_values) - contains the "cumulative sum" of item sizes - (in rings). Negative entries indicate null items. + template + Status setItem(const int64_t index, + const int8_t* values_buf, + const int32_t nof_values, + const int32_t* sizes_buf, + const int32_t nof_sizes) - defines the order of specifying items in the flat - buffer + template + Status setItem(const int64_t index, + const int8_t* values_buf, + const int32_t nof_values, + const int32_t* sizes_buf, + const int32_t nof_sizes, + const int32_t* sizes_of_sizes_buf, + const int32_t nof_sizes_of_sizes) - Assuming that a ring consists of at least 3 points, we'll have + template + Status setItem(const int64_t index, const std::vector& arr) - * 3 <= + template + Status setItem(const int64_t index, const std::vector>& item) - A polygon of n-th item consists of rings defined by + template + Status setItem(const int64_t index, + const std::vector>>& item) - i0 = compressed_indices[storage_indices[n]] - i1 = compressed_indices[storage_indices[n] + 1] + template + Status setItem(const int64_t index, + const std::vector& values, + const std::vector& sizes) - for i in range(i0, i1): - j0 = compressed_indices2[i] - j1 = compressed_indices2[i + 1] - ring_points = values[j0:j1] + template + Status setItem(const int64_t index, + const std::vector& values, + const std::vector& sizes, + const std::vector& sizes_of_sizes) - For example, consider two polygons: + Status setNull(int64_t index) - Polygon([p0, p1 ,p2, p3], [p4, p5, p6], [p7, p8, p9]) - Polygon([r0, r1 ,r2], [r3, r4, r5], [r6, r7, r8]) + To test if the item is NULL:: - and storage_indices=[0, 1], then + Status isNull(index, bool& is_null) - compressed_indices = [0 3 6] - compressed_indices2 = [0 4 7 10 13 16 19] - values = [p0, ..., p9, r0, ..., r8] - Notice that compressed_indices2 describes the partitioning of points - into rings and compressed_indices describes the partitioning of - rings into polygons. FlatBuffer usage ---------------- @@ -188,26 +296,42 @@ */ // clang-format on +#ifdef FLATBUFFER_ERROR_ABORTS +#include "../../Shared/toString.h" +#define RETURN_ERROR(exc) \ + { \ + PRINT(exc); \ + abort(); \ + return (exc); \ + } +#else +#define RETURN_ERROR(exc) return (exc) +#endif + #include #ifdef HAVE_TOSTRING #include #include #endif +#include #include "../../Shared/funcannotations.h" +#define FLATBUFFER_UNREACHABLE() \ + { abort(); } + // Notice that the format value is used to recognize if a memory // buffer uses some flat buffer format or not. To minimize chances for // false positive test results, use a non-trival integer value when // introducing new formats. enum FlatBufferFormat { - VarlenArrayFormatId = 0x7661726c65634152, // hex repr of 'varlenAR' - GeoPointFormatId = 0x67656f706f696e74, // hex repr of 'geopoint' - GeoLineStringFormatId = 0x676c696e65737472, // hex repr of 'glinestr' - GeoPolygonFormatId = 0x67706f6c79676f6e, // hex repr of 'gpolygon' + VarlenArrayFormatId = 0x7661726c65634152, // hex repr of 'varlenAR' + GeoPointFormatId = 0x67656f706f696e74, // hex repr of 'geopoint' + // GeoLineStringFormatId = 0x676c696e65737472, // hex repr of 'glinestr' + // GeoPolygonFormatId = 0x67706f6c79676f6e, // hex repr of 'gpolygon' // GeoMultiPointFormatId = 0x47656f706f696e74, // hex repr of 'Geopoint' // GeoMultiLineStringFormatId = 0x476c696e65737472, // hex repr of 'Glinestr' - // GeoMultiPolygonFormatId = 0x47706f6c79676f6e, // hex repr of 'Gpolygon' + NestedArrayFormatId = 0x6e65737465644152 // hex repr of 'nestedAR' }; inline int64_t _align_to_int64(int64_t addr) { @@ -216,18 +340,80 @@ inline int64_t _align_to_int64(int64_t addr) { } struct FlatBufferManager { + enum ValueType { + Bool8, + Int8, + Int16, + Int32, + Int64, + UInt8, + UInt16, + UInt32, + UInt64, + Float32, + Float64, + PointInt32, + PointFloat64 + }; + +#ifdef HAVE_TOSTRING + static std::string toString(const ValueType& type); +#endif + + static size_t get_size(ValueType type) { + switch (type) { + case Bool8: + case Int8: + case UInt8: + return 1; + case Int16: + case UInt16: + return 2; + case Int32: + case UInt32: + case Float32: + return 4; + case Int64: + case UInt64: + case Float64: + case PointInt32: + return 8; + case PointFloat64: + return 16; + } + FLATBUFFER_UNREACHABLE(); + return 0; + } + + /* + sizes_t is the type of a container size. Here we use int32_t + because Geospatial uses it as the type for the vector of ring and + polygon sizes. + + offsets_t is the type of offsets that is used to locate + sub-buffers within the FlatBuffer main buffer. Because NULL items + are encoded as negative offset values, the offsets type must be a + signed type. Hence, we define offsets_t as int64_t. + */ + + typedef int32_t sizes_t; + typedef int64_t offsets_t; + +#define FLATBUFFER_SIZES_T_VALUE_TYPE Int32 +#define FLATBUFFER_OFFSETS_T_VALUE_TYPE UInt64 + struct BaseWorker { - int64_t format_id; - int64_t flatbuffer_size; - int64_t format_metadata_offset; // the offset of the data format metadata buffer - int64_t format_worker_offset; // the offset of the data format worker buffer + FlatBufferFormat format_id; + offsets_t flatbuffer_size; + offsets_t format_metadata_offset; // the offset of the data format metadata buffer + offsets_t format_worker_offset; // the offset of the data format worker buffer #ifdef HAVE_TOSTRING std::string toString() const { std::string result = ::typeName(this) + "{"; result += "format_id=" + std::to_string(format_id); - result += ", flatbuffer_size=" + std::to_string(flatbuffer_size); - result += ", format_metadata_offset=" + std::to_string(format_metadata_offset); - result += ", format_worker_offset=" + std::to_string(format_worker_offset); + result += ",\n flatbuffer_size=" + std::to_string(flatbuffer_size); + result += ",\n format_metadata_offset=" + std::to_string(format_metadata_offset); + result += ",\n format_worker_offset=" + std::to_string(format_worker_offset); result += "}"; return result; } @@ -313,88 +499,50 @@ struct FlatBufferManager { #endif }; - struct GeoLineString { - int64_t total_items_count; // the total number of items - int64_t max_nof_values; // the maximum number of points in all items - int32_t input_srid; - int32_t output_srid; - bool is_geoint; -#ifdef HAVE_TOSTRING - std::string toString() const { - std::string result = ::typeName(this) + "{"; - result += "total_items_count=" + std::to_string(total_items_count); - result += ", max_nof_values=" + std::to_string(max_nof_values); - result += ", input_srid=" + std::to_string(output_srid); - result += ", output_srid=" + std::to_string(output_srid); - result += ", is_geoint=" + std::to_string(is_geoint); - result += "}"; - return result; - } -#endif - }; - - struct GeoLineStringWorker { - int64_t items_count; // the number of specified items - int64_t values_offset; - int64_t compressed_indices_offset; - int64_t storage_indices_offset; + struct NestedArrayWorker { + int64_t specified_items_count; + // all offsets are in bytes + offsets_t storage_indices_offset; + offsets_t sizes_offsets_offset; + offsets_t values_offsets_offset; + offsets_t sizes_buffer_offset; + offsets_t values_buffer_offset; + offsets_t user_data_buffer_offset; + size_t value_size; #ifdef HAVE_TOSTRING std::string toString() const { std::string result = ::typeName(this) + "{"; - result += "items_count=" + std::to_string(items_count); - result += ", values_offset=" + std::to_string(values_offset); + result += "specified_items_count=" + std::to_string(specified_items_count); + result += ",\n storage_indices_offset=" + std::to_string(storage_indices_offset); + result += ",\n sizes_offsets_offset=" + std::to_string(sizes_offsets_offset); + result += ",\n values_offsets_offset=" + std::to_string(values_offsets_offset); + result += ",\n sizes_buffer_offset=" + std::to_string(sizes_buffer_offset); + result += ",\n values_buffer_offset=" + std::to_string(values_buffer_offset); result += - ", compressed_indices_offset=" + std::to_string(compressed_indices_offset); - result += ", storage_indices_offset=" + std::to_string(storage_indices_offset); - result += "}"; - return result; - } -#endif - }; - - struct GeoPolygon { - int64_t total_items_count; // the total number of items - int64_t max_nof_values; // the maximum number of points in all items - int64_t max_nof_rings; // the maximum number of rings in all items - int32_t input_srid; - int32_t output_srid; - bool is_geoint; -#ifdef HAVE_TOSTRING - std::string toString() const { - std::string result = ::typeName(this) + "{"; - result += "total_items_count=" + std::to_string(total_items_count); - result += ", max_nof_values=" + std::to_string(max_nof_values); - result += ", max_nof_rings=" + std::to_string(max_nof_rings); - result += ", input_srid=" + std::to_string(output_srid); - result += ", output_srid=" + std::to_string(output_srid); - result += ", is_geoint=" + std::to_string(is_geoint); + ",\n user_data_buffer_offset=" + std::to_string(user_data_buffer_offset); + result += ",\n value_size=" + std::to_string(value_size); result += "}"; return result; } #endif }; - struct GeoPolygonWorker { - int64_t items_count; // the number of specified items - int64_t items2_count; // the number of specified rings - int64_t values_offset; - int64_t counts2_offset; - int64_t compressed_indices2_offset; - int64_t compressed_indices_offset; - int64_t storage_indices_offset; - + struct NestedArray { + size_t ndims; + int64_t total_items_count; + int64_t total_sizes_count; + int64_t total_values_count; + ValueType value_type; + size_t user_data_size; #ifdef HAVE_TOSTRING std::string toString() const { std::string result = ::typeName(this) + "{"; - result += "items_count=" + std::to_string(items_count); - result += ", items2_count=" + std::to_string(items2_count); - result += ", values_offset=" + std::to_string(values_offset); - result += ", counts2_offset=" + std::to_string(counts2_offset); - result += - ", compressed_indices2_offset=" + std::to_string(compressed_indices2_offset); - result += - ", compressed_indices_offset=" + std::to_string(compressed_indices_offset); - result += ", storage_indices_offset=" + std::to_string(storage_indices_offset); + result += "ndims=" + std::to_string(ndims); + result += ",\n total_items_count=" + std::to_string(total_items_count); + result += ",\n total_sizes_count=" + std::to_string(total_sizes_count); + result += ",\n total_values_count=" + std::to_string(total_values_count); + result += ",\n value_type=" + FlatBufferManager::toString(value_type); + result += ",\n user_data_size=" + std::to_string(user_data_size); result += "}"; return result; } @@ -406,17 +554,25 @@ struct FlatBufferManager { IndexError, SubIndexError, SizeError, + FlatbufferSizeError, ItemAlreadySpecifiedError, ItemUnspecifiedError, UnexpectedNullItemError, ValuesBufferTooSmallError, + SizesBufferTooSmallError, CompressedIndices2BufferTooSmallError, MemoryError, NotImplementedError, NotSupportedFormatError, + InvalidUserDataError, + DimensionalityError, + TypeError, + UserDataError, + InconsistentSizesError, UnknownFormatError }; + // FlatBuffer main buffer. It is the only member of the FlatBuffer struct. int8_t* buffer; // Check if a buffer contains FlatBuffer formatted data @@ -424,26 +580,28 @@ struct FlatBufferManager { if (buffer) { // warning: assume that buffer size is at least 8 bytes const auto* base = reinterpret_cast(buffer); - FlatBufferFormat header_format = static_cast(base->format_id); + FlatBufferFormat header_format = base->format_id; switch (header_format) { + case NestedArrayFormatId: case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - case GeoPolygonFormatId: { + case GeoPointFormatId: { int64_t flatbuffer_size = base->flatbuffer_size; if (flatbuffer_size > 0) { FlatBufferFormat footer_format = static_cast( ((int64_t*)buffer)[flatbuffer_size / sizeof(int64_t) - 1]); return footer_format == header_format; } - } break; - default:; + break; + } + default: + break; } } return false; } // Return the allocation size of the the FlatBuffer storage, in bytes + // TODO?: return size_t value, 0 when not a flat buffer static int64_t getBufferSize(const void* buffer) { if (isFlatBuffer(buffer)) { return reinterpret_cast(buffer)->flatbuffer_size; @@ -453,14 +611,36 @@ struct FlatBufferManager { } // Return the allocation size of the the FlatBuffer storage, in bytes + // TODO?: int64_t -> size_t inline int64_t getBufferSize() const { return reinterpret_cast(buffer)->flatbuffer_size; } + inline bool isNestedArray() const { return format() == NestedArrayFormatId; } + + inline size_t getValueSize() const { return getNestedArrayWorker()->value_size; } + + inline size_t getValuesBufferSize() const { + const auto* metadata = getNestedArrayMetadata(); + const auto* worker = getNestedArrayWorker(); + return worker->value_size * metadata->total_values_count; + } + + inline size_t getValuesCount() const { + const auto* worker = getNestedArrayWorker(); + const auto* values_offsets = get_values_offsets(); + const auto storage_index = worker->specified_items_count; + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + return -(values_offset + 1); + } + return values_offset; + } + // Return the format of FlatBuffer HOST DEVICE inline FlatBufferFormat format() const { const auto* base = reinterpret_cast(buffer); - return static_cast(base->format_id); + return base->format_id; } // Return the number of items @@ -470,44 +650,43 @@ struct FlatBufferManager { return getVarlenArrayMetadata()->total_items_count; case GeoPointFormatId: return getGeoPointMetadata()->total_items_count; - case GeoLineStringFormatId: - return getGeoLineStringMetadata()->total_items_count; - case GeoPolygonFormatId: - return getGeoPolygonMetadata()->total_items_count; + case NestedArrayFormatId: + return getNestedArrayMetadata()->total_items_count; + default: + break; } return -1; // invalid value } - HOST DEVICE inline int64_t items2Count() const { + // To be deprecated in favor of NestedArray format + HOST DEVICE inline int64_t valueByteSize() const { switch (format()) { case VarlenArrayFormatId: + return getVarlenArrayMetadata()->dtype_size; case GeoPointFormatId: - case GeoLineStringFormatId: + return 2 * (getGeoPointMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); + default: break; - case GeoPolygonFormatId: - return getGeoPolygonMetadata()->max_nof_rings; } - return -1; // invalid value + return -1; } - HOST DEVICE inline int64_t dtypeSize() const { + // To be deprecated in favor of NestedArray format + HOST DEVICE inline int64_t dtypeSize() const { // TODO: use valueByteSize instead switch (format()) { case VarlenArrayFormatId: return getVarlenArrayMetadata()->dtype_size; case GeoPointFormatId: return 2 * (getGeoPointMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); - case GeoLineStringFormatId: - return 2 * - (getGeoLineStringMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); - case GeoPolygonFormatId: - return 2 * - (getGeoPolygonMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); + default: + break; } return -1; } // VarlenArray support: + // To be deprecated in favor of NestedArray format static int64_t compute_flatbuffer_size(FlatBufferFormat format_id, const int8_t* format_metadata_ptr) { int64_t flatbuffer_size = _align_to_int64(sizeof(FlatBufferManager::BaseWorker)); @@ -539,46 +718,8 @@ struct FlatBufferManager { itemsize * format_metadata->total_items_count); // values buffer size break; } - case GeoLineStringFormatId: { - const auto format_metadata = - reinterpret_cast(format_metadata_ptr); - flatbuffer_size += _align_to_int64(sizeof(GeoLineString)); - flatbuffer_size += _align_to_int64(sizeof(GeoLineStringWorker)); - const auto itemsize = - 2 * (format_metadata->is_geoint ? sizeof(int32_t) : sizeof(double)); - flatbuffer_size += _align_to_int64( - itemsize * format_metadata->max_nof_values); // values buffer size - flatbuffer_size += - _align_to_int64(sizeof(int64_t) * (format_metadata->total_items_count + - 1)); // compressed_indices buffer size - flatbuffer_size += _align_to_int64( - sizeof(int64_t) * - (format_metadata->total_items_count)); // storage_indices buffer size - break; - } - case GeoPolygonFormatId: { - const auto format_metadata = - reinterpret_cast(format_metadata_ptr); - flatbuffer_size += _align_to_int64(sizeof(GeoPolygon)); - flatbuffer_size += _align_to_int64(sizeof(GeoPolygonWorker)); - const auto itemsize = - 2 * (format_metadata->is_geoint ? sizeof(int32_t) : sizeof(double)); - flatbuffer_size += _align_to_int64( - itemsize * format_metadata->max_nof_values); // values buffer size - flatbuffer_size += _align_to_int64( - sizeof(int32_t) * (format_metadata->max_nof_rings)); // counts2 buffer size - flatbuffer_size += - _align_to_int64(sizeof(int64_t) * (format_metadata->max_nof_rings + - 1)); // compressed_indices2 buffer size - flatbuffer_size += - _align_to_int64(sizeof(int64_t) * (format_metadata->total_items_count + - 1)); // compressed_indices buffer size - flatbuffer_size += _align_to_int64( - sizeof(int64_t) * - (format_metadata->total_items_count)); // storage_indices buffer size - break; - } - default:; + default: + FLATBUFFER_UNREACHABLE(); } flatbuffer_size += _align_to_int64(sizeof(int64_t)); // footer format id return flatbuffer_size; @@ -610,19 +751,162 @@ struct FlatBufferManager { return reinterpret_cast(buffer + base->format_metadata_offset); \ } + // To be deprecated in favor of NestedArray format FLATBUFFER_MANAGER_FORMAT_TOOLS(VarlenArray); FLATBUFFER_MANAGER_FORMAT_TOOLS(GeoPoint); - FLATBUFFER_MANAGER_FORMAT_TOOLS(GeoLineString); - FLATBUFFER_MANAGER_FORMAT_TOOLS(GeoPolygon); + +#define FLATBUFFER_MANAGER_FORMAT_TOOLS_NEW(TYPENAME) \ + HOST DEVICE inline NestedArrayWorker* get##TYPENAME##Worker() { \ + auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + base->format_worker_offset); \ + } \ + HOST DEVICE inline TYPENAME* get##TYPENAME##Metadata() { \ + auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + base->format_metadata_offset); \ + } \ + HOST DEVICE inline const NestedArrayWorker* get##TYPENAME##Worker() const { \ + const auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + \ + base->format_worker_offset); \ + } \ + HOST DEVICE inline const TYPENAME* get##TYPENAME##Metadata() const { \ + const auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + base->format_metadata_offset); \ + } + + FLATBUFFER_MANAGER_FORMAT_TOOLS(NestedArray); #undef FLATBUFFER_MANAGER_FORMAT_TOOLS +#undef FLATBUFFER_MANAGER_FORMAT_TOOLS_NEW + +#define FLATBUFFER_MANAGER_SET_OFFSET(OBJ, NAME, SIZE) \ + offset = OBJ->NAME##_offset = offset + _align_to_int64(previous_size); \ + previous_size = SIZE; + + static int64_t computeBufferSizeNestedArray(int64_t ndims, + int64_t total_items_count, + int64_t total_sizes_count, + int64_t total_values_count, + ValueType value_type, + size_t user_data_size) { + size_t value_size = get_size(value_type); + offsets_t flatbuffer_size = _align_to_int64(sizeof(FlatBufferManager::BaseWorker)); + flatbuffer_size += _align_to_int64(sizeof(NestedArray)); + flatbuffer_size += _align_to_int64(sizeof(NestedArrayWorker)); + flatbuffer_size += + _align_to_int64(value_size * (total_values_count + 1)); // values buffer + flatbuffer_size += + _align_to_int64(sizeof(sizes_t) * total_sizes_count); // sizes buffer + flatbuffer_size += + _align_to_int64(sizeof(offsets_t) * (total_items_count + 1)); // values offsets + flatbuffer_size += _align_to_int64(sizeof(offsets_t) * + (total_items_count * ndims + 1)); // sizes offsets + flatbuffer_size += _align_to_int64( + sizeof(sizes_t) * total_items_count); // storage indices, must use signed type + flatbuffer_size += _align_to_int64(user_data_size); // user data + flatbuffer_size += _align_to_int64(sizeof(int64_t)); // format id + return flatbuffer_size; + } + Status initializeNestedArray(int64_t ndims, + int64_t total_items_count, + int64_t total_sizes_count, + int64_t total_values_count, + ValueType value_type, + const int8_t* null_value_ptr, + const int8_t* user_data_ptr, + size_t user_data_size) { + auto* base = getBaseWorker(); + base->format_id = NestedArrayFormatId; + size_t value_size = get_size(value_type); + base->flatbuffer_size = computeBufferSizeNestedArray(ndims, + total_items_count, + total_sizes_count, + total_values_count, + value_type, + user_data_size); + offsets_t offset = 0; + size_t previous_size = sizeof(FlatBufferManager::BaseWorker); + FLATBUFFER_MANAGER_SET_OFFSET(base, format_metadata, sizeof(NestedArray)); + FLATBUFFER_MANAGER_SET_OFFSET(base, format_worker, sizeof(NestedArrayWorker)); + + auto* metadata = getNestedArrayMetadata(); + metadata->ndims = ndims; + metadata->total_items_count = total_items_count; + metadata->total_sizes_count = total_sizes_count; + metadata->total_values_count = total_values_count; + metadata->value_type = value_type; + metadata->user_data_size = user_data_size; + + auto* worker = getNestedArrayWorker(); + worker->specified_items_count = 0; + worker->value_size = value_size; + + FLATBUFFER_MANAGER_SET_OFFSET( + worker, values_buffer, value_size * (total_values_count + 1)); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, sizes_buffer, sizeof(sizes_t) * total_sizes_count); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, values_offsets, sizeof(offsets_t) * (total_items_count + 1)); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, sizes_offsets, sizeof(offsets_t) * (total_items_count * ndims + 1)); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, storage_indices, sizeof(sizes_t) * total_items_count); + FLATBUFFER_MANAGER_SET_OFFSET(worker, user_data_buffer, user_data_size); + + if (base->flatbuffer_size != + offset + _align_to_int64(previous_size) + _align_to_int64(sizeof(int64_t))) { + RETURN_ERROR(FlatbufferSizeError); + } + + offsets_t* values_offsets = get_values_offsets(); + offsets_t* sizes_offsets = get_sizes_offsets(); + values_offsets[0] = 0; + sizes_offsets[0] = 0; + sizes_t* storage_indices = get_storage_indices(); + for (int i = 0; i < total_items_count; i++) { + storage_indices[i] = -1; + } + + // the last value in values_buffer stores a null value: + int8_t* null_value_buffer = get_values_buffer() + value_size * total_values_count; + if (null_value_ptr != nullptr) { + if (memcpy(null_value_buffer, null_value_ptr, value_size) == nullptr) { + RETURN_ERROR(MemoryError); + } + } else { + if (memset(null_value_buffer, 0, value_size) == nullptr) { + RETURN_ERROR(MemoryError); + } + } + + if (user_data_size > 0 && user_data_ptr != nullptr) { + int8_t* user_data_buffer = get_user_data_buffer(); + if (memcpy(user_data_buffer, user_data_ptr, user_data_size) == nullptr) { + RETURN_ERROR(MemoryError); + } + } + + ((int64_t*)buffer)[base->flatbuffer_size / sizeof(int64_t) - 1] = + static_cast(base->format_id); + + if (isFlatBuffer(buffer)) { + // make sure that initialization leads to a valid FlatBuffer + return Success; + } + RETURN_ERROR(UnknownFormatError); + } + + // To be deprecated in favor of NestedArray format void initialize(FlatBufferFormat format_id, const int8_t* format_metadata_ptr) { auto* base = getBaseWorker(); base->format_id = format_id; base->flatbuffer_size = compute_flatbuffer_size(format_id, format_metadata_ptr); base->format_metadata_offset = _align_to_int64(sizeof(FlatBufferManager::BaseWorker)); switch (format_id) { + case NestedArrayFormatId: + FLATBUFFER_UNREACHABLE(); + break; case VarlenArrayFormatId: { base->format_worker_offset = base->format_metadata_offset + @@ -678,100 +962,6 @@ struct FlatBufferManager { base->format_worker_offset + _align_to_int64(sizeof(GeoPointWorker)); break; } - case GeoLineStringFormatId: { - base->format_worker_offset = - base->format_metadata_offset + _align_to_int64(sizeof(GeoLineString)); - - const auto* format_metadata = - reinterpret_cast(format_metadata_ptr); - auto* this_metadata = getGeoLineStringMetadata(); - this_metadata->total_items_count = format_metadata->total_items_count; - this_metadata->max_nof_values = format_metadata->max_nof_values; - this_metadata->input_srid = format_metadata->input_srid; - this_metadata->output_srid = format_metadata->output_srid; - this_metadata->is_geoint = format_metadata->is_geoint; - - const auto itemsize = - 2 * (this_metadata->is_geoint ? sizeof(int32_t) : sizeof(double)); - - auto* this_worker = getGeoLineStringWorker(); - this_worker->items_count = 0; - this_worker->values_offset = - base->format_worker_offset + _align_to_int64(sizeof(GeoLineStringWorker)); - this_worker->compressed_indices_offset = - this_worker->values_offset + - _align_to_int64(itemsize * this_metadata->max_nof_values); - ; - this_worker->storage_indices_offset = - this_worker->compressed_indices_offset + - _align_to_int64(sizeof(int64_t) * (this_metadata->total_items_count + 1)); - - int64_t* compressed_indices = - reinterpret_cast(buffer + this_worker->compressed_indices_offset); - int64_t* storage_indices = - reinterpret_cast(buffer + this_worker->storage_indices_offset); - for (int i = 0; i < this_metadata->total_items_count; i++) { - compressed_indices[i] = 0; - storage_indices[i] = -1; - } - compressed_indices[this_metadata->total_items_count] = 0; - break; - } - case GeoPolygonFormatId: { - base->format_worker_offset = - base->format_metadata_offset + _align_to_int64(sizeof(GeoPolygon)); - - const auto* format_metadata = - reinterpret_cast(format_metadata_ptr); - auto* this_metadata = getGeoPolygonMetadata(); - this_metadata->total_items_count = format_metadata->total_items_count; - this_metadata->max_nof_values = format_metadata->max_nof_values; - this_metadata->max_nof_rings = format_metadata->max_nof_rings; - this_metadata->input_srid = format_metadata->input_srid; - this_metadata->output_srid = format_metadata->output_srid; - this_metadata->is_geoint = format_metadata->is_geoint; - - const auto itemsize = - 2 * (this_metadata->is_geoint ? sizeof(int32_t) : sizeof(double)); - - auto* this_worker = getGeoPolygonWorker(); - this_worker->items_count = 0; - this_worker->items2_count = 0; - this_worker->values_offset = - base->format_worker_offset + _align_to_int64(sizeof(GeoPolygonWorker)); - - this_worker->counts2_offset = - this_worker->values_offset + - _align_to_int64(itemsize * this_metadata->max_nof_values); - - this_worker->compressed_indices2_offset = - this_worker->counts2_offset + - _align_to_int64(sizeof(int32_t) * this_metadata->max_nof_rings); - - this_worker->compressed_indices_offset = - this_worker->compressed_indices2_offset + - _align_to_int64(sizeof(int64_t) * (this_metadata->max_nof_rings + 1)); - ; - this_worker->storage_indices_offset = - this_worker->compressed_indices_offset + - _align_to_int64(sizeof(int64_t) * (this_metadata->total_items_count + 1)); - - int32_t* counts2 = get_counts2(); - int64_t* compressed_indices2 = get_compressed_indices2(); - int64_t* compressed_indices = get_compressed_indices(); - int64_t* storage_indices = get_storage_indices(); - for (int i = 0; i < this_metadata->max_nof_rings; i++) { - counts2[i] = 0; - compressed_indices2[i] = 0; - } - compressed_indices2[this_metadata->max_nof_rings] = 0; - for (int i = 0; i < this_metadata->total_items_count; i++) { - compressed_indices[i] = 0; - storage_indices[i] = -1; - } - compressed_indices[this_metadata->total_items_count] = 0; - break; - } } ((int64_t*)buffer)[base->flatbuffer_size / sizeof(int64_t) - 1] = static_cast(format_id); @@ -779,28 +969,35 @@ struct FlatBufferManager { // Low-level API + inline size_t getNDims() const { + if (isNestedArray()) { + return getNestedArrayMetadata()->ndims; + } + FLATBUFFER_UNREACHABLE(); + return 0; + } + // Return the upper bound to the total number of points in all items + // To be deprecated in favor of NestedArray format inline int64_t get_max_nof_values() const { switch (format()) { case VarlenArrayFormatId: return getVarlenArrayMetadata()->max_nof_values; case GeoPointFormatId: return getGeoPointMetadata()->total_items_count; - case GeoLineStringFormatId: - return getGeoLineStringMetadata()->max_nof_values; - case GeoPolygonFormatId: - return getGeoPolygonMetadata()->max_nof_values; + default: + break; } return -1; } // Return the total number of values in all specified items + // To be deprecated in favor of NestedArray format inline int64_t get_nof_values() const { switch (format()) { - case GeoPolygonFormatId: { - const int64_t storage_count2 = get_storage_count2(); - const int64_t* compressed_indices2 = get_compressed_indices2(); - return compressed_indices2[storage_count2]; + case NestedArrayFormatId: { + FLATBUFFER_UNREACHABLE(); + break; } default: { const int64_t storage_count = get_storage_count(); @@ -811,64 +1008,36 @@ struct FlatBufferManager { } // Return the number of specified items + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t& get_storage_count() { switch (format()) { case VarlenArrayFormatId: return getVarlenArrayWorker()->items_count; case GeoPointFormatId: return getGeoPointMetadata()->total_items_count; - case GeoLineStringFormatId: - return getGeoLineStringWorker()->items_count; - case GeoPolygonFormatId: - return getGeoPolygonWorker()->items_count; + default: + break; } static int64_t dummy_storage_count = -1; return dummy_storage_count; } + // To be deprecated in favor of NestedArray format inline const int64_t& get_storage_count() const { switch (format()) { case VarlenArrayFormatId: return getVarlenArrayWorker()->items_count; case GeoPointFormatId: return getGeoPointMetadata()->total_items_count; - case GeoLineStringFormatId: - return getGeoLineStringWorker()->items_count; - case GeoPolygonFormatId: - return getGeoPolygonWorker()->items_count; + default: + break; } static int64_t dummy = -1; return dummy; } - // Return the number of specified blocks - HOST DEVICE inline int64_t& get_storage_count2() { - switch (format()) { - case GeoPolygonFormatId: - return getGeoPolygonWorker()->items2_count; - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - break; - } - static int64_t dummy_storage_count = -1; - return dummy_storage_count; - } - - inline const int64_t& get_storage_count2() const { - switch (format()) { - case GeoPolygonFormatId: - return getGeoPolygonWorker()->items2_count; - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - break; - } - static int64_t dummy_storage_count = -1; - return dummy_storage_count; - } - // Return the size of values buffer in bytes + // To be deprecated in favor of NestedArray format inline int64_t get_values_buffer_size() const { switch (format()) { case VarlenArrayFormatId: { @@ -881,36 +1050,15 @@ struct FlatBufferManager { 2 * (metadata->is_geoint ? sizeof(int32_t) : sizeof(double)); return _align_to_int64(itemsize * metadata->total_items_count); } - case GeoLineStringFormatId: { - const auto* worker = getGeoLineStringWorker(); - return worker->compressed_indices_offset - worker->values_offset; - } - case GeoPolygonFormatId: { - const auto* worker = getGeoPolygonWorker(); - return worker->compressed_indices2_offset - worker->values_offset; - } - } - static int64_t dummy = -1; - return dummy; - } - - // Return the size of compressed_indices2 buffer in bytes - inline int64_t get_compressed_indices2_buffer_size() const { - switch (format()) { - case GeoPolygonFormatId: { - const auto* worker = getGeoPolygonWorker(); - return worker->compressed_indices_offset - worker->compressed_indices2_offset; - } - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - break; + default: + break; } static int64_t dummy = -1; return dummy; } // Return the pointer to values buffer + // To be deprecated in favor of NestedArray format HOST DEVICE inline int8_t* get_values() { int64_t offset = 0; switch (format()) { @@ -920,18 +1068,13 @@ struct FlatBufferManager { case GeoPointFormatId: offset = getGeoPointWorker()->values_offset; break; - case GeoLineStringFormatId: - offset = getGeoLineStringWorker()->values_offset; - break; - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->values_offset; - break; default: return nullptr; } return buffer + offset; } + // To be deprecated in favor of NestedArray format inline const int8_t* get_values() const { int64_t offset = 0; switch (format()) { @@ -941,61 +1084,111 @@ struct FlatBufferManager { case GeoPointFormatId: offset = getGeoPointWorker()->values_offset; break; - case GeoLineStringFormatId: - offset = getGeoLineStringWorker()->values_offset; - break; - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->values_offset; - break; default: return nullptr; } return buffer + offset; } - // Return the pointer to counts2 buffer - HOST DEVICE inline int32_t* get_counts2() { + // Return the pointer to compressed indices buffer + // To be deprecated in favor of NestedArray format + HOST DEVICE inline int64_t* get_compressed_indices() { int64_t offset = 0; switch (format()) { - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->counts2_offset; + case VarlenArrayFormatId: + offset = getVarlenArrayWorker()->compressed_indices_offset; break; default: return nullptr; } - return reinterpret_cast(buffer + offset); + return reinterpret_cast(buffer + offset); } - inline const int32_t* get_counts2() const { + // To be deprecated in favor of NestedArray format + inline const int64_t* get_compressed_indices() const { int64_t offset = 0; switch (format()) { - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->counts2_offset; + case VarlenArrayFormatId: + offset = getVarlenArrayWorker()->compressed_indices_offset; break; default: return nullptr; } - return reinterpret_cast(buffer + offset); + return reinterpret_cast(buffer + offset); } - // Return the pointer to compressed indices2 buffer - HOST DEVICE inline int64_t* get_compressed_indices2() { - int64_t offset = 0; - switch (format()) { - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->compressed_indices2_offset; - break; - default: - return nullptr; +#define FLATBUFFER_GET_BUFFER_METHODS(BUFFERNAME, BUFFERTYPE) \ + HOST DEVICE inline BUFFERTYPE* get_##BUFFERNAME() { \ + int64_t offset = 0; \ + switch (format()) { \ + case NestedArrayFormatId: \ + offset = getNestedArrayWorker()->BUFFERNAME##_offset; \ + break; \ + default: \ + return nullptr; \ + } \ + return reinterpret_cast(buffer + offset); \ + } \ + inline const BUFFERTYPE* get_##BUFFERNAME() const { \ + int64_t offset = 0; \ + switch (format()) { \ + case NestedArrayFormatId: \ + offset = getNestedArrayWorker()->BUFFERNAME##_offset; \ + break; \ + default: \ + return nullptr; \ + } \ + return reinterpret_cast(buffer + offset); \ + } + + FLATBUFFER_GET_BUFFER_METHODS(user_data_buffer, int8_t); + FLATBUFFER_GET_BUFFER_METHODS(values_buffer, int8_t); + FLATBUFFER_GET_BUFFER_METHODS(sizes_buffer, sizes_t); + FLATBUFFER_GET_BUFFER_METHODS(values_offsets, offsets_t); + FLATBUFFER_GET_BUFFER_METHODS(sizes_offsets, offsets_t); + +#undef FLATBUFFER_GET_BUFFER_METHODS + + inline const int8_t* getNullValuePtr() const { + if (isNestedArray()) { + return get_values_buffer() + getValuesBufferSize(); } - return reinterpret_cast(buffer + offset); + return nullptr; + } + + inline bool containsNullValue(const int8_t* value_ptr) const { + const int8_t* null_value_ptr = getNullValuePtr(); + if (null_value_ptr != nullptr) { + switch (getValueSize()) { + case 1: + return *null_value_ptr == *value_ptr; + case 2: + return *reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr); + case 4: + return *reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr); + case 8: + return *reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr); + case 16: + return (*reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr) && + *(reinterpret_cast(null_value_ptr) + 1) == + *(reinterpret_cast(value_ptr) + 1)); + default: + break; + } + } + return false; } - inline const int64_t* get_compressed_indices2() const { + // To be deprecated in favor of NestedArray format + HOST DEVICE inline int64_t* get_storage_indices_old() { int64_t offset = 0; switch (format()) { - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->compressed_indices2_offset; + case VarlenArrayFormatId: + offset = getVarlenArrayWorker()->storage_indices_offset; break; default: return nullptr; @@ -1003,18 +1196,12 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } - // Return the pointer to compressed indices buffer - HOST DEVICE inline int64_t* get_compressed_indices() { + // To be deprecated in favor of NestedArray format + inline const int64_t* get_storage_indices_old() const { int64_t offset = 0; switch (format()) { case VarlenArrayFormatId: - offset = getVarlenArrayWorker()->compressed_indices_offset; - break; - case GeoLineStringFormatId: - offset = getGeoLineStringWorker()->compressed_indices_offset; - break; - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->compressed_indices_offset; + offset = getVarlenArrayWorker()->storage_indices_offset; break; default: return nullptr; @@ -1022,124 +1209,786 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } - inline const int64_t* get_compressed_indices() const { - int64_t offset = 0; + HOST DEVICE inline sizes_t* get_storage_indices() { + offsets_t offset = 0; switch (format()) { - case VarlenArrayFormatId: - offset = getVarlenArrayWorker()->compressed_indices_offset; - break; - case GeoLineStringFormatId: - offset = getGeoLineStringWorker()->compressed_indices_offset; - break; - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->compressed_indices_offset; + case NestedArrayFormatId: + offset = getNestedArrayWorker()->storage_indices_offset; break; default: return nullptr; } - return reinterpret_cast(buffer + offset); + return reinterpret_cast(buffer + offset); } - // Return the pointer to storage indices buffer - HOST DEVICE inline int64_t* get_storage_indices() { - int64_t offset = 0; + inline const sizes_t* get_storage_indices() const { + offsets_t offset = 0; switch (format()) { - case VarlenArrayFormatId: - offset = getVarlenArrayWorker()->storage_indices_offset; - break; - case GeoLineStringFormatId: - offset = getGeoLineStringWorker()->storage_indices_offset; - break; - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->storage_indices_offset; + case NestedArrayFormatId: + offset = getNestedArrayWorker()->storage_indices_offset; break; default: return nullptr; } - return reinterpret_cast(buffer + offset); + return reinterpret_cast(buffer + offset); } - inline const int64_t* get_storage_indices() const { - int64_t offset = 0; - switch (format()) { - case VarlenArrayFormatId: - offset = getVarlenArrayWorker()->storage_indices_offset; + inline sizes_t get_storage_index(const int64_t index) const { + return get_storage_indices()[index]; + } + + // High-level API + + // This getLength method is a worker method of accessing the + // flatbuffer content. + template + Status getLength(const int64_t index[NDIM], const size_t n, size_t& length) const { + if (!isNestedArray()) { + RETURN_ERROR(NotSupportedFormatError); + } + const size_t ndims = getNDims(); + if (n == 0) { + length = itemsCount(); + return Success; + } + if (n > ndims + 1) { + RETURN_ERROR(DimensionalityError); + } + const auto storage_index = get_storage_index(index[0]); + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { // NULL item + length = 0; + return Success; + } + const auto* sizes_offsets = get_sizes_offsets(); + const auto* sizes_buffer = get_sizes_buffer(); + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + switch (n) { + case 1: { + length = sizes_buffer[sizes_offset]; + } break; + case 2: { + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + if (index[1] < 0 || index[1] >= sizes_buffer[sizes_offset]) { + RETURN_ERROR(SubIndexError); + } + length = sizes_buffer[sizes2_offset + index[1]]; + } break; + case 3: { + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + if (index[1] < 0 || index[1] >= sizes_buffer[sizes_offset]) { + RETURN_ERROR(SubIndexError); + } + if (index[2] < 0 || index[2] >= sizes_buffer[sizes2_offset + index[1]]) { + RETURN_ERROR(SubIndexError); + } + offsets_t soffset = 0; + for (int64_t i = 0; i < index[1]; i++) { + soffset += sizes_buffer[sizes2_offset + i]; + } + length = sizes_buffer[sizes3_offset + soffset + index[2]]; + } break; + default: + RETURN_ERROR(NotImplementedError); break; - case GeoLineStringFormatId: - offset = getGeoLineStringWorker()->storage_indices_offset; + } + return Success; + } + + // This getItem method is a worker method of accessing the + // flatbuffer content. + template + Status getItemWorker(const int64_t index[NDIM], + const size_t n, + int8_t*& values, + int32_t& nof_values, + int32_t* sizes_buffers[NDIM], + int32_t sizes_lengths[NDIM], + int32_t& nof_sizes, + bool& is_null) { + if (format() != NestedArrayFormatId) { + RETURN_ERROR(NotSupportedFormatError); + } + + const size_t ndims = getNDims(); + if (n <= 0 || n > ndims + 1) { + RETURN_ERROR(DimensionalityError); + } + // clang-format off + /* + multipolygon (ndims == 3): + + n == 0 means return a column of multipolygons: flatbuffer, getLenght returns + itemsCount() + + n == 1 means return a multipolygon: values, sizes(=sizes_buffers[1]), + sizes_of_sizes(=sizes_buffers[0]), getLength returns + len(sizes_of_sizes)(=sizes_lengths[0]) + + n == 2 means return a polygon: values, sizes, getLength + returns len(sizes) + + n == 3 means return a linestring: values, getLength returns + len(values) + + n == 4 means return a point: value, getLength returns 0 [NOTIMPL] + + polygon/multilinestring (ndims == 2): + + n == 0 means return a column of polygons/multilinestring: + flatbuffer, getLenght returns itemsCount() + + n == 1 means return a polygon/multilinestring: values, sizes, + getLength returns len(sizes) + + n == 2 means return a linestring: values, getLength + returns len(values) + + n == 3 means return a point: value, getLength returns 0 [NOTIMPL] + + linestring/multipoint (ndims == 1): + + n == 0 means return a column of linestring/multipoint: + flatbuffer, getLenght returns itemsCount() + + n == 1 means return a linestring: values, getLength returns + len(values) + + n == 2 means return a point: value, getLength returns 0 [NOTIMPL] + + */ + // clang-format off + const auto storage_index = get_storage_index(index[0]); + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + values = nullptr; + nof_values = 0; + nof_sizes = 0; + is_null = true; + return Success; + } + is_null = false; + const auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto valuesize = getValueSize(); + const auto next_values_offset = values_offsets[storage_index + 1]; + + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + nof_sizes = ndims - n; + switch (n) { + case 1: { + if (next_values_offset < 0) { + nof_values = -(next_values_offset + 1) - values_offset; + } else { + nof_values = next_values_offset - values_offset; + } + values = values_buffer + values_offset * valuesize; + switch (ndims) { + case 3: { + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + sizes_buffers[0] = sizes_buffer + sizes2_offset; + sizes_buffers[1] = sizes_buffer + sizes3_offset; + sizes_lengths[0] = sizes_buffer[sizes_offset]; + sizes_lengths[1] = sizes_offsets[storage_index * ndims + 3] - sizes3_offset; + break; + } + case 2: { + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + sizes_buffers[0] = sizes_buffer + sizes2_offset; + sizes_lengths[0] = sizes_buffer[sizes_offset]; + break; + } + case 1: + break; + default: + FLATBUFFER_UNREACHABLE(); + break; + } + break; + } + case 2: { + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + if (index[1] < 0 || index[1] >= sizes_buffer[sizes_offset]) { + RETURN_ERROR(SubIndexError); + } + offsets_t soffset = 0; + for (int64_t i = 0; i < index[1]; i++) { + soffset += sizes_buffer[sizes2_offset + i]; + } + values = values_buffer + (values_offset + soffset) * valuesize; + switch (ndims) { + case 3: { + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + const sizes_t nsizes = sizes_buffer[sizes2_offset + index[1]]; + auto sizes_buf = sizes_buffer + sizes3_offset + soffset; + sizes_buffers[0] = sizes_buf; + sizes_lengths[0] = nsizes; + nof_values = 0; + for (int64_t i = 0; i < nsizes; i++) { + nof_values += sizes_buf[i]; + } + break; + } + case 2: { + nof_values = sizes_buffer[sizes2_offset + index[1]]; + break; + } + default: + FLATBUFFER_UNREACHABLE(); + break; + } break; - case GeoPolygonFormatId: - offset = getGeoPolygonWorker()->storage_indices_offset; + } + case 3: { + if (ndims != 3) { + RETURN_ERROR(NotImplementedError); + } + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + if (index[1] < 0 || index[1] >= sizes_buffer[sizes_offset]) { + RETURN_ERROR(SubIndexError); + } + if (index[2] < 0 || index[2] >= sizes_buffer[sizes2_offset + index[1]]) { + RETURN_ERROR(SubIndexError); + } + + int64_t i3 = 0; + int64_t soffset = 0; + int64_t voffset = 0; + for (int64_t i = 0; i < index[1]; i++) { + auto size2 = sizes_buffer[sizes2_offset + i]; + soffset += size2; + for (int64_t j = 0; j < size2; j++) { + voffset += sizes_buffer[sizes3_offset + i3]; + i3++; + } + } + for (int64_t j = 0; j < index[2]; j++) { + voffset += sizes_buffer[sizes3_offset + i3]; + i3++; + } + values = values_buffer + (values_offset + voffset) * valuesize; + nof_values = sizes_buffer[sizes3_offset + soffset + index[2]]; break; + } default: - return nullptr; + RETURN_ERROR(NotImplementedError); + break; } - return reinterpret_cast(buffer + offset); + return Success; } - // High-level API + template + struct NestedArrayItem { + int8_t* values; + int32_t nof_values; + int32_t* sizes_buffers[NDIM]; + int32_t sizes_lengths[NDIM]; + int32_t nof_sizes; + bool is_null; + }; + + template + Status getItem(const int64_t index, NestedArrayItem& result) { + const int64_t index_[NDIM] = {index}; + return getItem(index_, 1, result); + } + + template + Status getItem(const int64_t index[NDIM], const size_t n, NestedArrayItem& result) { + return getItemWorker(index, n, + result.values, + result.nof_values, + result.sizes_buffers, + result.sizes_lengths, + result.nof_sizes, + result.is_null); + } + + // This setItem method is a worker method of initializing the + // flatbuffer content. It can be called once per index value. + template + Status setItemWorker(const int64_t index, + const int8_t* values, + const int32_t nof_values, + const int32_t* const sizes_buffers[NDIM], + const int32_t sizes_lengths[NDIM], + const int32_t nof_sizes) { + if (format() != NestedArrayFormatId) { + RETURN_ERROR(NotSupportedFormatError); + } + if (index < 0 || index >= itemsCount()) { + RETURN_ERROR(IndexError); + } + const int32_t ndims = getNDims(); + if (nof_sizes + 1 != ndims) { + RETURN_ERROR(DimensionalityError); + } + + auto* storage_indices = get_storage_indices(); + if (storage_indices[index] >= 0) { + RETURN_ERROR(ItemAlreadySpecifiedError); + } + auto* worker = getNestedArrayWorker(); + const auto storage_index = worker->specified_items_count; + storage_indices[index] = storage_index; + worker->specified_items_count++; + + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto* metadata = getNestedArrayMetadata(); + const auto valuesize = getValueSize(); + + auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + if (values_offset + nof_values > metadata->total_values_count) { + RETURN_ERROR(ValuesBufferTooSmallError); + } + + switch (ndims) { + case 1: { + sizes_buffer[sizes_offset] = nof_values; + sizes_offsets[storage_index * ndims + 1] = sizes_offset + 1; + } break; + case 2: { + const auto sizes2_offset = sizes_offset + 1; + if (sizes2_offset + sizes_lengths[0] > metadata->total_sizes_count) { + RETURN_ERROR(SizesBufferTooSmallError); + } + sizes_buffer[sizes_offset] = sizes_lengths[0]; + if constexpr (check_sizes) { + // check consistency of sizes and nof_values + int32_t sum_of_sizes = 0; + for (int32_t i=0; i < sizes_lengths[0]; i++) { + sum_of_sizes += sizes_buffers[0][i]; + } + if (nof_values != sum_of_sizes) { + RETURN_ERROR(InconsistentSizesError); + } + } + if (memcpy(sizes_buffer + sizes2_offset, + sizes_buffers[0], + sizes_lengths[0] * sizeof(sizes_t)) == nullptr) { + RETURN_ERROR(MemoryError); + } + sizes_offsets[storage_index * ndims + 1] = sizes2_offset; + sizes_offsets[storage_index * ndims + 2] = sizes2_offset + sizes_lengths[0]; + } break; + case 3: { + const auto sizes2_offset = sizes_offset + 1; + const auto sizes3_offset = sizes2_offset + sizes_lengths[0]; + if (sizes2_offset + sizes_lengths[0] + sizes_lengths[1] > + metadata->total_sizes_count) { + RETURN_ERROR(SizesBufferTooSmallError); + } + sizes_buffer[sizes_offset] = sizes_lengths[0]; + if constexpr (check_sizes) { + // check consistency of sizes of sizes and nof_sizes + int32_t sum_of_sizes_of_sizes = 0; + for (int32_t i=0; i < sizes_lengths[0]; i++) { + sum_of_sizes_of_sizes += sizes_buffers[0][i]; + } + if (sizes_lengths[1] != sum_of_sizes_of_sizes) { + RETURN_ERROR(InconsistentSizesError); + } + } + if (memcpy(sizes_buffer + sizes2_offset, + sizes_buffers[0], + sizes_lengths[0] * sizeof(sizes_t)) == nullptr) { + RETURN_ERROR(MemoryError); + } + if constexpr (check_sizes) { + // check consistency of sizes and nof_values + int32_t sum_of_sizes = 0; + for (int32_t i=0; i < sizes_lengths[1]; i++) { + sum_of_sizes += sizes_buffers[1][i]; + } + if (nof_values != sum_of_sizes) { + RETURN_ERROR(InconsistentSizesError); + } + } + if (memcpy(sizes_buffer + sizes3_offset, + sizes_buffers[1], + sizes_lengths[1] * sizeof(sizes_t)) == nullptr) { + RETURN_ERROR(MemoryError); + } + sizes_offsets[storage_index * ndims + 1] = sizes2_offset; + sizes_offsets[storage_index * ndims + 2] = sizes3_offset; + sizes_offsets[storage_index * ndims + 3] = sizes3_offset + sizes_lengths[1]; + } break; + default: + FLATBUFFER_UNREACHABLE(); + break; + } + if (values != nullptr) { + if (memcpy(values_buffer + values_offset * valuesize, + values, + nof_values * valuesize) == nullptr) { + RETURN_ERROR(MemoryError); + } + } + values_offsets[storage_index + 1] = values_offset + nof_values; + return Success; + } + + template + Status setItem(const int64_t index, + const int8_t* values_buf, + const int32_t nof_values) { + const int32_t* const sizes_buffers[1] = {nullptr}; + int32_t sizes_lengths[1] = {0}; + return setItemWorker<1, check_sizes>(index, + values_buf, + nof_values, + sizes_buffers, + sizes_lengths, + 0); + } + + template + Status setItem(const int64_t index, + const int8_t* values_buf, + const int32_t nof_values, + const int32_t* sizes_buf, + const int32_t nof_sizes) { + const int32_t* const sizes_buffers[NDIM] = {sizes_buf}; + int32_t sizes_lengths[NDIM] = {nof_sizes}; + return setItemWorker(index, + values_buf, + nof_values, + sizes_buffers, + sizes_lengths, + static_cast(NDIM)); + } + + template + Status setItem(const int64_t index, + const int8_t* values_buf, + const int32_t nof_values, + const int32_t* sizes_buf, + const int32_t nof_sizes, + const int32_t* sizes_of_sizes_buf, + const int32_t nof_sizes_of_sizes) { + const int32_t* const sizes_buffers[NDIM] = {sizes_of_sizes_buf, sizes_buf}; + int32_t sizes_lengths[NDIM] = {nof_sizes_of_sizes, nof_sizes}; + return setItemWorker(index, + values_buf, + nof_values, + sizes_buffers, + sizes_lengths, + static_cast(NDIM)); + } + + template + Status getItem(const int64_t index, + std::vector& values, + std::vector& sizes, + bool& is_null) { + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + NestedArrayItem<2> item; + Status status = getItem(index, item); + if (status != Success) { + return status; + } + if (item.is_null) { + return Success; + } + if (item.nof_sizes != 1) { + RETURN_ERROR(InconsistentSizesError); + } + const auto valuesize = getValueSize(); + const auto values_count = item.nof_values * valuesize / sizeof(CT); + values.reserve(values_count); + values.insert(values.end(), + reinterpret_cast(item.values), + reinterpret_cast(item.values) + values_count); + + sizes.reserve(item.sizes_lengths[0]); + sizes.insert(sizes.end(), + reinterpret_cast(item.sizes_buffers[0]), + reinterpret_cast(item.sizes_buffers[0] + item.sizes_lengths[0] * sizeof(int32_t))); + return Success; + } + + template + Status getItem(const int64_t index, + std::vector& values, + std::vector& sizes, + std::vector& sizes_of_sizes, + bool& is_null) { + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + NestedArrayItem<3> item; + Status status = getItem(index, item); + if (status != Success) { + return status; + } + if (item.is_null) { + return Success; + } + if (item.nof_sizes != 2) { + RETURN_ERROR(InconsistentSizesError); + } + const auto valuesize = getValueSize(); + const auto values_count = item.nof_values * valuesize / sizeof(CT); + values.reserve(values_count); + values.insert(values.end(), + reinterpret_cast(item.values), + reinterpret_cast(item.values) + values_count); + + sizes.reserve(item.sizes_lengths[1]); + sizes.insert(sizes.end(), + reinterpret_cast(item.sizes_buffers[1]), + reinterpret_cast(item.sizes_buffers[1] + item.sizes_lengths[1] * sizeof(int32_t))); + + sizes_of_sizes.reserve(item.sizes_lengths[0]); + sizes_of_sizes.insert(sizes_of_sizes.end(), + reinterpret_cast(item.sizes_buffers[0]), + reinterpret_cast(item.sizes_buffers[0] + item.sizes_lengths[0] * sizeof(int32_t))); + return Success; + + } + + template + Status setItem(const int64_t index, const std::vector& arr) { + if (getNDims() != NDIM + 1) { + RETURN_ERROR(DimensionalityError); + } + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + const auto valuesize = getValueSize(); + auto sz = (arr.size() * sizeof(CT)) / valuesize; + const int32_t* const sizes_buffers[1] = {nullptr}; + int32_t sizes_lengths[1] = {0}; + return setItemWorker<1, false>(index, + reinterpret_cast(arr.data()), + sz, + sizes_buffers, + sizes_lengths, + 0); + } + + template + Status setItem(const int64_t index, const std::vector>& item) { + const auto valuesize = getValueSize(); + std::vector sizes; + sizes.reserve(item.size()); + int32_t nof_values = 0; + size_t nof_elements = 0; + for (const auto& subitem: item) { + const auto sz = (subitem.size() * sizeof(CT)) / valuesize; + sizes.push_back(sz); + nof_values += sz; + nof_elements += subitem.size(); + } + std::vector flatitem; + flatitem.reserve(nof_elements); + for (const auto& subitem: item) { + flatitem.insert(flatitem.end(), subitem.begin(), subitem.end()); + } + return setItem(index, flatitem, sizes); + } + + template + Status setItem(const int64_t index, + const std::vector>>& item) { + const auto valuesize = getValueSize(); + std::vector sizes_of_sizes; + std::vector sizes; + std::vector flatitem; + sizes_of_sizes.reserve(item.size()); + size_t nof_sizes_of_sizes = 0; + for (const auto& subitem: item) { + sizes_of_sizes.push_back(subitem.size()); + nof_sizes_of_sizes += subitem.size(); + } + sizes.reserve(nof_sizes_of_sizes); + int32_t nof_values = 0; + size_t nof_elements = 0; + for (const auto& subitem: item) { + for (const auto& subitem1: subitem) { + const auto sz = (subitem1.size() * sizeof(CT)) / valuesize; + sizes.push_back(sz); + nof_values += sz; + nof_elements += subitem1.size(); + } + } + flatitem.reserve(nof_elements); + for (const auto& subitem: item) { + for (const auto& subitem1: subitem) { + flatitem.insert(flatitem.end(), subitem1.begin(), subitem1.end()); + } + } + return setItem(index, flatitem, sizes, sizes_of_sizes); + } + + template + Status setItem(const int64_t index, + const std::vector& values, + const std::vector& sizes) { + if (getNDims() != NDIM + 1) { + RETURN_ERROR(DimensionalityError); + } + const auto* metadata = getNestedArrayMetadata(); + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + const auto valuesize = getValueSize(); + const int32_t nof_values = (values.size() * sizeof(CT)) / valuesize; + return setItem(index, + reinterpret_cast(values.data()), + nof_values, + sizes.data(), + sizes.size()); + } + + template + Status setItem(const int64_t index, + const std::vector& values, + const std::vector& sizes, + const std::vector& sizes_of_sizes) { + if (getNDims() != NDIM + 1) { + RETURN_ERROR(DimensionalityError); + } + const auto* metadata = getNestedArrayMetadata(); + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + const auto valuesize = getValueSize(); + const auto nof_values = (values.size() * sizeof(CT)) / valuesize; + return setItem(index, + reinterpret_cast(values.data()), + nof_values, + sizes.data(), + sizes.size(), + sizes_of_sizes.data(), + sizes_of_sizes.size() + ); + } // Set a new item with index and size (in bytes) and initialize its // elements from source buffer. The item values will be // uninitialized when source buffer is nullptr. If dest != nullptr // then the item's buffer pointer will be stored in *dest. - Status setItem(const int64_t index, + // To be deprecated in favor of NestedArray format + Status setItemOld(const int64_t index, const int8_t* src, const int64_t size, int8_t** dest = nullptr) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: { + case VarlenArrayFormatId: { int64_t& storage_count = get_storage_count(); int64_t* compressed_indices = get_compressed_indices(); - int64_t* storage_indices = get_storage_indices(); + int64_t* storage_indices = get_storage_indices_old(); const int64_t itemsize = dtypeSize(); if (size % itemsize != 0) { return SizeError; // size must be multiple of itemsize. Perhaps size is not in // bytes? } if (storage_indices[index] >= 0) { - return ItemAlreadySpecifiedError; + RETURN_ERROR(ItemAlreadySpecifiedError); } const int64_t cindex = compressed_indices[storage_count]; const int64_t values_buffer_size = get_values_buffer_size(); const int64_t csize = cindex * itemsize; if (csize + size > values_buffer_size) { - return ValuesBufferTooSmallError; + RETURN_ERROR(ValuesBufferTooSmallError); } break; } case GeoPointFormatId: { const int64_t itemsize = dtypeSize(); if (size != itemsize) { - return SizeError; + RETURN_ERROR(SizeError); } break; } - case GeoPolygonFormatId: { - const int64_t itemsize = dtypeSize(); - const int32_t counts = size / itemsize; - return setItemCountsAndData(index, &counts, 1, src, dest); - } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return setItemNoValidation(index, src, size, dest); } // Same as setItem but performs no input validation + // To be deprecated in favor of NestedArray format Status setItemNoValidation(const int64_t index, const int8_t* src, const int64_t size, int8_t** dest) { switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: { + case VarlenArrayFormatId: { int64_t& storage_count = get_storage_count(); - int64_t* storage_indices = get_storage_indices(); + int64_t* storage_indices = get_storage_indices_old(); int64_t* compressed_indices = get_compressed_indices(); int8_t* values = get_values(); const int64_t itemsize = dtypeSize(); @@ -1149,7 +1998,7 @@ struct FlatBufferManager { storage_indices[index] = storage_count; compressed_indices[storage_count + 1] = cindex + values_count; if (size > 0 && src != nullptr && memcpy(values + csize, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } if (dest != nullptr) { *dest = values + csize; @@ -1162,208 +2011,20 @@ struct FlatBufferManager { const int64_t itemsize = dtypeSize(); const int64_t csize = index * itemsize; if (src != nullptr && memcpy(values + csize, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } if (dest != nullptr) { *dest = values + csize; } break; } - case GeoPolygonFormatId: { - const int64_t itemsize = dtypeSize(); - const int32_t counts = size / itemsize; - return setItemCountsAndDataNoValidation(index, &counts, 1, src, dest); - } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return Success; } - Status setItemCountsAndData(const int64_t index, - const int32_t* counts, - const int64_t nof_counts, - const int8_t* src, - int8_t** dest = nullptr) { - if (index < 0 || index >= itemsCount()) { - return IndexError; - } - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - return NotSupportedFormatError; - case GeoPolygonFormatId: { - const int64_t& storage_count = get_storage_count(); - const int64_t& storage_count2 = get_storage_count2(); - const int64_t* compressed_indices = get_compressed_indices(); - const int64_t* compressed_indices2 = get_compressed_indices2(); - const int64_t* storage_indices = get_storage_indices(); - const int64_t valuesize = dtypeSize(); - if (storage_indices[index] >= 0) { - return ItemAlreadySpecifiedError; - } - - const int64_t compressed_indices2_buffer_size = - get_compressed_indices2_buffer_size(); - if (compressed_indices[storage_count] + nof_counts > - compressed_indices2_buffer_size) { - return CompressedIndices2BufferTooSmallError; - } - - const int64_t offset = compressed_indices2[storage_count2] * valuesize; - int64_t size = 0; - for (int i = 0; i < nof_counts; i++) { - size += valuesize * counts[i]; - } - const int64_t values_buffer_size = get_values_buffer_size(); - if (offset + size > values_buffer_size) { - return ValuesBufferTooSmallError; - } - break; - } - default: - return UnknownFormatError; - } - return setItemCountsAndDataNoValidation(index, counts, nof_counts, src, dest); - } - - // Same as setItem but performs no input validation - Status setItemCountsAndDataNoValidation( - const int64_t index, - const int32_t* counts, // counts of points in rings - const int64_t nof_counts, // nof rings - const int8_t* src, // coordinates of points - int8_t** dest) { - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - return NotSupportedFormatError; - case GeoPolygonFormatId: { - int64_t& storage_count = get_storage_count(); - int64_t& storage_count2 = get_storage_count2(); - int64_t* storage_indices = get_storage_indices(); - int64_t* compressed_indices = get_compressed_indices(); - int32_t* counts2 = get_counts2(); - int64_t* compressed_indices2 = get_compressed_indices2(); - int8_t* values = get_values(); - const int64_t valuesize = dtypeSize(); - storage_indices[index] = storage_count; - - compressed_indices[storage_count + 1] = - compressed_indices[storage_count] + nof_counts; - - int64_t cindex2 = compressed_indices2[storage_count2]; - const int64_t offset = cindex2 * valuesize; - int64_t size = 0; - for (int i = 0; i < nof_counts; i++) { - size += valuesize * counts[i]; - cindex2 += counts[i]; - counts2[storage_count2] = counts[i]; - storage_count2++; - compressed_indices2[storage_count2] = cindex2; - } - if (size > 0 && src != nullptr && memcpy(values + offset, src, size) == nullptr) { - return MemoryError; - } - if (dest != nullptr) { - *dest = values + offset; - } - storage_count++; - break; - } - default: - return UnknownFormatError; - } - - return Success; - } - - Status setSubItem(const int64_t index, - const int64_t subindex, - const int8_t* src, - const int64_t size, - int8_t** dest = nullptr) { - if (index < 0 || index >= itemsCount()) { - return IndexError; - } - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - return NotSupportedFormatError; - case GeoPolygonFormatId: { - const int64_t* storage_indices = get_storage_indices(); - const int64_t storage_index = storage_indices[index]; - if (storage_index < 0) { - return ItemUnspecifiedError; - } - int64_t* compressed_indices = get_compressed_indices(); - const int64_t cindex = compressed_indices[storage_index]; - if (cindex < 0) { - if (size > 0) { - return UnexpectedNullItemError; - } - } else { - const int64_t next_cindex = compressed_indices[storage_index + 1]; - const int64_t nof_counts = - (next_cindex < 0 ? -(next_cindex + 1) - cindex : next_cindex - cindex); - if (subindex < 0 || subindex >= nof_counts) { - return SubIndexError; - } - int64_t* compressed_indices2 = get_compressed_indices2(); - const int64_t valuesize = dtypeSize(); - const int64_t cindex2 = compressed_indices2[cindex + subindex]; - const int64_t next_cindex2 = compressed_indices2[cindex + subindex + 1]; - const int64_t expected_size = (next_cindex2 - cindex2) * valuesize; - if (expected_size != size) { - return SizeError; - } - } - break; - } - default: - return UnknownFormatError; - } - return setSubItemNoValidation(index, subindex, src, size, dest); - } - - Status setSubItemNoValidation(const int64_t index, - const int64_t subindex, - const int8_t* src, - const int64_t size, - int8_t** dest) { - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - return NotSupportedFormatError; - case GeoPolygonFormatId: { - const int64_t* storage_indices = get_storage_indices(); - const int64_t storage_index = storage_indices[index]; - int64_t* compressed_indices = get_compressed_indices(); - const int64_t cindex = compressed_indices[storage_index]; - int8_t* values = get_values(); - int64_t* compressed_indices2 = get_compressed_indices2(); - const int64_t valuesize = dtypeSize(); - const int64_t cindex2 = compressed_indices2[cindex + subindex]; - const int64_t offset = cindex2 * valuesize; - if (size > 0 && src != nullptr && memcpy(values + offset, src, size) == nullptr) { - return MemoryError; - } - if (dest != nullptr) { - *dest = values + offset; - } - break; - } - default: - return UnknownFormatError; - } - return Success; - } - // Set a new item with index and size but without initializing item // elements. The buffer pointer of the new item will be stored in // *dest if dest != nullptr. Inputs are not validated! @@ -1371,33 +2032,33 @@ struct FlatBufferManager { return setItemNoValidation(index, nullptr, size, dest); } + // To be deprecated in favor of NestedArray format Status concatItem(int64_t index, const int8_t* src, int64_t size) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: { + case VarlenArrayFormatId: { int64_t next_storage_count = get_storage_count(); int64_t storage_count = next_storage_count - 1; int64_t* compressed_indices = get_compressed_indices(); - int64_t* storage_indices = get_storage_indices(); + int64_t* storage_indices = get_storage_indices_old(); int8_t* values = get_values(); int64_t itemsize = dtypeSize(); int64_t storage_index = storage_indices[index]; if (storage_index == -1) { // unspecified, so setting the item - return setItem(index, src, size, nullptr); + return setItemOld(index, src, size, nullptr); } if (size % itemsize != 0) { - return SizeError; + RETURN_ERROR(SizeError); } if (storage_index != storage_count) { - return IndexError; // index does not correspond to the last set - // item, only the last item can be - // concatenated + RETURN_ERROR(IndexError); // index does not correspond to the last set + // item, only the last item can be + // concatenated } if (compressed_indices[storage_index] < 0) { - return NotImplementedError; // todo: support concat to null when last + RETURN_ERROR(NotImplementedError); // todo: support concat to null when last } int64_t values_count = compressed_indices[next_storage_count] - compressed_indices[storage_index]; @@ -1406,25 +2067,51 @@ struct FlatBufferManager { int8_t* ptr = values + compressed_indices[storage_index] * itemsize; if (size > 0 && src != nullptr && memcpy(ptr + values_count * itemsize, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } return Success; } default:; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Set item with index as a null item Status setNull(int64_t index) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); + } + if (isNestedArray()) { + auto* storage_indices = get_storage_indices(); + if (storage_indices[index] >= 0) { + RETURN_ERROR(ItemAlreadySpecifiedError); + } + auto* worker = getNestedArrayWorker(); + const auto storage_index = worker->specified_items_count; + worker->specified_items_count++; + storage_indices[index] = storage_index; + const size_t ndims = getNDims(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + const auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + sizes_buffer[sizes_offset] = 0; + for (size_t i = 0; i < ndims; i++) { + sizes_offsets[storage_index * ndims + i + 1] = sizes_offset + 1; + } + values_offsets[storage_index] = -(values_offset + 1); + values_offsets[storage_index + 1] = values_offset; + return Success; + } + // To be deprecated in favor of NestedArray format: + if (index < 0 || index >= itemsCount()) { + RETURN_ERROR(IndexError); } + switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: - case GeoPolygonFormatId: { - int64_t* storage_indices = get_storage_indices(); + case VarlenArrayFormatId: { + int64_t* storage_indices = get_storage_indices_old(); if (storage_indices[index] >= 0) { return ItemAlreadySpecifiedError; } @@ -1433,18 +2120,19 @@ struct FlatBufferManager { case GeoPointFormatId: { return setNullNoValidation(index); } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Same as setNull but performs no input validation + // To be deprecated in favor of NestedArray format Status setNullNoValidation(int64_t index) { switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: - case GeoPolygonFormatId: { + case VarlenArrayFormatId: { int64_t& storage_count = get_storage_count(); - int64_t* storage_indices = get_storage_indices(); + int64_t* storage_indices = get_storage_indices_old(); int64_t* compressed_indices = get_compressed_indices(); const int64_t cindex = compressed_indices[storage_count]; storage_indices[index] = storage_count; @@ -1471,7 +2159,7 @@ struct FlatBufferManager { break; } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return Success; } @@ -1479,17 +2167,26 @@ struct FlatBufferManager { // Check if the item is unspecified or null. Status isNull(int64_t index, bool& is_null) const { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } + if (isNestedArray()) { + const auto storage_index = get_storage_index(index); + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + is_null = values_offset < 0; + return Success; + } + if (index < 0 || index >= itemsCount()) { + RETURN_ERROR(IndexError); + } + // To be deprecated in favor of NestedArray format: switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: - case GeoPolygonFormatId: { + case VarlenArrayFormatId: { const int64_t* compressed_indices = get_compressed_indices(); - const int64_t* storage_indices = get_storage_indices(); + const int64_t* storage_indices = get_storage_indices_old(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } is_null = (compressed_indices[storage_index] < 0); return Success; @@ -1509,23 +2206,25 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Get item at index by storing its size (in bytes), values buffer, // and nullity information to the corresponding pointer // arguments. + // To be deprecated in favor of NestedArray format HOST DEVICE Status getItem(int64_t index, int64_t& size, int8_t*& dest, bool& is_null) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: { + case VarlenArrayFormatId: { int8_t* values = get_values(); const int64_t* compressed_indices = get_compressed_indices(); - const int64_t* storage_indices = get_storage_indices(); + const int64_t* storage_indices = get_storage_indices_old(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { return ItemUnspecifiedError; @@ -1555,12 +2254,13 @@ struct FlatBufferManager { is_null = false; return Success; } - case GeoPolygonFormatId: + default: break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } + // To be deprecated in favor of NestedArray format HOST DEVICE Status getItem(int64_t index, size_t& size, int8_t*& dest, bool& is_null) { int64_t sz{0}; Status status = getItem(index, sz, dest, is_null); @@ -1568,115 +2268,20 @@ struct FlatBufferManager { return status; } - HOST DEVICE Status getItem2(int64_t index, - int64_t*& cumcounts, - int64_t& nof_counts, - int8_t*& dest, - bool& is_null) { - if (index < 0 || index >= itemsCount()) { - return IndexError; - } - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - break; - case GeoPolygonFormatId: { - const int64_t* storage_indices = get_storage_indices(); - int64_t* compressed_indices = get_compressed_indices(); - int64_t* compressed_indices2 = get_compressed_indices2(); - int8_t* values = get_values(); - - const int64_t storage_index = storage_indices[index]; - if (storage_index < 0) { - return ItemUnspecifiedError; - } - const int64_t cindex = compressed_indices[storage_index]; - if (cindex < 0) { - cumcounts = nullptr; - nof_counts = 0; - dest = nullptr; - is_null = true; - } else { - const int64_t next_cindex = compressed_indices[storage_index + 1]; - const int64_t valuesize = dtypeSize(); - const int64_t cindex2 = compressed_indices2[cindex]; - nof_counts = - (next_cindex < 0 ? -(next_cindex + 1) - cindex : next_cindex - cindex); - cumcounts = compressed_indices2 + cindex; - dest = values + cindex2 * valuesize; - is_null = false; - } - return Success; - } - } - return UnknownFormatError; - } - - HOST DEVICE Status getItemCountsAndData(const int64_t index, - int32_t*& counts, - int64_t& nof_counts, - int8_t*& dest, - int64_t& size, - bool& is_null) { - if (index < 0 || index >= itemsCount()) { - return IndexError; - } - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - break; - case GeoPolygonFormatId: { - const int64_t* storage_indices = get_storage_indices(); - int64_t* compressed_indices = get_compressed_indices(); - const int64_t* compressed_indices2 = get_compressed_indices2(); - int8_t* values = get_values(); - - const int64_t storage_index = storage_indices[index]; - if (storage_index < 0) { - return ItemUnspecifiedError; - } - const int64_t cindex = compressed_indices[storage_index]; - if (cindex < 0) { - counts = nullptr; - nof_counts = 0; - dest = nullptr; - size = 0; - is_null = true; - } else { - const int64_t next_cindex = compressed_indices[storage_index + 1]; - const int64_t valuesize = dtypeSize(); - const int64_t cindex2 = compressed_indices2[cindex]; - nof_counts = - (next_cindex < 0 ? -(next_cindex + 1) - cindex : next_cindex - cindex); - const int64_t* cumcounts = compressed_indices2 + cindex; - counts = get_counts2() + cindex; - dest = values + cindex2 * valuesize; - size = (cumcounts[nof_counts] - cumcounts[0]) * valuesize; - is_null = false; - } - return Success; - } - } - return UnknownFormatError; - } - + // To be deprecated in favor of NestedArray format Status getItemLength(const int64_t index, int64_t& length) const { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case GeoPointFormatId: break; - case VarlenArrayFormatId: - case GeoLineStringFormatId: - case GeoPolygonFormatId: { + case VarlenArrayFormatId: { const int64_t* compressed_indices = get_compressed_indices(); - const int64_t* storage_indices = get_storage_indices(); + const int64_t* storage_indices = get_storage_indices_old(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } const int64_t cindex = compressed_indices[storage_index]; if (cindex < 0) { @@ -1687,134 +2292,153 @@ struct FlatBufferManager { } return Success; } - } - return UnknownFormatError; - } - - Status getSubItemLength(const int64_t index, - const int64_t subindex, - int64_t& length) const { - if (index < 0 || index >= itemsCount()) { - return IndexError; - } - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: + default: break; - case GeoPolygonFormatId: { - const int64_t* storage_indices = get_storage_indices(); - const int64_t* compressed_indices = get_compressed_indices(); - const int64_t storage_index = storage_indices[index]; - if (storage_index < 0) { - return ItemUnspecifiedError; - } - const int64_t cindex = compressed_indices[storage_index]; - if (cindex < 0) { - length = 0; - } else { - const int64_t next_cindex = compressed_indices[storage_index + 1]; - const int64_t nof_counts = - (next_cindex < 0 ? -(next_cindex + 1) - cindex : next_cindex - cindex); - if (subindex < 0 || subindex >= nof_counts) { - return SubIndexError; - } - const int64_t* compressed_indices2 = get_compressed_indices2(); - const int64_t cindex2 = compressed_indices2[cindex + subindex]; - const int64_t next_cindex2 = compressed_indices2[cindex + subindex + 1]; - length = next_cindex2 - cindex2; - } - return Success; - } } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } - // Get a subitem data of an item, e.g. a linestring within a polygon - HOST DEVICE Status getSubItem(int64_t index, - int64_t subindex, - int64_t& size, - int8_t*& dest, - bool& is_null) { - if (index < 0 || index >= itemsCount()) { - return IndexError; - } - switch (format()) { - case VarlenArrayFormatId: - case GeoPointFormatId: - case GeoLineStringFormatId: - return NotSupportedFormatError; - case GeoPolygonFormatId: { - const int64_t* storage_indices = get_storage_indices(); - const int64_t storage_index = storage_indices[index]; - if (storage_index < 0) { - return ItemUnspecifiedError; - } - int64_t* compressed_indices = get_compressed_indices(); - const int64_t cindex = compressed_indices[storage_index]; - if (cindex < 0) { - dest = nullptr; - size = 0; - is_null = true; - } else { - const int64_t next_cindex = compressed_indices[storage_index + 1]; - const int64_t nof_counts = - (next_cindex < 0 ? -(next_cindex + 1) - cindex : next_cindex - cindex); - if (subindex < 0 || subindex >= nof_counts) { - return SubIndexError; +#ifdef HAVE_TOSTRING +#define HAVE_FLATBUFFER_TOSTRING + std::string bufferToString(const int8_t* buffer, + const size_t size, + ValueType value_type) const { + size_t value_size = get_size(value_type); + size_t count = size / value_size; + std::string result = ""; + for (size_t i = 0; i < count; i++) { + if (i > 0) { + result += ", "; + } + switch (value_type) { + case Bool8: + result += (buffer[i] ? "true" : "false"); + break; + case Int8: + result += std::to_string(buffer[i]); + break; + case Int16: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Int32: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Int64: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt8: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt16: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt32: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt64: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Float32: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Float64: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case PointInt32: + result += "("; + if (containsNullValue(buffer + 2 * i * sizeof(int32_t))) { + result += "NULL"; + } else { + result += std::to_string(reinterpret_cast(buffer)[2 * i]); + result += ", "; + result += std::to_string(reinterpret_cast(buffer)[2 * i + 1]); } - int8_t* values = get_values(); - int64_t* compressed_indices2 = get_compressed_indices2(); - const int64_t valuesize = dtypeSize(); - const int64_t cindex2 = compressed_indices2[cindex + subindex]; - const int64_t next_cindex2 = compressed_indices2[cindex + subindex + 1]; - dest = values + cindex2 * valuesize; - size = (next_cindex2 - cindex2) * valuesize; - is_null = false; - } - return Success; + result += ")"; + break; + case PointFloat64: + result += "("; + if (containsNullValue(buffer + 2 * i * sizeof(double))) { + result += "NULL"; + } else { + result += std::to_string(reinterpret_cast(buffer)[2 * i]); + result += ", "; + result += std::to_string(reinterpret_cast(buffer)[2 * i + 1]); + } + result += ")"; + break; } } - return UnknownFormatError; + return result; } -#ifdef HAVE_TOSTRING -#define HAVE_FLATBUFFER_TOSTRING std::string toString() const { if (buffer == nullptr) { return ::typeName(this) + "[UNINITIALIZED]"; } - std::string result = typeName(this) + "("; + std::string result = typeName(this) + "@" + ::toString((void*)buffer) + "("; result += "" + getBaseWorker()->toString(); - const auto fmt = format(); + if (isNestedArray()) { + const auto* metadata = getNestedArrayMetadata(); + const auto* worker = getNestedArrayWorker(); + result += ",\n " + metadata->toString(); + result += ",\n " + worker->toString(); + result += ",\n values_buffer=[" + + bufferToString( + get_values_buffer(), getValuesBufferSize(), metadata->value_type) + + "]"; + result += ",\n sizes_buffer=[" + + bufferToString( + reinterpret_cast(get_sizes_buffer()), + metadata->total_sizes_count * get_size(FLATBUFFER_SIZES_T_VALUE_TYPE), + FLATBUFFER_SIZES_T_VALUE_TYPE) + + "]"; + result += ",\n values_offsets=[" + + bufferToString(reinterpret_cast(get_values_offsets()), + (metadata->total_items_count + 1) * + get_size(FLATBUFFER_OFFSETS_T_VALUE_TYPE), + FLATBUFFER_OFFSETS_T_VALUE_TYPE) + + "]"; + result += ",\n sizes_offsets=[" + + bufferToString(reinterpret_cast(get_sizes_offsets()), + (metadata->total_items_count * metadata->ndims + 1) * + get_size(FLATBUFFER_OFFSETS_T_VALUE_TYPE), + FLATBUFFER_OFFSETS_T_VALUE_TYPE) + + "]"; + result += ",\n storage_indices=[" + + bufferToString( + reinterpret_cast(get_storage_indices()), + metadata->total_items_count * get_size(FLATBUFFER_SIZES_T_VALUE_TYPE), + FLATBUFFER_SIZES_T_VALUE_TYPE) + + "]"; + result += ",\n user_data_buffer=[" + + bufferToString(get_user_data_buffer(), metadata->user_data_size, Int8) + + "]"; + result += ")"; + return result; + } + + // To be deprecated in favor of NestedArray format: + const FlatBufferFormat fmt = format(); + + std::cout << "fmt=" << static_cast(fmt) << ", " << sizeof(fmt) << std::endl; switch (fmt) { case VarlenArrayFormatId: { result += ", " + getVarlenArrayMetadata()->toString(); result += ", " + getVarlenArrayWorker()->toString(); break; } - case GeoLineStringFormatId: { - result += ", " + getGeoLineStringMetadata()->toString(); - result += ", " + getGeoLineStringWorker()->toString(); - break; - } case GeoPointFormatId: { result += ", " + getGeoPointMetadata()->toString(); result += ", " + getGeoPointWorker()->toString(); break; } - case GeoPolygonFormatId: { - result += ", " + getGeoPolygonMetadata()->toString(); - result += ", " + getGeoPolygonWorker()->toString(); + default: break; - } } switch (fmt) { - case VarlenArrayFormatId: - case GeoLineStringFormatId: - case GeoPolygonFormatId: { + case VarlenArrayFormatId: { result += ", values="; const int64_t numvalues = get_nof_values(); const int64_t itemsize = dtypeSize(); @@ -1835,38 +2459,13 @@ struct FlatBufferManager { result += ::toString(values); } break; case 8: { - if (fmt == GeoLineStringFormatId || fmt == GeoPolygonFormatId) { - const int32_t* values_buf = reinterpret_cast(get_values()); - std::vector values(values_buf, values_buf + numvalues * 2); - result += ::toString(values); - } else { - const int64_t* values_buf = reinterpret_cast(get_values()); - std::vector values(values_buf, values_buf + numvalues); - result += ::toString(values); - } + const int64_t* values_buf = reinterpret_cast(get_values()); + std::vector values(values_buf, values_buf + numvalues); + result += ::toString(values); } break; - case 16: { - if (fmt == GeoLineStringFormatId || fmt == GeoPolygonFormatId) { - const double* values_buf = reinterpret_cast(get_values()); - std::vector values(values_buf, values_buf + numvalues * 2); - result += ::toString(values); - break; - } - } default: result += "[UNEXPECTED ITEMSIZE:" + std::to_string(itemsize) + "]"; } - if (fmt == GeoPolygonFormatId) { - const int64_t numitems2 = items2Count(); - const int64_t* compressed_indices2_buf = get_compressed_indices2(); - std::vector compressed_indices2( - compressed_indices2_buf, compressed_indices2_buf + numitems2 + 1); - result += ", compressed_indices2=" + ::toString(compressed_indices2); - - const int32_t* counts2_buf = get_counts2(); - std::vector counts2(counts2_buf, counts2_buf + numitems2); - result += ", counts2=" + ::toString(counts2); - } const int64_t numitems = itemsCount(); const int64_t* compressed_indices_buf = get_compressed_indices(); @@ -1874,7 +2473,7 @@ struct FlatBufferManager { compressed_indices_buf + numitems + 1); result += ", compressed_indices=" + ::toString(compressed_indices); - const int64_t* storage_indices_buf = get_storage_indices(); + const int64_t* storage_indices_buf = get_storage_indices_old(); std::vector storage_indices(storage_indices_buf, storage_indices_buf + numitems); result += ", storage_indices=" + ::toString(storage_indices); @@ -1904,6 +2503,64 @@ struct FlatBufferManager { }; #ifdef HAVE_TOSTRING +inline std::ostream& operator<<(std::ostream& os, + FlatBufferManager::ValueType const type) { + switch (type) { + case FlatBufferManager::Bool8: + os << "Bool8"; + break; + case FlatBufferManager::Int8: + os << "Int8"; + break; + case FlatBufferManager::Int16: + os << "Int16"; + break; + case FlatBufferManager::Int32: + os << "Int32"; + break; + case FlatBufferManager::Int64: + os << "Int64"; + break; + case FlatBufferManager::UInt8: + os << "UInt8"; + break; + case FlatBufferManager::UInt16: + os << "UInt16"; + break; + case FlatBufferManager::UInt32: + os << "UInt32"; + break; + case FlatBufferManager::UInt64: + os << "UInt64"; + break; + case FlatBufferManager::Float32: + os << "Float32"; + break; + case FlatBufferManager::Float64: + os << "Float64"; + break; + case FlatBufferManager::PointInt32: + os << "PointInt32"; + break; + case FlatBufferManager::PointFloat64: + os << "PointFloat64"; + break; + } + return os; +} + +inline std::string FlatBufferManager::toString(const FlatBufferManager::ValueType& type) { + std::ostringstream ss; + ss << type; + return ss.str(); +} + +inline std::string toString(const FlatBufferManager::ValueType& type) { + std::ostringstream ss; + ss << type; + return ss.str(); +} + inline std::ostream& operator<<(std::ostream& os, FlatBufferManager::Status const status) { switch (status) { @@ -1919,6 +2576,9 @@ inline std::ostream& operator<<(std::ostream& os, case FlatBufferManager::SizeError: os << "SizeError"; break; + case FlatBufferManager::FlatbufferSizeError: + os << "FlatbufferSizeError"; + break; case FlatBufferManager::ItemAlreadySpecifiedError: os << "ItemAlreadySpecifiedError"; break; @@ -1931,6 +2591,9 @@ inline std::ostream& operator<<(std::ostream& os, case FlatBufferManager::ValuesBufferTooSmallError: os << "ValuesBufferTooSmallError"; break; + case FlatBufferManager::SizesBufferTooSmallError: + os << "SizesBufferTooSmallError"; + break; case FlatBufferManager::CompressedIndices2BufferTooSmallError: os << "CompressedIndices2BufferTooSmallError"; break; @@ -1946,6 +2609,21 @@ inline std::ostream& operator<<(std::ostream& os, case FlatBufferManager::NotImplementedError: os << "NotImplementedError"; break; + case FlatBufferManager::InvalidUserDataError: + os << "InvalidUserDataError"; + break; + case FlatBufferManager::DimensionalityError: + os << "DimensionalityError"; + break; + case FlatBufferManager::UserDataError: + os << "UserDataError"; + break; + case FlatBufferManager::TypeError: + os << "TypeError"; + break; + case FlatBufferManager::InconsistentSizesError: + os << "InconsistentSizesError"; + break; default: os << "[Unknown FlatBufferManager::Status value]"; } @@ -1958,3 +2636,5 @@ inline std::string toString(const FlatBufferManager::Status& status) { return ss.str(); } #endif + +#undef RETURN_ERROR diff --git a/QueryEngine/heavydbTypes.h b/QueryEngine/heavydbTypes.h index 9ae3dcd024..4bd56cac85 100644 --- a/QueryEngine/heavydbTypes.h +++ b/QueryEngine/heavydbTypes.h @@ -29,6 +29,8 @@ #include "DateAdd.h" +#include "../Shared/sqltypes_lite.h" + #if !(defined(__CUDACC__) || defined(NO_BOOST)) #include "../Shared/DateTimeParser.h" #endif @@ -678,7 +680,7 @@ struct GeoLineStringStruct { typedef struct GeoLineStringStruct GeoLineString; -struct GeoMultiLineString { +struct GeoMultiLineStringStruct { int8_t* ptr; int32_t sz; int8_t* linestring_sizes; @@ -702,6 +704,8 @@ struct GeoMultiLineString { DEVICE int32_t getOutputSrid() const { return output_srid; } }; +typedef struct GeoMultiLineStringStruct GeoMultiLineString; + struct GeoPolygonStruct { int8_t* ptr_coords; int32_t coords_size; @@ -725,7 +729,7 @@ struct GeoPolygonStruct { typedef struct GeoPolygonStruct GeoPolygon; -struct GeoMultiPolygon { +struct GeoMultiPolygonStruct { int8_t* ptr_coords; int32_t coords_size; int8_t* ring_sizes; @@ -752,6 +756,8 @@ struct GeoMultiPolygon { DEVICE int32_t getOutputSrid() const { return output_srid; } }; +typedef struct GeoMultiPolygonStruct GeoMultiPolygon; + // There are redundant #ifndef UDF_COMPILED inside // ifguard for StringDictionaryProxy to flag that // if we decide to adapt C++ UDF Compiler for table @@ -826,6 +832,154 @@ struct Column { #endif }; +// Defines the maximal dimensionality of nested array objects. +#define NESTED_ARRAY_NDIM 4 + +template +struct NestedArray { + /* + flatbuffer_ contains a NestedArray with dimensionality up to NDIM. + + index_ defines an indexed NestedArray with n_ number of indices. + */ + + int8_t* flatbuffer_; + int64_t index_[NESTED_ARRAY_NDIM]; + size_t n_{0}; + + size_t size() const { + FlatBufferManager m{flatbuffer_}; + size_t length; + FlatBufferManager::Status status = FlatBufferManager::Status::NotImplementedError; + status = m.getLength(index_, n_, length); + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("NestedArray size failed: " + ::toString(status)); +#endif + return 0; + } + return length; + } + + size_t size(const int64_t index) const { return getItem(index).size(); } + + // non-nested array items must implement its own getItem method + ItemType getItem(const int64_t index) { + if constexpr (!item_is_nested_array) { +#ifndef __CUDACC__ + throw std::runtime_error(::typeName(this) + " must implement getItem method"); +#endif + ItemType result{}; + return result; + } else { + ItemType result{{{flatbuffer_, {}, n_ + 1}}}; + for (size_t i = 0; i < n_; i++) { + result.index_[i] = index_[i]; + } + result.index_[n_] = index; + return result; + } + } + + // non-nested array items must implement its own getItem method + ItemType getItem(const int64_t index) const { + if constexpr (!item_is_nested_array) { +#ifndef __CUDACC__ + throw std::runtime_error("NestedArray.getItem is not defined for " + + ::typeName(this)); +#endif + } else { + ItemType result{{{flatbuffer_, {}, n_ + 1}}}; + for (size_t i = 0; i < n_; i++) { + result.index_[i] = index_[i]; + } + result.index_[n_] = index; + return result; + } + } + + // non-nested array items must implement its own operator[] + inline ItemType operator[](const unsigned int index) { + return getItem(static_cast(index)); + } + + inline const ItemType operator[](const unsigned int index) const { + return getItem(static_cast(index)); + } + + // check if the parent is NULL + inline bool isNull() const { + FlatBufferManager m{flatbuffer_}; + bool is_null = false; + auto status = m.isNull(index_[0], is_null); +#ifndef __CUDACC__ + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("isNull failed: " + ::toString(status)); + } +#endif + return is_null; + } + + // copy other into self, can be called exactly once + NestedArray& operator=( + const NestedArray& other) { + if (n_ != 1) { +#ifndef __CUDACC__ + throw std::runtime_error( + "NestedArray operator= failed: expected single index, got " + ::toString(n_)); +#endif + } + FlatBufferManager other_m{other.flatbuffer_}; + int8_t* values; + int32_t nof_values; + int32_t* sizes[NESTED_ARRAY_NDIM]; + int32_t sizes_lengths[NESTED_ARRAY_NDIM]; + int32_t nof_sizes; + bool is_null; + FlatBufferManager::Status status; + status = other_m.getItemWorker(other.index_, + other.n_, + values, + nof_values, + sizes, + sizes_lengths, + nof_sizes, + is_null); + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("NestedArray operator= failed: getItem raised " + + ::toString(status)); +#endif + } else { + FlatBufferManager this_m{flatbuffer_}; + if (is_null) { + status = this_m.setNull(index_[0]); + } else { + status = this_m.setItemWorker( + index_[0], values, nof_values, sizes, sizes_lengths, nof_sizes); + } + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("NestedArray operator= failed: setItem raised " + + ::toString(status)); +#endif + } + } + return *this; + } + +#ifdef HAVE_TOSTRING + std::string toString() const { + std::string result = ::typeName(this) + "(..., {"; + for (size_t i = 0; i < n_; i++) { + result += std::to_string(index_[i]) + ", "; + } + result += "}, " + std::to_string(n_) + ")"; + return result; + } +#endif +}; + namespace Geo { struct Point2D { @@ -868,31 +1022,89 @@ DEVICE inline double decompress_y_coord(const int8_t* data, } } -DEVICE inline Point2D get_point(const int8_t* data, - const int64_t index, - const int32_t input_srid, - const int32_t output_srid, - const bool is_geoint) { - Point2D point{decompress_x_coord(data, index, is_geoint), - decompress_y_coord(data, index, is_geoint)}; +DEVICE inline double decompress_x_coord(const int8_t* data, + const int64_t index, + const bool is_geoint, + const int32_t input_srid, + const int32_t output_srid) { + double x = decompress_x_coord(data, index, is_geoint); if (input_srid == output_srid || output_srid == 0) { - return point; + return x; } else if (input_srid == 4326 && output_srid == 900913) { // WGS 84 --> Web Mercator - point.x *= 111319.490778; - point.y = 6378136.99911 * log(tan(.00872664626 * point.y + .785398163397)); - return point; + x *= 111319.490778; + } else { +#ifndef __CUDACC__ + throw std::runtime_error("decompress_x_coord: unhandled geo transformation from " + + std::to_string(input_srid) + " to " + + std::to_string(output_srid) + '.'); +#endif } -#ifdef __CUDACC__ - return {}; // (NaN,NaN) -#else - throw std::runtime_error("Unhandled geo transformation from " + - std::to_string(input_srid) + " to " + - std::to_string(output_srid) + '.'); + return x; +} + +DEVICE inline double decompress_y_coord(const int8_t* data, + const int64_t index, + const bool is_geoint, + const int32_t input_srid, + const int32_t output_srid) { + double y = decompress_y_coord(data, index, is_geoint); + if (input_srid == output_srid || output_srid == 0) { + return y; + } else if (input_srid == 4326 && output_srid == 900913) { + // WGS 84 --> Web Mercator + y = 6378136.99911 * log(tan(.00872664626 * y + .785398163397)); + } else { +#ifndef __CUDACC__ + throw std::runtime_error("decompress_y_coord: unhandled geo transformation from " + + std::to_string(input_srid) + " to " + + std::to_string(output_srid) + '.'); #endif + } + return y; +} + +DEVICE inline Point2D get_point(const int8_t* data, + const int64_t index, + const int32_t input_srid, + const int32_t output_srid, + const bool is_geoint) { + Point2D point{decompress_x_coord(data, index, is_geoint, input_srid, output_srid), + decompress_y_coord(data, index, is_geoint, input_srid, output_srid)}; + return point; } #ifndef __CUDACC__ + +template +inline void points_to_vector(const int8_t* points_buf, + const int64_t nof_points, + const bool is_geoint, + std::vector& result) { + result.reserve(2 * nof_points); + if (is_geoint) { + if constexpr (std::is_same::value) { + for (int64_t i = 0; i < nof_points; i++) { + result.push_back(decompress_x_coord(points_buf, 2 * i, is_geoint)); + result.push_back(decompress_y_coord(points_buf, 2 * i, is_geoint)); + } + } else { + const int32_t* buf = reinterpret_cast(points_buf); + result.assign(buf, buf + 2 * nof_points); + } + } else { + const double* buf = reinterpret_cast(points_buf); + if constexpr (std::is_same::value) { + result.assign(buf, buf + 2 * nof_points); + } else { + for (int64_t i = 0; i < nof_points; i++) { + result.push_back(compress_x_coord(buf, 2 * i)); + result.push_back(compress_y_coord(buf, 2 * i)); + } + } + } +} + inline std::vector compress_coords(const int8_t* data, const int64_t size, const bool is_geoint) { @@ -911,6 +1123,7 @@ inline std::vector compress_coords(const int8_t* data, } return result; } + inline std::vector decompress_coords(const int8_t* data, const int64_t size, const bool is_geoint) { @@ -923,15 +1136,77 @@ inline std::vector decompress_coords(const int8_t* data, } return result; } + +inline std::vector compress_coords(const std::vector& coords) { + std::vector result; + const size_t nofpoints = coords.size() / 2; + result.reserve(coords.size()); + const double* buf = coords.data(); + for (size_t i = 0; i < nofpoints; i++) { + result.push_back(compress_x_coord(buf, 2 * i)); + result.push_back(compress_y_coord(buf, 2 * i)); + } + return result; +} + +inline std::vector decompress_coords(const std::vector& coords) { + std::vector result; + const size_t nofpoints = coords.size() / 2; + result.reserve(coords.size()); + const int8_t* buf = reinterpret_cast(coords.data()); + for (size_t i = 0; i < nofpoints; i++) { + result.push_back(decompress_x_coord(buf, 2 * i, true)); + result.push_back(decompress_y_coord(buf, 2 * i, true)); + } + return result; +} + +inline std::vector> compress_coords( + const std::vector>& coords) { + std::vector> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(compress_coords(coords[i])); + } + return result; +} + +inline std::vector> decompress_coords( + const std::vector>& coords) { + std::vector> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(decompress_coords(coords[i])); + } + return result; +} + +inline std::vector>> compress_coords( + const std::vector>>& coords) { + std::vector>> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(compress_coords(coords[i])); + } + return result; +} + +inline std::vector>> decompress_coords( + const std::vector>>& coords) { + std::vector>> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(decompress_coords(coords[i])); + } + return result; +} + #endif +// to be deprecated inline bool get_is_geoint(const int8_t* flatbuffer) { FlatBufferManager m{const_cast(flatbuffer)}; switch (m.format()) { - case GeoPolygonFormatId: - return m.getGeoPolygonMetadata()->is_geoint; - case GeoLineStringFormatId: - return m.getGeoLineStringMetadata()->is_geoint; case GeoPointFormatId: return m.getGeoPointMetadata()->is_geoint; default: @@ -944,13 +1219,10 @@ inline bool get_is_geoint(const int8_t* flatbuffer) { } } +// to be deprecated inline int32_t get_input_srid(const int8_t* flatbuffer) { FlatBufferManager m{const_cast(flatbuffer)}; switch (m.format()) { - case GeoPolygonFormatId: - return m.getGeoPolygonMetadata()->input_srid; - case GeoLineStringFormatId: - return m.getGeoLineStringMetadata()->input_srid; case GeoPointFormatId: return m.getGeoPointMetadata()->input_srid; default: @@ -963,13 +1235,10 @@ inline int32_t get_input_srid(const int8_t* flatbuffer) { } } +// to be deprecated inline int32_t get_output_srid(const int8_t* flatbuffer) { FlatBufferManager m{const_cast(flatbuffer)}; switch (m.format()) { - case GeoPolygonFormatId: - return m.getGeoPolygonMetadata()->output_srid; - case GeoLineStringFormatId: - return m.getGeoLineStringMetadata()->output_srid; case GeoPointFormatId: return m.getGeoPointMetadata()->output_srid; default: @@ -982,343 +1251,326 @@ inline int32_t get_output_srid(const int8_t* flatbuffer) { } } -struct LineString { - int8_t* flatbuffer_; // FlatBuffer of GeoLineStrings or GeoPolygon - int64_t index_[3]; // line string index within a - // Column/Column/Column +template +struct GeoNestedArray : public NestedArray { + using NestedArray::flatbuffer_; + using NestedArray::index_; + using NestedArray::n_; + using NestedArray::size; + using NestedArray::getItem; - FlatBufferManager::Status getBuffer(int64_t& size, int8_t*& dest, bool& is_null) const { - FlatBufferManager m{flatbuffer_}; - FlatBufferManager::Status status{}; - switch (m.format()) { - case GeoLineStringFormatId: - status = m.getItem(index_[0], size, dest, is_null); - break; - case GeoPolygonFormatId: - status = m.getSubItem(index_[0], index_[1], size, dest, is_null); - break; - default: - status = FlatBufferManager::Status::NotImplementedError; +#ifndef __CUDACC__ + + template + FlatBufferManager::Status toCoordsWorker(std::vector& result) const { + if constexpr (std::is_same::value) { + FlatBufferManager m{flatbuffer_}; + const auto* ti = reinterpret_cast(m.get_user_data_buffer()); + if (ti == nullptr) { + throw std::runtime_error("NestedArray toCoords failed: unspecified type info"); + } + int8_t* points_buf; + int32_t nof_points; + int32_t* sizes[NESTED_ARRAY_NDIM]; + int32_t sizes_lengths[NESTED_ARRAY_NDIM]; + int32_t nof_sizes; + bool is_null; + auto status = m.getItemWorker( + index_, n_, points_buf, nof_points, sizes, sizes_lengths, nof_sizes, is_null); + if (nof_sizes != 0) { + // TODO: if we allow this, this corresponds to flattening of + // coordinates. Decide if we want this. + throw std::runtime_error("NestedArray toCoords expect nof_sizes be 0 but got " + + ::toString(nof_sizes)); + } + if (status == FlatBufferManager::Status::Success) { + points_to_vector(points_buf, nof_points, ti->is_geoint(), result); + } + return status; + } else { + auto sz = size(); + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + auto item = getItem(i); + VT ritem; + auto status = item.toCoords(ritem); + if (status != FlatBufferManager::Status::Success) { + return status; + } + result.push_back(ritem); + } + return FlatBufferManager::Status::Success; } - return status; } - // Get the index-th point of the line string - DEVICE Geo::Point2D getItem(const int64_t index, const int32_t output_srid = 0) const { - int8_t* ptr; - int64_t size; - bool is_null; - auto status = getBuffer(size, ptr, is_null); - if (status != FlatBufferManager::Status::Success) { -#ifndef __CUDACC__ - throw std::runtime_error("LineString.getItem failed: " + ::toString(status)); -#endif - } - bool is_geoint = get_is_geoint(flatbuffer_); - int32_t this_input_srid = get_input_srid(flatbuffer_); - int32_t this_output_srid = get_output_srid(flatbuffer_); - return Geo::get_point(ptr, - index, - this_input_srid, - (output_srid < 0 ? this_output_srid : output_srid), - is_geoint); + // Return coordinates as a vector of double or int32_t type + template + FlatBufferManager::Status toCoords(std::vector& result) const { + return toCoordsWorker(result); } - DEVICE inline Geo::Point2D operator[](const unsigned int index) const { - /* Use getItem(index, output_srid) to enable user-specified - transformation. */ - return getItem(static_cast(index), /*output_srid=*/0); + template + FlatBufferManager::Status toCoords(std::vector>& result) const { + return toCoordsWorker>(result); } -#ifndef __CUDACC__ - std::vector toCoords() const { - int8_t* ptr; - int64_t size; - bool is_null; - auto status = getBuffer(size, ptr, is_null); + template + FlatBufferManager::Status toCoords( + std::vector>>& result) const { + return toCoordsWorker>>(result); + } - std::vector result; - if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("LineString.getBuffer failed: " + ::toString(status)); + template + FlatBufferManager::Status fromCoordsWorker(const VT& coords) { + FlatBufferManager m{flatbuffer_}; + const auto* ti = reinterpret_cast(m.get_user_data_buffer()); + if (ti == nullptr) { + return FlatBufferManager::UserDataError; } - if (is_null) { - return {}; + if (n_ != 1) { + throw std::runtime_error( + "NestedArray fromCoords failed: expected single index but got " + + ::toString(n_)); + } + if (ti->is_geoint()) { + if constexpr (std::is_same::value) { + const auto ccoords = compress_coords(coords); + return m.setItem(index_[0], ccoords); + } else { + return m.setItem(index_[0], coords); + } } else { - bool is_geoint = get_is_geoint(flatbuffer_); - return decompress_coords(ptr, size, is_geoint); + if constexpr (std::is_same::value) { + return m.setItem(index_[0], coords); + } else { + const auto dcoords = decompress_coords(coords); + return m.setItem(index_[0], dcoords); + } } } -#endif - // Return the number of points of the line string - int64_t size() const { - FlatBufferManager m{flatbuffer_}; - int64_t length = 0; - switch (m.format()) { - case GeoLineStringFormatId: - m.getItemLength(index_[0], length); - break; - default: -#ifndef __CUDACC__ - throw std::runtime_error("LineString::size: not implemented for format " + - ::toString(m.format())); -#else - ; -#endif - } - return length; + // Create row from a nested vector of coordinates either in double or int32_t type + template + FlatBufferManager::Status fromCoords(const std::vector& coords) { + return fromCoordsWorker>(coords); } - bool isNull() const { + template + FlatBufferManager::Status fromCoords(const std::vector>& coords) { + return fromCoordsWorker>>(coords); + } + + template + FlatBufferManager::Status fromCoords( + const std::vector>>& coords) { + return fromCoordsWorker>>>(coords); + } + +#endif +}; + +struct LineString : public GeoNestedArray { + using GeoNestedArray::flatbuffer_; + using GeoNestedArray::index_; + using GeoNestedArray::n_; + + DEVICE Point2D getItem(const int64_t index) { FlatBufferManager m{flatbuffer_}; - bool is_null = false; - switch (m.format()) { - case GeoLineStringFormatId: { - auto status = m.isNull(index_[0], is_null); - if (status != FlatBufferManager::Status::Success) { + const SQLTypeInfoLite* ti = + reinterpret_cast(m.get_user_data_buffer()); + int8_t* points; + int32_t nof_points; + int32_t nof_sizes; + int32_t* sizes[NESTED_ARRAY_NDIM]; + int32_t sizes_lengths[NESTED_ARRAY_NDIM]; + bool is_null; + auto status = m.getItemWorker( + index_, n_, points, nof_points, sizes, sizes_lengths, nof_sizes, is_null); + if (status != FlatBufferManager::Status::Success) { #ifndef __CUDACC__ - throw std::runtime_error("Geo::LineString::isNull failed: " + - ::toString(status)); + throw std::runtime_error("LineString.getItem failed: flatbuffer getItem raised " + + ::toString(status)); #endif - } - } break; - case GeoPolygonFormatId: - // Linestrins in a polygon is never NULL: - return false; - default: + } + if (is_null) { + Point2D result; + return result; + } + if (index < 0 || index >= nof_points) { #ifndef __CUDACC__ - throw std::runtime_error( - "Geo::LineString::isNull not implemented for the given format"); -#else - ; + throw std::runtime_error("LineString.getItem failed: index " + ::toString(index) + + " is out of range [0, " + ::toString(nof_points) + ")"); #endif } - return is_null; + return get_point( + points, 2 * index, ti->get_input_srid(), ti->get_output_srid(), ti->is_geoint()); } - void setBuffer(const int8_t* src, const int64_t sz = 0) { - FlatBufferManager m{flatbuffer_}; - FlatBufferManager::Status status; - if (src == nullptr) { - status = m.setNull(index_[0]); - } else { - status = m.setItem(index_[0], src, sz, nullptr); - } + DEVICE inline Point2D operator[](const unsigned int index) { + return getItem(static_cast(index)); + } + +#ifndef __CUDACC__ + + using GeoNestedArray::toCoords; + + template + std::vector toCoords() const { + std::vector result; + auto status = toCoords(result); if (status != FlatBufferManager::Status::Success) { #ifndef __CUDACC__ - throw std::runtime_error("LineString assignment failed: " + ::toString(status)); + throw std::runtime_error("LineString.toCoords failed: " + ::toString(status)); #endif } + return result; } - LineString& operator=(const LineString& other) { - FlatBufferManager other_m{other.flatbuffer_}; - int8_t* ptr; - int64_t size; - bool is_null; - switch (other_m.format()) { - case GeoLineStringFormatId: { - auto status = other_m.getItem(other.index_[0], size, ptr, is_null); - if (status != FlatBufferManager::Status::Success) { -#ifndef __CUDACC__ - throw std::runtime_error("other getItem failed: " + ::toString(status)); + std::vector toCoords() const { return toCoords(); } + #endif - } - if (is_null) { - setBuffer(nullptr); - } else { - FlatBufferManager m{flatbuffer_}; - const auto* metadata = m.getGeoLineStringMetadata(); - const auto* other_metadata = other_m.getGeoLineStringMetadata(); +}; + +struct MultiLineString : public GeoNestedArray { #ifndef __CUDACC__ - bool requires_conversion = - (metadata->input_srid != 0 && other_metadata->input_srid != 0 && - metadata->input_srid != other_metadata->input_srid) || - (metadata->output_srid != 0 && other_metadata->output_srid != 0 && - metadata->output_srid != other_metadata->output_srid) || - (metadata->is_geoint != other_metadata->is_geoint); - if (requires_conversion) { - // TODO: implement conversion - throw std::runtime_error( - "assignment failed: linestrings have different metadata"); - } + + using GeoNestedArray::toCoords; + + template + std::vector> toCoords() const { + std::vector> result; + auto status = toCoords(result); + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("MultiLineString.toCoords failed: " + ::toString(status)); + } + return result; + } + + std::vector> toCoords() const { return toCoords(); } + #endif - setBuffer(ptr, size); - } - } break; - default: +}; + +struct Polygon : public GeoNestedArray { #ifndef __CUDACC__ - throw std::runtime_error( - "LineString::operator= not implemented on the given format"); -#else - ; -#endif + + using GeoNestedArray::toCoords; + + template + std::vector> toCoords() const { + std::vector> result; + auto status = toCoords(result); + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("Polygon.toCoords failed: " + ::toString(status)); } - return *this; + return result; } -#ifdef HAVE_TOSTRING - std::string toString() const { - return ::typeName(this) + "(..., {" + std::to_string(index_[0]) + ", " + - std::to_string(index_[1]) + ", " + std::to_string(index_[2]) + "})"; - } + std::vector> toCoords() const { return toCoords(); } + #endif }; -struct Polygon { - int8_t* flatbuffer_; // FlatBuffer of GeoPolygons - int64_t index_[2]; // polygon index in a Column/Column +struct MultiPolygon : public GeoNestedArray { +#ifndef __CUDACC__ - // Return the index-th linestring - Geo::LineString getItem(const int64_t index) const { - Geo::LineString linestring{flatbuffer_, {index_[0], index, -1}}; - return linestring; + using GeoNestedArray::toCoords; + + template + std::vector>> toCoords() const { + std::vector>> result; + auto status = toCoords(result); + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("MultiPolygon.toCoords failed: " + ::toString(status)); + } + return result; } - inline Geo::LineString operator[](const unsigned int index) const { - return getItem(static_cast(index)); + std::vector>> toCoords() const { + return toCoords(); } - // TODO: Return the coordinates buffer of all linestrings in a polygon - // FlatBufferManager::Status getBuffer(int64_t& size, int8_t*& dest, bool& is_null) - // const { ... } +#endif +}; - // Return the coordinates buffer of the index-th linestring in a polygon - FlatBufferManager::Status getBuffer(const int64_t index, - int64_t& size, - int8_t*& dest, - bool& is_null) const { - FlatBufferManager m{flatbuffer_}; - switch (m.format()) { - case GeoPolygonFormatId: - return m.getSubItem(index_[0], index, size, dest, is_null); - default: - return FlatBufferManager::Status::NotImplementedError; - } - } +} // namespace Geo - bool isNull() const { +template +struct ColumnFlatBuffer { + int8_t* flatbuffer_; + int64_t num_rows_; + + // Return true if index-th row is NULL. + DEVICE inline bool isNull(int64_t index) const { FlatBufferManager m{flatbuffer_}; bool is_null = false; - switch (m.format()) { - case GeoPolygonFormatId: { - auto status = m.isNull(index_[0], is_null); - if (status != FlatBufferManager::Status::Success) { -#ifndef __CUDACC__ - throw std::runtime_error("Geo::Polygon isNull failed: " + ::toString(status)); -#endif - } - } break; - default: + auto status = m.isNull(index, is_null); #ifndef __CUDACC__ - throw std::runtime_error( - "Geo::Polygon::isNull not implemented for the given format"); -#else - ; -#endif + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("isNull failed: " + ::toString(status)); } +#endif return is_null; } - FlatBufferManager::Status setItems(const int8_t* src, - const int32_t* subitem_sizes, - const int64_t nof_subitems, - int8_t** dest = nullptr) { + // Return the number of rows. + DEVICE int64_t size() const { return num_rows_; } + + // Set the index-th row to NULL. Can be called once per row. + DEVICE inline void setNull(int64_t index) { FlatBufferManager m{flatbuffer_}; - auto status = - m.setItemCountsAndData(index_[0], subitem_sizes, nof_subitems, src, dest); - if (status != FlatBufferManager::Status::Success) { + auto status = m.setNull(index); #ifndef __CUDACC__ - throw std::runtime_error("Geo::Polygon setItems failed: " + ::toString(status)); -#endif + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("setNull failed: " + ::toString(status)); } - return status; +#endif } - FlatBufferManager::Status setItem(const int64_t index, - const int8_t* src, - const int64_t size) { - FlatBufferManager m{flatbuffer_}; - auto status = m.setSubItem(index_[0], index, src, size); - return status; + // Return row object. + DEVICE inline RowType getItem(const int64_t index) const { + RowType row{{{flatbuffer_, {index}, 1}}}; + return row; } - // Return the number of line strings in a polygon - int64_t size() const { - FlatBufferManager m{flatbuffer_}; - int64_t length = 0; - m.getItemLength(index_[0], length); - return length; + // Return row object via indexing. + DEVICE inline RowType operator[](const unsigned int index) const { + return getItem(static_cast(index)); } - // Return the number of points in the index-th line string - int64_t size(const int64_t index) const { - FlatBufferManager m{flatbuffer_}; - int64_t length = 0; - m.getSubItemLength(index_[0], index, length); - return length; + // Copy item into the index-th row. + DEVICE inline void setItem(int64_t index, const RowType& item) { + RowType this_item = getItem(index); + this_item = item; } -#ifndef __CUDACC__ - // Construct a polygon from a vector of coordinates vector - template - FlatBufferManager::Status fromCoords(const std::vector>& coords) { - bool is_geoint = get_is_geoint(flatbuffer_); - std::vector sizes; - sizes.reserve(coords.size()); - for (const auto& coord_vec : coords) { - int32_t sz = coord_vec.size() / 2; - if (sz > 0) { - sizes.push_back(sz); - } - } - int8_t* coords_buf = nullptr; - auto status = setItems(nullptr, sizes.data(), sizes.size(), &coords_buf); - if (status != FlatBufferManager::Status::Success) { - return status; - } - - int64_t index = 0; - for (const auto& coord_vec : coords) { - int64_t sz = coord_vec.size() / 2; - if (sz == 0) { - continue; - } else if (is_geoint && std::is_same::value) { - std::vector ccoord_vec = - compress_coords(reinterpret_cast(coord_vec.data()), - sz * sizeof(CT) * 2, - false); - status = setItem(index++, - reinterpret_cast(ccoord_vec.data()), - ccoord_vec.size() * sizeof(int32_t)); - } else if (!is_geoint && std::is_same::value) { - std::vector dcoord_vec = decompress_coords( - reinterpret_cast(coord_vec.data()), sz * sizeof(CT) * 2, true); - status = setItem(index++, - reinterpret_cast(dcoord_vec.data()), - dcoord_vec.size() * sizeof(double)); - } else { - status = setItem(index++, - reinterpret_cast(coord_vec.data()), - sz * sizeof(CT) * 2); - } - if (status != FlatBufferManager::Status::Success) { - return status; - } - } - return status; + // Return the total number of values that the flatbuffer instance + // holds. + inline int64_t getNofValues() const { + FlatBufferManager m{flatbuffer_}; + return m.getValuesCount(); } -#endif -#ifdef HAVE_TOSTRING +#ifdef HAVE_FLATBUFFER_TOSTRING std::string toString() const { - return ::typeName(this) + "(..., {" + std::to_string(index_[0]) + ", " + - std::to_string(index_[1]) + "})"; + FlatBufferManager m{flatbuffer_}; + return ::typeName(this) + "(" + m.toString() + + ", num_rows=" + std::to_string(num_rows_) + ")"; } #endif }; -} // namespace Geo +template <> +struct Column : public ColumnFlatBuffer {}; + +template <> +struct Column : public ColumnFlatBuffer {}; + +template <> +struct Column : public ColumnFlatBuffer {}; + +template <> +struct Column : public ColumnFlatBuffer {}; template <> struct Column { @@ -1388,7 +1640,7 @@ struct Column { const auto* metadata = m.getGeoPointMetadata(); int8_t* dest = nullptr; int64_t sz = 2 * (metadata->is_geoint ? sizeof(int32_t) : sizeof(double)); - FlatBufferManager::Status status = m.setItem(index, nullptr, sz, &dest); + FlatBufferManager::Status status = m.setItemOld(index, nullptr, sz, &dest); if (status != FlatBufferManager::Status::Success) { #ifndef __CUDACC__ throw std::runtime_error("setItem failed: " + ::toString(status)); @@ -1420,168 +1672,6 @@ struct Column { #endif }; -template <> -struct Column { - int8_t* flatbuffer_; - int64_t num_rows_; - - DEVICE Geo::LineString getItem(const int64_t index) const { - Geo::LineString linestring{flatbuffer_, {index, -1, -1}}; - return linestring; - } - - DEVICE inline Geo::LineString operator[](const unsigned int index) const { - return getItem(static_cast(index)); - } - - DEVICE int64_t size() const { return num_rows_; } - - DEVICE inline bool isNull(int64_t index) const { - FlatBufferManager m{flatbuffer_}; - bool is_null = false; - auto status = m.isNull(index, is_null); -#ifndef __CUDACC__ - if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("Column::isNull failed: " + - ::toString(status)); - } -#endif - return is_null; - } - - DEVICE inline void setNull(int64_t index) { - FlatBufferManager m{flatbuffer_}; - auto status = m.setNull(index); -#ifndef __CUDACC__ - if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("Column::setNull failed: " + - ::toString(status)); - } -#endif - } - - // Return the total number of points in a Column - inline int64_t getNofValues() const { - FlatBufferManager m{flatbuffer_}; - return m.get_nof_values(); - } - - // TODO: Implement the following as `linestrings[index] = polygon[ring_index];` if - // possible grab a ring from a polygon - void setItem(const int64_t index, - const Geo::Polygon& polygon, - const int64_t ring_index) { - FlatBufferManager m{flatbuffer_}; - int64_t size = 0; - int8_t* buf = nullptr; - bool is_null = true; - auto status = polygon.getBuffer(ring_index, size, buf, is_null); - if (status == FlatBufferManager::Status::SubIndexError) { - is_null = true; - } else if (status != FlatBufferManager::Status::Success) { -#ifndef __CUDACC__ - throw std::runtime_error( - "Column setItem failed in Polygon getItem: " + - ::toString(status)); -#else - return; -#endif - } - if (is_null) { - status = m.setNull(index); - } else { - status = m.setItem(index, buf, size, nullptr); - } -#ifndef __CUDACC__ - if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("Column setItem from polygon failed: " + - ::toString(status)); - } -#endif - } - - // set line from a buffer of point coordindates - void setItem(const int64_t index, - const int8_t* buf, - const int64_t size // in bytes - ) { - FlatBufferManager m{flatbuffer_}; - FlatBufferManager::Status status = FlatBufferManager::Status::UnknownFormatError; - if (buf == nullptr) { - status = m.setNull(index); - } else { - status = m.setItem(index, buf, size, nullptr); - } - if (status != FlatBufferManager::Status::Success) { -#ifndef __CUDACC__ - throw std::runtime_error("Column setItem from buffer failed: " + - ::toString(status)); -#endif - } - } - -#ifdef HAVE_FLATBUFFER_TOSTRING - std::string toString() const { - FlatBufferManager m{flatbuffer_}; - return ::typeName(this) + "(" + m.toString() + - ", num_rows=" + std::to_string(num_rows_) + ")"; - } -#endif -}; - -template <> -struct Column { - int8_t* flatbuffer_; - int64_t num_rows_; - - DEVICE Geo::Polygon getItem(const int64_t index) const { - Geo::Polygon polygon{flatbuffer_, {index, -1}}; - return polygon; - } - - DEVICE inline Geo::Polygon operator[](const unsigned int index) const { - return getItem(static_cast(index)); - } - - DEVICE int64_t size() const { return num_rows_; } - - DEVICE inline bool isNull(int64_t index) const { - FlatBufferManager m{flatbuffer_}; - bool is_null = false; - auto status = m.isNull(index, is_null); -#ifndef __CUDACC__ - if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("isNull failed: " + ::toString(status)); - } -#endif - return is_null; - } - - DEVICE inline void setNull(int64_t index) { - FlatBufferManager m{flatbuffer_}; - auto status = m.setNull(index); -#ifndef __CUDACC__ - if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("setNull failed: " + ::toString(status)); - } -#endif - } - - // Return the total number of points in a Column - inline int64_t getNofValues() const { - FlatBufferManager m{flatbuffer_}; - return m.get_nof_values(); - } - -#ifdef HAVE_FLATBUFFER_TOSTRING - std::string toString() const { - FlatBufferManager m{flatbuffer_}; - return ::typeName(this) + "(" + m.toString() + - ", num_rows=" + std::to_string(num_rows_) + ")"; - } -#endif -}; - template struct Column> { // A type for a column of variable length arrays diff --git a/QueryEngine/scripts/generate_TableFunctionsFactory_init.py b/QueryEngine/scripts/generate_TableFunctionsFactory_init.py index 5e3bcc836a..49dc5ef4c2 100644 --- a/QueryEngine/scripts/generate_TableFunctionsFactory_init.py +++ b/QueryEngine/scripts/generate_TableFunctionsFactory_init.py @@ -314,7 +314,8 @@ def get_cpp_type(self): ctype = name elif name == 'YearMonthTimeInterval': ctype = name - elif name in ['GeoPoint', 'GeoLineString', 'GeoPolygon']: + elif name in ['GeoPoint', 'GeoLineString', 'GeoPolygon', + 'GeoMultiPoint', 'GeoMultiLineString', 'GeoMultiPolygon']: ctype = name else: raise NotImplementedError(self) diff --git a/Shared/DebugOutputStream.h b/Shared/DebugOutputStream.h new file mode 100644 index 0000000000..c048433ec4 --- /dev/null +++ b/Shared/DebugOutputStream.h @@ -0,0 +1,24 @@ +/* + * Copyright 2022 HEAVY.AI, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// Use if constexpr to ensure that log messages still compile, but are elided +#define DEBUG_OUTPUT_STREAM(enable_flag, stream) \ + if constexpr (!enable_flag) \ + ; \ + else \ + stream diff --git a/Shared/sqltypes.h b/Shared/sqltypes.h index 1e508843e8..c6959899df 100644 --- a/Shared/sqltypes.h +++ b/Shared/sqltypes.h @@ -29,6 +29,7 @@ #include "../Logger/Logger.h" #include "Datum.h" #include "funcannotations.h" +#include "sqltypes_lite.h" #include #include @@ -1015,12 +1016,120 @@ class SQLTypeInfo { case kPOINT: case kLINESTRING: case kPOLYGON: + case kMULTILINESTRING: + case kMULTIPOLYGON: return true; default:; } return false; } + SQLTypeInfoLite toLite() const { + SQLTypeInfoLite ti_lite; + switch (type) { + case kPOINT: + ti_lite.type = SQLTypeInfoLite::POINT; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kLINESTRING: + ti_lite.type = SQLTypeInfoLite::LINESTRING; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kPOLYGON: + ti_lite.type = SQLTypeInfoLite::POLYGON; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kMULTIPOINT: + ti_lite.type = SQLTypeInfoLite::MULTIPOINT; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kMULTILINESTRING: + ti_lite.type = SQLTypeInfoLite::MULTILINESTRING; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kMULTIPOLYGON: + ti_lite.type = SQLTypeInfoLite::MULTIPOLYGON; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kTEXT: + ti_lite.type = SQLTypeInfoLite::TEXT; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kARRAY: + ti_lite.type = SQLTypeInfoLite::ARRAY; + switch (subtype) { + case kBOOLEAN: + ti_lite.subtype = SQLTypeInfoLite::BOOLEAN; + break; + case kTINYINT: + ti_lite.subtype = SQLTypeInfoLite::TINYINT; + break; + case kSMALLINT: + ti_lite.subtype = SQLTypeInfoLite::SMALLINT; + break; + case kINT: + ti_lite.subtype = SQLTypeInfoLite::INT; + break; + case kBIGINT: + ti_lite.subtype = SQLTypeInfoLite::BIGINT; + break; + case kFLOAT: + ti_lite.subtype = SQLTypeInfoLite::FLOAT; + break; + case kDOUBLE: + ti_lite.subtype = SQLTypeInfoLite::DOUBLE; + break; + case kTEXT: + ti_lite.subtype = SQLTypeInfoLite::TEXT; + break; + default: + UNREACHABLE(); + } + break; + default: + UNREACHABLE(); + } + if (is_geometry()) { + switch (get_compression()) { + case kENCODING_NONE: + ti_lite.compression = SQLTypeInfoLite::NONE; + break; + case kENCODING_GEOINT: + ti_lite.compression = SQLTypeInfoLite::GEOINT; + break; + default: + UNREACHABLE(); + } + ti_lite.dimension = get_input_srid(); + ti_lite.scale = get_output_srid(); + ti_lite.db_id = 0; // unused + ti_lite.dict_id = 0; // unused + } else if (type == kTEXT) { + switch (get_compression()) { + case kENCODING_NONE: + ti_lite.compression = SQLTypeInfoLite::NONE; + break; + case kENCODING_DICT: + ti_lite.compression = SQLTypeInfoLite::DICT; + break; + default: + UNREACHABLE(); + } + ti_lite.dimension = 0; // unused + ti_lite.scale = 0; // unused + ti_lite.db_id = dict_key_.db_id; + ti_lite.dict_id = dict_key_.dict_id; + } else if (type == kARRAY) { + ti_lite.dimension = 0; // unused + ti_lite.scale = 0; // unused + ti_lite.db_id = dict_key_.db_id; + ti_lite.dict_id = dict_key_.dict_id; + } else { + UNREACHABLE(); + } + return ti_lite; + } + private: SQLTypes type; // type id SQLTypes subtype; // element type of arrays or columns @@ -1491,6 +1600,73 @@ DEVICE inline void VarlenArray_get_nth(int8_t* buf, inline int64_t getFlatBufferSize(int64_t items_count, int64_t max_nof_values, const SQLTypeInfo& ti) { + size_t ndims = 0; + FlatBufferManager::ValueType value_type; + int64_t max_nof_sizes = 0; + switch (ti.get_type()) { + case kPOINT: + ndims = 0; + break; + case kLINESTRING: + case kMULTIPOINT: + case kARRAY: + ndims = 1; + max_nof_sizes = items_count + max_nof_values / 3; + break; + case kPOLYGON: + case kMULTILINESTRING: + ndims = 2; + max_nof_sizes = items_count + 2 * max_nof_values / 3; + break; + case kMULTIPOLYGON: + ndims = 3; + max_nof_sizes = items_count + max_nof_values; + break; + default: + UNREACHABLE(); + } + + if (ti.is_geometry()) { + if (ti.get_compression() == kENCODING_GEOINT) { + value_type = FlatBufferManager::PointInt32; + } else { + value_type = FlatBufferManager::PointFloat64; + } + } else if (ti.is_array()) { + switch (ti.get_subtype()) { + case kBOOLEAN: + value_type = FlatBufferManager::Bool8; + break; + case kTINYINT: + value_type = FlatBufferManager::Int8; + break; + case kSMALLINT: + value_type = FlatBufferManager::Int16; + break; + case kINT: + value_type = FlatBufferManager::Int32; + break; + case kTEXT: + CHECK_EQ(ti.get_compression(), kENCODING_DICT); + value_type = FlatBufferManager::Int32; + break; + case kBIGINT: + value_type = FlatBufferManager::Int64; + break; + case kFLOAT: + value_type = FlatBufferManager::Float32; + break; + case kDOUBLE: + value_type = FlatBufferManager::Float64; + break; + default: + UNREACHABLE(); + break; + } + } else { + UNREACHABLE(); + } + switch (ti.get_type()) { case kPOINT: { FlatBufferManager::GeoPoint metadata{items_count, @@ -1500,24 +1676,18 @@ inline int64_t getFlatBufferSize(int64_t items_count, return FlatBufferManager::compute_flatbuffer_size( GeoPointFormatId, reinterpret_cast(&metadata)); } - case kLINESTRING: { - FlatBufferManager::GeoLineString metadata{items_count, - max_nof_values, - ti.get_input_srid(), - ti.get_output_srid(), - ti.get_compression() == kENCODING_GEOINT}; - return FlatBufferManager::compute_flatbuffer_size( - GeoLineStringFormatId, reinterpret_cast(&metadata)); - } - case kPOLYGON: { - FlatBufferManager::GeoPolygon metadata{items_count, - max_nof_values, - max_nof_values / 3, - ti.get_input_srid(), - ti.get_output_srid(), - ti.get_compression() == kENCODING_GEOINT}; - return FlatBufferManager::compute_flatbuffer_size( - GeoPolygonFormatId, reinterpret_cast(&metadata)); + case kLINESTRING: + case kPOLYGON: + case kMULTIPOINT: + case kMULTILINESTRING: + case kMULTIPOLYGON: { + return FlatBufferManager::computeBufferSizeNestedArray( + /* ndims= */ ndims, + /* total_items_count= */ items_count, + /* total sizes count= */ max_nof_sizes, + /* total values count= */ max_nof_values, + value_type, + /* user data size= */ sizeof(SQLTypeInfoLite)); } case kARRAY: { const size_t array_item_size = ti.get_elem_type().get_size(); @@ -1539,6 +1709,75 @@ inline void initializeFlatBuffer(FlatBufferManager& m, int64_t items_count, int64_t max_nof_values, const SQLTypeInfo& ti) { + size_t ndims = 0; + FlatBufferManager::ValueType value_type; + int64_t max_nof_sizes = 0; + switch (ti.get_type()) { + case kPOINT: + ndims = 0; + break; + case kLINESTRING: + case kMULTIPOINT: + case kARRAY: + ndims = 1; + max_nof_sizes = items_count + max_nof_values / 3; + break; + case kPOLYGON: + case kMULTILINESTRING: + ndims = 2; + max_nof_sizes = items_count + 2 * max_nof_values / 3; + break; + case kMULTIPOLYGON: + ndims = 3; + max_nof_sizes = items_count + max_nof_values; + break; + default: + UNREACHABLE(); + } + + if (ti.is_geometry()) { + if (ti.get_compression() == kENCODING_GEOINT) { + value_type = FlatBufferManager::PointInt32; + } else { + value_type = FlatBufferManager::PointFloat64; + } + } else if (ti.is_array()) { + switch (ti.get_subtype()) { + case kBOOLEAN: + value_type = FlatBufferManager::Bool8; + break; + case kTINYINT: + value_type = FlatBufferManager::Int8; + break; + case kSMALLINT: + value_type = FlatBufferManager::Int16; + break; + case kINT: + value_type = FlatBufferManager::Int32; + break; + case kTEXT: + CHECK_EQ(ti.get_compression(), kENCODING_DICT); + value_type = FlatBufferManager::Int32; + break; + case kBIGINT: + value_type = FlatBufferManager::Int64; + break; + case kFLOAT: + value_type = FlatBufferManager::Float32; + break; + case kDOUBLE: + value_type = FlatBufferManager::Float64; + break; + default: + UNREACHABLE(); + break; + } + } else { + UNREACHABLE(); + } + + SQLTypeInfoLite ti_lite = ti.toLite(); + switch (ti.get_type()) { case kPOINT: { FlatBufferManager::GeoPoint metadata{items_count, @@ -1548,23 +1787,30 @@ inline void initializeFlatBuffer(FlatBufferManager& m, m.initialize(GeoPointFormatId, reinterpret_cast(&metadata)); break; } - case kLINESTRING: { - FlatBufferManager::GeoLineString metadata{items_count, - max_nof_values, - ti.get_input_srid(), - ti.get_output_srid(), - ti.get_compression() == kENCODING_GEOINT}; - m.initialize(GeoLineStringFormatId, reinterpret_cast(&metadata)); - break; - } - case kPOLYGON: { - FlatBufferManager::GeoPolygon metadata{items_count, - max_nof_values, - max_nof_values / 3, - ti.get_input_srid(), - ti.get_output_srid(), - ti.get_compression() == kENCODING_GEOINT}; - m.initialize(GeoPolygonFormatId, reinterpret_cast(&metadata)); + case kLINESTRING: + case kPOLYGON: + case kMULTIPOINT: + case kMULTILINESTRING: + case kMULTIPOLYGON: { + int8_t* null_value_ptr = nullptr; + uint32_t geoint_null_value[2] = {0x80000000U, 0x80000000U}; + double null_point[2] = {2 * DBL_MIN, 2 * DBL_MIN}; + if (ti.get_compression() == kENCODING_GEOINT) { + null_value_ptr = reinterpret_cast(geoint_null_value); + } else { + null_value_ptr = reinterpret_cast(null_point); + } + auto status = m.initializeNestedArray( + /* ndims= */ ndims, + /* total_items_count= */ items_count, + /* total_sizes_count= */ max_nof_sizes, + /* total_values_count= */ max_nof_values, + value_type, + /* null value buffer=*/null_value_ptr, // null value buffer size + // is defined by value type + /* user data buffer=*/reinterpret_cast(&ti_lite), + /* user data buffer size=*/sizeof(SQLTypeInfoLite)); + CHECK_EQ(status, FlatBufferManager::Success); break; } case kARRAY: { diff --git a/Shared/sqltypes_lite.h b/Shared/sqltypes_lite.h new file mode 100644 index 0000000000..529351361b --- /dev/null +++ b/Shared/sqltypes_lite.h @@ -0,0 +1,66 @@ +/* + * Copyright 2023 HEAVY.AI, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + Provides a light-weight data structure SQLTypeInfoLite to serialize + SQLTypeInfo (from sqltypes.h) for the extension functions (in + heavydbTypes.h) by FlatBufferManager. + + Extend SQLTypeInfoLite struct as needed but keep it simple so that + both sqltypes.h and heavydbTypes.h are able to include it (recall, + the two header files cannot include each other). +*/ + +#pragma once + +#include + +struct SQLTypeInfoLite { + enum SQLTypes { + UNSPECIFIED = 0, + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + FLOAT, + DOUBLE, + POINT, + LINESTRING, + POLYGON, + MULTIPOINT, + MULTILINESTRING, + MULTIPOLYGON, + TEXT, + ARRAY + }; + enum EncodingType { + NONE = 0, + DICT, // used by TEXT and ARRAY of TEXT + GEOINT // used by geotypes + }; + SQLTypes type; + SQLTypes subtype; // used by ARRAY + EncodingType compression; // used by geotypes and TEXT and ARRAY of TEXT + int32_t dimension; // input_srid + int32_t scale; // output_srid + int32_t db_id; // used by TEXT and ARRAY of TEXT + int32_t dict_id; // used by TEXT and ARRAY of TEXT + + inline bool is_geoint() const { return compression == GEOINT; } + inline int32_t get_input_srid() const { return dimension; } + inline int32_t get_output_srid() const { return scale; } +}; diff --git a/TableArchiver/TableArchiver.cpp b/TableArchiver/TableArchiver.cpp index 14afc7533d..3180099ace 100644 --- a/TableArchiver/TableArchiver.cpp +++ b/TableArchiver/TableArchiver.cpp @@ -54,9 +54,13 @@ extern bool g_cluster; extern std::string g_base_path; bool g_test_rollback_dump_restore{false}; +constexpr static int kDumpVersion = 1; +constexpr static int kDumpVersion_remove_render_group_columns = 1; + constexpr static char const* table_schema_filename = "_table.sql"; constexpr static char const* table_oldinfo_filename = "_table.oldinfo"; constexpr static char const* table_epoch_filename = "_table.epoch"; +constexpr static char const* table_dumpversion_filename = "_table.dumpversion"; #if BOOST_VERSION < 107300 namespace std { @@ -79,7 +83,9 @@ inline std::string abs_path(const File_Namespace::GlobalFileMgr* global_file_mgr return boost::filesystem::canonical(global_file_mgr->getBasePath()).string(); } -inline std::string run(const std::string& cmd, const std::string& chdir = "") { +inline std::string run(const std::string& cmd, + const std::string& chdir = "", + const bool log_failure = true) { VLOG(3) << "running cmd: " << cmd; int rcode; std::error_code ec; @@ -99,11 +105,13 @@ inline std::string run(const std::string& cmd, const std::string& chdir = "") { errors = ss_errors.str(); }); if (rcode || ec) { - LOG(ERROR) << "failed cmd: " << cmd; - LOG(ERROR) << "exit code: " << rcode; - LOG(ERROR) << "error code: " << ec.value() << " - " << ec.message(); - LOG(ERROR) << "stdout: " << output; - LOG(ERROR) << "stderr: " << errors; + if (log_failure) { + LOG(ERROR) << "failed cmd: " << cmd; + LOG(ERROR) << "exit code: " << rcode; + LOG(ERROR) << "error code: " << ec.value() << " - " << ec.message(); + LOG(ERROR) << "stdout: " << output; + LOG(ERROR) << "stderr: " << errors; + } #if defined(__APPLE__) // osx bsdtar options "--use-compress-program" and "--fast-read" together // run into pipe write error after tar extracts the first occurrence of a @@ -150,7 +158,8 @@ inline std::string run(const std::string& cmd, const std::string& chdir = "") { inline std::string simple_file_cat(const std::string& archive_path, const std::string& file_name, - const std::string& compression) { + const std::string& compression, + const bool log_failure = true) { ddl_utils::validate_allowed_file_path(archive_path, ddl_utils::DataTransferType::IMPORT); #if defined(__APPLE__) @@ -163,7 +172,8 @@ inline std::string simple_file_cat(const std::string& archive_path, boost::filesystem::create_directories(temp_dir); run("tar " + compression + " -xvf " + get_quoted_string(archive_path) + " " + opt_occurrence + " " + file_name, - temp_dir.string()); + temp_dir.string(), + log_failure); const auto output = run("cat " + (temp_dir / file_name).string()); boost::filesystem::remove_all(temp_dir); return output; @@ -180,10 +190,11 @@ inline std::string get_table_schema(const std::string& archive_path, // If a table was altered there may be a mapping from old column ids to new ones these // values need to be replaced in the page headers. -void rewrite_column_ids_in_page_headers( +void update_or_drop_column_ids_in_page_headers( const boost::filesystem::path& path, const std::unordered_map& column_ids_map, - const int32_t table_epoch) { + const int32_t table_epoch, + const bool drop_not_update) { const std::string file_path = path.string(); const std::string file_name = path.filename().string(); std::vector tokens; @@ -230,11 +241,31 @@ void rewrite_column_ids_in_page_headers( continue; } auto column_map_it = column_ids_map.find(col_id); - CHECK(column_map_it != column_ids_map.end()) << "could not find " << col_id; - // If a header contains a column id that is remapped to new location - // then write that change to the file. - if (const auto dest_col_id = column_map_it->second; col_id != dest_col_id) { - col_id = dest_col_id; + bool rewrite_header = false; + if (drop_not_update) { + // if the header contains a column ID that is a key of the map + // erase the entire header so that column is effectively dropped + // the value of the map is ignored, thus allowing us to use the + // same function for both operations + if (column_map_it != column_ids_map.end()) { + // clear the entire header + std::memset(header_info, 0, sizeof(header_info)); + rewrite_header = true; + } + } else { + if (column_map_it == column_ids_map.end()) { + throw std::runtime_error("Page " + std::to_string(page) + " in " + file_path + + " has unexpected Column ID " + std::to_string(col_id) + + ". Dump may be corrupt."); + } + // If a header contains a column id that is remapped to new location + // then write that change to the file. + if (const auto dest_col_id = column_map_it->second; col_id != dest_col_id) { + col_id = dest_col_id; + rewrite_header = true; + } + } + if (rewrite_header) { if (0 != std::fseek(fp.get(), page * page_size, SEEK_SET)) { throw std::runtime_error("Failed to seek to page# " + std::to_string(page) + file_path + " for write: " + std::strerror(errno)); @@ -248,12 +279,15 @@ void rewrite_column_ids_in_page_headers( } } -// Adjust column ids in chunk keys in a table's data files under a temp_data_dir, +// Rewrite column ids in chunk keys in a table's data files under a temp_data_dir, // including files of all shards of the table. Can be slow for big files but should // be scale faster than refragmentizing. Table altering should be rare for olap. -void adjust_altered_table_files(const int32_t table_epoch, - const std::string& temp_data_dir, - const std::unordered_map& column_ids_map) { +// Also used to erase page headers for columns that must be dropped completely. +void update_or_drop_column_ids_in_table_files( + const int32_t table_epoch, + const std::string& temp_data_dir, + const std::unordered_map& column_ids_map, + const bool drop_not_update) { boost::filesystem::path base_path(temp_data_dir); boost::filesystem::recursive_directory_iterator end_it; ThreadController_NS::SimpleThreadController<> thread_controller(cpu_threads()); @@ -261,8 +295,11 @@ void adjust_altered_table_files(const int32_t table_epoch, ++fit) { if (!boost::filesystem::is_symlink(fit->path()) && boost::filesystem::is_regular_file(fit->status())) { - thread_controller.startThread( - rewrite_column_ids_in_page_headers, fit->path(), column_ids_map, table_epoch); + thread_controller.startThread(update_or_drop_column_ids_in_page_headers, + fit->path(), + column_ids_map, + table_epoch, + drop_not_update); thread_controller.checkThreadsStatus(); } } @@ -330,6 +367,73 @@ void rename_table_directories(const File_Namespace::GlobalFileMgr* global_file_m } } +std::unordered_map find_render_group_columns( + const std::list& src_columns, + std::vector& src_oldinfo_strs, + const std::string& archive_path) { + // scan for poly or mpoly columns and collect their names + std::vector poly_column_names; + for (auto const& src_column : src_columns) { + auto const sqltype = src_column.columnType.get_type(); + if (sqltype == kPOLYGON || sqltype == kMULTIPOLYGON) { + poly_column_names.push_back(src_column.columnName); + } + } + + // remove any matching render group columns from the source list + // and capture their IDs in the keys of a map (value is ignored) + std::unordered_map column_ids_to_drop; + auto last_itr = std::remove_if( + src_oldinfo_strs.begin(), + src_oldinfo_strs.end(), + [&](const std::string& v) -> bool { + // tokenize + std::vector tokens; + boost::algorithm::split( + tokens, v, boost::is_any_of(":"), boost::token_compress_on); + // extract name and ID + if (tokens.size() < 2) { + throw std::runtime_error( + "Dump " + archive_path + + " has invalid oldinfo file contents. Dump may be corrupt."); + } + auto const& column_name = tokens[0]; + auto const column_id = std::stoi(tokens[1]); + for (auto const& poly_column_name : poly_column_names) { + // is it a render group column? + auto const render_group_column_name = poly_column_name + "_render_group"; + if (column_name == render_group_column_name) { + LOG(INFO) << "RESTORE TABLE dropping render group column '" + << render_group_column_name << "' from dump " << archive_path; + // add to "set" + column_ids_to_drop[column_id] = -1; + return true; + } + } + return false; + }); + src_oldinfo_strs.erase(last_itr, src_oldinfo_strs.end()); + + return column_ids_to_drop; +} + +void drop_render_group_columns( + const std::unordered_map& render_group_column_ids, + const std::string& archive_path, + const std::string& temp_data_dir, + const std::string& compression) { + // rewrite page files to drop the columns with IDs that are the keys of the map + if (render_group_column_ids.size()) { + const auto epoch = boost::lexical_cast( + simple_file_cat(archive_path, table_epoch_filename, compression)); + const auto time_ms = measure<>::execution([&]() { + update_or_drop_column_ids_in_table_files( + epoch, temp_data_dir, render_group_column_ids, true /* drop */); + }); + VLOG(3) << "drop render group columns: " << time_ms << " ms"; + } +} + } // namespace void TableArchiver::dumpTable(const TableDescriptor* td, @@ -387,6 +491,10 @@ void TableArchiver::dumpTable(const TableDescriptor* td, const auto table_name = td->tableName; { + // - gen dumpversion file + const auto dumpversion_str = std::to_string(kDumpVersion); + file_writer( + uuid_dir / table_dumpversion_filename, "table dumpversion", dumpversion_str); // - gen schema file const auto schema_str = cat_->dumpSchema(td); file_writer(uuid_dir / table_schema_filename, "table schema", schema_str); @@ -512,6 +620,36 @@ void TableArchiver::restoreTable(const Catalog_Namespace::SessionInfo& session, all_src_oldinfo_str, boost::is_any_of(" "), boost::token_compress_on); + + // fetch dump version + int dump_version = -1; + try { + // attempt to read file, do not log if fail to read + auto const dump_version_str = + simple_file_cat(archive_path, table_dumpversion_filename, compression, false); + dump_version = std::stoi(dump_version_str); + } catch (std::runtime_error& e) { + // no dump version file found + dump_version = 0; + } + LOG(INFO) << "Dump Version: " << dump_version; + + // version-specific behavior + const bool do_drop_render_group_columns = + (dump_version < kDumpVersion_remove_render_group_columns); + + // remove any render group columns from the source columns so that the list of + // source columns matches the already-created table, and the removed ones will + // not have an entry in column_ids_map, and hence will not have their data + // mapped later (effectively dropping them), and return their IDs for when + // they are actually dropped later + std::unordered_map render_group_column_ids; + if (do_drop_render_group_columns) { + render_group_column_ids = + find_render_group_columns(src_columns, src_oldinfo_strs, archive_path); + } + + // compare with the destination columns auto all_dst_columns = cat_->getAllColumnMetadataForTable(td->tableId, true, true, true); if (src_oldinfo_strs.size() != all_dst_columns.size()) { @@ -559,20 +697,30 @@ void TableArchiver::restoreTable(const Catalog_Namespace::SessionInfo& session, was_table_altered = was_table_altered || it.first != it.second; }); VLOG(3) << "was_table_altered = " << was_table_altered; + // extract all data files to a temp dir. will swap with dst table dir after all set, // otherwise will corrupt table in case any bad thing happens in the middle. run("rm -rf " + temp_data_dir.string()); run("mkdir -p " + temp_data_dir.string()); run("tar " + compression + " -xvf " + get_quoted_string(archive_path), temp_data_dir); + // drop the render group columns here + if (do_drop_render_group_columns) { + drop_render_group_columns( + render_group_column_ids, archive_path, temp_data_dir, compression); + } + // if table was ever altered after it was created, update column ids in chunk headers. if (was_table_altered) { const auto epoch = boost::lexical_cast( simple_file_cat(archive_path, table_epoch_filename, compression)); - const auto time_ms = measure<>::execution( - [&]() { adjust_altered_table_files(epoch, temp_data_dir, column_ids_map); }); - VLOG(3) << "adjust_altered_table_files: " << time_ms << " ms"; + const auto time_ms = measure<>::execution([&]() { + update_or_drop_column_ids_in_table_files( + epoch, temp_data_dir, column_ids_map, false /* update */); + }); + VLOG(3) << "update_column_ids_table_files: " << time_ms << " ms"; } + // finally,,, swap table data/dict dirs! const auto data_file_dirs = cat_->getTableDataDirectories(td); const auto dict_file_dirs = cat_->getTableDictDirectories(td); diff --git a/Tests/BufferMgrTest.cpp b/Tests/BufferMgrTest.cpp index ad8143e671..5c6a34b997 100644 --- a/Tests/BufferMgrTest.cpp +++ b/Tests/BufferMgrTest.cpp @@ -839,7 +839,7 @@ TEST_P(BufferMgrTest, DeleteBuffersWithPrefixPinnedBuffer) { TEST_P(BufferMgrTest, DeleteBuffersWithPrefixNoMatchingPrefix) { buffer_mgr_ = createBufferMgr(); - createPinnedBuffers(1); + createUnpinnedBuffers(1); assertSegmentCount(2); assertSegmentAttributes(0, 0, Buffer_Namespace::USED); @@ -856,6 +856,56 @@ TEST_P(BufferMgrTest, DeleteBuffersWithPrefixNoMatchingPrefix) { assertExpectedBufferMgrAttributes(); } +TEST_P(BufferMgrTest, DeleteBuffersWithLongerPrefixBeforeCachedChunkKey) { + buffer_mgr_ = createBufferMgr(); + createUnpinnedBuffers(1); + + assertSegmentCount(2); + assertSegmentAttributes(0, 0, Buffer_Namespace::USED); + assertSegmentAttributes(0, 1, Buffer_Namespace::FREE); + assertExpectedBufferMgrAttributes(); + + EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_)); + + // Use a prefix that is longer than `test_chunk_key_` but occurs before it when sorted. + auto chunk_prefix = test_chunk_key_; + chunk_prefix.emplace_back(1); + chunk_prefix[CHUNK_KEY_TABLE_IDX]--; + + buffer_mgr_->deleteBuffersWithPrefix(chunk_prefix); + EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_)); + + assertSegmentCount(2); + assertSegmentAttributes(0, 0, Buffer_Namespace::USED); + assertSegmentAttributes(0, 1, Buffer_Namespace::FREE); + assertExpectedBufferMgrAttributes(); +} + +TEST_P(BufferMgrTest, DeleteBuffersWithLongerPrefixAfterCachedChunkKey) { + buffer_mgr_ = createBufferMgr(); + createUnpinnedBuffers(1); + + assertSegmentCount(2); + assertSegmentAttributes(0, 0, Buffer_Namespace::USED); + assertSegmentAttributes(0, 1, Buffer_Namespace::FREE); + assertExpectedBufferMgrAttributes(); + + EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_)); + + // Use a prefix that is longer than `test_chunk_key_` but occurs after it when sorted. + auto chunk_prefix = test_chunk_key_; + chunk_prefix.emplace_back(1); + chunk_prefix[CHUNK_KEY_TABLE_IDX]++; + + buffer_mgr_->deleteBuffersWithPrefix(chunk_prefix); + EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_)); + + assertSegmentCount(2); + assertSegmentAttributes(0, 0, Buffer_Namespace::USED); + assertSegmentAttributes(0, 1, Buffer_Namespace::FREE); + assertExpectedBufferMgrAttributes(); +} + TEST_P(BufferMgrTest, AllocAndFree) { buffer_mgr_ = createBufferMgr(); diff --git a/Tests/DumpRestoreTest.cpp b/Tests/DumpRestoreTest.cpp index 2d58baf924..ff793a893b 100644 --- a/Tests/DumpRestoreTest.cpp +++ b/Tests/DumpRestoreTest.cpp @@ -250,6 +250,7 @@ class DumpAndRestoreTest : public ::testing::Test { boost::filesystem::remove_all(tar_ball_path); run_ddl_statement("DROP TABLE IF EXISTS test_table;"); run_ddl_statement("DROP TABLE IF EXISTS test_table_2;"); + run_ddl_statement("DROP TABLE IF EXISTS render_groups;"); g_test_rollback_dump_restore = false; } @@ -257,6 +258,7 @@ class DumpAndRestoreTest : public ::testing::Test { boost::filesystem::remove_all(tar_ball_path); run_ddl_statement("DROP TABLE IF EXISTS test_table;"); run_ddl_statement("DROP TABLE IF EXISTS test_table_2;"); + run_ddl_statement("DROP TABLE IF EXISTS render_groups;"); } void sqlAndCompareResult(const std::string& sql, @@ -581,6 +583,30 @@ TEST_F(DumpAndRestoreTest, DumpAlteredTable) { sqlAndCompareResult("SELECT * FROM test_table;", std::vector{1}); } +TEST_F(DumpAndRestoreTest, DropRenderGroupColumns) { + static constexpr int kNullInt = std::numeric_limits::min(); + auto file_path = boost::filesystem::canonical( + "../../Tests/Export/TableDump/dump_with_render_groups_good.gz") + .string(); + EXPECT_NO_THROW(run_ddl_statement("RESTORE TABLE render_groups FROM '" + file_path + + "' WITH (compression='gzip');")); + sqlAndCompareResult("SELECT id FROM render_groups;", + std::vector{1, 2, 3, 4, 5}); + sqlAndCompareResult("SELECT ST_NPOINTS(poly) FROM render_groups;", + std::vector{4, kNullInt, 3, 3, 0}); + sqlAndCompareResult("SELECT ST_NPOINTS(multipoly) FROM render_groups;", + std::vector{6, kNullInt, 9, 9, 0}); +} + +TEST_F(DumpAndRestoreTest, DropRenderGroupColumnsBad) { + auto file_path = boost::filesystem::canonical( + "../../Tests/Export/TableDump/dump_with_render_groups_bad.gz") + .string(); + EXPECT_THROW(run_ddl_statement("RESTORE TABLE render_groups FROM '" + file_path + + "' WITH (compression='gzip');"), + std::runtime_error); +} + #ifdef HAVE_AWS_S3 class S3RestoreTest : public DumpAndRestoreTest { protected: diff --git a/Tests/ExecuteTest.cpp b/Tests/ExecuteTest.cpp index 7069bcb69c..d7e6f722fc 100644 --- a/Tests/ExecuteTest.cpp +++ b/Tests/ExecuteTest.cpp @@ -12820,6 +12820,45 @@ TEST_F(Select, Joins_Negative_ShardKey) { } } +TEST_F(Select, Joins_One_Shard_Is_Empty) { + SKIP_ALL_ON_AGGREGATOR(); + if (skip_tests(ExecutorDeviceType::GPU)) { + return; + } + size_t num_shards = choose_shard_count(); + if (num_shards == 1) { + return; + } + auto drop_tbls = [] { + run_ddl_statement("DROP TABLE IF EXISTS SJ1;"); + run_ddl_statement("DROP TABLE IF EXISTS SJ2;"); + }; + drop_tbls(); + for (auto tbl_name : {"SJ1", "SJ2"}) { + std::ostringstream oss; + oss << "CREATE TABLE " << tbl_name + << " (v INT, SHARD KEY(v)) WITH (SHARD_COUNT=" << num_shards << ");"; + run_ddl_statement(oss.str()); + } + for (auto v : {2, 10}) { + std::ostringstream oss; + oss << "INSERT INTO SJ1 VALUES(" << v << ");"; + run_multiple_agg(oss.str(), ExecutorDeviceType::CPU); + } + for (auto v : {30, 2, 2, 4, 1, 3}) { + std::ostringstream oss; + oss << "INSERT INTO SJ2 VALUES(" << v << ");"; + run_multiple_agg(oss.str(), ExecutorDeviceType::CPU); + } + ScopeGuard reset = [drop_tbls] { drop_tbls(); }; + for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) { + SKIP_NO_GPU(); + EXPECT_EQ(v(run_simple_agg( + "SELECT COUNT(1) FROM SJ1 R, SJ2 S WHERE R.v = S.v;", dt)), + int64_t(2)); + } +} + TEST_F(Select, Joins_InnerJoin_AtLeastThreeTables) { for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) { SKIP_NO_GPU(); diff --git a/Tests/Export/TableDump/dump_with_render_groups_bad.gz b/Tests/Export/TableDump/dump_with_render_groups_bad.gz new file mode 100644 index 0000000000..b5d74b34b7 Binary files /dev/null and b/Tests/Export/TableDump/dump_with_render_groups_bad.gz differ diff --git a/Tests/Export/TableDump/dump_with_render_groups_good.gz b/Tests/Export/TableDump/dump_with_render_groups_good.gz new file mode 100644 index 0000000000..33b4147a1f Binary files /dev/null and b/Tests/Export/TableDump/dump_with_render_groups_good.gz differ diff --git a/Tests/Export/TableDump/truncate_test_dump_file.py b/Tests/Export/TableDump/truncate_test_dump_file.py new file mode 100644 index 0000000000..1ffbac8c0e --- /dev/null +++ b/Tests/Export/TableDump/truncate_test_dump_file.py @@ -0,0 +1,58 @@ +import os +import re +import shutil +import struct +import tarfile + +from argparse import ArgumentParser + +output_file_suffix = "_truncated" +def truncate_dump_file(file_path: str): + extract_path = "./dump_file_extract" + with tarfile.open(file_path, "r") as old_dump_tar: + old_dump_tar.extractall(path=extract_path) + + for root, _, files in os.walk(extract_path): + for file in files: + match = re.match("\d+\\.(\d+)\\.data", file) + if match: + truncate_data_file(os.path.join(root, file), int(match.group(1))) + + file_name = os.path.basename(file_path) + with tarfile.open(f"{file_name}{output_file_suffix}", "w:gz") as new_dump_tar: + new_dump_tar.add(extract_path, arcname="") + + shutil.rmtree(extract_path) + +def truncate_data_file(file_path: str, page_size: int): + print(f"Truncating data file: {file_path}") + + file_size = os.path.getsize(file_path) + last_empty_header_offset = None + with open(file_path, "r+b") as file: + file_offet = 0 + while file_offet < file_size: + file.seek(file_offet) + header_size = struct.unpack('i', file.read(4))[0] + + # Capture the first free page that is followed by only free pages. + if header_size == 0 and last_empty_header_offset is None: + last_empty_header_offset = file_offet + elif header_size != 0 and last_empty_header_offset is not None: + last_empty_header_offset = None + file_offet += page_size + + if last_empty_header_offset is not None: + # Keep at least one free page in order to ensure that code branches that check for free pages are executed. + new_file_size = last_empty_header_offset + page_size + if new_file_size < file_size: + file.truncate(new_file_size) + +if __name__ == '__main__': + parser = ArgumentParser(description=f"""Truncate the dump file at the specified path for tests. Truncation involves + removal of excess free pages that do not add to test coverage. Output + dump file is created in the current directory with a file name that matches + the input file name along with a {output_file_suffix} suffix.""") + parser.add_argument("file_path", type=str, help="Path to file file") + args = parser.parse_args() + truncate_dump_file(file_path=args.file_path) diff --git a/Tests/Import/datafiles/geospatial.csv b/Tests/Import/datafiles/geospatial.csv index d965daa959..251427d752 100644 --- a/Tests/Import/datafiles/geospatial.csv +++ b/Tests/Import/datafiles/geospatial.csv @@ -1,11 +1,11 @@ "point", "linestring", "polygon", "multipolygon", "point2", "point3", "point4", "trip_distance" -"POINT(0 0)", "LINESTRING(0 0, 0 0)", "POLYGON((0 0, 1 0, 0 1, 0 0))", "MULTIPOLYGON(((0 0, 1 0, 0 1, 0 0)))", "POINT(0 0)", "POINT(0 0)", "POINT(0 0)", "0.0" -"POINT(1 1)", "LINESTRING(1 0, 2 2, 3 3)", "POLYGON((0 0, 2 0, 0 2, 0 0))", "MULTIPOLYGON(((0 0, 2 0, 0 2, 0 0)))", "POINT(1 1)", "POINT(1 1)", "POINT(1 1)", "1.0" -"POINT(2 2)", "LINESTRING(2 0, 4 4)", "POLYGON((0 0, 3 0, 0 3, 0 0))", "MULTIPOLYGON(((0 0, 3 0, 0 3, 0 0)))", "POINT(2 2)", "POINT(2 2)", "POINT(2 2)", "2.0" -"POINT(3 3)", "LINESTRING(3 0, 6 6, 7 7)", "POLYGON((0 0, 4 0, 0 4, 0 0))", "MULTIPOLYGON(((0 0, 4 0, 0 4, 0 0)))", "POINT(3 3)", "POINT(3 3)", "POINT(3 3)", "3.0" -"POINT(4 4)", "LINESTRING(4 0, 8 8)", "POLYGON((0 0, 5 0, 0 5, 0 0))", "MULTIPOLYGON(((0 0, 5 0, 0 5, 0 0)))", "POINT(4 4)", "POINT(4 4)", "POINT(4 4)", "4.0" -"POINT(5 5)", "LINESTRING(5 0, 10 10, 11 11)", "POLYGON((0 0, 6 0, 0 6, 0 0))", "MULTIPOLYGON(((0 0, 6 0, 0 6, 0 0)))", "POINT(5 5)", "POINT(5 5)", "POINT(5 5)", "5.0" -"POINT(6 6)", "LINESTRING(6 0, 12 12)", "POLYGON((0 0,7 0, 0 7, 0 0))", "MULTIPOLYGON(((0 0, 7 0, 0 7, 0 0)))", "POINT(6 6)", "POINT(6 6)", "POINT(6 6)", "6.0" -"POINT(7 7)", "LINESTRING(7 0, 14 14, 15 15)", "POLYGON((0 0, 8 0, 0 8, 0 0))", "MULTIPOLYGON(((0 0, 8 0, 0 8, 0 0)))", "POINT(7 7)", "POINT(7 7)", "POINT(7 7)", "7.0" -"POINT(8 8)", "LINESTRING(8 0, 16 16)", "POLYGON((0 0,9 0, 0 9, 0 0))", "MULTIPOLYGON(((0 0, 9 0, 0 9, 0 0)))", "POINT(8 8)", "POINT(8 8)", "POINT(8 8)", "8.0" -"POINT(9 9)", "LINESTRING(9 0, 18 18, 19 19)", "POLYGON((0 0, 10 0, 0 10, 0 0))", "MULTIPOLYGON(((0 0, 10 0, 0 10, 0 0)))", "POINT(9 9)", "POINT(9 9)", "POINT(9 9)", "9.0" +"POINT(0 0)", "LINESTRING(0 0, 0 0)", "POLYGON((0 0, 1 0, 0 1, 0 0))", "MULTIPOLYGON(((0 0, 1 0, 0 1, 0 0)))", "POINT(0 0)", "POINT(0 0)", "POINT(0 0)", 0.0 +"POINT(1 1)", "LINESTRING(1 0, 2 2, 3 3)", "POLYGON((0 0, 2 0, 0 2, 0 0))", "MULTIPOLYGON(((0 0, 2 0, 0 2, 0 0)))", "POINT(1 1)", "POINT(1 1)", "POINT(1 1)", 1.0 +"POINT(2 2)", "LINESTRING(2 0, 4 4)", "POLYGON((0 0, 3 0, 0 3, 0 0))", "MULTIPOLYGON(((0 0, 3 0, 0 3, 0 0)))", "POINT(2 2)", "POINT(2 2)", "POINT(2 2)", 2.0 +"POINT(3 3)", "LINESTRING(3 0, 6 6, 7 7)", "POLYGON((0 0, 4 0, 0 4, 0 0))", "MULTIPOLYGON(((0 0, 4 0, 0 4, 0 0)))", "POINT(3 3)", "POINT(3 3)", "POINT(3 3)", 3.0 +"POINT(4 4)", "LINESTRING(4 0, 8 8)", "POLYGON((0 0, 5 0, 0 5, 0 0))", "MULTIPOLYGON(((0 0, 5 0, 0 5, 0 0)))", "POINT(4 4)", "POINT(4 4)", "POINT(4 4)", 4.0 +"POINT(5 5)", "LINESTRING(5 0, 10 10, 11 11)", "POLYGON((0 0, 6 0, 0 6, 0 0))", "MULTIPOLYGON(((0 0, 6 0, 0 6, 0 0)))", "POINT(5 5)", "POINT(5 5)", "POINT(5 5)", 5.0 +"POINT(6 6)", "LINESTRING(6 0, 12 12)", "POLYGON((0 0,7 0, 0 7, 0 0))", "MULTIPOLYGON(((0 0, 7 0, 0 7, 0 0)))", "POINT(6 6)", "POINT(6 6)", "POINT(6 6)", 6.0 +"POINT(7 7)", "LINESTRING(7 0, 14 14, 15 15)", "POLYGON((0 0, 8 0, 0 8, 0 0))", "MULTIPOLYGON(((0 0, 8 0, 0 8, 0 0)))", "POINT(7 7)", "POINT(7 7)", "POINT(7 7)", 7.0 +"POINT(8 8)", "LINESTRING(8 0, 16 16)", "POLYGON((0 0,9 0, 0 9, 0 0))", "MULTIPOLYGON(((0 0, 9 0, 0 9, 0 0)))", "POINT(8 8)", "POINT(8 8)", "POINT(8 8)", 8.0 +"POINT(9 9)", "LINESTRING(9 0, 18 18, 19 19)", "POLYGON((0 0, 10 0, 0 10, 0 0))", "MULTIPOLYGON(((0 0, 10 0, 0 10, 0 0)))", "POINT(9 9)", "POINT(9 9)", "POINT(9 9)", 9.0 diff --git a/Tests/Import/datafiles/random_strings_with_line_endings.7z b/Tests/Import/datafiles/random_strings_with_line_endings.7z index b2a6db7fbc..f307ab3499 100644 Binary files a/Tests/Import/datafiles/random_strings_with_line_endings.7z and b/Tests/Import/datafiles/random_strings_with_line_endings.7z differ diff --git a/Tests/Import/datafiles/with_quoted_fields_and_side_spaces.csv b/Tests/Import/datafiles/with_quoted_fields_and_side_spaces.csv new file mode 100644 index 0000000000..dd7c9f131b --- /dev/null +++ b/Tests/Import/datafiles/with_quoted_fields_and_side_spaces.csv @@ -0,0 +1,5 @@ +"id","str1","bool1","smallint1" +"1", " test1 ", "true" , "1" + "2","test2","false","2" +"3", "", "true", "3" +"4", test4 , "false", "4" \ No newline at end of file diff --git a/Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv b/Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv new file mode 100644 index 0000000000..9d3a623330 --- /dev/null +++ b/Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv @@ -0,0 +1,4 @@ +"id","str1","bool1","smallint1" +"1", " test1 ", "true" , "1" + "2","test2,"false","2" +"3", "", "true", "3" \ No newline at end of file diff --git a/Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv b/Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv new file mode 100644 index 0000000000..868b73573e --- /dev/null +++ b/Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv @@ -0,0 +1,4 @@ +"id","str1","bool1","smallint1" +"1", " test1 ", "true" , "1" + "2",test2","false","2" +"3", "", "true", "3" \ No newline at end of file diff --git a/Tests/ImportExportTest.cpp b/Tests/ImportExportTest.cpp index ccb802c933..3d23e3b3ec 100644 --- a/Tests/ImportExportTest.cpp +++ b/Tests/ImportExportTest.cpp @@ -2173,6 +2173,15 @@ const char* create_table_with_side_spaces = R"( ) WITH (FRAGMENT_SIZE=75000000); )"; +const char* create_table_with_quoted_fields_and_side_spaces = R"( + CREATE TABLE with_quoted_fields_and_side_spaces ( + id INTEGER, + str1 TEXT, + bool1 BOOLEAN, + smallint1 SMALLINT + ) WITH (FRAGMENT_SIZE=75000000); + )"; + const char* create_table_with_side_spaced_array = R"( CREATE TABLE array_with_side_spaces ( id INTEGER, @@ -2223,6 +2232,8 @@ class ImportTest : public ImportExportTestBase { sql(create_table_with_null_text_arrays); sql("drop table if exists with_side_spaces;"); sql(create_table_with_side_spaces); + sql("drop table if exists with_quoted_fields_and_side_spaces;"); + sql(create_table_with_quoted_fields_and_side_spaces); sql("drop table if exists array_with_side_spaces;"); sql(create_table_with_side_spaced_array); sql("drop table if exists null_table;"); @@ -2331,6 +2342,25 @@ class ImportTest : public ImportExportTestBase { return true; } + void importTestWithQuotedFieldsAndSideSpaces(const std::string& filename, + const std::string& quoted, + const std::string& trim) { + sql("TRUNCATE TABLE with_quoted_fields_and_side_spaces;"); + string query_str = + "COPY with_quoted_fields_and_side_spaces FROM '../../Tests/Import/datafiles/" + + filename + "' WITH (quoted='" + quoted + "',trim_spaces='" + trim + "');"; + sql(query_str); + string select_query_str = + "SELECT * FROM with_quoted_fields_and_side_spaces ORDER BY id;"; + string test1 = (trim == "true" ? "test1" : " test1 "); + string test4 = (trim == "true" ? "test4" : " test4 "); + sqlAndCompareResult(select_query_str, + {{i(1), test1, True, i(1)}, + {i(2), "test2", False, i(2)}, + {i(3), NULL, True, i(3)}, + {i(4), test4, False, i(4)}}); + } + bool importTestArrayWithSideSpaces(const string& filename, const string& trim) { sql("TRUNCATE TABLE array_with_side_spaces;"); string query_str = "COPY array_with_side_spaces FROM '../../Tests/Import/datafiles/" + @@ -2698,12 +2728,37 @@ TEST_F(ImportTest, with_quoted_fields) { } } +TEST_F(ImportTest, with_quoted_fields_unmatched_quote) { + std::string error_msg{ + "Unable to find a matching end quote for the quote character '\"' after reading 60 " + "characters. Please ensure that all data fields are correctly formatted or update " + "the \"buffer_size\" option appropriately. Row number: 2. First few characters in " + "row: "}; + queryAndAssertException( + "COPY with_quoted_fields FROM " + "'../../Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv' WITH " + "(header='true', quoted='true');", + error_msg + "\"2\",\"test2,\"false\",\"2\" \n\"3\", \"\", \"t"); + queryAndAssertException( + "COPY with_quoted_fields FROM " + "'../../Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv' WITH " + "(header='true', quoted='true');", + error_msg + "\"2\",test2\",\"false\",\"2\" \n\"3\", \"\", \"t"); +} + TEST_F(ImportTest, with_side_spaces) { for (auto trim : {"false", "true"}) { EXPECT_NO_THROW(importTestWithSideSpaces("with_side_spaces.csv", trim)); } } +TEST_F(ImportTest, with_quoted_fields_and_side_spaces) { + for (auto trim : {"false", "true"}) { + EXPECT_NO_THROW(importTestWithQuotedFieldsAndSideSpaces( + "with_quoted_fields_and_side_spaces.csv", "true", trim)); + } +} + TEST_F(ImportTest, with_side_spaced_array) { for (auto trim : {"false", "true"}) { EXPECT_NO_THROW(importTestArrayWithSideSpaces("array_with_side_spaces.csv", trim)); @@ -3027,7 +3082,7 @@ TEST_F(ImportTestGeo, CSV_Import_Buffer_Size_Less_Than_Row_Size) { } TEST_F(ImportTestGeo, CSV_Import_Max_Buffer_Resize_Less_Than_Row_Size) { - import_export::delimited_parser::set_max_buffer_resize(170); + import_export::delimited_parser::set_max_buffer_resize(168); const auto file_path = boost::filesystem::path("../../Tests/Import/datafiles/geospatial.csv"); @@ -3036,9 +3091,9 @@ TEST_F(ImportTestGeo, CSV_Import_Max_Buffer_Resize_Less_Than_Row_Size) { // adapt value based on which importer we're testing as they have different buffer size // management heuristics if (g_enable_legacy_delimited_import) { - expected_error_message += "170"; + expected_error_message += "168"; } else { - expected_error_message += "169"; + expected_error_message += "167"; } expected_error_message += " characters. " diff --git a/Tests/OverlapsJoinTest.cpp b/Tests/OverlapsJoinTest.cpp index b12fddbb96..de1cc96b65 100644 --- a/Tests/OverlapsJoinTest.cpp +++ b/Tests/OverlapsJoinTest.cpp @@ -2338,6 +2338,36 @@ TEST_F(OverlapsJoinRewriteTest, ArgumentOrderingAfterTableReordering) { } } +TEST_F(OverlapsJoinRewriteTest, ArgumentReorderingNonPointCol) { + // test logic is different compared with the previous test, + // so we do not use performTest function here + QR::get()->clearCpuMemory(); + g_enable_distance_rangejoin = true; + ScopeGuard reset_flag = [orig = g_from_table_reordering] { + g_from_table_reordering = orig; + }; + auto q1 = + "SELECT COUNT(*) FROM TEST_GEOPT R, TEST_GEOPT2 S WHERE ST_DISTANCE( " + "ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326), R.pt4326 ) <= 1.0 AND " + "ST_DISTANCE(R.pt4326, S.pt4326) < 0.01;"; + auto q2 = + "SELECT COUNT(*) FROM TEST_GEOPT R, TEST_GEOPT2 S WHERE ST_DISTANCE( S.pt4326, " + "ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326) ) <= 1.0 AND " + "ST_DISTANCE(R.pt4326, S.pt4326) < 0.01;"; + auto q3 = + "SELECT COUNT(*) FROM TEST_GEOPT R, TEST_GEOPT2 S WHERE ST_DISTANCE( R.pt4326, " + "ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326) ) <= 1.0 AND " + "ST_DISTANCE( S.pt4326, ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326) " + ") <= 1.0;"; + for (bool const table_reordering : {true, false}) { + g_from_table_reordering = table_reordering; + // check whether the query finishes without a crash + EXPECT_EQ((int64_t)0, v(execSQL(q1, ExecutorDeviceType::CPU))); + EXPECT_EQ((int64_t)0, v(execSQL(q2, ExecutorDeviceType::CPU))); + EXPECT_EQ((int64_t)0, v(execSQL(q3, ExecutorDeviceType::CPU))); + } +} + TEST_F(OverlapsJoinRewriteTest, TemporaryTable) { QR::get()->runDDLStatement("DROP TABLE IF EXISTS tp1;"); QR::get()->runDDLStatement("DROP TABLE IF EXISTS tp2;"); diff --git a/Tests/SQLHintTest.cpp b/Tests/SQLHintTest.cpp index 65c79adbc9..f075edec54 100644 --- a/Tests/SQLHintTest.cpp +++ b/Tests/SQLHintTest.cpp @@ -1220,6 +1220,54 @@ TEST(QueryHint, Subquery) { } } +TEST(QueryHint, HashJoinSpec) { + // this join originally a target for perfect join hash table w/ OneToOne hash table + // layout + auto check_registered_hint = [](std::string const& query, + QueryHint query_hint, + bool is_global) { + auto const rel_alg_dag = QR::get()->getRelAlgDag(query); + if (is_global) { + EXPECT_TRUE(is_hint_globally_registered(rel_alg_dag->getGlobalHints(), query_hint)); + } else { + EXPECT_TRUE(is_hint_registered(rel_alg_dag->getQueryHints(), query_hint)); + } + }; + std::string q1 = + "select /*+ force_baseline_hash_join */ count(1) from JOIN_HINT_TEST R, " + "JOIN_HINT_TEST S where R.v = S.v;"; + std::string q2 = + "select /*+ force_one_to_many_hash_join */ count(1) from JOIN_HINT_TEST R, " + "JOIN_HINT_TEST S where R.v = S.v;"; + std::string q1_g = + "select /*+ g_force_baseline_hash_join */ count(1) from JOIN_HINT_TEST R, " + "JOIN_HINT_TEST S where R.v = S.v;"; + std::string q2_g = + "select /*+ g_force_one_to_many_hash_join */ count(1) from JOIN_HINT_TEST R, " + "JOIN_HINT_TEST S where R.v = S.v;"; + check_registered_hint(q1, QueryHint::kforceBaselineHashJoin, false); + check_registered_hint(q1_g, QueryHint::kforceBaselineHashJoin, true); + check_registered_hint(q2, QueryHint::kforceOneToManyHashJoin, false); + check_registered_hint(q2_g, QueryHint::kforceOneToManyHashJoin, true); + + std::set visited; + QR::get()->runSQL(q1, ExecutorDeviceType::CPU); + auto q1_ht_info = QR::get()->getCachedHashtableWithoutCacheKey( + visited, CacheItemType::BASELINE_HT, DataRecyclerUtil::CPU_DEVICE_IDENTIFIER); + auto cached_ht1 = std::get<1>(q1_ht_info); + auto expected_ht1 = dynamic_cast(cached_ht1.get()); + EXPECT_TRUE(expected_ht1); + + QR::get()->runSQL(q2, ExecutorDeviceType::CPU); + auto q2_ht_info = QR::get()->getCachedHashtableWithoutCacheKey( + visited, CacheItemType::PERFECT_HT, DataRecyclerUtil::CPU_DEVICE_IDENTIFIER); + auto cached_ht2 = std::get<1>(q2_ht_info); + auto expected_ht2 = dynamic_cast(cached_ht2.get()); + EXPECT_TRUE(expected_ht2); + EXPECT_TRUE(expected_ht2->getHashTableEntryInfo().getHashTableLayout() == + HashType::OneToMany); +} + int main(int argc, char** argv) { TestHelpers::init_logger_stderr_only(argc, argv); testing::InitGoogleTest(&argc, argv); diff --git a/Tests/TableFunctionsTest.cpp b/Tests/TableFunctionsTest.cpp index ec66f41edb..00d800be68 100644 --- a/Tests/TableFunctionsTest.cpp +++ b/Tests/TableFunctionsTest.cpp @@ -381,75 +381,112 @@ class TableFunctions : public ::testing::Test { run_ddl_statement( "CREATE TABLE geo_polygon_test(" "p1 POLYGON, " + "ml1 MULTILINESTRING, " "r1 LINESTRING, h1 LINESTRING, hh1 LINESTRING, " - "p2 GEOMETRY(POLYGON, 4326), " // uses geoint compression + "p2 GEOMETRY(POLYGON, 4326), " // uses geoint compression + "ml2 GEOMETRY(MULTILINESTRING, 4326), " // uses geoint compression "r2 GEOMETRY(LINESTRING, 4326), h2 GEOMETRY(LINESTRING, 4326), hh2 " "GEOMETRY(LINESTRING, 4326), " "p3 GEOMETRY(POLYGON, 4326) ENCODING NONE, " + "ml3 GEOMETRY(MULTILINESTRING, 4326) ENCODING NONE, " "r3 GEOMETRY(LINESTRING, 4326) ENCODING NONE, h3 GEOMETRY(LINESTRING, 4326) " "ENCODING NONE, " "hh3 GEOMETRY(LINESTRING, 4326) ENCODING NONE, " "p4 GEOMETRY(POLYGON, 900913)," + "ml4 GEOMETRY(MULTILINESTRING, 900913)," "r4 GEOMETRY(LINESTRING, 900913), h4 GEOMETRY(LINESTRING, 900913), hh4 " - "GEOMETRY(LINESTRING, 900913), sizes INT);"); + "GEOMETRY(LINESTRING, 900913), " + "mp1 MULTIPOLYGON, " + "mp2 GEOMETRY(MULTIPOLYGON, 4326), " + "mp3 GEOMETRY(MULTIPOLYGON, 4326) ENCODING NONE, " + "mp4 GEOMETRY(MULTIPOLYGON, 900913), " + "sizes INT);"); TestHelpers::ValuesGenerator gen("geo_polygon_test"); - run_multiple_agg(gen("'POLYGON((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2))'", - "'LINESTRING(1 2,3 4,5 6,7 8,9 10)'", - "'LINESTRING(2 3,3 4,1 2)'", - "'NULL'", - "'POLYGON((0 0,5 0,5 5,0 5,0 0),(2 2, 2 1,1 1,1 2,2 2))'", - "'LINESTRING(0 0,5 0,5 5,0 5)'", - "'LINESTRING(2 2,2 1,1 1,1 2)'", - "'NULL'", - "'POLYGON((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3))'", - "'LINESTRING(0 0,6 0,6 6,0 6))'", - "'LINESTRING(3 3,3 2,2 2,2 3)'", - "'NULL'", - "'POLYGON((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4))'", - "'LINESTRING(0 0,7 0,7 7,0 7)'", - "'LINESTRING(4 4,4 2,2 3,2 4)'", - "'NULL'", - "8"), - ExecutorDeviceType::CPU); + run_multiple_agg( + gen("'POLYGON((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2))'", + "'MULTILINESTRING((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2))'", + "'LINESTRING(1 2,3 4,5 6,7 8,9 10)'", + "'LINESTRING(2 3,3 4,1 2)'", + "'NULL'", + "'POLYGON((0 0,5 0,5 5,0 5,0 0),(2 2, 2 1,1 1,1 2,2 2))'", + "'MULTILINESTRING((0 0,5 0,5 5,0 5),(2 2, 2 1,1 1,1 2))'", + "'LINESTRING(0 0,5 0,5 5,0 5)'", + "'LINESTRING(2 2,2 1,1 1,1 2)'", + "'NULL'", + "'POLYGON((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3))'", + "'MULTILINESTRING((0 0,6 0,6 6,0 6),(3 3,3 2,2 2,2 3))'", + "'LINESTRING(0 0,6 0,6 6,0 6))'", + "'LINESTRING(3 3,3 2,2 2,2 3)'", + "'NULL'", + "'POLYGON((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4))'", + "'MULTILINESTRING((0 0,7 0,7 7,0 7),(4 4,2 4, 2 3,4 2))'", + "'LINESTRING(0 0,7 0,7 7,0 7)'", + "'LINESTRING(4 4,4 2,2 3,2 4)'", + "'NULL'", + "'MULTIPOLYGON(((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2)))'", + "'MULTIPOLYGON(((0 0,5 0,5 5,0 5,0 0),(2 2, 2 1,1 1,1 2,2 2)))'", + "'MULTIPOLYGON(((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3)))'", + "'MULTIPOLYGON(((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4)))'", + "8"), + ExecutorDeviceType::CPU); run_multiple_agg(gen("'POLYGON((0 0,5 0,5 5,0 5,0 0))'", + "'MULTILINESTRING((0 0,5 0,5 5,0 5))'", "'LINESTRING(0 0,5 0,5 5,0 5)'", "'NULL'", "'NULL'", "'POLYGON((0 0,6 0,6 6,0 6,0 0))'", + "'MULTILINESTRING((0 0,6 0,6 6,0 6))'", "'LINESTRING(0 0,6 0,6 6,0 6)'", "'NULL'", "'NULL'", "'POLYGON((0 0,7 0,7 7,0 7,0 0))'", + "'MULTILINESTRING((0 0,7 0,7 7,0 7))'", "'LINESTRING(0 0,7 0,7 7,0 7)'", "'NULL'", "'NULL'", "'POLYGON((0 0,4 0,4 4,0 4,0 0))'", + "'MULTILINESTRING((0 0,4 0,4 4,0 4))'", "'LINESTRING(0 0,4 0,4 4,0 4)'", "'NULL'", "'NULL'", + "'MULTIPOLYGON(((0 0,5 0,5 5,0 5,0 0)))'", + "'MULTIPOLYGON(((0 0,6 0,6 6,0 6,0 0)))'", + "'MULTIPOLYGON(((0 0,7 0,7 7,0 7,0 0)))'", + "'MULTIPOLYGON(((0 0,4 0,4 4,0 4,0 0)))'", "4"), ExecutorDeviceType::CPU); run_multiple_agg( gen("'POLYGON((1 2,3 4,5 6,7 8,9 10),(3 4,1 2,2 3),(5 6,7 8,9 10))'", + "'MULTILINESTRING((1 2,3 4,5 6,7 8,9 10),(3 4,1 2,2 3),(5 6,7 8,9 10))'", "'LINESTRING(1 2,3 4,5 6,7 8,9 10)'", "'LINESTRING(2 3,3 4,1 2)'", "'LINESTRING(9 10,5 6,7 8)'", "'POLYGON((0 0,5 0,5 5,0 5,0 0),(2 2,2 1,1 1,1 2,2 2),(0 0,0 1,1 0))'", + "'MULTILINESTRING((0 0,5 0,5 5,0 5),(2 2,2 1,1 1,1 2),(0 0,0 1,1 0))'", "'LINESTRING(0 0,5 0,5 5,0 5)'", "'LINESTRING(2 2,2 1,1 1,1 2)'", "'LINESTRING(0 0,0 1,1 0)'", "'POLYGON((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3),(0 0,0 1,1 0))'", + "'MULTILINESTRING((0 0,6 0,6 6,0 6),(3 3,3 2,2 2,2 3),(0 0,0 1,1 0))'", "'LINESTRING(0 0,6 0,6 6,0 6))'", "'LINESTRING(3 3,3 2,2 2,2 3)'", "'LINESTRING(0 0,0 1,1 0)'", "'POLYGON((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4),(0 0,0 1,1 0))'", + "'MULTILINESTRING((0 0,7 0,7 7,0 7),(4 4,2 4, 2 3,4 2),(0 0,0 1,1 0))'", "'LINESTRING(0 0,7 0,7 7,0 7)'", "'LINESTRING(4 4,4 2,2 3,2 4)'", "'LINESTRING(0 0,0 1,1 0)'", + "'MULTIPOLYGON(((1 2,3 4,5 6,7 8,9 10),(3 4,1 2,2 3),(5 6,7 8,9 10)))'", + "'MULTIPOLYGON(((0 0,5 0,5 5,0 5,0 0),(2 2,2 1,1 1,1 2,2 2),(0 0,0 1,1 " + "0)))'", + "'MULTIPOLYGON(((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3),(0 0,0 1,1 " + "0)))'", + "'MULTIPOLYGON(((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4),(0 0,0 1,1 " + "0)))'", "11"), ExecutorDeviceType::CPU); run_multiple_agg(gen("'NULL'", @@ -468,6 +505,14 @@ class TableFunctions : public ::testing::Test { "'NULL'", "'NULL'", "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", "NULL"), ExecutorDeviceType::CPU); } @@ -2923,6 +2968,7 @@ void assert_equal(const TargetValue& val1, std::vector coords1, coords2; std::vector bounds1, bounds2; std::vector ring_sizes1, ring_sizes2; + std::vector poly_sizes1, poly_sizes2; switch (ti.get_type()) { case kLINESTRING: { const auto gdal_wkt_ls1 = Geospatial::GeoLineString(*s1); @@ -2940,9 +2986,27 @@ void assert_equal(const TargetValue& val1, gdal_wkt_poly2.getColumns(coords2, ring_sizes2, bounds2); break; } + case kMULTILINESTRING: { + const auto gdal_wkt_poly1 = Geospatial::GeoMultiLineString(*s1); + gdal_wkt_poly1.getColumns(coords1, ring_sizes1, bounds1); + const auto gdal_wkt_poly2 = Geospatial::GeoMultiLineString(*s2); + gdal_wkt_poly2.getColumns(coords2, ring_sizes2, bounds2); + break; + } + case kMULTIPOLYGON: { + const auto gdal_wkt_mpoly1 = Geospatial::GeoMultiPolygon(*s1); + gdal_wkt_mpoly1.getColumns(coords1, ring_sizes1, poly_sizes1, bounds1); + const auto gdal_wkt_mpoly2 = Geospatial::GeoMultiPolygon(*s2); + gdal_wkt_mpoly2.getColumns(coords2, ring_sizes2, poly_sizes2, bounds2); + break; + } default: UNREACHABLE() << "ti=" << ti.to_string(); } + ASSERT_EQ(poly_sizes1.size(), poly_sizes2.size()); + for (size_t i = 0; i < poly_sizes1.size(); i++) { + ASSERT_EQ(poly_sizes1[i], poly_sizes2[i]); + } ASSERT_EQ(ring_sizes1.size(), ring_sizes2.size()); ASSERT_EQ(coords1.size(), coords2.size()); int64_t k = 0; @@ -2963,8 +3027,8 @@ void assert_equal(const TargetValue& val1, for (int32_t j = 0; j < sz; j++) { Point p1 = points1[j]; Point p2 = points2[j]; - ASSERT_NEAR(p1.x, p2.x, 1e-7); - ASSERT_NEAR(p1.y, p2.y, 1e-7); + ASSERT_NEAR(p1.x, p2.x, 1e-6); + ASSERT_NEAR(p1.y, p2.y, 1e-6); } } } @@ -3509,7 +3573,7 @@ TEST_F(TableFunctions, ColumnGeoLineStringInput) { } } -TEST_F(TableFunctions, ColumnGeoLineStringInputOutput) { +TEST_F(TableFunctions, ColumnGeoLineStringCopy) { for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { SKIP_NO_GPU(); for (int i = 1; i <= 4; i++) { @@ -3517,8 +3581,62 @@ TEST_F(TableFunctions, ColumnGeoLineStringInputOutput) { { // Test Column input and output std::string q1 = "SELECT " + col + " FROM geo_line_string_test;"; - std::string q2 = "SELECT copied_linestrings FROM TABLE(CT_COPY(CURSOR(SELECT " + - col + " FROM geo_line_string_test)))"; + std::string q2 = "SELECT outputs FROM TABLE(CT_COPY(CURSOR(SELECT " + col + + " FROM geo_line_string_test)))"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiLineStringCopy) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string col = "ml" + std::to_string(i); + { + // Test Column input and output + std::string q1 = "SELECT " + col + " FROM geo_polygon_test;"; + std::string q2 = "SELECT outputs FROM TABLE(CT_COPY(CURSOR(SELECT " + col + + " FROM geo_polygon_test)))"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoPolygonCopy) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string col = "p" + std::to_string(i); + { + // Test Column input and output + std::string q1 = "SELECT " + col + " FROM geo_polygon_test;"; + std::string q2 = "SELECT outputs FROM TABLE(CT_COPY(CURSOR(SELECT " + col + + " FROM geo_polygon_test)))"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonCopy) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string col = "mp" + std::to_string(i); + { + // Test Column input and output + std::string q1 = "SELECT " + col + " FROM geo_polygon_test;"; + std::string q2 = "SELECT outputs FROM TABLE(CT_COPY(CURSOR(SELECT " + col + + " FROM geo_polygon_test)))"; const auto expected_rows = run_multiple_agg(q1, dt); const auto rows = run_multiple_agg(q2, dt); assert_equal(rows, expected_rows); @@ -3604,6 +3722,142 @@ TEST_F(TableFunctions, ColumnGeoPolygonOutput) { } } +TEST_F(TableFunctions, ColumnGeoPolygonInOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string rcol = "r" + std::to_string(i); + std::string hcol = "h" + std::to_string(i); + std::string hhcol = "hh" + std::to_string(i); + { + std::string q1 = "SELECT " + rcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT linestrings FROM TABLE(CT_LINESTRINGN(CURSOR(" + "SELECT polygons FROM TABLE(CT_MAKE_POLYGON3(CURSOR(SELECT " + + rcol + ", " + hcol + ", " + hhcol + " FROM geo_polygon_test)))), 1));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiLineStringOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mlcol = "ml" + std::to_string(i); + { + std::string q1 = "SELECT " + mlcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT mlinestrings FROM TABLE(CT_TO_MULTILINESTRING(CURSOR(SELECT " + pcol + + " FROM geo_polygon_test)));"; + + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiLineStringInput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mlcol = "ml" + std::to_string(i); + { + std::string q1 = "SELECT " + pcol + " FROM geo_polygon_test;"; + std::string q2 = "SELECT polygons FROM TABLE(CT_TO_POLYGON(CURSOR(SELECT " + + mlcol + " FROM geo_polygon_test)));"; + + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mpcol = "mp" + std::to_string(i); + // Test Column output + { + std::string q1 = "SELECT " + mpcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT mpolygons FROM TABLE(CT_MAKE_MULTIPOLYGON(CURSOR(SELECT " + pcol + + " FROM geo_polygon_test)));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonInput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mpcol = "mp" + std::to_string(i); + // Test Column input + { + std::string q1 = "SELECT " + pcol + " FROM geo_polygon_test;"; + std::string q2 = "SELECT polygons FROM TABLE(CT_POLYGONN(CURSOR(SELECT " + mpcol + + " FROM geo_polygon_test), 1));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonInOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mpcol = "mp" + std::to_string(i); + // Test Column input, Column + // output, Column input, Column + // output + { + std::string q1 = "SELECT " + pcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT polygons FROM TABLE(CT_POLYGONN(CURSOR(SELECT" + " mpolygons FROM TABLE(CT_MAKE_MULTIPOLYGON(CURSOR(SELECT " + + pcol + " FROM geo_polygon_test)))), 1));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + // Test Column input, Column + // output, Column input, Column + // output: + { + std::string q1 = "SELECT " + mpcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT mpolygons FROM TABLE(CT_MAKE_MULTIPOLYGON(" + "CURSOR(SELECT polygons FROM TABLE(CT_POLYGONN(CURSOR(SELECT " + + mpcol + " FROM geo_polygon_test), 1)))));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + TEST_F(TableFunctions, DefaultScalarValues) { for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) { SKIP_NO_GPU(); diff --git a/ThriftHandler/CommandLineOptions.cpp b/ThriftHandler/CommandLineOptions.cpp index 5d06d450f5..5ec335da76 100644 --- a/ThriftHandler/CommandLineOptions.cpp +++ b/ThriftHandler/CommandLineOptions.cpp @@ -93,27 +93,28 @@ void CommandLineOptions::init_logging() { log_options_.set_base_path(base_path); logger::init(log_options_); } - void CommandLineOptions::fillOptions() { - help_desc.add_options()("help,h", "Show available options."); - help_desc.add_options()( + po::options_description& desc = help_desc_; + + desc.add_options()("help,h", "Show available options."); + desc.add_options()( "allow-cpu-retry", po::value(&g_allow_cpu_retry) ->default_value(g_allow_cpu_retry) ->implicit_value(true), R"(Allow the queries which failed on GPU to retry on CPU, even when watchdog is enabled.)"); - help_desc.add_options()("allow-loop-joins", - po::value(&allow_loop_joins) - ->default_value(allow_loop_joins) - ->implicit_value(true), - "Enable loop joins."); - help_desc.add_options()("bigint-count", - po::value(&g_bigint_count) - ->default_value(g_bigint_count) - ->implicit_value(true), - "Use 64-bit count."); - - help_desc.add_options()( + desc.add_options()("allow-loop-joins", + po::value(&allow_loop_joins) + ->default_value(allow_loop_joins) + ->implicit_value(true), + "Enable loop joins."); + desc.add_options()("bigint-count", + po::value(&g_bigint_count) + ->default_value(g_bigint_count) + ->implicit_value(true), + "Use 64-bit count."); + + desc.add_options()( "enable-executor-resource-mgr", po::value(&g_enable_executor_resource_mgr) ->default_value(g_enable_executor_resource_mgr) @@ -124,7 +125,7 @@ void CommandLineOptions::fillOptions() { // Note we allow executor-cpu-result-mem-ratio to have values > 0 to allow // oversubscription of memory when warranted, but user should be careful with this as // too high a value can cause OOM errors. - help_desc.add_options()( + desc.add_options()( "executor-cpu-result-mem-ratio", po::value(&g_executor_resource_mgr_cpu_result_mem_ratio) ->default_value(g_executor_resource_mgr_cpu_result_mem_ratio), @@ -134,7 +135,7 @@ void CommandLineOptions::fillOptions() { "warranted, but too high a value can cause out-of-memory errors. Requires " "--executor-resource-mgr to be set"); - help_desc.add_options()( + desc.add_options()( "executor-cpu-result-mem-bytes", po::value(&g_executor_resource_mgr_cpu_result_mem_bytes) ->default_value(g_executor_resource_mgr_cpu_result_mem_bytes), @@ -147,7 +148,7 @@ void CommandLineOptions::fillOptions() { // oversubscription of threads when warranted, given we may be overly pessimistic about // kernel core occupation for some classes of queries. Care should be taken however with // setting this value too high as thrashing and thread starvation can result. - help_desc.add_options()( + desc.add_options()( "executor-per-query-max-cpu-threads-ratio", po::value(&g_executor_resource_mgr_per_query_max_cpu_slots_ratio) ->default_value(g_executor_resource_mgr_per_query_max_cpu_slots_ratio), @@ -157,7 +158,7 @@ void CommandLineOptions::fillOptions() { // Note we allow executor-per-query-max-cpu-result-mem-ratio to have values > 0 to allow // oversubscription of memory when warranted, but user should be careful with this as // too high a value can cause OOM errors. - help_desc.add_options()( + desc.add_options()( "executor-per-query-max-cpu-result-mem-ratio", po::value(&g_executor_resource_mgr_per_query_max_cpu_result_mem_ratio) ->default_value(g_executor_resource_mgr_per_query_max_cpu_result_mem_ratio), @@ -165,7 +166,7 @@ void CommandLineOptions::fillOptions() { "that can be " "allocated for a single query. Requires --enable-executor-resource-mgr to be set."); - help_desc.add_options()( + desc.add_options()( "allow-cpu-kernel-concurrency", po::value(&g_executor_resource_mgr_allow_cpu_kernel_concurrency) ->default_value(g_executor_resource_mgr_allow_cpu_kernel_concurrency) @@ -173,7 +174,7 @@ void CommandLineOptions::fillOptions() { "Allow for multiple queries to run execution kernels concurrently on CPU. Requires " "--enable-executor-resource-mgr to be set."); - help_desc.add_options()( + desc.add_options()( "allow-cpu-gpu-kernel-concurrency", po::value(&g_executor_resource_mgr_allow_cpu_gpu_kernel_concurrency) ->default_value(g_executor_resource_mgr_allow_cpu_gpu_kernel_concurrency) @@ -185,7 +186,7 @@ void CommandLineOptions::fillOptions() { // CPU slots/threads Single query CPU slot oversubscription should be controlled with // --executor-per-query-max-cpu-threads-ratio (i.e. by setting it to > 1.0) - help_desc.add_options()( + desc.add_options()( "allow-cpu-thread-oversubscription-concurrency", po::value( &g_executor_resource_mgr_allow_cpu_slot_oversubscription_concurrency) @@ -202,7 +203,7 @@ void CommandLineOptions::fillOptions() { // controlled with // --executor-per-query-cpu-result-mem-ratio (i.e. by setting it to > 1.0) - help_desc.add_options()( + desc.add_options()( "allow-cpu-result-mem-oversubscription-concurrency", po::value( &g_executor_resource_mgr_allow_cpu_result_mem_oversubscription_concurrency) @@ -214,342 +215,336 @@ void CommandLineOptions::fillOptions() { "can lead to out-of-memory errors. Requires --enable-executor-resource-mgr to be " "set."); - help_desc.add_options()( + desc.add_options()( "executor-max-available-resource-use-ratio", po::value(&g_executor_resource_mgr_max_available_resource_use_ratio) ->default_value(g_executor_resource_mgr_max_available_resource_use_ratio), "Set max proportion (0 < ratio <= 1.0) of available resources that should be " "granted to a query. Requires --executor-resource-mgr to be set"); - help_desc.add_options()("calcite-max-mem", - po::value(&system_parameters.calcite_max_mem) - ->default_value(system_parameters.calcite_max_mem), - "Max memory available to calcite JVM."); + desc.add_options()("calcite-max-mem", + po::value(&system_parameters.calcite_max_mem) + ->default_value(system_parameters.calcite_max_mem), + "Max memory available to calcite JVM."); if (!dist_v5_) { - help_desc.add_options()("calcite-port", - po::value(&system_parameters.calcite_port) - ->default_value(system_parameters.calcite_port), - "Calcite port number."); + desc.add_options()("calcite-port", + po::value(&system_parameters.calcite_port) + ->default_value(system_parameters.calcite_port), + "Calcite port number."); } - help_desc.add_options()("config", - po::value(&system_parameters.config_file), - "Path to server configuration file."); - help_desc.add_options()("cpu-buffer-mem-bytes", - po::value(&system_parameters.cpu_buffer_mem_bytes) - ->default_value(system_parameters.cpu_buffer_mem_bytes), - "Size of memory reserved for CPU buffers, in bytes."); - - help_desc.add_options()("cpu-only", - po::value(&system_parameters.cpu_only) - ->default_value(system_parameters.cpu_only) - ->implicit_value(true), - "Run on CPU only, even if GPUs are available."); - help_desc.add_options()("cuda-block-size", - po::value(&system_parameters.cuda_block_size) - ->default_value(system_parameters.cuda_block_size), - "Size of block to use on NVIDIA GPU."); - help_desc.add_options()("cuda-grid-size", - po::value(&system_parameters.cuda_grid_size) - ->default_value(system_parameters.cuda_grid_size), - "Size of grid to use on NVIDIA GPU."); - help_desc.add_options()("optimize-cuda-block-and-grid-sizes", - po::value(&optimize_cuda_block_and_grid_sizes) - ->default_value(false) - ->implicit_value(true)); + desc.add_options()("config", + po::value(&system_parameters.config_file), + "Path to server configuration file."); + desc.add_options()("cpu-buffer-mem-bytes", + po::value(&system_parameters.cpu_buffer_mem_bytes) + ->default_value(system_parameters.cpu_buffer_mem_bytes), + "Size of memory reserved for CPU buffers, in bytes."); + + desc.add_options()("cpu-only", + po::value(&system_parameters.cpu_only) + ->default_value(system_parameters.cpu_only) + ->implicit_value(true), + "Run on CPU only, even if GPUs are available."); + desc.add_options()("cuda-block-size", + po::value(&system_parameters.cuda_block_size) + ->default_value(system_parameters.cuda_block_size), + "Size of block to use on NVIDIA GPU."); + desc.add_options()("cuda-grid-size", + po::value(&system_parameters.cuda_grid_size) + ->default_value(system_parameters.cuda_grid_size), + "Size of grid to use on NVIDIA GPU."); + desc.add_options()("optimize-cuda-block-and-grid-sizes", + po::value(&optimize_cuda_block_and_grid_sizes) + ->default_value(false) + ->implicit_value(true)); if (!dist_v5_) { - help_desc.add_options()( + desc.add_options()( "data", po::value(&base_path)->required()->default_value("storage"), "Directory path to HeavyDB data storage (catalogs, raw data, log files, etc)."); positional_options.add("data", 1); } - help_desc.add_options()("db-query-list", - po::value(&db_query_file), - "Path to file containing HeavyDB warmup queries."); - help_desc.add_options()( + desc.add_options()("db-query-list", + po::value(&db_query_file), + "Path to file containing HeavyDB warmup queries."); + desc.add_options()( "exit-after-warmup", po::value(&exit_after_warmup)->default_value(false)->implicit_value(true), "Exit after HeavyDB warmup queries."); - help_desc.add_options()("dynamic-watchdog-time-limit", - po::value(&dynamic_watchdog_time_limit) - ->default_value(dynamic_watchdog_time_limit) - ->implicit_value(10000), - "Dynamic watchdog time limit, in milliseconds."); - help_desc.add_options()("enable-data-recycler", - po::value(&enable_data_recycler) - ->default_value(enable_data_recycler) - ->implicit_value(true), - "Use data recycler."); - help_desc.add_options()("use-hashtable-cache", - po::value(&use_hashtable_cache) - ->default_value(use_hashtable_cache) - ->implicit_value(true), - "Use hashtable cache."); - help_desc.add_options()("use-query-resultset-cache", - po::value(&g_use_query_resultset_cache) - ->default_value(g_use_query_resultset_cache) - ->implicit_value(true), - "Use query resultset cache."); - help_desc.add_options()("use-chunk-metadata-cache", - po::value(&g_use_chunk_metadata_cache) - ->default_value(g_use_chunk_metadata_cache) - ->implicit_value(true), - "Use chunk metadata cache."); - help_desc.add_options()( + desc.add_options()("dynamic-watchdog-time-limit", + po::value(&dynamic_watchdog_time_limit) + ->default_value(dynamic_watchdog_time_limit) + ->implicit_value(10000), + "Dynamic watchdog time limit, in milliseconds."); + desc.add_options()("enable-data-recycler", + po::value(&enable_data_recycler) + ->default_value(enable_data_recycler) + ->implicit_value(true), + "Use data recycler."); + desc.add_options()("use-hashtable-cache", + po::value(&use_hashtable_cache) + ->default_value(use_hashtable_cache) + ->implicit_value(true), + "Use hashtable cache."); + desc.add_options()("use-query-resultset-cache", + po::value(&g_use_query_resultset_cache) + ->default_value(g_use_query_resultset_cache) + ->implicit_value(true), + "Use query resultset cache."); + desc.add_options()("use-chunk-metadata-cache", + po::value(&g_use_chunk_metadata_cache) + ->default_value(g_use_chunk_metadata_cache) + ->implicit_value(true), + "Use chunk metadata cache."); + desc.add_options()( "hashtable-cache-total-bytes", po::value(&hashtable_cache_total_bytes) ->default_value(hashtable_cache_total_bytes) ->implicit_value(4294967296), "Size of total memory space for hashtable cache, in bytes (default: 4GB)."); - help_desc.add_options()("max-cacheable-hashtable-size-bytes", - po::value(&max_cacheable_hashtable_size_bytes) - ->default_value(max_cacheable_hashtable_size_bytes) - ->implicit_value(2147483648), - "The maximum size of hashtable that is available to cache, in " - "bytes (default: 2GB)."); - help_desc.add_options()( + desc.add_options()("max-cacheable-hashtable-size-bytes", + po::value(&max_cacheable_hashtable_size_bytes) + ->default_value(max_cacheable_hashtable_size_bytes) + ->implicit_value(2147483648), + "The maximum size of hashtable that is available to cache, in " + "bytes (default: 2GB)."); + desc.add_options()( "query-resultset-cache-total-bytes", po::value(&g_query_resultset_cache_total_bytes) ->default_value(g_query_resultset_cache_total_bytes), "Size of total memory space for query resultset cache, in bytes (default: 4GB)."); - help_desc.add_options()( - "max-query-resultset-size-bytes", - po::value(&g_max_cacheable_query_resultset_size_bytes) - ->default_value(g_max_cacheable_query_resultset_size_bytes), - "The maximum size of query resultset that is available to cache, in " - "bytes (default: 2GB)."); - help_desc.add_options()("allow-auto-query-resultset-caching", - po::value(&g_allow_auto_resultset_caching) - ->default_value(g_allow_auto_resultset_caching) - ->implicit_value(true), - "Allow automatic query resultset caching when the size of " - "query resultset is smaller or equal to the threshold defined " - "by `auto-resultset-caching-threshold-bytes`, in bytes (to " - "enable this, query resultset recycler " - "should be enabled, default: 1048576 bytes (or 1MB))."); - help_desc.add_options()( + desc.add_options()("max-query-resultset-size-bytes", + po::value(&g_max_cacheable_query_resultset_size_bytes) + ->default_value(g_max_cacheable_query_resultset_size_bytes), + "The maximum size of query resultset that is available to cache, in " + "bytes (default: 2GB)."); + desc.add_options()("allow-auto-query-resultset-caching", + po::value(&g_allow_auto_resultset_caching) + ->default_value(g_allow_auto_resultset_caching) + ->implicit_value(true), + "Allow automatic query resultset caching when the size of " + "query resultset is smaller or equal to the threshold defined " + "by `auto-resultset-caching-threshold-bytes`, in bytes (to " + "enable this, query resultset recycler " + "should be enabled, default: 1048576 bytes (or 1MB))."); + desc.add_options()( "auto-resultset-caching-threshold-bytes", po::value(&g_auto_resultset_caching_threshold) ->default_value(g_auto_resultset_caching_threshold), "A threshold that allows caching query resultset automatically if the size of " "resultset is less than it, in bytes (default: 1MB)."); - help_desc.add_options()("allow-query-step-skipping", - po::value(&g_allow_query_step_skipping) - ->default_value(g_allow_query_step_skipping) - ->implicit_value(true), - "Allow query step skipping when multi-step query has at least " - "one cached query resultset."); - help_desc.add_options()("enable-debug-timer", - po::value(&g_enable_debug_timer) - ->default_value(g_enable_debug_timer) - ->implicit_value(true), - "Enable debug timer logging."); - help_desc.add_options()("enable-dynamic-watchdog", - po::value(&enable_dynamic_watchdog) - ->default_value(enable_dynamic_watchdog) - ->implicit_value(true), - "Enable dynamic watchdog."); - help_desc.add_options()("enable-filter-push-down", - po::value(&g_enable_filter_push_down) - ->default_value(g_enable_filter_push_down) - ->implicit_value(true), - "Enable filter push down through joins."); - help_desc.add_options()("enable-overlaps-hashjoin", - po::value(&g_enable_overlaps_hashjoin) - ->default_value(g_enable_overlaps_hashjoin) - ->implicit_value(true), - "Enable the overlaps hash join framework allowing for range " - "join (e.g. spatial overlaps) computation using a hash table."); - help_desc.add_options()("enable-hashjoin-many-to-many", - po::value(&g_enable_hashjoin_many_to_many) - ->default_value(g_enable_hashjoin_many_to_many) - ->implicit_value(true), - "Enable the overlaps hash join framework allowing for range " - "join (e.g. spatial overlaps) computation using a hash table."); - help_desc.add_options()("enable-distance-rangejoin", - po::value(&g_enable_distance_rangejoin) - ->default_value(g_enable_distance_rangejoin) - ->implicit_value(true), - "Enable accelerating point distance joins with a hash table. " - "This rewrites ST_Distance when using an upperbound (<= X)."); - help_desc.add_options()("enable-runtime-query-interrupt", - po::value(&enable_runtime_query_interrupt) - ->default_value(enable_runtime_query_interrupt) - ->implicit_value(true), - "Enable runtime query interrupt."); - help_desc.add_options()("enable-non-kernel-time-query-interrupt", - po::value(&enable_non_kernel_time_query_interrupt) - ->default_value(enable_non_kernel_time_query_interrupt) - ->implicit_value(true), - "Enable non-kernel time query interrupt."); - help_desc.add_options()("pending-query-interrupt-freq", - po::value(&pending_query_interrupt_freq) - ->default_value(pending_query_interrupt_freq) - ->implicit_value(1000), - "A frequency of checking the request of pending query " - "interrupt from user (in millisecond)."); - help_desc.add_options()( - "running-query-interrupt-freq", - po::value(&running_query_interrupt_freq) - ->default_value(running_query_interrupt_freq) - ->implicit_value(0.5), - "A frequency of checking the request of running query " - "interrupt from user (0.0 (less frequent) ~ (more frequent) 1.0)."); - help_desc.add_options()("use-estimator-result-cache", - po::value(&use_estimator_result_cache) - ->default_value(use_estimator_result_cache) - ->implicit_value(true), - "Use estimator result cache."); + + desc.add_options()("allow-query-step-skipping", + po::value(&g_allow_query_step_skipping) + ->default_value(g_allow_query_step_skipping) + ->implicit_value(true), + "Allow query step skipping when multi-step query has at least " + "one cached query resultset."); + desc.add_options()("enable-debug-timer", + po::value(&g_enable_debug_timer) + ->default_value(g_enable_debug_timer) + ->implicit_value(true), + "Enable debug timer logging."); + desc.add_options()("enable-dynamic-watchdog", + po::value(&enable_dynamic_watchdog) + ->default_value(enable_dynamic_watchdog) + ->implicit_value(true), + "Enable dynamic watchdog."); + desc.add_options()("enable-filter-push-down", + po::value(&g_enable_filter_push_down) + ->default_value(g_enable_filter_push_down) + ->implicit_value(true), + "Enable filter push down through joins."); + desc.add_options()("enable-overlaps-hashjoin", + po::value(&g_enable_overlaps_hashjoin) + ->default_value(g_enable_overlaps_hashjoin) + ->implicit_value(true), + "Enable the overlaps hash join framework allowing for range " + "join (e.g. spatial overlaps) computation using a hash table."); + desc.add_options()("enable-hashjoin-many-to-many", + po::value(&g_enable_hashjoin_many_to_many) + ->default_value(g_enable_hashjoin_many_to_many) + ->implicit_value(true), + "Enable the overlaps hash join framework allowing for range " + "join (e.g. spatial overlaps) computation using a hash table."); + desc.add_options()("enable-distance-rangejoin", + po::value(&g_enable_distance_rangejoin) + ->default_value(g_enable_distance_rangejoin) + ->implicit_value(true), + "Enable accelerating point distance joins with a hash table. " + "This rewrites ST_Distance when using an upperbound (<= X)."); + desc.add_options()("enable-runtime-query-interrupt", + po::value(&enable_runtime_query_interrupt) + ->default_value(enable_runtime_query_interrupt) + ->implicit_value(true), + "Enable runtime query interrupt."); + desc.add_options()("enable-non-kernel-time-query-interrupt", + po::value(&enable_non_kernel_time_query_interrupt) + ->default_value(enable_non_kernel_time_query_interrupt) + ->implicit_value(true), + "Enable non-kernel time query interrupt."); + desc.add_options()("pending-query-interrupt-freq", + po::value(&pending_query_interrupt_freq) + ->default_value(pending_query_interrupt_freq) + ->implicit_value(1000), + "A frequency of checking the request of pending query " + "interrupt from user (in millisecond)."); + desc.add_options()("running-query-interrupt-freq", + po::value(&running_query_interrupt_freq) + ->default_value(running_query_interrupt_freq) + ->implicit_value(0.5), + "A frequency of checking the request of running query " + "interrupt from user (0.0 (less frequent) ~ (more frequent) 1.0)."); + desc.add_options()("use-estimator-result-cache", + po::value(&use_estimator_result_cache) + ->default_value(use_estimator_result_cache) + ->implicit_value(true), + "Use estimator result cache."); + if (!dist_v5_) { - help_desc.add_options()( + desc.add_options()( "enable-string-dict-hash-cache", po::value(&g_cache_string_hash) ->default_value(g_cache_string_hash) ->implicit_value(true), "Cache string hash values in the string dictionary server during import."); } - help_desc.add_options()( - "enable-thrift-logs", - po::value(&g_enable_thrift_logs) - ->default_value(g_enable_thrift_logs) - ->implicit_value(true), - "Enable writing messages directly from thrift to stdout/stderr."); - help_desc.add_options()("enable-watchdog", - po::value(&enable_watchdog) - ->default_value(enable_watchdog) - ->implicit_value(true), - "Enable watchdog."); - help_desc.add_options()( - "watchdog-none-encoded-string-translation-limit", - po::value(&watchdog_none_encoded_string_translation_limit) - ->default_value(watchdog_none_encoded_string_translation_limit), - "Max number of none-encoded strings allowed to be translated " - "to dictionary-encoded with watchdog enabled"); - help_desc.add_options()( - "filter-push-down-low-frac", - po::value(&g_filter_push_down_low_frac) - ->default_value(g_filter_push_down_low_frac) - ->implicit_value(g_filter_push_down_low_frac), - "Lower threshold for selectivity of filters that are pushed down."); - help_desc.add_options()( - "filter-push-down-high-frac", - po::value(&g_filter_push_down_high_frac) - ->default_value(g_filter_push_down_high_frac) - ->implicit_value(g_filter_push_down_high_frac), - "Higher threshold for selectivity of filters that are pushed down."); - help_desc.add_options()("filter-push-down-passing-row-ubound", - po::value(&g_filter_push_down_passing_row_ubound) - ->default_value(g_filter_push_down_passing_row_ubound) - ->implicit_value(g_filter_push_down_passing_row_ubound), - "Upperbound on the number of rows that should pass the filter " - "if the selectivity is less than " - "the high fraction threshold."); - help_desc.add_options()("from-table-reordering", - po::value(&g_from_table_reordering) - ->default_value(g_from_table_reordering) - ->implicit_value(true), - "Enable automatic table reordering in FROM clause."); - help_desc.add_options()("gpu-buffer-mem-bytes", - po::value(&system_parameters.gpu_buffer_mem_bytes) - ->default_value(system_parameters.gpu_buffer_mem_bytes), - "Size of memory reserved for GPU buffers, in bytes, per GPU."); - help_desc.add_options()("gpu-input-mem-limit", - po::value(&system_parameters.gpu_input_mem_limit) - ->default_value(system_parameters.gpu_input_mem_limit), - "Force query to CPU when input data memory usage exceeds this " - "percentage of available GPU memory."); - help_desc.add_options()( + desc.add_options()("enable-thrift-logs", + po::value(&g_enable_thrift_logs) + ->default_value(g_enable_thrift_logs) + ->implicit_value(true), + "Enable writing messages directly from thrift to stdout/stderr."); + desc.add_options()("enable-watchdog", + po::value(&enable_watchdog) + ->default_value(enable_watchdog) + ->implicit_value(true), + "Enable watchdog."); + desc.add_options()("watchdog-none-encoded-string-translation-limit", + po::value(&watchdog_none_encoded_string_translation_limit) + ->default_value(watchdog_none_encoded_string_translation_limit), + "Max number of none-encoded strings allowed to be translated " + "to dictionary-encoded with watchdog enabled"); + desc.add_options()("filter-push-down-low-frac", + po::value(&g_filter_push_down_low_frac) + ->default_value(g_filter_push_down_low_frac) + ->implicit_value(g_filter_push_down_low_frac), + "Lower threshold for selectivity of filters that are pushed down."); + desc.add_options()("filter-push-down-high-frac", + po::value(&g_filter_push_down_high_frac) + ->default_value(g_filter_push_down_high_frac) + ->implicit_value(g_filter_push_down_high_frac), + "Higher threshold for selectivity of filters that are pushed down."); + desc.add_options()("filter-push-down-passing-row-ubound", + po::value(&g_filter_push_down_passing_row_ubound) + ->default_value(g_filter_push_down_passing_row_ubound) + ->implicit_value(g_filter_push_down_passing_row_ubound), + "Upperbound on the number of rows that should pass the filter " + "if the selectivity is less than " + "the high fraction threshold."); + desc.add_options()("from-table-reordering", + po::value(&g_from_table_reordering) + ->default_value(g_from_table_reordering) + ->implicit_value(true), + "Enable automatic table reordering in FROM clause."); + desc.add_options()("gpu-buffer-mem-bytes", + po::value(&system_parameters.gpu_buffer_mem_bytes) + ->default_value(system_parameters.gpu_buffer_mem_bytes), + "Size of memory reserved for GPU buffers, in bytes, per GPU."); + desc.add_options()("gpu-input-mem-limit", + po::value(&system_parameters.gpu_input_mem_limit) + ->default_value(system_parameters.gpu_input_mem_limit), + "Force query to CPU when input data memory usage exceeds this " + "percentage of available GPU memory."); + desc.add_options()( "hll-precision-bits", po::value(&g_hll_precision_bits) ->default_value(g_hll_precision_bits) ->implicit_value(g_hll_precision_bits), "Number of bits used from the hash value used to specify the bucket number."); if (!dist_v5_) { - help_desc.add_options()("http-port", - po::value(&http_port)->default_value(http_port), - "HTTP port number."); - help_desc.add_options()( - "http-binary-port", - po::value(&http_binary_port)->default_value(http_binary_port), - "HTTP binary port number."); + desc.add_options()("http-port", + po::value(&http_port)->default_value(http_port), + "HTTP port number."); + desc.add_options()("http-binary-port", + po::value(&http_binary_port)->default_value(http_binary_port), + "HTTP binary port number."); } - help_desc.add_options()( + desc.add_options()( "idle-session-duration", po::value(&idle_session_duration)->default_value(idle_session_duration), "Maximum duration of idle session."); - help_desc.add_options()("inner-join-fragment-skipping", - po::value(&g_inner_join_fragment_skipping) - ->default_value(g_inner_join_fragment_skipping) - ->implicit_value(true), - "Enable/disable inner join fragment skipping. This feature is " - "considered stable and is enabled by default. This " - "parameter will be removed in a future release."); - help_desc.add_options()( + desc.add_options()("inner-join-fragment-skipping", + po::value(&g_inner_join_fragment_skipping) + ->default_value(g_inner_join_fragment_skipping) + ->implicit_value(true), + "Enable/disable inner join fragment skipping. This feature is " + "considered stable and is enabled by default. This " + "parameter will be removed in a future release."); + desc.add_options()( "max-session-duration", po::value(&max_session_duration)->default_value(max_session_duration), "Maximum duration of active session."); - help_desc.add_options()("num-sessions", - po::value(&system_parameters.num_sessions) - ->default_value(system_parameters.num_sessions), - "Maximum number of active session."); - help_desc.add_options()( - "null-div-by-zero", - po::value(&g_null_div_by_zero) - ->default_value(g_null_div_by_zero) - ->implicit_value(true), - "Return null on division by zero instead of throwing an exception."); - help_desc.add_options()( + desc.add_options()("num-sessions", + po::value(&system_parameters.num_sessions) + ->default_value(system_parameters.num_sessions), + "Maximum number of active session."); + desc.add_options()("null-div-by-zero", + po::value(&g_null_div_by_zero) + ->default_value(g_null_div_by_zero) + ->implicit_value(true), + "Return null on division by zero instead of throwing an exception."); + desc.add_options()( "num-reader-threads", po::value(&num_reader_threads)->default_value(num_reader_threads), "Number of reader threads to use."); - help_desc.add_options()( + desc.add_options()( "max-import-threads", po::value(&g_max_import_threads)->default_value(g_max_import_threads), "Max number of default import threads to use (num hardware threads will be used " "instead if lower). Can be overriden with copy statement threads option)."); - help_desc.add_options()( + desc.add_options()( "overlaps-max-table-size-bytes", po::value(&g_overlaps_max_table_size_bytes) ->default_value(g_overlaps_max_table_size_bytes), "The maximum size in bytes of the hash table for an overlaps hash join."); - help_desc.add_options()("overlaps-target-entries-per-bin", - po::value(&g_overlaps_target_entries_per_bin) - ->default_value(g_overlaps_target_entries_per_bin), - "The target number of hash entries per bin for overlaps join"); + desc.add_options()("overlaps-target-entries-per-bin", + po::value(&g_overlaps_target_entries_per_bin) + ->default_value(g_overlaps_target_entries_per_bin), + "The target number of hash entries per bin for overlaps join"); if (!dist_v5_) { - help_desc.add_options()("port,p", - po::value(&system_parameters.omnisci_server_port) - ->default_value(system_parameters.omnisci_server_port), - "TCP Port number."); + desc.add_options()("port,p", + po::value(&system_parameters.omnisci_server_port) + ->default_value(system_parameters.omnisci_server_port), + "TCP Port number."); } - help_desc.add_options()("num-gpus", - po::value(&system_parameters.num_gpus) - ->default_value(system_parameters.num_gpus), - "Number of gpus to use."); - help_desc.add_options()( + desc.add_options()("num-gpus", + po::value(&system_parameters.num_gpus) + ->default_value(system_parameters.num_gpus), + "Number of gpus to use."); + desc.add_options()( "read-only", po::value(&read_only)->default_value(read_only)->implicit_value(true), "Enable read-only mode."); - help_desc.add_options()( + desc.add_options()( "res-gpu-mem", po::value(&reserved_gpu_mem)->default_value(reserved_gpu_mem), "Reduces GPU memory available to the HeavyDB allocator by this amount. Used for " "compiled code cache and ancillary GPU functions and other processes that may also " "be using the GPU concurrent with HeavyDB."); - help_desc.add_options()("start-gpu", - po::value(&system_parameters.start_gpu) - ->default_value(system_parameters.start_gpu), - "First gpu to use."); - help_desc.add_options()("trivial-loop-join-threshold", - po::value(&g_trivial_loop_join_threshold) - ->default_value(g_trivial_loop_join_threshold) - ->implicit_value(1000), - "The maximum number of rows in the inner table of a loop join " - "considered to be trivially small."); - help_desc.add_options()( + desc.add_options()("start-gpu", + po::value(&system_parameters.start_gpu) + ->default_value(system_parameters.start_gpu), + "First gpu to use."); + desc.add_options()("trivial-loop-join-threshold", + po::value(&g_trivial_loop_join_threshold) + ->default_value(g_trivial_loop_join_threshold) + ->implicit_value(1000), + "The maximum number of rows in the inner table of a loop join " + "considered to be trivially small."); + desc.add_options()( "uniform-request-ids-per-thrift-call", po::value(&g_uniform_request_ids_per_thrift_call) ->default_value(g_uniform_request_ids_per_thrift_call) @@ -557,19 +552,19 @@ void CommandLineOptions::fillOptions() { "If true (default) then assign the same request_id to thrift calls that were " "initiated by the same external thrift call. If false then assign different " "request_ids and log the parent/child relationships."); - help_desc.add_options()("verbose", - po::value(&verbose_logging) - ->default_value(verbose_logging) - ->implicit_value(true), - "Write additional debug log messages to server logs."); - help_desc.add_options()( + desc.add_options()("verbose", + po::value(&verbose_logging) + ->default_value(verbose_logging) + ->implicit_value(true), + "Write additional debug log messages to server logs."); + desc.add_options()( "enable-runtime-udf", po::value(&enable_runtime_udf) ->default_value(enable_runtime_udf) ->implicit_value(true), "DEPRECATED. Please use `enable-runtime-udfs` instead as this flag will be removed " "in the near future."); - help_desc.add_options()( + desc.add_options()( "enable-runtime-udfs", po::value(&enable_runtime_udfs) ->default_value(enable_runtime_udfs) @@ -577,70 +572,70 @@ void CommandLineOptions::fillOptions() { "Enable runtime UDF registration by passing signatures and corresponding LLVM IR " "to the `register_runtime_udf` endpoint. For use with the Python Remote Backend " "Compiler server, packaged separately."); - help_desc.add_options()("enable-udf-registration-for-all-users", + desc.add_options()("enable-udf-registration-for-all-users", po::value(&enable_udf_registration_for_all_users) ->default_value(enable_udf_registration_for_all_users) ->implicit_value(true), "Allow all users, not just superusers, to register runtime " "UDFs/UDTFs. Option only valid if " "`--enable-runtime-udfs` is set to true."); - help_desc.add_options()("version,v", "Print Version Number."); - help_desc.add_options()("enable-string-functions", + desc.add_options()("version,v", "Print Version Number."); + desc.add_options()("enable-string-functions", po::value(&g_enable_string_functions) ->default_value(g_enable_string_functions) ->implicit_value(true), "Enable experimental string functions."); - help_desc.add_options()("enable-experimental-string-functions", + desc.add_options()("enable-experimental-string-functions", po::value(&g_enable_string_functions) ->default_value(g_enable_string_functions) ->implicit_value(true), "DEPRECATED. String functions are now enabled by default, " "but can still be controlled with --enable-string-functions."); - help_desc.add_options()( + desc.add_options()( "enable-fsi", po::value(&g_enable_fsi)->default_value(g_enable_fsi)->implicit_value(true), "Enable foreign storage interface."); - help_desc.add_options()("enable-legacy-delimited-import", + desc.add_options()("enable-legacy-delimited-import", po::value(&g_enable_legacy_delimited_import) ->default_value(g_enable_legacy_delimited_import) ->implicit_value(true), "Use legacy importer for delimited sources."); #ifdef ENABLE_IMPORT_PARQUET - help_desc.add_options()("enable-legacy-parquet-import", + desc.add_options()("enable-legacy-parquet-import", po::value(&g_enable_legacy_parquet_import) ->default_value(g_enable_legacy_parquet_import) ->implicit_value(true), "Use legacy importer for parquet sources."); #endif - help_desc.add_options()("enable-fsi-regex-import", + desc.add_options()("enable-fsi-regex-import", po::value(&g_enable_fsi_regex_import) ->default_value(g_enable_fsi_regex_import) ->implicit_value(true), "Use FSI importer for regex parsed sources."); - help_desc.add_options()("enable-add-metadata-columns", + desc.add_options()("enable-add-metadata-columns", po::value(&g_enable_add_metadata_columns) ->default_value(g_enable_add_metadata_columns) ->implicit_value(true), "Enable add_metadata_columns COPY FROM WITH option (Beta)."); - help_desc.add_options()("disk-cache-path", + desc.add_options()("disk-cache-path", po::value(&disk_cache_config.path), "Specify the path for the disk cache."); - help_desc.add_options()( + desc.add_options()( "disk-cache-level", po::value(&(disk_cache_level))->default_value("foreign_tables"), "Specify level of disk cache. Valid options are 'foreign_tables', " "'local_tables', 'none', and 'all'."); - help_desc.add_options()("disk-cache-size", + desc.add_options()("disk-cache-size", po::value(&(disk_cache_config.size_limit)), "Specify a maximum size for the disk cache in bytes."); #ifdef HAVE_AWS_S3 - help_desc.add_options()( + desc.add_options()( "allow-s3-server-privileges", po::value(&g_allow_s3_server_privileges) ->default_value(g_allow_s3_server_privileges) @@ -651,86 +646,86 @@ void CommandLineOptions::fillOptions() { "an AWS credentials file, or when running on an EC2 instance, with an IAM role " "that is attached to the instance."); #endif // defined(HAVE_AWS_S3) - help_desc.add_options()( + desc.add_options()( "enable-interoperability", po::value(&g_enable_interop) ->default_value(g_enable_interop) ->implicit_value(true), "Enable offloading of query portions to an external execution engine."); - help_desc.add_options()("enable-union", + desc.add_options()("enable-union", po::value(&g_enable_union) ->default_value(g_enable_union) ->implicit_value(true), "DEPRECATED. UNION ALL is enabled by default. Please remove " "use of this option, as it may be disabled in the future."); - help_desc.add_options()( + desc.add_options()( "calcite-service-timeout", po::value(&system_parameters.calcite_timeout) ->default_value(system_parameters.calcite_timeout), "Calcite server timeout (milliseconds). Increase this on systems with frequent " "schema changes or when running large numbers of parallel queries."); - help_desc.add_options()("calcite-service-keepalive", + desc.add_options()("calcite-service-keepalive", po::value(&system_parameters.calcite_keepalive) ->default_value(system_parameters.calcite_keepalive) ->implicit_value(true), "Enable keepalive on Calcite connections."); - help_desc.add_options()( + desc.add_options()( "stringdict-parallelizm", po::value(&g_enable_stringdict_parallel) ->default_value(g_enable_stringdict_parallel) ->implicit_value(true), "Allow StringDictionary to parallelize loads using multiple threads"); - help_desc.add_options()( + desc.add_options()( "log-user-id", po::value(&Catalog_Namespace::g_log_user_id) ->default_value(Catalog_Namespace::g_log_user_id) ->implicit_value(true), "Log userId integer in place of the userName (when available)."); - help_desc.add_options()("log-user-origin", + desc.add_options()("log-user-origin", po::value(&log_user_origin) ->default_value(log_user_origin) ->implicit_value(true), "Lookup the origin of inbound connections by IP address/DNS " "name, and print this information as part of stdlog."); - help_desc.add_options()( + desc.add_options()( "allowed-import-paths", po::value(&allowed_import_paths), "List of allowed root paths that can be used in import operations."); - help_desc.add_options()( + desc.add_options()( "allowed-export-paths", po::value(&allowed_export_paths), "List of allowed root paths that can be used in export operations."); - help_desc.add_options()("enable-system-tables", + desc.add_options()("enable-system-tables", po::value(&g_enable_system_tables) ->default_value(g_enable_system_tables) ->implicit_value(true), "Enable use of system tables."); - help_desc.add_options()("enable-table-functions", + desc.add_options()("enable-table-functions", po::value(&g_enable_table_functions) ->default_value(g_enable_table_functions) ->implicit_value(true), "Enable system table functions support."); - help_desc.add_options()("enable-logs-system-tables", + desc.add_options()("enable-logs-system-tables", po::value(&g_enable_logs_system_tables) ->default_value(g_enable_logs_system_tables) ->implicit_value(true), "Enable use of logs system tables."); - help_desc.add_options()( + desc.add_options()( "logs-system-tables-max-files-count", po::value(&g_logs_system_tables_max_files_count) ->default_value(g_logs_system_tables_max_files_count), "Maximum number of log files that will be processed by each logs system table."); #ifdef ENABLE_MEMKIND - help_desc.add_options()("enable-tiered-cpu-mem", + desc.add_options()("enable-tiered-cpu-mem", po::value(&g_enable_tiered_cpu_mem) ->default_value(g_enable_tiered_cpu_mem) ->implicit_value(true), "Enable additional tiers of CPU memory (PMEM, etc...)"); - help_desc.add_options()("pmem-size", po::value(&g_pmem_size)->default_value(0)); - help_desc.add_options()("pmem-path", po::value(&g_pmem_path)); + desc.add_options()("pmem-size", po::value(&g_pmem_size)->default_value(0)); + desc.add_options()("pmem-path", po::value(&g_pmem_path)); #endif - help_desc.add(log_options_.get_options()); + desc.add(log_options_.get_options()); } void CommandLineOptions::fillAdvancedOptions() { @@ -796,7 +791,7 @@ void CommandLineOptions::fillAdvancedOptions() { "We switch hash table layout when this condition and the condition related to " "\'col-range-to-num-hash-entries-threshold-switch-to-baseline\' are satisfied " "together."); - developer_desc.add_options()( + desc.add_options()( "ratio-num-hash-entry-to-num-tuple-switch-to-baseline", po::value(&g_ratio_num_hash_entry_to_num_tuple_switch_to_baseline) ->default_value(g_ratio_num_hash_entry_to_num_tuple_switch_to_baseline) @@ -928,12 +923,11 @@ void CommandLineOptions::fillAdvancedOptions() { "there is not enough free memory to accomodate the target slab size, smaller " "slabs will be allocated, down to the minimum size specified by " "min-cpu-slab-size."); - developer_desc.add_options()( - "min-gpu-slab-size", - po::value(&system_parameters.min_gpu_slab_size) - ->default_value(system_parameters.min_gpu_slab_size), - "Min slab size (size of memory allocations) for GPU buffer pools."); - developer_desc.add_options()( + desc.add_options()("min-gpu-slab-size", + po::value(&system_parameters.min_gpu_slab_size) + ->default_value(system_parameters.min_gpu_slab_size), + "Min slab size (size of memory allocations) for GPU buffer pools."); + desc.add_options()( "max-gpu-slab-size", po::value(&system_parameters.max_gpu_slab_size) ->default_value(system_parameters.max_gpu_slab_size), @@ -942,7 +936,7 @@ void CommandLineOptions::fillAdvancedOptions() { "slabs will be allocated, down to the minimum size speified by " "min-gpu-slab-size."); - developer_desc.add_options()( + desc.add_options()( "max-output-projection-allocation-bytes", po::value(&g_max_memory_allocation_size) ->default_value(g_max_memory_allocation_size), @@ -950,7 +944,7 @@ void CommandLineOptions::fillAdvancedOptions() { "queries with no pre-flight count. Default is the maximum slab size (sizes " "greater " "than the maximum slab size have no affect). Requires bump allocator."); - developer_desc.add_options()( + desc.add_options()( "min-output-projection-allocation-bytes", po::value(&g_min_memory_allocation_size) ->default_value(g_min_memory_allocation_size), @@ -1143,25 +1137,25 @@ void CommandLineOptions::fillAdvancedOptions() { "Threshold (in minimum number of rows) to prefer columnar output for projections. " "Requires --columnar-large-projections to be set."); - help_desc.add_options()( + desc.add_options()( "allow-query-step-cpu-retry", po::value(&g_allow_query_step_cpu_retry) ->default_value(g_allow_query_step_cpu_retry) ->implicit_value(true), R"(Allow certain query steps to retry on CPU, even when allow-cpu-retry is disabled)"); - help_desc.add_options()("enable-http-binary-server", + desc.add_options()("enable-http-binary-server", po::value(&g_enable_http_binary_server) ->default_value(g_enable_http_binary_server) ->implicit_value(true), "Enable binary over HTTP Thrift server"); - help_desc.add_options()("enable-query-engine-cuda-streams", + desc.add_options()("enable-query-engine-cuda-streams", po::value(&g_query_engine_cuda_streams) ->default_value(g_query_engine_cuda_streams) ->implicit_value(true), "Enable Query Engine CUDA streams"); - help_desc.add_options()( + desc.add_options()( "allow-invalid-literal-buffer-reads", po::value(&g_allow_invalid_literal_buffer_reads) ->default_value(g_allow_invalid_literal_buffer_reads) @@ -1607,7 +1601,7 @@ boost::optional CommandLineOptions::parse_command_line( char const* const* argv, const bool should_init_logging) { po::options_description all_desc("All options"); - all_desc.add(help_desc).add(developer_desc); + all_desc.add(help_desc_).add(developer_desc_); try { po::store(po::command_line_parser(argc, argv) @@ -1622,7 +1616,7 @@ boost::optional CommandLineOptions::parse_command_line( "[--http-port ] [--flush-log] [--version|-v]" << std::endl << std::endl; - std::cout << help_desc << std::endl; + std::cout << help_desc_ << std::endl; return 0; } if (vm.count("dev-options")) { @@ -1630,7 +1624,7 @@ boost::optional CommandLineOptions::parse_command_line( "[--http-port ] [--flush-log] [--version|-v]" << std::endl << std::endl; - std::cout << developer_desc << std::endl; + std::cout << developer_desc_ << std::endl; return 0; } if (vm.count("version")) { diff --git a/ThriftHandler/CommandLineOptions.h b/ThriftHandler/CommandLineOptions.h index 1b145d1e9e..8203ca572b 100644 --- a/ThriftHandler/CommandLineOptions.h +++ b/ThriftHandler/CommandLineOptions.h @@ -42,7 +42,7 @@ class CommandLineOptions { CommandLineOptions(char const* argv0, bool dist_v5_ = false) : log_options_(argv0), exe_name(argv0), dist_v5_(dist_v5_) { fillOptions(); - fillAdvancedOptions(); + fillDeveloperOptions(); } int http_port = 6278; int http_binary_port = 6276; @@ -126,12 +126,12 @@ class CommandLineOptions { bool enable_drop_render_group_columns_migration = false; void fillOptions(); - void fillAdvancedOptions(); + void fillDeveloperOptions(); std::string compressor = std::string(BLOSC_LZ4HC_COMPNAME); - po::options_description help_desc; - po::options_description developer_desc; + po::options_description help_desc_; + po::options_description developer_desc_; logger::LogOptions log_options_; std::string exe_name; po::positional_options_description positional_options; diff --git a/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java b/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java index 8df436b5a0..ec7b027432 100644 --- a/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java +++ b/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java @@ -37,6 +37,8 @@ static HintStrategyTable createHintStrategies(HintStrategyTable.Builder builder) supportedHints.add("disable_loop_join"); supportedHints.add("loop_join_inner_table_max_num_rows"); supportedHints.add("max_join_hashtable_size"); + supportedHints.add("force_baseline_hash_join"); + supportedHints.add("force_one_to_many_hash_join"); for (String hint_name : supportedHints) { // add local / global hints, e.., cpu_mode / g_cpu_mode