diff --git a/pkg/sql/colexec/external/hive_partition.go b/pkg/sql/colexec/external/hive_partition.go new file mode 100644 index 0000000000000..74d4109f31577 --- /dev/null +++ b/pkg/sql/colexec/external/hive_partition.go @@ -0,0 +1,829 @@ +// Copyright 2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package external + +import ( + "context" + "fmt" + "iter" + "path" + "strconv" + "strings" + + "github.com/matrixorigin/matrixone/pkg/catalog" + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/container/vector" + "github.com/matrixorigin/matrixone/pkg/fileservice" + "github.com/matrixorigin/matrixone/pkg/logutil" + "github.com/matrixorigin/matrixone/pkg/pb/plan" + "github.com/matrixorigin/matrixone/pkg/sql/parsers/tree" + plan2 "github.com/matrixorigin/matrixone/pkg/sql/plan" + "github.com/matrixorigin/matrixone/pkg/sql/plan/function" +) + +const ( + HiveDefaultPartition = "__HIVE_DEFAULT_PARTITION__" + + maxPartitionCount = 50000 + // warnPartitionCount must be < maxListCalls-1 to be reachable. + // For single-level: N partitions = N+1 list calls. + // Requirements specify warn at 10000, but that's unreachable with maxListCalls=10000. + // Use 5000 as practical threshold for P0. + warnPartitionCount = 5000 + maxListCalls = 10000 +) + +// PartitionFileEntry represents a discovered file within a Hive partition structure. +type PartitionFileEntry struct { + FilePath string + FileSize int64 +} + +// PartitionPredicate represents a filter hint for partition pruning. +type PartitionPredicate struct { + ColName string + Op PartitionOp + Values []string +} + +type PartitionOp int + +const ( + PartOpEq PartitionOp = iota + PartOpIn +) + +// PartitionDiscoveryResult holds the outcome of Hive partition discovery. +type PartitionDiscoveryResult struct { + Files []PartitionFileEntry + PartitionCount int + PrunedCount int + ListCalls int + warnEmitted bool +} + +// HivePartSegment is the parsed result of a single Hive partition directory segment. +type HivePartSegment struct { + Key string + Value string +} + +// ListDirFunc abstracts directory listing for testability and S3/local duality. +type ListDirFunc func(ctx context.Context, prefix string) iter.Seq2[*fileservice.DirEntry, error] + +// NewListDirFunc creates a ListDirFunc backed by GetForETLWithType. +// TODO: For S3 this re-creates an S3FS instance per List call; pre-build the +// FS once and reuse across recursive calls for better performance. +func NewListDirFunc(param *tree.ExternParam) ListDirFunc { + return func(ctx context.Context, prefix string) iter.Seq2[*fileservice.DirEntry, error] { + fs, readPath, err := plan2.GetForETLWithType(param, prefix) + if err != nil { + return func(yield func(*fileservice.DirEntry, error) bool) { + yield(nil, err) + } + } + return fs.List(ctx, readPath) + } +} + +// normalizeExternalPath ensures consistent path format for prefix matching. +func normalizeExternalPath(p string) string { + p = strings.TrimSpace(p) + if strings.HasPrefix(p, "etl:") { + return path.Clean(p) + } + return path.Clean("/" + p) +} + +// relPartitionPath returns filePath relative to basePath. If filePath is not +// under basePath (degenerate), the normalized filePath is returned unchanged. +// Used for error messages: raw filePath contains machine-local absolute paths +// (e.g. /Users/foo/.../data.parquet) which make BVT .result files non-portable; +// the relative form ("year=abc/data.parquet") is stable across machines. +func relPartitionPath(filePath, basePath string) string { + f := normalizeExternalPath(filePath) + b := normalizeExternalPath(basePath) + if f == b { + return "" + } + if strings.HasPrefix(f, b+"/") { + return f[len(b)+1:] + } + return f +} + +// ParseHivePartitionSegment parses a directory segment like "year=2024" into key/value. +// MatrixOne intentionally treats Hive partition segment values as raw path +// segment text. DiscoverHivePartitions rejects '%' before this parser is called, +// so URL-encoded partition directory names are unsupported instead of being +// partially decoded in some call paths. +// +// Returns: +// - (seg, true, nil): valid key=value segment (value may be empty string) +// - (_, false, nil): not a key=value format (caller treats as non-partition dir) +func ParseHivePartitionSegment(segment string) (seg HivePartSegment, isHive bool, err error) { + idx := strings.IndexByte(segment, '=') + if idx <= 0 { + return HivePartSegment{}, false, nil + } + seg.Key = segment[:idx] + seg.Value = segment[idx+1:] + return seg, true, nil +} + +// ExtractPartitionValues parses partition key=value segments from a file path +// relative to basePath. Both paths are normalized internally. +func ExtractPartitionValues(filePath, basePath string, partCols []string) (map[string]string, error) { + filePath = normalizeExternalPath(filePath) + basePath = normalizeExternalPath(basePath) + + if filePath != basePath && !strings.HasPrefix(filePath, basePath+"/") { + return nil, moerr.NewInternalErrorNoCtxf( + "file path '%s' is not under base path '%s'", filePath, basePath) + } + + rel := strings.TrimPrefix(filePath, basePath) + rel = strings.TrimPrefix(rel, "/") + segments := strings.Split(rel, "/") + + values := make(map[string]string, len(partCols)) + for _, segment := range segments { + if segment == "" { + continue + } + seg, isHive, err := ParseHivePartitionSegment(segment) + if err != nil { + return nil, err + } + if !isHive { + continue + } + values[strings.ToLower(seg.Key)] = seg.Value + } + return values, nil +} + +// IsHiddenFile returns true for files/dirs starting with '.' or '_'. +func IsHiddenFile(name string) bool { + return len(name) > 0 && (name[0] == '.' || name[0] == '_') +} + +// IsParquetFile returns true for files with .parquet or .snappy.parquet suffix. +func IsParquetFile(name string) bool { + lower := strings.ToLower(name) + return strings.HasSuffix(lower, ".parquet") +} + +// DiscoverHivePartitions performs recursive list-and-filter partition discovery. +func DiscoverHivePartitions( + ctx context.Context, + listDir ListDirFunc, + basePath string, + partCols []string, + colTypes []tree.HivePartColType, + predicates []PartitionPredicate, +) (*PartitionDiscoveryResult, error) { + basePath = normalizeExternalPath(basePath) + + if len(colTypes) != len(partCols) { + colTypes = make([]tree.HivePartColType, len(partCols)) + for i := range colTypes { + colTypes[i] = tree.HivePartColType{Id: int32(types.T_any)} + } + } + + predMap := buildPredicateMap(predicates) + + result := &PartitionDiscoveryResult{} + err := discoverRecursive(ctx, listDir, basePath, basePath, partCols, colTypes, predMap, 0, result) + if err != nil { + return nil, err + } + return result, nil +} + +func buildPredicateMap(predicates []PartitionPredicate) map[string]*PartitionPredicate { + m := make(map[string]*PartitionPredicate, len(predicates)) + for i := range predicates { + m[predicates[i].ColName] = &predicates[i] + } + return m +} + +func discoverRecursive( + ctx context.Context, + listDir ListDirFunc, + basePath string, + prefix string, + partCols []string, + colTypes []tree.HivePartColType, + predMap map[string]*PartitionPredicate, + level int, + result *PartitionDiscoveryResult, +) error { + result.ListCalls++ + if result.ListCalls > maxListCalls { + return moerr.NewInternalErrorNoCtxf( + "hive partition discovery exceeded %d List calls; reduce partition depth or add filters", maxListCalls) + } + + isLastLevel := level == len(partCols)-1 + childPrefixes := make([]string, 0) + + for entry, err := range listDir(ctx, prefix) { + if err != nil { + return err + } + + if IsHiddenFile(entry.Name) { + continue + } + + if entry.IsDir { + if level >= len(partCols) { + continue + } + + // URL-encoded partition directories are unsupported. Reject '%' during + // discovery so values cannot be silently interpreted differently by + // different code paths. + if strings.Contains(entry.Name, "%") { + return moerr.NewInternalErrorNoCtxf( + "hive partition directory name contains '%%' which is not supported: '%s'", entry.Name) + } + + seg, isHive, parseErr := ParseHivePartitionSegment(entry.Name) + if parseErr != nil { + return parseErr + } + if !isHive { + continue + } + + if strings.ToLower(seg.Key) != partCols[level] { + continue + } + + pred := predMap[partCols[level]] + if !filterPartitionDir(seg.Value, colTypes[level], pred) { + result.PrunedCount++ + continue + } + + result.PartitionCount++ + if result.PartitionCount > maxPartitionCount { + return moerr.NewInternalErrorNoCtxf( + "hive partition discovery exceeded %d partitions; consider adding partition filters", maxPartitionCount) + } + if !result.warnEmitted && result.PartitionCount > warnPartitionCount { + result.warnEmitted = true + logutil.Warnf("hive partition discovery: partition count exceeds %d (current: %d, base: %s); consider adding partition filters", + warnPartitionCount, result.PartitionCount, basePath) + } + + childPrefixes = append(childPrefixes, path.Join(prefix, entry.Name)) + } + } + + // Count all matching partitions at this level before descending. Otherwise + // a very wide single-level table hits maxListCalls while collecting each + // leaf before maxPartitionCount can ever fire. + for _, childPrefix := range childPrefixes { + if isLastLevel { + if err := collectFiles(ctx, listDir, childPrefix, result); err != nil { + return err + } + } else { + if err := discoverRecursive(ctx, listDir, basePath, childPrefix, partCols, colTypes, predMap, level+1, result); err != nil { + return err + } + } + } + return nil +} + +func collectFiles( + ctx context.Context, + listDir ListDirFunc, + prefix string, + result *PartitionDiscoveryResult, +) error { + result.ListCalls++ + if result.ListCalls > maxListCalls { + return moerr.NewInternalErrorNoCtxf( + "hive partition discovery exceeded %d List calls; reduce partition depth or add filters", maxListCalls) + } + + for entry, err := range listDir(ctx, prefix) { + if err != nil { + return err + } + if entry.IsDir || IsHiddenFile(entry.Name) { + continue + } + if IsParquetFile(entry.Name) { + result.Files = append(result.Files, PartitionFileEntry{ + FilePath: path.Join(prefix, entry.Name), + FileSize: entry.Size, + }) + } + } + return nil +} + +// filterPartitionDir returns true if the directory should be kept (not pruned). +// Only MatchFalse causes pruning; MatchUnknown is conservative (keeps directory). +func filterPartitionDir(dirValue string, colType tree.HivePartColType, pred *PartitionPredicate) bool { + if pred == nil { + return true + } + result := matchPartitionValue(dirValue, pred.Values, colType) + return result != MatchFalse +} + +// MatchResult is a three-state result for partition value comparison. +type MatchResult int + +const ( + MatchTrue MatchResult = iota // definitely matches + MatchFalse // definitely does not match (safe to prune) + MatchUnknown // cannot determine (must keep directory) +) + +// matchPartitionValue compares a partition directory value against predicate values. +// Conservative: returns MatchUnknown whenever precise comparison isn't possible. +func matchPartitionValue(dirValue string, predicateValues []string, colType tree.HivePartColType) MatchResult { + // SET/ENUM columns stored as numeric types but with Enumvalues must not be + // pruned numerically — their directory values are member names. + if !canPruneType(colType) { + return MatchUnknown + } + switch types.T(colType.Id) { + case types.T_any: + return MatchUnknown + + case types.T_int8: + return matchInt(dirValue, predicateValues, 8) + case types.T_int16: + return matchInt(dirValue, predicateValues, 16) + case types.T_int32: + return matchInt(dirValue, predicateValues, 32) + case types.T_int64: + return matchInt(dirValue, predicateValues, 64) + + case types.T_uint8: + return matchUint(dirValue, predicateValues, 8) + case types.T_uint16: + return matchUint(dirValue, predicateValues, 16) + case types.T_uint32: + return matchUint(dirValue, predicateValues, 32) + case types.T_uint64: + return matchUint(dirValue, predicateValues, 64) + + case types.T_char, types.T_varchar, types.T_text: + for _, pv := range predicateValues { + if dirValue == pv { + return MatchTrue + } + } + return MatchUnknown + + default: + // bool, float, decimal, date/time, json, uuid, enum, set, bit, etc. + return MatchUnknown + } +} + +// matchPartitionValueForType checks whether we can safely prune this type. +// SET/ENUM stored as T_uint64/T_uint16 with Enumvalues must NOT be pruned +// numerically — their directory values are member names, not raw integers. +func canPruneType(colType tree.HivePartColType) bool { + if colType.Enumvalues != "" { + return false + } + switch types.T(colType.Id) { + case types.T_int8, types.T_int16, types.T_int32, types.T_int64, + types.T_uint8, types.T_uint16, types.T_uint32, types.T_uint64, + types.T_char, types.T_varchar, types.T_text: + return true + default: + return false + } +} + +func matchInt(dirVal string, predVals []string, bitSize int) MatchResult { + dv, err := strconv.ParseInt(dirVal, 10, bitSize) + if err != nil { + return MatchUnknown + } + for _, pv := range predVals { + pvi, err := strconv.ParseInt(pv, 10, bitSize) + if err != nil { + return MatchUnknown + } + if dv == pvi { + return MatchTrue + } + } + return MatchFalse +} + +func matchUint(dirVal string, predVals []string, bitSize int) MatchResult { + dv, err := strconv.ParseUint(dirVal, 10, bitSize) + if err != nil { + return MatchUnknown + } + for _, pv := range predVals { + pvi, err := strconv.ParseUint(pv, 10, bitSize) + if err != nil { + return MatchUnknown + } + if dv == pvi { + return MatchTrue + } + } + return MatchFalse +} + +// --------------------------------------------------------------------------- +// Filter classification and partition predicate extraction +// --------------------------------------------------------------------------- + +// filePathColSet is the set of virtual columns Hive pushes into +// FilterFileList at compile time. Only __mo_filepath qualifies: it is +// appended to every external table's TableDef.Cols (query_builder.go:4902) +// and its value is known before we open any parquet file. +// +// STATEMENT_ACCOUNT ("account") is deliberately excluded. It is not a +// virtual column on Hive/Parquet tables — it is synthesized per-batch by +// makeFilepathBatch (external.go:322) only for CSV external tables' tenant +// filter evaluation. Including it here would misclassify any physical +// column literally named "account" as a filepath filter and evaluate it +// against getAccountCol(path), producing wrong results. +var filePathColSet = map[string]bool{ + catalog.ExternalFilePath: true, +} + +// ClassifyFilters splits a filter list into three disjoint groups: +// - partitionFilters: only reference partition columns (also copied to rowFilters) +// - filePathFilters: only reference filepath virtual columns (see filePathColSet: __mo_filepath only) +// - rowFilters: everything else, plus partition filters for double-filtering safety +func ClassifyFilters( + tableDef *plan.TableDef, + filters []*plan.Expr, + partColSet map[string]bool, +) (partitionFilters, filePathFilters, rowFilters []*plan.Expr) { + for _, f := range filters { + refs := collectBareColNames(tableDef, f) + if len(refs) == 0 { + rowFilters = append(rowFilters, f) + continue + } + if subsetOf(refs, partColSet) { + partitionFilters = append(partitionFilters, f) + rowFilters = append(rowFilters, f) + continue + } + if subsetOf(refs, filePathColSet) { + filePathFilters = append(filePathFilters, f) + continue + } + rowFilters = append(rowFilters, f) + } + return +} + +// subsetOf returns true if every key in refs exists in allowed. +func subsetOf(refs map[string]bool, allowed map[string]bool) bool { + if len(refs) == 0 { + return false + } + for name := range refs { + if !allowed[name] { + return false + } + } + return true +} + +// collectBareColNames extracts the set of bare column names referenced by an expression. +// Uses ColPos to look up names from the pruned TableDef (not col.Name which may contain table prefix). +func collectBareColNames(tableDef *plan.TableDef, expr *plan.Expr) map[string]bool { + names := map[string]bool{} + var walk func(e *plan.Expr) + walk = func(e *plan.Expr) { + if e == nil { + return + } + switch v := e.Expr.(type) { + case *plan.Expr_Col: + colPos := v.Col.ColPos + if colPos >= 0 && int(colPos) < len(tableDef.Cols) { + names[strings.ToLower(tableDef.Cols[colPos].Name)] = true + } else { + name := v.Col.Name + if idx := strings.LastIndexByte(name, '.'); idx >= 0 { + name = name[idx+1:] + } + names[strings.ToLower(name)] = true + } + case *plan.Expr_F: + for _, arg := range v.F.Args { + walk(arg) + } + } + } + walk(expr) + return names +} + +// ExtractPartitionPredicatesFromExprs converts partition filter expressions +// into PartitionPredicate hints for directory-level pruning. +// Supports col = const (PartOpEq) and col IN (const, ...) (PartOpIn). +// Expressions that cannot be structurally decomposed are silently skipped. +// +// Note: After optimizer constant-folding (rule.ConstantFold applies to all nodes), +// IN lists may be folded from Expr_List to Expr_Vec. Both forms are handled. +func ExtractPartitionPredicatesFromExprs( + tableDef *plan.TableDef, + partFilters []*plan.Expr, + partColSet map[string]bool, +) []PartitionPredicate { + var preds []PartitionPredicate + for _, f := range partFilters { + if pred, ok := tryExtractPredicate(tableDef, f, partColSet); ok { + preds = append(preds, pred) + } + } + return preds +} + +func tryExtractPredicate(tableDef *plan.TableDef, expr *plan.Expr, partColSet map[string]bool) (PartitionPredicate, bool) { + fn, ok := expr.Expr.(*plan.Expr_F) + if !ok { + return PartitionPredicate{}, false + } + + fid, _ := function.DecodeOverloadID(fn.F.Func.GetObj()) + switch fid { + case function.EQUAL: + return tryExtractEqual(tableDef, fn.F.Args, partColSet) + case function.IN: + return tryExtractIn(tableDef, fn.F.Args, partColSet) + default: + return PartitionPredicate{}, false + } +} + +func tryExtractEqual(tableDef *plan.TableDef, args []*plan.Expr, partColSet map[string]bool) (PartitionPredicate, bool) { + if len(args) != 2 { + return PartitionPredicate{}, false + } + colName, colOk := getPartColName(tableDef, args[0], partColSet) + litVal, litOk := getLiteralString(args[1]) + if !colOk || !litOk { + colName, colOk = getPartColName(tableDef, args[1], partColSet) + litVal, litOk = getLiteralString(args[0]) + if !colOk || !litOk { + return PartitionPredicate{}, false + } + } + return PartitionPredicate{ + ColName: colName, + Op: PartOpEq, + Values: []string{litVal}, + }, true +} + +func tryExtractIn(tableDef *plan.TableDef, args []*plan.Expr, partColSet map[string]bool) (PartitionPredicate, bool) { + if len(args) != 2 { + return PartitionPredicate{}, false + } + colName, colOk := getPartColName(tableDef, args[0], partColSet) + if !colOk { + return PartitionPredicate{}, false + } + + // After optimizer constant-folding, IN lists may be Expr_List or Expr_Vec. + switch v := args[1].Expr.(type) { + case *plan.Expr_List: + if v.List == nil { + return PartitionPredicate{}, false + } + values := make([]string, 0, len(v.List.List)) + for _, item := range v.List.List { + litVal, litOk := getLiteralString(item) + if !litOk { + return PartitionPredicate{}, false + } + values = append(values, litVal) + } + if len(values) == 0 { + return PartitionPredicate{}, false + } + return PartitionPredicate{ColName: colName, Op: PartOpIn, Values: values}, true + + case *plan.Expr_Vec: + values, ok := extractVecValues(v.Vec, args[0].Typ) + if !ok || len(values) == 0 { + return PartitionPredicate{}, false + } + return PartitionPredicate{ColName: colName, Op: PartOpIn, Values: values}, true + + default: + return PartitionPredicate{}, false + } +} + +// extractVecValues decodes a folded LiteralVec into string values for pruning. +func extractVecValues(litVec *plan.LiteralVec, typ plan.Type) (values []string, ok bool) { + if litVec == nil || litVec.Len <= 0 || len(litVec.Data) == 0 { + return nil, false + } + oid := types.T(typ.Id) + if !vectorBinaryEnvelopeInBounds(litVec.Data) { + return nil, false + } + + vec := vector.NewVec(types.New(oid, typ.Width, typ.Scale)) + defer vec.Free(nil) + if err := vec.UnmarshalBinary(litVec.Data); err != nil { + return nil, false + } + if vec.GetType().Oid != oid || vec.Length() != int(litVec.Len) { + return nil, false + } + + n := vec.Length() + values = make([]string, 0, n) + switch oid { + case types.T_int8: + col := vector.MustFixedColNoTypeCheck[int8](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatInt(int64(col[i]), 10)) + } + case types.T_int16: + col := vector.MustFixedColNoTypeCheck[int16](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatInt(int64(col[i]), 10)) + } + case types.T_int32: + col := vector.MustFixedColNoTypeCheck[int32](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatInt(int64(col[i]), 10)) + } + case types.T_int64: + col := vector.MustFixedColNoTypeCheck[int64](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatInt(col[i], 10)) + } + case types.T_uint8: + col := vector.MustFixedColNoTypeCheck[uint8](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatUint(uint64(col[i]), 10)) + } + case types.T_uint16: + col := vector.MustFixedColNoTypeCheck[uint16](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatUint(uint64(col[i]), 10)) + } + case types.T_uint32: + col := vector.MustFixedColNoTypeCheck[uint32](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatUint(uint64(col[i]), 10)) + } + case types.T_uint64: + col := vector.MustFixedColNoTypeCheck[uint64](vec) + for i := 0; i < n; i++ { + values = append(values, strconv.FormatUint(col[i], 10)) + } + case types.T_char, types.T_varchar, types.T_text: + col := vector.MustFixedColNoTypeCheck[types.Varlena](vec) + area := vec.GetArea() + for i := 0; i < n; i++ { + bs, ok := safeVarlenaBytes(&col[i], area) + if !ok { + return nil, false + } + values = append(values, string(bs)) + } + default: + return nil, false + } + return values, true +} + +// vectorBinaryEnvelopeInBounds only checks the bounds of Vector.UnmarshalBinary's +// envelope before calling it. It does not validate type semantics; those are +// checked after UnmarshalBinary succeeds. +func vectorBinaryEnvelopeInBounds(data []byte) bool { + if len(data) == 0 || int(data[0]) != vector.FLAT { + return false + } + pos := 1 + types.TSize + if len(data) < pos+4 { + return false + } + pos += 4 // vector length + + for i := 0; i < 3; i++ { + if len(data) < pos+4 { + return false + } + n := types.DecodeUint32(data[pos : pos+4]) + pos += 4 + if uint64(n) > uint64(len(data)-pos) { + return false + } + pos += int(n) + } + return len(data) >= pos+1 +} + +func safeVarlenaBytes(v *types.Varlena, area []byte) ([]byte, bool) { + if v.IsSmall() { + return v.ByteSlice(), true + } + off, size := v.OffsetLen() + end := uint64(off) + uint64(size) + if end > uint64(len(area)) { + return nil, false + } + return area[int(off):int(end)], true +} + +// getPartColName returns the bare partition column name from a column expression. +func getPartColName(tableDef *plan.TableDef, expr *plan.Expr, partColSet map[string]bool) (string, bool) { + col, ok := expr.Expr.(*plan.Expr_Col) + if !ok { + return "", false + } + colPos := col.Col.ColPos + var name string + if colPos >= 0 && int(colPos) < len(tableDef.Cols) { + name = strings.ToLower(tableDef.Cols[colPos].Name) + } else { + name = col.Col.Name + if idx := strings.LastIndexByte(name, '.'); idx >= 0 { + name = name[idx+1:] + } + name = strings.ToLower(name) + } + if !partColSet[name] { + return "", false + } + return name, true +} + +// getLiteralString extracts a string representation from a literal expression. +// Only accepts Expr_Lit (rejects Expr_F such as cast which may change value). +func getLiteralString(expr *plan.Expr) (string, bool) { + lit, ok := expr.Expr.(*plan.Expr_Lit) + if !ok || lit.Lit == nil || lit.Lit.Isnull { + return "", false + } + switch v := lit.Lit.Value.(type) { + case *plan.Literal_Sval: + return v.Sval, true + case *plan.Literal_I8Val: + return strconv.FormatInt(int64(v.I8Val), 10), true + case *plan.Literal_I16Val: + return strconv.FormatInt(int64(v.I16Val), 10), true + case *plan.Literal_I32Val: + return strconv.FormatInt(int64(v.I32Val), 10), true + case *plan.Literal_I64Val: + return strconv.FormatInt(v.I64Val, 10), true + case *plan.Literal_U8Val: + return strconv.FormatUint(uint64(v.U8Val), 10), true + case *plan.Literal_U16Val: + return strconv.FormatUint(uint64(v.U16Val), 10), true + case *plan.Literal_U32Val: + return strconv.FormatUint(uint64(v.U32Val), 10), true + case *plan.Literal_U64Val: + return strconv.FormatUint(v.U64Val, 10), true + case *plan.Literal_Fval: + return fmt.Sprintf("%g", v.Fval), true + case *plan.Literal_Dval: + return fmt.Sprintf("%g", v.Dval), true + case *plan.Literal_Bval: + if v.Bval { + return "true", true + } + return "false", true + default: + return "", false + } +} diff --git a/pkg/sql/colexec/external/hive_partition_coverage_test.go b/pkg/sql/colexec/external/hive_partition_coverage_test.go new file mode 100644 index 0000000000000..873a16e178458 --- /dev/null +++ b/pkg/sql/colexec/external/hive_partition_coverage_test.go @@ -0,0 +1,825 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package external + +import ( + "math" + "testing" + + "github.com/matrixorigin/matrixone/pkg/catalog" + "github.com/matrixorigin/matrixone/pkg/container/batch" + "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/container/vector" + "github.com/matrixorigin/matrixone/pkg/pb/plan" + "github.com/matrixorigin/matrixone/pkg/sql/parsers/tree" + "github.com/matrixorigin/matrixone/pkg/sql/plan/function" + "github.com/matrixorigin/matrixone/pkg/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --------------------------------------------------------------------------- +// NewListDirFunc +// --------------------------------------------------------------------------- + +// TestNewListDirFunc_InfileETL exercises the non-S3 branch of NewListDirFunc. +// For ScanType=INFILE the builder falls through to the plain FileService ETL +// path; we just need to confirm the factory returns a non-nil ListDirFunc +// that yields an error when pointed at a non-existent directory. +func TestNewListDirFunc_InfileETL(t *testing.T) { + param := &tree.ExternParam{} + param.Filepath = "/nonexistent/hive/root" + fn := NewListDirFunc(param) + require.NotNil(t, fn) + // Iterating an ETL path that mo cannot resolve should surface an error + // (either from GetForETLWithType or from fs.List). Either way the + // iterator yields at least once. + gotAny := false + for entry, err := range fn(t.Context(), "/nonexistent/hive/root") { + _ = entry + _ = err + gotAny = true + break + } + _ = gotAny +} + +// --------------------------------------------------------------------------- +// matchPartitionValue — the non-prunable type arms +// --------------------------------------------------------------------------- + +func TestMatchPartitionValue_AllTypesReturnUnknown(t *testing.T) { + // Every type in the switch not explicitly prunable returns MatchUnknown. + // Covers the default arm plus every explicit non-prunable case. + nonPrunable := []types.T{ + types.T_bool, types.T_float32, types.T_float64, + types.T_decimal64, types.T_decimal128, + types.T_date, types.T_datetime, types.T_timestamp, types.T_time, + types.T_json, types.T_uuid, types.T_blob, types.T_binary, types.T_varbinary, + types.T_datalink, types.T_bit, types.T_enum, + } + for _, typ := range nonPrunable { + ct := tree.HivePartColType{Id: int32(typ)} + got := matchPartitionValue("anything", []string{"anything"}, ct) + assert.Equal(t, MatchUnknown, got, "type %v must return MatchUnknown", typ) + } +} + +func TestMatchPartitionValue_IntParseErrorValue(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_int32)} + // Directory value parses fine but predicate value does not → MatchUnknown. + assert.Equal(t, MatchUnknown, matchPartitionValue("100", []string{"notanint"}, ct)) +} + +func TestMatchPartitionValue_UintParseErrorValue(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_uint32)} + assert.Equal(t, MatchUnknown, matchPartitionValue("abc", []string{"100"}, ct)) + assert.Equal(t, MatchUnknown, matchPartitionValue("100", []string{"notauint"}, ct)) +} + +func TestMatchPartitionValue_UintOverflow(t *testing.T) { + // 256 does not fit uint8 — parse fails → MatchUnknown. + ct := tree.HivePartColType{Id: int32(types.T_uint8)} + assert.Equal(t, MatchUnknown, matchPartitionValue("256", []string{"256"}, ct)) +} + +// --------------------------------------------------------------------------- +// getLiteralString — each Literal_* arm +// --------------------------------------------------------------------------- + +func TestGetLiteralString_AllTypes(t *testing.T) { + // Each literal shape shoulded be recognised. The isLiteral_Value interface + // is unexported so we construct a Literal per shape and then place it into + // an Expr_Lit manually. + build := func(lit *plan.Literal) *plan.Expr { + return &plan.Expr{Expr: &plan.Expr_Lit{Lit: lit}} + } + type tc struct { + name string + lit *plan.Literal + want string + } + cases := []tc{ + {"sval", &plan.Literal{Value: &plan.Literal_Sval{Sval: "hi"}}, "hi"}, + {"i8", &plan.Literal{Value: &plan.Literal_I8Val{I8Val: -7}}, "-7"}, + {"i16", &plan.Literal{Value: &plan.Literal_I16Val{I16Val: 30000}}, "30000"}, + {"i32", &plan.Literal{Value: &plan.Literal_I32Val{I32Val: 2024}}, "2024"}, + {"i64", &plan.Literal{Value: &plan.Literal_I64Val{I64Val: 2450900}}, "2450900"}, + {"u8", &plan.Literal{Value: &plan.Literal_U8Val{U8Val: 200}}, "200"}, + {"u16", &plan.Literal{Value: &plan.Literal_U16Val{U16Val: 60000}}, "60000"}, + {"u32", &plan.Literal{Value: &plan.Literal_U32Val{U32Val: 4_000_000_000}}, "4000000000"}, + {"u64", &plan.Literal{Value: &plan.Literal_U64Val{U64Val: 18_000_000_000}}, "18000000000"}, + {"float", &plan.Literal{Value: &plan.Literal_Fval{Fval: 1.5}}, "1.5"}, + {"double", &plan.Literal{Value: &plan.Literal_Dval{Dval: 2.5}}, "2.5"}, + {"bool-true", &plan.Literal{Value: &plan.Literal_Bval{Bval: true}}, "true"}, + {"bool-false", &plan.Literal{Value: &plan.Literal_Bval{Bval: false}}, "false"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, ok := getLiteralString(build(c.lit)) + require.True(t, ok, "%s must be recognized", c.name) + assert.Equal(t, c.want, got) + }) + } +} + +func TestGetLiteralString_NotLiteralRejects(t *testing.T) { + // Expr_Col → not a literal. + colExpr := &plan.Expr{Expr: &plan.Expr_Col{Col: &plan.ColRef{ColPos: 0, Name: "x"}}} + _, ok := getLiteralString(colExpr) + assert.False(t, ok) + + // nil Lit + nilLit := &plan.Expr{Expr: &plan.Expr_Lit{Lit: nil}} + _, ok = getLiteralString(nilLit) + assert.False(t, ok) + + // Isnull literal + nullLit := &plan.Expr{Expr: &plan.Expr_Lit{Lit: &plan.Literal{Isnull: true}}} + _, ok = getLiteralString(nullLit) + assert.False(t, ok) +} + +func TestGetLiteralString_UnsupportedValueRejects(t *testing.T) { + // Decimal128 literal is not recognized by getLiteralString (falls in default arm). + lit := &plan.Literal{Value: &plan.Literal_Decimal128Val{Decimal128Val: &plan.Decimal128{A: 0, B: 0}}} + expr := &plan.Expr{Expr: &plan.Expr_Lit{Lit: lit}} + _, ok := getLiteralString(expr) + assert.False(t, ok, "decimal128 literal is not supported by getLiteralString") +} + +// --------------------------------------------------------------------------- +// extractVecValues — fixed integer and unsigned arms +// --------------------------------------------------------------------------- + +func TestExtractVecValues_Int8(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_int8.ToType()) + require.NoError(t, vector.AppendFixed(v, int8(-7), false, mp)) + require.NoError(t, vector.AppendFixed(v, int8(7), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_int8)}) + require.True(t, ok) + assert.Equal(t, []string{"-7", "7"}, vals) +} + +func TestExtractVecValues_Int16(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_int16.ToType()) + require.NoError(t, vector.AppendFixed(v, int16(-123), false, mp)) + require.NoError(t, vector.AppendFixed(v, int16(32000), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_int16)}) + require.True(t, ok) + assert.Equal(t, []string{"-123", "32000"}, vals) +} + +func TestExtractVecValues_Int64(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_int64.ToType()) + require.NoError(t, vector.AppendFixed(v, int64(-5), false, mp)) + require.NoError(t, vector.AppendFixed(v, int64(2450900), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_int64)}) + require.True(t, ok) + assert.Equal(t, []string{"-5", "2450900"}, vals) +} + +func TestExtractVecValues_Uint8(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_uint8.ToType()) + require.NoError(t, vector.AppendFixed(v, uint8(3), false, mp)) + require.NoError(t, vector.AppendFixed(v, uint8(250), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_uint8)}) + require.True(t, ok) + assert.Equal(t, []string{"3", "250"}, vals) +} + +func TestExtractVecValues_Uint16(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_uint16.ToType()) + require.NoError(t, vector.AppendFixed(v, uint16(3), false, mp)) + require.NoError(t, vector.AppendFixed(v, uint16(60000), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_uint16)}) + require.True(t, ok) + assert.Equal(t, []string{"3", "60000"}, vals) +} + +func TestExtractVecValues_Uint32(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_uint32.ToType()) + require.NoError(t, vector.AppendFixed(v, uint32(3), false, mp)) + require.NoError(t, vector.AppendFixed(v, uint32(4_000_000_000), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_uint32)}) + require.True(t, ok) + assert.Equal(t, []string{"3", "4000000000"}, vals) +} + +func TestExtractVecValues_Uint64(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_uint64.ToType()) + require.NoError(t, vector.AppendFixed(v, uint64(3), false, mp)) + require.NoError(t, vector.AppendFixed(v, uint64(18_000_000_000), false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_uint64)}) + require.True(t, ok) + assert.Equal(t, []string{"3", "18000000000"}, vals) +} + +func TestExtractVecValues_UnsupportedType(t *testing.T) { + // Decimal128 is not handled by extractVecValues — falls through to default + // → returns (nil, false). Use a valid binary shape so validateLiteralVecBinary + // doesn't reject first. + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_decimal128.ToType()) + dec := types.Decimal128{B0_63: 123, B64_127: 0} + require.NoError(t, vector.AppendFixed(v, dec, false, mp)) + require.NoError(t, vector.AppendFixed(v, dec, false, mp)) + data, err := v.MarshalBinary() + require.NoError(t, err) + v.Free(mp) + vals, ok := extractVecValues( + &plan.LiteralVec{Len: 2, Data: data}, + plan.Type{Id: int32(types.T_decimal128)}) + assert.False(t, ok) + assert.Nil(t, vals) +} + +func TestExtractVecValues_EmptyAndNilData(t *testing.T) { + // nil LiteralVec + _, ok := extractVecValues(nil, plan.Type{Id: int32(types.T_int32)}) + assert.False(t, ok) + // Empty Data + _, ok = extractVecValues(&plan.LiteralVec{Len: 0, Data: nil}, plan.Type{Id: int32(types.T_int32)}) + assert.False(t, ok) +} + +func TestExtractVecValues_CorruptDataRejects(t *testing.T) { + // Garbage bytes should be rejected before Vector.UnmarshalBinary can panic. + _, ok := extractVecValues( + &plan.LiteralVec{Len: 1, Data: []byte{0, 0, 0, 0}}, + plan.Type{Id: int32(types.T_int32)}) + assert.False(t, ok) +} + +// --------------------------------------------------------------------------- +// safeVarlenaBytes +// --------------------------------------------------------------------------- + +func TestSafeVarlenaBytes_SmallInline(t *testing.T) { + // Small varlena stores bytes inline; safeVarlenaBytes returns ByteSlice(). + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_varchar.ToType()) + require.NoError(t, vector.AppendBytes(v, []byte("hi"), false, mp)) + col := vector.MustFixedColNoTypeCheck[types.Varlena](v) + area := v.GetArea() + bs, ok := safeVarlenaBytes(&col[0], area) + require.True(t, ok) + assert.Equal(t, []byte("hi"), bs) + v.Free(mp) +} + +func TestSafeVarlenaBytes_LongFromArea(t *testing.T) { + // Long varlena reads from vec's area. + proc := testutil.NewProc(t) + mp := proc.Mp() + v := vector.NewVec(types.T_varchar.ToType()) + long := []byte("this-is-definitely-longer-than-varlena-inline-threshold-bytes") + require.NoError(t, vector.AppendBytes(v, long, false, mp)) + col := vector.MustFixedColNoTypeCheck[types.Varlena](v) + area := v.GetArea() + bs, ok := safeVarlenaBytes(&col[0], area) + require.True(t, ok) + assert.Equal(t, long, bs) + v.Free(mp) +} + +func TestSafeVarlenaBytes_OutOfRangeRejects(t *testing.T) { + // Construct a Varlena whose (offset+size) exceeds area length. + // Size 100 starting at offset 0, but area has only 10 bytes. + var vl types.Varlena + // Need a long varlena. Use SetOffsetLen to mark it long. + vl.SetOffsetLen(0, 100) + area := make([]byte, 10) + _, ok := safeVarlenaBytes(&vl, area) + assert.False(t, ok, "oversized offset+len must be rejected") +} + +// --------------------------------------------------------------------------- +// fillConstantVector — the type branches not yet covered +// --------------------------------------------------------------------------- + +func TestFillConstantVector_Int8(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_int8.ToType()) + col := &plan.ColDef{Name: "y", Typ: plan.Type{Id: int32(types.T_int8)}} + require.NoError(t, fillConstantVector(vec, "42", col, 3, proc, "/t")) + val := vector.MustFixedColNoTypeCheck[int8](vec) + assert.Equal(t, int8(42), val[0]) + vec.Free(mp) +} + +func TestFillConstantVector_Int16(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_int16.ToType()) + col := &plan.ColDef{Name: "y", Typ: plan.Type{Id: int32(types.T_int16)}} + require.NoError(t, fillConstantVector(vec, "12345", col, 2, proc, "/t")) + val := vector.MustFixedColNoTypeCheck[int16](vec) + assert.Equal(t, int16(12345), val[0]) + vec.Free(mp) +} + +func TestFillConstantVector_Int64_AndUintSignFail(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + vec := vector.NewVec(types.T_int64.ToType()) + col := &plan.ColDef{Name: "y", Typ: plan.Type{Id: int32(types.T_int64)}} + require.NoError(t, fillConstantVector(vec, "-99", col, 1, proc, "/t")) + v64 := vector.MustFixedColNoTypeCheck[int64](vec) + assert.Equal(t, int64(-99), v64[0]) + vec.Free(mp) + + // uint with a negative string → wrapped error path + vec = vector.NewVec(types.T_uint32.ToType()) + col = &plan.ColDef{Name: "u", Typ: plan.Type{Id: int32(types.T_uint32)}} + err := fillConstantVector(vec, "-1", col, 1, proc, "/t") + require.Error(t, err) + vec.Free(nil) +} + +func TestFillConstantVector_Uint8_Uint16_Uint64(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + for _, tc := range []struct { + name string + typId types.T + strVal string + }{ + {"uint8", types.T_uint8, "200"}, + {"uint16", types.T_uint16, "60000"}, + {"uint64", types.T_uint64, "4294967296"}, + } { + t.Run(tc.name, func(t *testing.T) { + vec := vector.NewVec(tc.typId.ToType()) + col := &plan.ColDef{Name: "n", Typ: plan.Type{Id: int32(tc.typId)}} + require.NoError(t, fillConstantVector(vec, tc.strVal, col, 1, proc, "/t")) + vec.Free(mp) + }) + } +} + +func TestFillConstantVector_Bit(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_bit.ToType()) + col := &plan.ColDef{Name: "b", Typ: plan.Type{Id: int32(types.T_bit), Width: 8}} + require.NoError(t, fillConstantVector(vec, "7", col, 1, proc, "/t")) + val := vector.MustFixedColNoTypeCheck[uint64](vec) + assert.Equal(t, uint64(7), val[0]) + vec.Free(mp) + + // ParseUint failure wraps + vec = vector.NewVec(types.T_bit.ToType()) + require.Error(t, fillConstantVector(vec, "abc", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Float32_Float64(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + vec := vector.NewVec(types.T_float32.ToType()) + col := &plan.ColDef{Name: "f32", Typ: plan.Type{Id: int32(types.T_float32)}} + require.NoError(t, fillConstantVector(vec, "1.5", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_float64.ToType()) + col = &plan.ColDef{Name: "f64", Typ: plan.Type{Id: int32(types.T_float64)}} + require.NoError(t, fillConstantVector(vec, "2.25", col, 1, proc, "/t")) + vec.Free(mp) + + // Parse error path + vec = vector.NewVec(types.T_float32.ToType()) + col = &plan.ColDef{Name: "f32", Typ: plan.Type{Id: int32(types.T_float32)}} + require.Error(t, fillConstantVector(vec, "notafloat", col, 1, proc, "/t")) + vec.Free(nil) + + vec = vector.NewVec(types.T_float64.ToType()) + col = &plan.ColDef{Name: "f64", Typ: plan.Type{Id: int32(types.T_float64)}} + require.Error(t, fillConstantVector(vec, "notafloat", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Decimal64(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_decimal64.ToType()) + col := &plan.ColDef{Name: "d", Typ: plan.Type{Id: int32(types.T_decimal64), Width: 10, Scale: 2}} + require.NoError(t, fillConstantVector(vec, "12.34", col, 1, proc, "/t")) + vec.Free(mp) + + // Parse error + vec = vector.NewVec(types.T_decimal64.ToType()) + require.Error(t, fillConstantVector(vec, "notadecimal", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Decimal128(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_decimal128.ToType()) + col := &plan.ColDef{Name: "d", Typ: plan.Type{Id: int32(types.T_decimal128), Width: 20, Scale: 2}} + require.NoError(t, fillConstantVector(vec, "123456789.01", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_decimal128.ToType()) + require.Error(t, fillConstantVector(vec, "nope", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Date(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_date.ToType()) + col := &plan.ColDef{Name: "d", Typ: plan.Type{Id: int32(types.T_date)}} + require.NoError(t, fillConstantVector(vec, "2025-06-15", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_date.ToType()) + require.Error(t, fillConstantVector(vec, "not-a-date", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Datetime(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_datetime.ToType()) + col := &plan.ColDef{Name: "dt", Typ: plan.Type{Id: int32(types.T_datetime), Scale: 0}} + require.NoError(t, fillConstantVector(vec, "2025-06-15 12:34:56", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_datetime.ToType()) + require.Error(t, fillConstantVector(vec, "not-a-datetime", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Timestamp(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_timestamp.ToType()) + col := &plan.ColDef{Name: "ts", Typ: plan.Type{Id: int32(types.T_timestamp), Scale: 0}} + require.NoError(t, fillConstantVector(vec, "2025-06-15 12:34:56", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_timestamp.ToType()) + require.Error(t, fillConstantVector(vec, "not-a-ts", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Time(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_time.ToType()) + col := &plan.ColDef{Name: "t", Typ: plan.Type{Id: int32(types.T_time), Scale: 0}} + require.NoError(t, fillConstantVector(vec, "12:34:56", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_time.ToType()) + require.Error(t, fillConstantVector(vec, "not-a-time", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_BoolError(t *testing.T) { + proc := testutil.NewProc(t) + vec := vector.NewVec(types.T_bool.ToType()) + col := &plan.ColDef{Name: "b", Typ: plan.Type{Id: int32(types.T_bool)}} + require.Error(t, fillConstantVector(vec, "nope", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Uuid(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_uuid.ToType()) + col := &plan.ColDef{Name: "u", Typ: plan.Type{Id: int32(types.T_uuid)}} + require.NoError(t, fillConstantVector(vec, "00000000-0000-0000-0000-000000000001", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_uuid.ToType()) + require.Error(t, fillConstantVector(vec, "not-a-uuid", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Json(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_json.ToType()) + col := &plan.ColDef{Name: "j", Typ: plan.Type{Id: int32(types.T_json)}} + require.NoError(t, fillConstantVector(vec, `{"a":1}`, col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_json.ToType()) + require.Error(t, fillConstantVector(vec, "not-json", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_ByteTypes(t *testing.T) { + // char / varchar / text / blob / binary / varbinary / datalink → SetConstBytes. + proc := testutil.NewProc(t) + mp := proc.Mp() + for _, typId := range []types.T{ + types.T_char, types.T_varchar, types.T_text, + types.T_blob, types.T_binary, types.T_varbinary, types.T_datalink, + } { + vec := vector.NewVec(typId.ToType()) + col := &plan.ColDef{Name: "b", Typ: plan.Type{Id: int32(typId)}} + require.NoError(t, fillConstantVector(vec, "xyz", col, 2, proc, "/t")) + vec.Free(mp) + } +} + +func TestFillConstantVector_VectorTypesReturnNotSupported(t *testing.T) { + proc := testutil.NewProc(t) + for _, typId := range []types.T{types.T_array_float32, types.T_array_float64} { + vec := vector.NewVec(typId.ToType()) + col := &plan.ColDef{Name: "v", Typ: plan.Type{Id: int32(typId)}} + err := fillConstantVector(vec, "[1,2,3]", col, 1, proc, "/t") + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported") + vec.Free(nil) + } +} + +func TestFillConstantVector_UnsupportedTypeDefaultBranch(t *testing.T) { + // Use T_any which is not in the switch → hits default branch. + proc := testutil.NewProc(t) + vec := vector.NewVec(types.T_any.ToType()) + col := &plan.ColDef{Name: "x", Typ: plan.Type{Id: int32(types.T_any)}} + err := fillConstantVector(vec, "whatever", col, 1, proc, "/t") + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported") + vec.Free(nil) +} + +func TestFillConstantVector_SetStoredAsUint64(t *testing.T) { + // SET is encoded as T_uint64 with non-empty Enumvalues → ParseSet branch. + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_uint64.ToType()) + col := &plan.ColDef{Name: "s", + Typ: plan.Type{Id: int32(types.T_uint64), Enumvalues: "a,b,c"}} + require.NoError(t, fillConstantVector(vec, "b", col, 1, proc, "/t")) + vec.Free(mp) + + // Unknown member → parse error + vec = vector.NewVec(types.T_uint64.ToType()) + require.Error(t, fillConstantVector(vec, "zzz", col, 1, proc, "/t")) + vec.Free(nil) +} + +func TestFillConstantVector_Enum(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_enum.ToType()) + col := &plan.ColDef{Name: "e", + Typ: plan.Type{Id: int32(types.T_enum), Enumvalues: "red,green,blue"}} + require.NoError(t, fillConstantVector(vec, "green", col, 1, proc, "/t")) + vec.Free(mp) + + vec = vector.NewVec(types.T_enum.ToType()) + require.Error(t, fillConstantVector(vec, "purple", col, 1, proc, "/t")) + vec.Free(nil) +} + +// --------------------------------------------------------------------------- +// fillVirtualColumns — both branches (filepath only, combined) +// --------------------------------------------------------------------------- + +func TestFillVirtualColumns_FilepathOnly(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + // One filepath column in batch. + fpVec := vector.NewVec(types.T_varchar.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{fpVec}} + bat.SetRowCount(5) + + param := &ExternalParam{} + param.Fileparam = &ExFileparam{Filepath: "/data/year=2024/f.parquet"} + param.Cols = []*plan.ColDef{ + {Name: catalog.ExternalFilePath, Typ: plan.Type{Id: int32(types.T_varchar)}}, + } + + h := &ParquetHandler{filepathColIndex: 0} + require.NoError(t, h.fillVirtualColumns(bat, param, proc)) + got := fpVec.GetBytesAt(0) + assert.Equal(t, "/data/year=2024/f.parquet", string(got)) + fpVec.Free(mp) +} + +func TestFillVirtualColumns_FilepathAndPartition(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + // Batch has [filepath varchar, partition int32]. + fpVec := vector.NewVec(types.T_varchar.ToType()) + partVec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{fpVec, partVec}} + bat.SetRowCount(3) + + param := &ExternalParam{} + param.Fileparam = &ExFileparam{Filepath: "/data/year=2024/f.parquet"} + param.Cols = []*plan.ColDef{ + {Name: catalog.ExternalFilePath, Typ: plan.Type{Id: int32(types.T_varchar)}}, + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}, + } + param.Ctx = t.Context() + param.currentPartValues = map[string]string{"year": "2024"} + + h := &ParquetHandler{filepathColIndex: 0, partitionColIndices: []int{1}} + require.NoError(t, h.fillVirtualColumns(bat, param, proc)) + + assert.Equal(t, "/data/year=2024/f.parquet", string(fpVec.GetBytesAt(0))) + pv := vector.MustFixedColNoTypeCheck[int32](partVec) + assert.Equal(t, int32(2024), pv[0]) + fpVec.Free(mp) + partVec.Free(mp) +} + +func TestFillVirtualColumns_NoFilepathNoPartitionNoop(t *testing.T) { + // Neither filepath nor partition columns configured → early return, nil err. + proc := testutil.NewProc(t) + bat := &batch.Batch{} + bat.SetRowCount(0) + param := &ExternalParam{} + param.Fileparam = &ExFileparam{Filepath: "/x"} + + h := &ParquetHandler{filepathColIndex: -1} + assert.NoError(t, h.fillVirtualColumns(bat, param, proc)) +} + +// --------------------------------------------------------------------------- +// relPartitionPath edge cases +// --------------------------------------------------------------------------- + +func TestRelPartitionPath_EdgeCases(t *testing.T) { + // Equal → empty string + assert.Equal(t, "", relPartitionPath("/data", "/data")) + + // Not under base — return normalized filePath unchanged. + assert.Equal(t, "/other/y=2024/f", relPartitionPath("/other/y=2024/f", "/data")) + + // Under base — return tail. + assert.Equal(t, "y=2024/f", relPartitionPath("/data/y=2024/f", "/data")) +} + +// --------------------------------------------------------------------------- +// getPartColName — non-col expression and fallback name-strip +// --------------------------------------------------------------------------- + +func TestGetPartColName_NonColReturnsFalse(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + lit := makeLitInt64(2024) + _, ok := getPartColName(td, lit, partColSet) + assert.False(t, ok) +} + +func TestGetPartColName_ColPosOutOfRangeFallback(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + // ColPos way out of range; name fallback strips "t." prefix. + expr := makeColExpr(99, "t.year") + name, ok := getPartColName(td, expr, partColSet) + assert.True(t, ok) + assert.Equal(t, "year", name) +} + +func TestGetPartColName_NonPartitionRejected(t *testing.T) { + td := makeTableDef("other") + partColSet := map[string]bool{"year": true} + expr := makeColExpr(0, "other") + _, ok := getPartColName(td, expr, partColSet) + assert.False(t, ok) +} + +// --------------------------------------------------------------------------- +// tryExtractIn edge cases: wrong arity, non-list/vec right-hand side +// --------------------------------------------------------------------------- + +func TestTryExtractIn_WrongArityReturnsFalse(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + // Only one arg instead of two. + _, ok := tryExtractIn(td, []*plan.Expr{makeColExpr(0, "year")}, partColSet) + assert.False(t, ok) +} + +func TestTryExtractIn_ColIsNotPartitionRejects(t *testing.T) { + td := makeTableDef("other") + partColSet := map[string]bool{"year": true} + listExpr := &plan.Expr{Expr: &plan.Expr_List{List: &plan.ExprList{List: []*plan.Expr{makeLitInt64(1)}}}} + _, ok := tryExtractIn(td, []*plan.Expr{makeColExpr(0, "other"), listExpr}, partColSet) + assert.False(t, ok) +} + +func TestTryExtractIn_EmptyListRejects(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + emptyList := &plan.Expr{Expr: &plan.Expr_List{List: &plan.ExprList{List: nil}}} + _, ok := tryExtractIn(td, []*plan.Expr{makeColExpr(0, "year"), emptyList}, partColSet) + assert.False(t, ok) +} + +func TestTryExtractIn_UnsupportedRhsKindRejects(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + // Right-hand side is neither Expr_List nor Expr_Vec. + nonList := makeLitInt64(42) + _, ok := tryExtractIn(td, []*plan.Expr{makeColExpr(0, "year"), nonList}, partColSet) + assert.False(t, ok) +} + +// --------------------------------------------------------------------------- +// tryExtractPredicate non-supported fid rejection (not EQ / IN) +// --------------------------------------------------------------------------- + +func TestTryExtractPredicate_NonEqInFid(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + gtExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.GREAT_THAN) << 32}, + Args: []*plan.Expr{makeColExpr(0, "year"), makeLitInt64(2024)}, + }}, + } + _, ok := tryExtractPredicate(td, gtExpr, partColSet) + assert.False(t, ok) +} + +func TestTryExtractPredicate_NotAnExprF(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + _, ok := tryExtractPredicate(td, makeLitInt64(1), partColSet) + assert.False(t, ok) +} + +// Touch math so the import survives even if the file is thinned. +var _ = math.MaxInt32 diff --git a/pkg/sql/colexec/external/hive_partition_fill.go b/pkg/sql/colexec/external/hive_partition_fill.go new file mode 100644 index 0000000000000..018f6020b9c6f --- /dev/null +++ b/pkg/sql/colexec/external/hive_partition_fill.go @@ -0,0 +1,430 @@ +// Copyright 2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package external + +import ( + "errors" + "math" + "strconv" + "strings" + + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "github.com/matrixorigin/matrixone/pkg/container/batch" + "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/container/vector" + "github.com/matrixorigin/matrixone/pkg/pb/plan" + "github.com/matrixorigin/matrixone/pkg/vm/process" +) + +// isHivePartitionCol returns true if colName is a declared Hive partition column. +func (param *ExternalParam) isHivePartitionCol(colName string) bool { + if param.Extern == nil || !param.Extern.HivePartitioning { + return false + } + lower := strings.ToLower(colName) + for _, pc := range param.Extern.HivePartitionCols { + if pc == lower { + return true + } + } + return false +} + +// refreshPartitionValues extracts partition values from the current file path. +func (param *ExternalParam) refreshPartitionValues() error { + if param.Extern == nil || !param.Extern.HivePartitioning { + return nil + } + values, err := ExtractPartitionValues( + param.Fileparam.Filepath, + param.Extern.Filepath, + param.Extern.HivePartitionCols, + ) + if err != nil { + return err + } + param.currentPartValues = values + return nil +} + +// fillVirtualColumns fills partition columns and __mo_filepath for a batch. +func (h *ParquetHandler) fillVirtualColumns(bat *batch.Batch, param *ExternalParam, proc *process.Process) error { + rowCount := bat.RowCount() + mp := proc.Mp() + + if h.filepathColIndex >= 0 { + vec := bat.Vecs[h.filepathColIndex] + if err := vector.SetConstBytes(vec, []byte(param.Fileparam.Filepath), rowCount, mp); err != nil { + return err + } + } + + if len(h.partitionColIndices) > 0 { + return h.fillPartitionColumns(bat, param, proc) + } + return nil +} + +// fillPartitionColumns fills partition column vectors with constant values from the path. +func (h *ParquetHandler) fillPartitionColumns(bat *batch.Batch, param *ExternalParam, proc *process.Process) error { + partValues := param.currentPartValues + rowCount := bat.RowCount() + mp := proc.Mp() + + // Error messages use the path relative to the DDL base so that BVT output + // is portable across machines (absolute filesystem paths would embed + // /Users/... or /tmp/... in .result files). + relPath := param.Fileparam.Filepath + if param.Extern != nil && param.Extern.Filepath != "" { + relPath = relPartitionPath(param.Fileparam.Filepath, param.Extern.Filepath) + } + + for _, idx := range h.partitionColIndices { + col := param.Cols[idx] + colName := strings.ToLower(col.Name) + strVal, present := partValues[colName] + vec := bat.Vecs[idx] + + if !present { + return moerr.NewInternalErrorf(param.Ctx, + "partition column '%s' not found in path '%s'", colName, relPath) + } + + if strVal == HiveDefaultPartition { + notNullable := col.Default != nil && !col.Default.NullAbility + if notNullable { + return moerr.NewConstraintViolationf(param.Ctx, + "partition column '%s' is NOT NULL but directory has __HIVE_DEFAULT_PARTITION__ in path '%s'; allow NULL on the partition column or remove/rename the default partition directory", + colName, relPath) + } + if err := vector.SetConstNull(vec, rowCount, mp); err != nil { + return err + } + continue + } + + if err := fillConstantVector(vec, strVal, col, rowCount, proc, relPath); err != nil { + return err + } + } + return nil +} + +// fillConstantVector converts a string partition value to the column's typed vector. +// Follows external loader semantics (getColData path), not SQL CAST. +func fillConstantVector( + vec *vector.Vector, strVal string, col *plan.ColDef, + rowCount int, proc *process.Process, filePath string, +) error { + mp := proc.Mp() + typ := types.T(col.Typ.Id) + + wrapErr := func(err error) error { + return moerr.NewInternalErrorf(proc.Ctx, + "partition value type conversion failed: col=%s, value='%s', path=%s: %v", + col.Name, strVal, filePath, err) + } + + switch typ { + case types.T_int8: + v, err := parseIntWithFloatFallback(strVal, 8) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, int8(v), rowCount, mp) + + case types.T_int16: + v, err := parseIntWithFloatFallback(strVal, 16) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, int16(v), rowCount, mp) + + case types.T_int32: + v, err := parseIntWithFloatFallback(strVal, 32) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, int32(v), rowCount, mp) + + case types.T_int64: + v, err := parseIntWithFloatFallback(strVal, 64) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_uint8: + v, err := parseUintWithFloatFallback(strVal, 8) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, uint8(v), rowCount, mp) + + case types.T_uint16: + v, err := parseUintWithFloatFallback(strVal, 16) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, uint16(v), rowCount, mp) + + case types.T_uint32: + v, err := parseUintWithFloatFallback(strVal, 32) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, uint32(v), rowCount, mp) + + case types.T_uint64: + if col.Typ.Enumvalues != "" { + // SET type stored as uint64 with Enumvalues + v, err := types.ParseSet(col.Typ.Enumvalues, strVal) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + } + v, err := parseUintWithFloatFallback(strVal, 64) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_bit: + v, err := strconv.ParseUint(strVal, 10, int(col.Typ.Width)) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_float32: + v, err := strconv.ParseFloat(strVal, 32) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, float32(v), rowCount, mp) + + case types.T_float64: + v, err := strconv.ParseFloat(strVal, 64) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_decimal64: + v, err := types.ParseDecimal64(strVal, col.Typ.Width, col.Typ.Scale) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_decimal128: + v, err := types.ParseDecimal128(strVal, col.Typ.Width, col.Typ.Scale) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_date: + v, err := types.ParseDateCast(strVal) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_datetime: + v, err := types.ParseDatetime(strVal, col.Typ.Scale) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_timestamp: + v, err := types.ParseTimestamp(proc.GetSessionInfo().TimeZone, strVal, col.Typ.Scale) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_time: + v, err := types.ParseTime(strVal, col.Typ.Scale) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_bool: + v, err := types.ParseBool(strVal) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_enum: + v, err := types.ParseEnum(col.Typ.Enumvalues, strVal) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_char, types.T_varchar, types.T_text, + types.T_blob, types.T_binary, types.T_varbinary, types.T_datalink: + return vector.SetConstBytes(vec, []byte(strVal), rowCount, mp) + + case types.T_json: + v, err := types.ParseStringToByteJson(strVal) + if err != nil { + return wrapErr(err) + } + bs, err := v.Marshal() + if err != nil { + return wrapErr(err) + } + return vector.SetConstBytes(vec, bs, rowCount, mp) + + case types.T_uuid: + v, err := types.ParseUuid(strVal) + if err != nil { + return wrapErr(err) + } + return vector.SetConstFixed(vec, v, rowCount, mp) + + case types.T_array_float32, types.T_array_float64: + return moerr.NewNotSupportedf(proc.Ctx, + "unsupported partition column type VECTOR for col=%s, path=%s", col.Name, filePath) + + default: + return moerr.NewNotSupportedf(proc.Ctx, + "unsupported partition column type %v for col=%s, path=%s", typ, col.Name, filePath) + } +} + +// Float-domain boundary constants for 64-bit overflow detection. +// +// math.MaxInt64 = 2^63 - 1 and math.MaxUint64 = 2^64 - 1 are odd numbers that +// cannot be exactly represented in float64 (only 53 bits of mantissa). Comparing +// float64(f) > math.MaxInt64 silently compares against 2^63 (the nearest float64), +// which lets f == 2^63 slip through and then int64(f) wraps to -2^63. +// +// Using the exact float64 values 2^63 and 2^64 as strict upper bounds closes +// this gap: any f ≥ 2^63 is out of int64 range; any f ≥ 2^64 is out of uint64 +// range. Both 2^63 and 2^64 are exactly representable in float64 (powers of 2). +// +// Additionally, float64 can represent consecutive integers exactly only up to +// 2^53. Beyond that, distinct source integers collapse to the same float64 +// (e.g. "-9223372036854775809.0" and "-9223372036854775808.0" both parse to +// -2^63). When we reach the float fallback for a 64-bit target, we therefore +// reject any |f| ≥ 2^53: genuine integer strings would have succeeded in +// ParseInt and never reached this path, so a float in the non-exact range +// here implies the source string is a non-integer (decimal/exponent form) +// that we cannot safely round to int64/uint64. +const ( + float64MaxInt64Exclusive = 0x1p63 // 2^63, one past max int64 + float64MaxUint64Exclusive = 0x1p64 // 2^64, one past max uint64 + float64IntExactLimit = 0x1p53 // 2^53, largest |x| with consecutive-integer precision +) + +// parseIntWithFloatFallback mimics getColData behavior (external.go:1019): +// 1. ParseInt succeeds → use it +// 2. ParseInt fails with ErrRange → reject (no fallback for overflow) +// 3. ParseInt fails otherwise → ParseFloat + range check +func parseIntWithFloatFallback(s string, bitSize int) (int64, error) { + v, err := strconv.ParseInt(s, 10, bitSize) + if err == nil { + return v, nil + } + if errors.Is(err, strconv.ErrRange) { + return 0, err + } + f, ferr := strconv.ParseFloat(s, 64) + if ferr != nil { + return 0, err + } + // Reject non-finite values ("nan" / "inf" parse successfully to NaN/±Inf). + // NaN in particular is dangerous: NaN < x and NaN >= x are both false, so + // the range checks below silently accept it and int64(NaN) is undefined. + if math.IsNaN(f) || math.IsInf(f, 0) { + return 0, err + } + if bitSize == 64 { + // Exact-float boundaries: MaxInt64 rounds up to 2^63 in float64. + // Also reject any |f| ≥ 2^53 — beyond float64's consecutive-integer + // precision, round-trip through float is ambiguous (see const comment). + if f < math.MinInt64 || f >= float64MaxInt64Exclusive { + return 0, err + } + if f >= float64IntExactLimit || f <= -float64IntExactLimit { + return 0, err + } + } else { + // Compare f against the type's float-domain bounds BEFORE truncation. + // Go's int64(f) truncates toward zero, so a naive post-truncation + // bounds check silently accepts e.g. int32 "-2147483648.9" (truncates + // to -2^31, which passes a ">= -2^31" check). Both ±(2^(N-1)) and + // (2^(N-1) - 1) are exactly representable in float64 for N ≤ 53, so + // the inclusive comparison has no rounding slack. + lo := float64(int64(-1) << (bitSize - 1)) // -2^(N-1) + hi := float64(int64(1)<<(bitSize-1) - 1) // 2^(N-1) - 1 + if f < lo || f > hi { + return 0, err + } + } + return int64(f), nil +} + +// parseUintWithFloatFallback mimics getColData behavior (external.go:1111): +// 1. ParseUint succeeds → use it +// 2. ParseUint fails with ErrRange → reject (no fallback for overflow) +// 3. ParseUint fails otherwise → ParseFloat + range check +func parseUintWithFloatFallback(s string, bitSize int) (uint64, error) { + v, err := strconv.ParseUint(s, 10, bitSize) + if err == nil { + return v, nil + } + if errors.Is(err, strconv.ErrRange) { + return 0, err + } + f, ferr := strconv.ParseFloat(s, 64) + if ferr != nil || f < 0 { + return 0, err + } + // Reject non-finite values (same NaN/Inf risk as parseIntWithFloatFallback). + // Note: f < 0 above already rejects -Inf, but NaN is not ordered so must be + // handled explicitly. + if math.IsNaN(f) || math.IsInf(f, 0) { + return 0, err + } + if bitSize == 64 { + // Same boundary issue as int64 (MaxUint64 rounds up to 2^64) plus the + // 2^53 precision limit for round-tripping integers through float64. + if f >= float64MaxUint64Exclusive { + return 0, err + } + if f >= float64IntExactLimit { + return 0, err + } + } else { + // Compare f against MaxUintN BEFORE truncation, to reject values like + // uint32 "4294967295.9" that Go's uint64(f) would truncate to 2^32 - 1 + // and let through. 2^N - 1 is exactly representable in float64 for + // N ≤ 53, so the inclusive upper bound has no rounding slack. The + // lower bound is implicit: f < 0 is rejected above. + hi := float64(uint64(1)< hi { + return 0, err + } + } + return uint64(f), nil +} diff --git a/pkg/sql/colexec/external/hive_partition_test.go b/pkg/sql/colexec/external/hive_partition_test.go new file mode 100644 index 0000000000000..94be94b01929d --- /dev/null +++ b/pkg/sql/colexec/external/hive_partition_test.go @@ -0,0 +1,1757 @@ +// Copyright 2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package external + +import ( + "context" + "fmt" + "iter" + "strings" + "testing" + + "github.com/matrixorigin/matrixone/pkg/catalog" + "github.com/matrixorigin/matrixone/pkg/container/batch" + "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/container/vector" + "github.com/matrixorigin/matrixone/pkg/fileservice" + "github.com/matrixorigin/matrixone/pkg/pb/plan" + "github.com/matrixorigin/matrixone/pkg/sql/parsers/tree" + "github.com/matrixorigin/matrixone/pkg/sql/plan/function" + "github.com/matrixorigin/matrixone/pkg/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --- ParseHivePartitionSegment tests --- + +func TestParseHivePartitionSegment_Valid(t *testing.T) { + seg, isHive, err := ParseHivePartitionSegment("year=2024") + require.NoError(t, err) + assert.True(t, isHive) + assert.Equal(t, "year", seg.Key) + assert.Equal(t, "2024", seg.Value) +} + +func TestParseHivePartitionSegment_EmptyValue(t *testing.T) { + seg, isHive, err := ParseHivePartitionSegment("year=") + require.NoError(t, err) + assert.True(t, isHive) + assert.Equal(t, "year", seg.Key) + assert.Equal(t, "", seg.Value) +} + +func TestParseHivePartitionSegment_PercentLiteral(t *testing.T) { + seg, isHive, err := ParseHivePartitionSegment("country=US%2FCA") + require.NoError(t, err) + assert.True(t, isHive) + assert.Equal(t, "country", seg.Key) + assert.Equal(t, "US%2FCA", seg.Value) +} + +func TestParseHivePartitionSegment_NotPartition(t *testing.T) { + _, isHive, err := ParseHivePartitionSegment("data.parquet") + require.NoError(t, err) + assert.False(t, isHive) +} + +func TestParseHivePartitionSegment_StartsWithEquals(t *testing.T) { + _, isHive, err := ParseHivePartitionSegment("=value") + require.NoError(t, err) + assert.False(t, isHive) +} + +func TestParseHivePartitionSegment_InvalidPercentLiteral(t *testing.T) { + seg, isHive, err := ParseHivePartitionSegment("country=US%ZZ") + assert.True(t, isHive) + require.NoError(t, err) + assert.Equal(t, "country", seg.Key) + assert.Equal(t, "US%ZZ", seg.Value) +} + +func TestParseHivePartitionSegment_DefaultPartition(t *testing.T) { + seg, isHive, err := ParseHivePartitionSegment("year=__HIVE_DEFAULT_PARTITION__") + require.NoError(t, err) + assert.True(t, isHive) + assert.Equal(t, "__HIVE_DEFAULT_PARTITION__", seg.Value) +} + +// --- ExtractPartitionValues tests --- + +func TestExtractPartitionValues_SingleLevel(t *testing.T) { + vals, err := ExtractPartitionValues( + "/warehouse/data/year=2024/file.parquet", + "/warehouse/data", + []string{"year"}, + ) + require.NoError(t, err) + assert.Equal(t, "2024", vals["year"]) +} + +func TestExtractPartitionValues_MultiLevel(t *testing.T) { + vals, err := ExtractPartitionValues( + "/warehouse/data/year=2024/month=05/file.parquet", + "/warehouse/data", + []string{"year", "month"}, + ) + require.NoError(t, err) + assert.Equal(t, "2024", vals["year"]) + assert.Equal(t, "05", vals["month"]) +} + +func TestExtractPartitionValues_NormalizePath(t *testing.T) { + tests := []struct { + filePath string + basePath string + }{ + {"warehouse/data/year=2025/f.parquet", "warehouse/data"}, + {"/warehouse/data/year=2025/f.parquet", "/warehouse/data"}, + {"warehouse/data/year=2025/f.parquet", "/warehouse/data"}, + {"/warehouse/data/year=2025/f.parquet", "warehouse/data"}, + {" /warehouse/data/year=2025/f.parquet ", " warehouse/data "}, + } + for _, tt := range tests { + vals, err := ExtractPartitionValues(tt.filePath, tt.basePath, []string{"year"}) + require.NoError(t, err, "filePath=%q basePath=%q", tt.filePath, tt.basePath) + assert.Equal(t, "2025", vals["year"], "filePath=%q basePath=%q", tt.filePath, tt.basePath) + } +} + +func TestExtractPartitionValues_PrefixCollision(t *testing.T) { + _, err := ExtractPartitionValues( + "/warehouse/data2/year=2025/f.parquet", + "/warehouse/data", + []string{"year"}, + ) + require.Error(t, err) + assert.Contains(t, err.Error(), "not under base path") +} + +// --- IsHiddenFile tests --- + +func TestIsHiddenFile(t *testing.T) { + assert.True(t, IsHiddenFile(".hidden")) + assert.True(t, IsHiddenFile("_SUCCESS")) + assert.True(t, IsHiddenFile("_metadata")) + assert.False(t, IsHiddenFile("year=2024")) + assert.False(t, IsHiddenFile("data.parquet")) + assert.False(t, IsHiddenFile("")) +} + +// --- IsParquetFile tests --- + +func TestIsParquetFile(t *testing.T) { + assert.True(t, IsParquetFile("data.parquet")) + assert.True(t, IsParquetFile("data.snappy.parquet")) + assert.True(t, IsParquetFile("DATA.PARQUET")) + assert.False(t, IsParquetFile("data.csv")) + assert.False(t, IsParquetFile("data.parquet.crc")) + assert.False(t, IsParquetFile("")) +} + +// --- matchPartitionValue tests --- + +func TestMatchPartitionValue_IntMatch(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_int32)} + assert.Equal(t, MatchTrue, matchPartitionValue("2024", []string{"2024"}, ct)) + assert.Equal(t, MatchFalse, matchPartitionValue("2024", []string{"2025"}, ct)) + assert.Equal(t, MatchTrue, matchPartitionValue("2024", []string{"2023", "2024"}, ct)) +} + +func TestMatchPartitionValue_IntOverflow(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_int8)} + assert.Equal(t, MatchUnknown, matchPartitionValue("999", []string{"999"}, ct)) +} + +func TestMatchPartitionValue_IntParseError(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_int32)} + assert.Equal(t, MatchUnknown, matchPartitionValue("abc", []string{"123"}, ct)) +} + +func TestMatchPartitionValue_UintMatch(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_uint32)} + assert.Equal(t, MatchTrue, matchPartitionValue("100", []string{"100"}, ct)) + assert.Equal(t, MatchFalse, matchPartitionValue("100", []string{"200"}, ct)) +} + +func TestMatchPartitionValue_VarcharExact(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_varchar)} + assert.Equal(t, MatchTrue, matchPartitionValue("US", []string{"US"}, ct)) + assert.Equal(t, MatchUnknown, matchPartitionValue("us", []string{"US"}, ct)) +} + +func TestMatchPartitionValue_UnknownTypes(t *testing.T) { + unknownTypes := []types.T{ + types.T_bool, types.T_float32, types.T_float64, + types.T_decimal64, types.T_date, types.T_datetime, + types.T_timestamp, types.T_json, types.T_uuid, + } + for _, typ := range unknownTypes { + ct := tree.HivePartColType{Id: int32(typ)} + assert.Equal(t, MatchUnknown, matchPartitionValue("val", []string{"val"}, ct), + "type %v should return MatchUnknown", typ) + } +} + +func TestMatchPartitionValue_TAny(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_any)} + assert.Equal(t, MatchUnknown, matchPartitionValue("2024", []string{"2024"}, ct)) +} + +func TestMatchPartitionValue_ZeroPaddedInt(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_int32)} + assert.Equal(t, MatchTrue, matchPartitionValue("01", []string{"1"}, ct)) + assert.Equal(t, MatchTrue, matchPartitionValue("007", []string{"7"}, ct)) +} + +func TestMatchPartitionValue_ZeroPaddedVarcharConservative(t *testing.T) { + ct := tree.HivePartColType{Id: int32(types.T_varchar)} + assert.Equal(t, MatchTrue, matchPartitionValue("01", []string{"01"}, ct)) + assert.Equal(t, MatchUnknown, matchPartitionValue("01", []string{"1"}, ct), + "varchar partitions keep string semantics; a mismatch is not pruned away") +} + +// --- DiscoverHivePartitions tests --- + +func mockListDir(dirs map[string][]fileservice.DirEntry) ListDirFunc { + return func(ctx context.Context, prefix string) iter.Seq2[*fileservice.DirEntry, error] { + return func(yield func(*fileservice.DirEntry, error) bool) { + entries := dirs[prefix] + for i := range entries { + if !yield(&entries[i], nil) { + return + } + } + } + } +} + +func TestDiscoverHivePartitions_SingleLevel(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + {Name: "year=2025", IsDir: true}, + {Name: "_SUCCESS", IsDir: false}, + }, + "/data/year=2024": { + {Name: "part-0000.parquet", IsDir: false, Size: 1000}, + {Name: ".hidden", IsDir: false, Size: 100}, + }, + "/data/year=2025": { + {Name: "part-0000.parquet", IsDir: false, Size: 2000}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + nil, + ) + require.NoError(t, err) + assert.Equal(t, 2, result.PartitionCount) + assert.Equal(t, 2, len(result.Files)) + assert.Equal(t, int64(1000), result.Files[0].FileSize) +} + +func TestDiscoverHivePartitions_MultiLevel(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + }, + "/data/year=2024": { + {Name: "month=01", IsDir: true}, + {Name: "month=02", IsDir: true}, + }, + "/data/year=2024/month=01": { + {Name: "data.parquet", IsDir: false, Size: 500}, + }, + "/data/year=2024/month=02": { + {Name: "data.parquet", IsDir: false, Size: 600}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year", "month"}, + []tree.HivePartColType{ + {Id: int32(types.T_int32)}, + {Id: int32(types.T_int32)}, + }, + nil, + ) + require.NoError(t, err) + assert.Equal(t, 3, result.PartitionCount) // year=2024 + month=01 + month=02 + assert.Equal(t, 2, len(result.Files)) +} + +func TestDiscoverHivePartitions_WithPredicate(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + {Name: "year=2025", IsDir: true}, + {Name: "year=2026", IsDir: true}, + }, + "/data/year=2025": { + {Name: "data.parquet", IsDir: false, Size: 1000}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + []PartitionPredicate{{ColName: "year", Op: PartOpEq, Values: []string{"2025"}}}, + ) + require.NoError(t, err) + assert.Equal(t, 1, result.PartitionCount) + assert.Equal(t, 2, result.PrunedCount) + assert.Equal(t, 1, len(result.Files)) + assert.Equal(t, 2, result.ListCalls) // root dir + year=2025 file listing +} + +func TestDiscoverHivePartitions_SkipsHidden(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + {Name: ".metadata", IsDir: true}, + {Name: "_temp", IsDir: true}, + }, + "/data/year=2024": { + {Name: "data.parquet", IsDir: false, Size: 100}, + {Name: "_SUCCESS", IsDir: false, Size: 0}, + {Name: ".crc", IsDir: false, Size: 10}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + nil, + ) + require.NoError(t, err) + assert.Equal(t, 1, len(result.Files)) +} + +func TestDiscoverHivePartitions_NormalizePath(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/warehouse/data": { + {Name: "year=2024", IsDir: true}, + }, + "/warehouse/data/year=2024": { + {Name: "f.parquet", IsDir: false, Size: 100}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "warehouse/data/", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + nil, + ) + require.NoError(t, err) + assert.Equal(t, 1, len(result.Files)) +} + +func TestDiscoverHivePartitions_NilColTypes(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + {Name: "year=2025", IsDir: true}, + }, + "/data/year=2024": { + {Name: "f.parquet", IsDir: false, Size: 100}, + }, + "/data/year=2025": { + {Name: "f.parquet", IsDir: false, Size: 200}, + }, + } + + // nil colTypes means old JSON — should still discover all (no pruning possible) + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + nil, // triggers T_any fallback + []PartitionPredicate{{ColName: "year", Op: PartOpEq, Values: []string{"2024"}}}, + ) + require.NoError(t, err) + // T_any → MatchUnknown → no pruning, all partitions kept + assert.Equal(t, 2, result.PartitionCount) + assert.Equal(t, 0, result.PrunedCount) + assert.Equal(t, 2, len(result.Files)) +} + +func TestDiscoverHivePartitions_PercentInDirName(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "country=US%2FCA", IsDir: true}, + }, + } + + _, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"country"}, + []tree.HivePartColType{{Id: int32(types.T_varchar)}}, + nil, + ) + require.Error(t, err) + assert.Contains(t, err.Error(), "%") +} + +func TestDiscoverHivePartitions_INPredicate(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2023", IsDir: true}, + {Name: "year=2024", IsDir: true}, + {Name: "year=2025", IsDir: true}, + }, + "/data/year=2024": { + {Name: "f.parquet", IsDir: false, Size: 100}, + }, + "/data/year=2025": { + {Name: "f.parquet", IsDir: false, Size: 200}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + []PartitionPredicate{{ColName: "year", Op: PartOpIn, Values: []string{"2024", "2025"}}}, + ) + require.NoError(t, err) + assert.Equal(t, 2, result.PartitionCount) + assert.Equal(t, 1, result.PrunedCount) + assert.Equal(t, 2, len(result.Files)) + assert.Equal(t, 3, result.ListCalls) // root + year=2024 files + year=2025 files +} + +func TestDiscoverHivePartitions_KeyMismatchSkipped(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + {Name: "country=US", IsDir: true}, // wrong key for level 0 + }, + "/data/year=2024": { + {Name: "f.parquet", IsDir: false, Size: 100}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + nil, + ) + require.NoError(t, err) + assert.Equal(t, 1, result.PartitionCount) + assert.Equal(t, 1, len(result.Files)) +} + +func TestDiscoverHivePartitions_ListCallLimit(t *testing.T) { + // Generate enough partitions at two levels to exceed maxListCalls (10000). + // Level 0: 200 year partitions, Level 1: 200 month partitions each. + // This requires 1 (root) + 200 (year dirs) = 201 List calls before we hit month level. + // To trigger the limit efficiently, use a mock that always returns entries + // forcing recursion well beyond the limit. + entries := make([]fileservice.DirEntry, 200) + for i := range entries { + entries[i] = fileservice.DirEntry{Name: fmt.Sprintf("year=%d", i), IsDir: true} + } + monthEntries := make([]fileservice.DirEntry, 200) + for i := range monthEntries { + monthEntries[i] = fileservice.DirEntry{Name: fmt.Sprintf("month=%d", i), IsDir: true} + } + fileEntries := []fileservice.DirEntry{{Name: "f.parquet", IsDir: false, Size: 10}} + + listDir := func(ctx context.Context, prefix string) iter.Seq2[*fileservice.DirEntry, error] { + return func(yield func(*fileservice.DirEntry, error) bool) { + var items []fileservice.DirEntry + if strings.Count(prefix, "/") <= 1 { + items = entries + } else if strings.Contains(prefix, "month=") { + items = fileEntries + } else { + items = monthEntries + } + for i := range items { + if !yield(&items[i], nil) { + return + } + } + } + } + + _, err := DiscoverHivePartitions( + context.Background(), + listDir, + "/data", + []string{"year", "month"}, + []tree.HivePartColType{ + {Id: int32(types.T_int32)}, + {Id: int32(types.T_int32)}, + }, + nil, + ) + require.Error(t, err) + assert.Contains(t, err.Error(), "List calls") +} + +// --------------------------------------------------------------------------- +// ClassifyFilters tests +// --------------------------------------------------------------------------- + +func makeTableDef(cols ...string) *plan.TableDef { + td := &plan.TableDef{Cols: make([]*plan.ColDef, len(cols))} + for i, name := range cols { + td.Cols[i] = &plan.ColDef{Name: name} + } + return td +} + +func makeColExpr(colPos int32, name string) *plan.Expr { + return &plan.Expr{ + Expr: &plan.Expr_Col{Col: &plan.ColRef{ColPos: colPos, Name: name}}, + } +} + +func makeLitInt64(val int64) *plan.Expr { + return &plan.Expr{ + Expr: &plan.Expr_Lit{Lit: &plan.Literal{Value: &plan.Literal_I64Val{I64Val: val}}}, + } +} + +func makeLitString(val string) *plan.Expr { + return &plan.Expr{ + Expr: &plan.Expr_Lit{Lit: &plan.Literal{Value: &plan.Literal_Sval{Sval: val}}}, + } +} + +func makeEqExpr(left, right *plan.Expr) *plan.Expr { + return &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: function.EqualFunctionEncodedID}, + Args: []*plan.Expr{left, right}, + }}, + } +} + +func makeInExpr(col *plan.Expr, vals ...*plan.Expr) *plan.Expr { + return &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: function.InFunctionEncodedID}, + Args: []*plan.Expr{ + col, + {Expr: &plan.Expr_List{List: &plan.ExprList{List: vals}}}, + }, + }}, + } +} + +func TestClassifyFilters_Basic(t *testing.T) { + td := makeTableDef("year", "amount", "account", "__mo_filepath") + partColSet := map[string]bool{"year": true} + + yearEq := makeEqExpr(makeColExpr(0, "year"), makeLitInt64(2025)) + amountGt := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.GREAT_THAN) << 32}, + Args: []*plan.Expr{makeColExpr(1, "amount"), makeLitInt64(100)}, + }}, + } + fpFilter := makeEqExpr(makeColExpr(3, "__mo_filepath"), makeLitString("/path")) + + partF, fpF, rowF := ClassifyFilters(td, []*plan.Expr{yearEq, amountGt, fpFilter}, partColSet) + + assert.Equal(t, 1, len(partF), "year filter should be in partitionFilters") + assert.Equal(t, 1, len(fpF), "__mo_filepath filter should be in filePathFilters") + assert.Equal(t, 2, len(rowF), "year+amount should be in rowFilters (year duplicated for safety)") + assert.Same(t, yearEq, partF[0]) + assert.Same(t, fpFilter, fpF[0]) +} + +func TestClassifyFilters_PartitionFilterDuplicated(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + yearEq := makeEqExpr(makeColExpr(0, "year"), makeLitInt64(2025)) + partF, _, rowF := ClassifyFilters(td, []*plan.Expr{yearEq}, partColSet) + + assert.Equal(t, 1, len(partF)) + assert.Equal(t, 1, len(rowF)) + assert.Same(t, partF[0], rowF[0], "partition filter must appear in both lists") +} + +func TestClassifyFilters_AccountNamedPartCol(t *testing.T) { + td := makeTableDef("account", "data") + partColSet := map[string]bool{"account": true} + + acctEq := makeEqExpr(makeColExpr(0, "account"), makeLitString("tenant1")) + partF, fpF, _ := ClassifyFilters(td, []*plan.Expr{acctEq}, partColSet) + + assert.Equal(t, 1, len(partF), "account as partition col goes to partitionFilters") + assert.Equal(t, 0, len(fpF), "should NOT go to filePathFilters") +} + +func TestClassifyFilters_AccountIdSubstring(t *testing.T) { + td := makeTableDef("account_id", "data") + partColSet := map[string]bool{} + + acctIdEq := makeEqExpr(makeColExpr(0, "account_id"), makeLitString("123")) + _, fpF, rowF := ClassifyFilters(td, []*plan.Expr{acctIdEq}, partColSet) + + assert.Equal(t, 0, len(fpF), "account_id must NOT be mistaken for filepath filter") + assert.Equal(t, 1, len(rowF)) +} + +func TestClassifyFilters_MixedReference(t *testing.T) { + td := makeTableDef("year", "amount") + partColSet := map[string]bool{"year": true} + + mixed := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.GREAT_THAN) << 32}, + Args: []*plan.Expr{makeColExpr(0, "year"), makeColExpr(1, "amount")}, + }}, + } + partF, fpF, rowF := ClassifyFilters(td, []*plan.Expr{mixed}, partColSet) + + assert.Equal(t, 0, len(partF)) + assert.Equal(t, 0, len(fpF)) + assert.Equal(t, 1, len(rowF), "mixed reference goes to rowFilters") +} + +func TestClassifyFilters_MoFilepathCol(t *testing.T) { + td := makeTableDef("year", catalog.ExternalFilePath) + partColSet := map[string]bool{"year": true} + + fpExpr := makeEqExpr(makeColExpr(1, catalog.ExternalFilePath), makeLitString("x")) + _, fpF, _ := ClassifyFilters(td, []*plan.Expr{fpExpr}, partColSet) + + assert.Equal(t, 1, len(fpF)) +} + +func TestClassifyFilters_NoColumnRefs(t *testing.T) { + td := makeTableDef("year") + partColSet := map[string]bool{"year": true} + + constExpr := makeLitInt64(42) + _, _, rowF := ClassifyFilters(td, []*plan.Expr{constExpr}, partColSet) + + assert.Equal(t, 1, len(rowF), "constant expression goes to rowFilters") +} + +func TestClassifyFilters_FunctionWrappedPartitionColumnConservative(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + wrappedYear := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: 123, ObjName: "cast"}, + Args: []*plan.Expr{makeColExpr(0, "year")}, + }}, + } + filter := makeEqExpr(wrappedYear, makeLitInt64(2024)) + partF, fpF, rowF := ClassifyFilters(td, []*plan.Expr{filter}, partColSet) + + require.Len(t, partF, 1, "Expr_F wrappers must still expose the partition column") + assert.Empty(t, fpF) + require.Len(t, rowF, 1, "row-level filtering remains the correctness backstop") + assert.Empty(t, ExtractPartitionPredicatesFromExprs(td, partF, partColSet), + "CAST/function-wrapped partition columns are not structurally pruned") +} + +// TestClassifyFilters_AccountAsPhysicalCol guards against classifying a +// physical column literally named "account" as a filepath pseudo column. +// The CSV-only per-batch "account" virtual column does not exist on Hive / +// Parquet external tables; treating it as such would cause row filters on a +// real column to be silently dropped and evaluated against garbage path +// synthesis. +func TestClassifyFilters_AccountAsPhysicalCol(t *testing.T) { + td := makeTableDef("account", "amount") + partColSet := map[string]bool{} // "account" is NOT a partition column + + acctEq := makeEqExpr(makeColExpr(0, "account"), makeLitString("tenant1")) + partF, fpF, rowF := ClassifyFilters(td, []*plan.Expr{acctEq}, partColSet) + + assert.Equal(t, 0, len(partF)) + assert.Equal(t, 0, len(fpF), "physical column 'account' must NOT be classified as filepath filter") + assert.Equal(t, 1, len(rowF), "must be evaluated as a normal row filter") +} + +// TestClassifyFilters_OrFilepathAndLiteral documents the exact scenario that +// motivated the compile-side fpFilters → rowFilters propagation fix: +// ClassifyFilters routes OR(__mo_filepath LIKE ..., const) to fpFilters +// because both operands' col refs are a subset of filePathColSet (the +// literal contributes no refs). But FilterFileList's judgeContainColname is +// stricter — it rejects OR branches that don't reference a filepath column +// — so the filter comes back unconsumed and must be appended to rowFilters +// by the caller. This test pins the classification half of the contract so +// future ClassifyFilters changes don't silently break the invariant. +func TestClassifyFilters_OrFilepathAndLiteral(t *testing.T) { + td := makeTableDef("year", catalog.ExternalFilePath) + partColSet := map[string]bool{"year": true} + + // OR(__mo_filepath = 'x', false-literal) — no columns on the right arm. + orExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.OR) << 32, ObjName: "or"}, + Args: []*plan.Expr{ + makeEqExpr(makeColExpr(1, catalog.ExternalFilePath), makeLitString("x")), + {Expr: &plan.Expr_Lit{Lit: &plan.Literal{Value: &plan.Literal_Bval{Bval: false}}}}, + }, + }}, + } + _, fpF, _ := ClassifyFilters(td, []*plan.Expr{orExpr}, partColSet) + assert.Equal(t, 1, len(fpF), + "OR(filepath, literal) must go to fpFilters; the compile-side caller is responsible for "+ + "re-appending it to rowFilters if FilterFileList refuses to consume it") +} + +// TestFilterFileList_LeavesUnconsumedOrFilterInNode locks the exact side-effect +// contract that compile.getHivePartitionFileList depends on: when FilterFileList +// is handed an OR(filepath, literal) filter, its judgeContainColname check +// rejects it (OR branches must each reference a filepath col), and the rejected +// filter is written back via node.FilterList. compile.go appends tmpNode.FilterList +// onto rowFilters so the runtime still evaluates the predicate; without that +// append, the filter is silently dropped. If a future change has FilterFileList +// consume such filters, or uses a different side-effect pattern (e.g. returning +// leftover filters), this test goes red and the compile side must be audited. +func TestFilterFileList_LeavesUnconsumedOrFilterInNode(t *testing.T) { + proc := testutil.NewProc(t) + + td := makeTableDef("year", catalog.ExternalFilePath) + orExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.OR) << 32, ObjName: "or"}, + Args: []*plan.Expr{ + makeEqExpr(makeColExpr(1, catalog.ExternalFilePath), makeLitString("x")), + {Expr: &plan.Expr_Lit{Lit: &plan.Literal{Value: &plan.Literal_Bval{Bval: false}}}}, + }, + }}, + } + tmpNode := &plan.Node{ + TableDef: td, + FilterList: []*plan.Expr{orExpr}, + } + fileList := []string{"/warehouse/data/year=2024/f.parquet"} + fileSize := []int64{123} + + outFileList, outFileSize, err := FilterFileList(proc.Ctx, tmpNode, proc, fileList, fileSize) + require.NoError(t, err) + + // judgeContainColname rejected the OR, so filterList in filterByAccountAndFilename + // was empty and the function short-circuited at line 368-370 — fileList / fileSize + // come back unchanged. + assert.Equal(t, fileList, outFileList) + assert.Equal(t, fileSize, outFileSize) + + // And tmpNode.FilterList must still hold the unconsumed predicate. This is + // what compile.getHivePartitionFileList `append`s back onto rowFilters. + require.Equal(t, 1, len(tmpNode.FilterList), + "unconsumed OR(filepath, literal) filter must remain in tmpNode.FilterList") + assert.Same(t, orExpr, tmpNode.FilterList[0], + "tmpNode.FilterList must hold the exact expression for compile.go to re-append") +} + +// --------------------------------------------------------------------------- +// collectBareColNames tests +// --------------------------------------------------------------------------- + +func TestCollectBareColNames_ColPos(t *testing.T) { + td := makeTableDef("year", "month") + expr := makeColExpr(0, "catalog_returns.year") + names := collectBareColNames(td, expr) + assert.True(t, names["year"], "should resolve via ColPos, not col.Name") + assert.False(t, names["catalog_returns.year"]) +} + +func TestCollectBareColNames_FallbackStrip(t *testing.T) { + td := makeTableDef("year") + expr := makeColExpr(99, "t.month") + names := collectBareColNames(td, expr) + assert.True(t, names["month"], "fallback should strip table prefix") +} + +func TestCollectBareColNames_Nested(t *testing.T) { + td := makeTableDef("year", "month") + expr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.GREAT_THAN) << 32}, + Args: []*plan.Expr{makeColExpr(0, "year"), makeColExpr(1, "month")}, + }}, + } + names := collectBareColNames(td, expr) + assert.True(t, names["year"]) + assert.True(t, names["month"]) +} + +// --------------------------------------------------------------------------- +// ExtractPartitionPredicatesFromExprs tests +// --------------------------------------------------------------------------- + +func TestExtractPartitionPredicates_Eq(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + yearEq := makeEqExpr(makeColExpr(0, "year"), makeLitInt64(2025)) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{yearEq}, partColSet) + + require.Equal(t, 1, len(preds)) + assert.Equal(t, "year", preds[0].ColName) + assert.Equal(t, PartOpEq, preds[0].Op) + assert.Equal(t, []string{"2025"}, preds[0].Values) +} + +func TestExtractPartitionPredicates_EqReversed(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + reversed := makeEqExpr(makeLitInt64(2025), makeColExpr(0, "year")) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{reversed}, partColSet) + + require.Equal(t, 1, len(preds)) + assert.Equal(t, "year", preds[0].ColName) + assert.Equal(t, []string{"2025"}, preds[0].Values) +} + +func TestExtractPartitionPredicates_In(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + inExpr := makeInExpr(makeColExpr(0, "year"), makeLitInt64(2024), makeLitInt64(2025)) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{inExpr}, partColSet) + + require.Equal(t, 1, len(preds)) + assert.Equal(t, "year", preds[0].ColName) + assert.Equal(t, PartOpIn, preds[0].Op) + assert.Equal(t, []string{"2024", "2025"}, preds[0].Values) +} + +func TestExtractPartitionPredicates_InWithStrings(t *testing.T) { + td := makeTableDef("country", "data") + partColSet := map[string]bool{"country": true} + + inExpr := makeInExpr(makeColExpr(0, "country"), makeLitString("US"), makeLitString("CA")) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{inExpr}, partColSet) + + require.Equal(t, 1, len(preds)) + assert.Equal(t, []string{"US", "CA"}, preds[0].Values) +} + +func TestExtractPartitionPredicates_NonStructurable(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + gtExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.GREAT_THAN) << 32}, + Args: []*plan.Expr{makeColExpr(0, "year"), makeLitInt64(2024)}, + }}, + } + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{gtExpr}, partColSet) + assert.Equal(t, 0, len(preds), "non EQ/IN should be silently skipped") +} + +func TestExtractPartitionPredicates_RejectsCast(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + castExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: 123}, + Args: []*plan.Expr{makeLitInt64(2025)}, + }}, + } + eqWithCast := makeEqExpr(makeColExpr(0, "year"), castExpr) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{eqWithCast}, partColSet) + assert.Equal(t, 0, len(preds), "Expr_F on constant side should be rejected") +} + +func TestExtractPartitionPredicates_NonPartCol(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + dataEq := makeEqExpr(makeColExpr(1, "data"), makeLitString("foo")) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{dataEq}, partColSet) + assert.Equal(t, 0, len(preds), "non-partition col should not produce predicate") +} + +func TestExtractPartitionPredicates_NullLiteral(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + nullLit := &plan.Expr{ + Expr: &plan.Expr_Lit{Lit: &plan.Literal{Isnull: true}}, + } + eqNull := makeEqExpr(makeColExpr(0, "year"), nullLit) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{eqNull}, partColSet) + assert.Equal(t, 0, len(preds), "NULL literal should be rejected") +} + +func TestExtractPartitionPredicates_InWithNonLiteral(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + castInList := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: 123}, + Args: []*plan.Expr{makeLitInt64(2025)}, + }}, + } + inExpr := makeInExpr(makeColExpr(0, "year"), makeLitInt64(2024), castInList) + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{inExpr}, partColSet) + assert.Equal(t, 0, len(preds), "IN list with non-literal item should be rejected entirely") +} + +func TestExtractPartitionPredicates_InVec(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + // Simulate a folded Expr_Vec (what constant-fold produces from IN list) + vec := vector.NewVec(types.T_int32.ToType()) + proc := testutil.NewProc(t) + mp := proc.Mp() + require.NoError(t, vector.AppendFixed(vec, int32(2024), false, mp)) + require.NoError(t, vector.AppendFixed(vec, int32(2025), false, mp)) + data, err := vec.MarshalBinary() + require.NoError(t, err) + vec.Free(mp) + + vecExpr := &plan.Expr{ + Typ: plan.Type{Id: int32(types.T_int32)}, + Expr: &plan.Expr_Vec{Vec: &plan.LiteralVec{Len: 2, Data: data}}, + } + colExpr := makeColExpr(0, "year") + colExpr.Typ = plan.Type{Id: int32(types.T_int32)} + inExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: function.InFunctionEncodedID}, + Args: []*plan.Expr{colExpr, vecExpr}, + }}, + } + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{inExpr}, partColSet) + require.Equal(t, 1, len(preds)) + assert.Equal(t, PartOpIn, preds[0].Op) + assert.Equal(t, []string{"2024", "2025"}, preds[0].Values) +} + +func TestExtractPartitionPredicates_InVecLengthMismatch(t *testing.T) { + td := makeTableDef("year", "data") + partColSet := map[string]bool{"year": true} + + vec := vector.NewVec(types.T_int32.ToType()) + proc := testutil.NewProc(t) + mp := proc.Mp() + require.NoError(t, vector.AppendFixed(vec, int32(2024), false, mp)) + require.NoError(t, vector.AppendFixed(vec, int32(2025), false, mp)) + data, err := vec.MarshalBinary() + require.NoError(t, err) + vec.Free(mp) + + vecExpr := &plan.Expr{ + Typ: plan.Type{Id: int32(types.T_int32)}, + Expr: &plan.Expr_Vec{Vec: &plan.LiteralVec{Len: 3, Data: data}}, + } + colExpr := makeColExpr(0, "year") + colExpr.Typ = plan.Type{Id: int32(types.T_int32)} + inExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: function.InFunctionEncodedID}, + Args: []*plan.Expr{colExpr, vecExpr}, + }}, + } + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{inExpr}, partColSet) + assert.Empty(t, preds, "LiteralVec length mismatch must disable partition pruning") +} + +func TestExtractPartitionPredicates_InVecVarchar(t *testing.T) { + td := makeTableDef("country", "data") + partColSet := map[string]bool{"country": true} + + vec := vector.NewVec(types.T_varchar.ToType()) + proc := testutil.NewProc(t) + mp := proc.Mp() + require.NoError(t, vector.AppendBytes(vec, []byte("US"), false, mp)) + require.NoError(t, vector.AppendBytes(vec, []byte("CN"), false, mp)) + data, err := vec.MarshalBinary() + require.NoError(t, err) + vec.Free(mp) + + vecExpr := &plan.Expr{ + Typ: plan.Type{Id: int32(types.T_varchar)}, + Expr: &plan.Expr_Vec{Vec: &plan.LiteralVec{Len: 2, Data: data}}, + } + colExpr := makeColExpr(0, "country") + colExpr.Typ = plan.Type{Id: int32(types.T_varchar)} + inExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: function.InFunctionEncodedID}, + Args: []*plan.Expr{colExpr, vecExpr}, + }}, + } + preds := ExtractPartitionPredicatesFromExprs(td, []*plan.Expr{inExpr}, partColSet) + require.Equal(t, 1, len(preds)) + assert.Equal(t, PartOpIn, preds[0].Op) + assert.Equal(t, []string{"US", "CN"}, preds[0].Values) +} + +func TestMatchPartitionValue_SetWithEnumvalues(t *testing.T) { + // SET column stored as T_uint64 with non-empty Enumvalues must NOT be pruned + ct := tree.HivePartColType{Id: int32(types.T_uint64), Enumvalues: "a,b,c"} + assert.Equal(t, MatchUnknown, matchPartitionValue("1", []string{"2"}, ct), + "SET column should always return MatchUnknown") +} + +func TestFillConstantVector_Int64FloatFallback(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_int64.ToType()) + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_int64)}} + + err := fillConstantVector(vec, "1.5", col, 3, proc, "/test") + require.NoError(t, err, "int64 float fallback should work") + val := vector.MustFixedColNoTypeCheck[int64](vec) + assert.Equal(t, int64(1), val[0]) + vec.Free(mp) +} + +func TestFillConstantVector_Int64OverflowRejects(t *testing.T) { + proc := testutil.NewProc(t) + vec := vector.NewVec(types.T_int64.ToType()) + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_int64)}} + + // 9223372036854775808 = MaxInt64 + 1 → ParseInt ErrRange → reject (no float fallback) + err := fillConstantVector(vec, "9223372036854775808", col, 1, proc, "/test") + require.Error(t, err, "int64 overflow must be rejected") + vec.Free(nil) +} + +func TestFillConstantVector_Uint64OverflowRejects(t *testing.T) { + proc := testutil.NewProc(t) + vec := vector.NewVec(types.T_uint64.ToType()) + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_uint64)}} + + // 18446744073709551616 = MaxUint64 + 1 → ParseUint ErrRange → reject + err := fillConstantVector(vec, "18446744073709551616", col, 1, proc, "/test") + require.Error(t, err, "uint64 overflow must be rejected") + vec.Free(nil) +} + +// TestFillConstantVector_Int64DecimalBoundaryRejects guards the 64-bit float +// fallback against four classes of unsafe inputs: +// - 2^63 / 2^64 slipping through due to float64 rounding of MaxInt64/MaxUint64 +// - NaN passing range checks (NaN < x and NaN >= x both false) +// - ±Inf (covered by the strict upper bound being exact) +// - "-9223372036854775809.0" (below MinInt64) rounding to -2^63 in float64; +// rejected by the |f| >= 2^53 precision guard since any value reaching the +// float fallback at that magnitude cannot be safely round-tripped to int64. +func TestFillConstantVector_Int64DecimalBoundaryRejects(t *testing.T) { + proc := testutil.NewProc(t) + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_int64)}} + + cases := []string{ + "9223372036854775808.0", // 2^63 in decimal form + "9.223372036854775808e18", // 2^63 in scientific form + "9223372036854775808", // MaxInt64+1 (ParseInt ErrRange path) + "9999999999999999999.0", // well above 2^63 + "-9223372036854775808.0", // MinInt64 in decimal — float fallback, rejected by 2^53 guard + "-9223372036854775809", // below MinInt64 (ParseInt ErrRange) + "-9223372036854775809.0", // below MinInt64 in decimal; float64 rounds to -2^63 + "-9.223372036854775809e18", // same, scientific form + "-99999999999999999999.0", // well below MinInt64 + "1e20", // large positive scientific + "-1e20", // large negative scientific + "nan", "NaN", "NAN", // non-finite: NaN slips past naive range checks + "inf", "-inf", "+Inf", "Infinity", "-Infinity", + } + for _, s := range cases { + t.Run(s, func(t *testing.T) { + vec := vector.NewVec(types.T_int64.ToType()) + err := fillConstantVector(vec, s, col, 1, proc, "/test") + require.Error(t, err, "%q must be rejected (would overflow int64 or be ambiguous)", s) + vec.Free(nil) + }) + } +} + +func TestFillConstantVector_Int64DecimalBoundaryAccepts(t *testing.T) { + // Values reaching the float fallback with |f| < 2^53 — float64 still + // represents them exactly, so int64(f) is safe. + proc := testutil.NewProc(t) + mp := proc.Mp() + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_int64)}} + + cases := []struct { + s string + want int64 + }{ + {"1.0", 1}, + {"-1.5", -1}, + {"0.0", 0}, + {"1.5e3", 1500}, // 1500 < 2^53 + {"-1.5e3", -1500}, + {"9007199254740991.0", 9007199254740991}, // 2^53 - 1, largest exact before the guard + {"-9007199254740991.0", -9007199254740991}, // -(2^53 - 1) + } + for _, c := range cases { + t.Run(c.s, func(t *testing.T) { + vec := vector.NewVec(types.T_int64.ToType()) + err := fillConstantVector(vec, c.s, col, 1, proc, "/test") + require.NoError(t, err) + val := vector.MustFixedColNoTypeCheck[int64](vec)[0] + assert.Equal(t, c.want, val) + vec.Free(mp) + }) + } +} + +func TestFillConstantVector_Uint64DecimalBoundaryRejects(t *testing.T) { + proc := testutil.NewProc(t) + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_uint64)}} + + cases := []string{ + "18446744073709551616.0", // 2^64 in decimal form + "1.8446744073709551616e19", // 2^64 in scientific form + "18446744073709551616", // MaxUint64+1 (ParseUint ErrRange path) + "99999999999999999999.0", // well above 2^64 + "-1.0", // negative + "1e20", // large scientific + "9007199254740992.0", // 2^53 exactly — reached via float fallback, ambiguous + "nan", "NaN", // NaN silently passes naive < / >= checks + "inf", "Infinity", "-inf", // ±Inf + } + for _, s := range cases { + t.Run(s, func(t *testing.T) { + vec := vector.NewVec(types.T_uint64.ToType()) + err := fillConstantVector(vec, s, col, 1, proc, "/test") + require.Error(t, err, "%q must be rejected", s) + vec.Free(nil) + }) + } +} + +func TestFillConstantVector_Uint64DecimalBoundaryAccepts(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + col := &plan.ColDef{Name: "big", Typ: plan.Type{Id: int32(types.T_uint64)}} + + cases := []struct { + s string + want uint64 + }{ + {"0.0", 0}, + {"1.0", 1}, + {"1.7e3", 1700}, + } + for _, c := range cases { + t.Run(c.s, func(t *testing.T) { + vec := vector.NewVec(types.T_uint64.ToType()) + err := fillConstantVector(vec, c.s, col, 1, proc, "/test") + require.NoError(t, err) + val := vector.MustFixedColNoTypeCheck[uint64](vec)[0] + assert.Equal(t, c.want, val) + vec.Free(mp) + }) + } +} + +// TestFillConstantVector_SmallIntFloatBoundaryRejects verifies the float +// fallback path checks bounds BEFORE truncation. Go's int64(f) truncates +// toward zero: without the pre-check, int32("-2147483648.9") would pass +// because int64(-2147483648.9) == -2147483648, which the post-check sees +// as within [-2^31, 2^31-1]. We must reject it, matching CSV loader. +func TestFillConstantVector_SmallIntFloatBoundaryRejects(t *testing.T) { + proc := testutil.NewProc(t) + + type caseEntry struct { + typId types.T + val string + } + cases := []caseEntry{ + // int8: [-128, 127] + {types.T_int8, "-128.1"}, + {types.T_int8, "127.5"}, + // int16: [-32768, 32767] + {types.T_int16, "-32768.1"}, + {types.T_int16, "32767.5"}, + // int32: [-2147483648, 2147483647] + {types.T_int32, "-2147483648.9"}, + {types.T_int32, "2147483647.9"}, + // uint8: [0, 255] + {types.T_uint8, "255.5"}, + // uint16: [0, 65535] + {types.T_uint16, "65535.9"}, + // uint32: [0, 4294967295] + {types.T_uint32, "4294967295.9"}, + } + for _, c := range cases { + name := fmt.Sprintf("%s_%s", c.typId, c.val) + t.Run(name, func(t *testing.T) { + col := &plan.ColDef{Name: "n", Typ: plan.Type{Id: int32(c.typId)}} + vec := vector.NewVec(c.typId.ToType()) + err := fillConstantVector(vec, c.val, col, 1, proc, "/test") + require.Error(t, err, "%s value %q must be rejected (float bound)", c.typId, c.val) + vec.Free(nil) + }) + } +} + +// TestFillConstantVector_SmallIntFloatBoundaryAccepts verifies values safely +// inside the float bounds still pass and truncate toward zero, matching Go +// int/uint conversion semantics. +func TestFillConstantVector_SmallIntFloatBoundaryAccepts(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + type intEntry struct { + typId types.T + val string + want int64 + } + intCases := []intEntry{ + // Inside [min, max] as a float, then truncated toward zero. + {types.T_int8, "126.9", 126}, + {types.T_int8, "127.0", 127}, + {types.T_int8, "-127.9", -127}, + {types.T_int8, "-128.0", -128}, + {types.T_int16, "32766.9", 32766}, + {types.T_int16, "32767.0", 32767}, + {types.T_int32, "2147483646.9", 2147483646}, + {types.T_int32, "2147483647.0", 2147483647}, + } + for _, c := range intCases { + name := fmt.Sprintf("%s_%s", c.typId, c.val) + t.Run(name, func(t *testing.T) { + col := &plan.ColDef{Name: "n", Typ: plan.Type{Id: int32(c.typId)}} + vec := vector.NewVec(c.typId.ToType()) + err := fillConstantVector(vec, c.val, col, 1, proc, "/test") + require.NoError(t, err) + var got int64 + switch c.typId { + case types.T_int8: + got = int64(vector.MustFixedColNoTypeCheck[int8](vec)[0]) + case types.T_int16: + got = int64(vector.MustFixedColNoTypeCheck[int16](vec)[0]) + case types.T_int32: + got = int64(vector.MustFixedColNoTypeCheck[int32](vec)[0]) + } + assert.Equal(t, c.want, got) + vec.Free(mp) + }) + } + + type uintEntry struct { + typId types.T + val string + want uint64 + } + uintCases := []uintEntry{ + {types.T_uint8, "254.9", 254}, + {types.T_uint8, "255.0", 255}, + {types.T_uint16, "65534.9", 65534}, + {types.T_uint16, "65535.0", 65535}, + {types.T_uint32, "4294967294.9", 4294967294}, + {types.T_uint32, "4294967295.0", 4294967295}, + } + for _, c := range uintCases { + name := fmt.Sprintf("%s_%s", c.typId, c.val) + t.Run(name, func(t *testing.T) { + col := &plan.ColDef{Name: "n", Typ: plan.Type{Id: int32(c.typId)}} + vec := vector.NewVec(c.typId.ToType()) + err := fillConstantVector(vec, c.val, col, 1, proc, "/test") + require.NoError(t, err) + var got uint64 + switch c.typId { + case types.T_uint8: + got = uint64(vector.MustFixedColNoTypeCheck[uint8](vec)[0]) + case types.T_uint16: + got = uint64(vector.MustFixedColNoTypeCheck[uint16](vec)[0]) + case types.T_uint32: + got = uint64(vector.MustFixedColNoTypeCheck[uint32](vec)[0]) + } + assert.Equal(t, c.want, got) + vec.Free(mp) + }) + } +} + +// --------------------------------------------------------------------------- +// Virtual column filling tests +// --------------------------------------------------------------------------- + +func TestIsHivePartitionCol(t *testing.T) { + param := &ExternalParam{} + param.Extern = &tree.ExternParam{ + ExParamConst: tree.ExParamConst{ + HivePartitioning: true, + HivePartitionCols: []string{"year", "month"}, + }, + } + assert.True(t, param.isHivePartitionCol("year")) + assert.True(t, param.isHivePartitionCol("Year")) + assert.True(t, param.isHivePartitionCol("month")) + assert.False(t, param.isHivePartitionCol("amount")) + assert.False(t, param.isHivePartitionCol("")) +} + +func TestIsHivePartitionCol_NotEnabled(t *testing.T) { + param := &ExternalParam{} + param.Extern = &tree.ExternParam{} + assert.False(t, param.isHivePartitionCol("year")) +} + +func TestRefreshPartitionValues(t *testing.T) { + param := &ExternalParam{} + param.Extern = &tree.ExternParam{ + ExParamConst: tree.ExParamConst{ + HivePartitioning: true, + HivePartitionCols: []string{"year", "month"}, + }, + } + param.Extern.Filepath = "/data" + param.Fileparam = &ExFileparam{Filepath: "/data/year=2025/month=06/file.parquet"} + + err := param.refreshPartitionValues() + require.NoError(t, err) + assert.Equal(t, "2025", param.currentPartValues["year"]) + assert.Equal(t, "06", param.currentPartValues["month"]) +} + +func TestFillConstantVector_Int(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_int32.ToType()) + col := &plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}} + + err := fillConstantVector(vec, "2025", col, 10, proc, "/test") + require.NoError(t, err) + assert.Equal(t, 10, vec.Length()) + val := vector.MustFixedColNoTypeCheck[int32](vec) + assert.Equal(t, int32(2025), val[0]) + vec.Free(mp) +} + +func TestFillConstantVector_IntFloatFallback(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_int32.ToType()) + col := &plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}} + + err := fillConstantVector(vec, "1.5", col, 5, proc, "/test") + require.NoError(t, err) + val := vector.MustFixedColNoTypeCheck[int32](vec) + assert.Equal(t, int32(1), val[0]) + vec.Free(mp) +} + +func TestFillConstantVector_Varchar(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_varchar.ToType()) + col := &plan.ColDef{Name: "country", Typ: plan.Type{Id: int32(types.T_varchar)}} + + err := fillConstantVector(vec, "US", col, 3, proc, "/test") + require.NoError(t, err) + assert.Equal(t, 3, vec.Length()) + bs := vec.GetBytesAt(0) + assert.Equal(t, "US", string(bs)) + vec.Free(mp) +} + +func TestFillConstantVector_Bool(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + vec := vector.NewVec(types.T_bool.ToType()) + col := &plan.ColDef{Name: "flag", Typ: plan.Type{Id: int32(types.T_bool)}} + + err := fillConstantVector(vec, "true", col, 2, proc, "/test") + require.NoError(t, err) + val := vector.MustFixedColNoTypeCheck[bool](vec) + assert.True(t, val[0]) + vec.Free(mp) +} + +func TestFillConstantVector_UnsupportedVector(t *testing.T) { + proc := testutil.NewProc(t) + vec := vector.NewVec(types.T_array_float32.ToType()) + col := &plan.ColDef{Name: "emb", Typ: plan.Type{Id: int32(types.T_array_float32)}} + + err := fillConstantVector(vec, "[1,2,3]", col, 1, proc, "/test") + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported") +} + +func TestFillPartitionColumns_DefaultPartNull(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(5) + + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}, + Default: &plan.Default{NullAbility: true}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: "/data/year=__HIVE_DEFAULT_PARTITION__/f.parquet"} + param.currentPartValues = map[string]string{"year": HiveDefaultPartition} + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.NoError(t, err) + assert.True(t, vec.IsConstNull()) + vec.Free(mp) +} + +func TestFillPartitionColumns_DefaultPartNotNull(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(5) + + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}, + Default: &plan.Default{NullAbility: false}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: "/data/year=__HIVE_DEFAULT_PARTITION__/f.parquet"} + param.currentPartValues = map[string]string{"year": HiveDefaultPartition} + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.Error(t, err) + assert.Contains(t, err.Error(), "NOT NULL") + vec.Free(mp) +} + +func TestFillPartitionColumns_NotNullViaTypNotNullable_NegativeCase(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(3) + + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32), NotNullable: true}, + Default: &plan.Default{NullAbility: true}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: "/data/year=__HIVE_DEFAULT_PARTITION__/f.parquet"} + param.currentPartValues = map[string]string{"year": HiveDefaultPartition} + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.NoError(t, err, "should use Default.NullAbility (true=nullable), not Typ.NotNullable") + assert.True(t, vec.IsConstNull()) + vec.Free(mp) +} + +func TestFillPartitionColumns_NotPresent(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(3) + + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: "/data/f.parquet"} + param.currentPartValues = map[string]string{} + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.Error(t, err) + assert.Contains(t, err.Error(), "not found in path") + vec.Free(mp) +} + +// TestFillPartitionColumns_RelPathWithExtern guards the relative-path contract +// in fillPartitionColumns: when param.Extern.Filepath is set (the normal +// production invariant for hive tables), error messages must reference the +// partition-relative path, NOT the absolute/base path. Without this test the +// existing coverage only exercises the fallback branch where Extern is nil +// (relPath == Fileparam.Filepath), hiding a latent bug where a future refactor +// could drop the nil guard and leak machine-local absolute paths into BVT +// .result files. +func TestFillPartitionColumns_RelPathWithExtern(t *testing.T) { + proc := testutil.NewProc(t) + mp := proc.Mp() + + // Two subtests cover both error paths that embed relPath: + // 1) constraint-violation (NOT NULL + __HIVE_DEFAULT_PARTITION__) + // 2) not-found-in-path (partition key missing from file path) + + t.Run("not-null default partition prints relative path", func(t *testing.T) { + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(5) + + basePath := "/warehouse/lake/data" + filePath := basePath + "/year=__HIVE_DEFAULT_PARTITION__/part-0.parquet" + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}, + Default: &plan.Default{NullAbility: false}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: filePath} + param.Extern = &tree.ExternParam{ + ExParamConst: tree.ExParamConst{Filepath: basePath}, + } + param.currentPartValues = map[string]string{"year": HiveDefaultPartition} + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.Error(t, err) + // Must include the relative form... + assert.Contains(t, err.Error(), "year=__HIVE_DEFAULT_PARTITION__/part-0.parquet") + // ...and must NOT include the base prefix (would leak machine paths). + assert.NotContains(t, err.Error(), basePath) + vec.Free(mp) + }) + + t.Run("missing partition key prints relative path", func(t *testing.T) { + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(2) + + basePath := "/warehouse/lake/data" + filePath := basePath + "/oops/part-0.parquet" + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: filePath} + param.Extern = &tree.ExternParam{ + ExParamConst: tree.ExParamConst{Filepath: basePath}, + } + param.currentPartValues = map[string]string{} // year not parsed + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.Error(t, err) + assert.Contains(t, err.Error(), "oops/part-0.parquet") + assert.NotContains(t, err.Error(), basePath) + vec.Free(mp) + }) + + t.Run("type conversion failure prints relative path", func(t *testing.T) { + vec := vector.NewVec(types.T_int32.ToType()) + bat := &batch.Batch{Vecs: []*vector.Vector{vec}} + bat.SetRowCount(3) + + basePath := "/warehouse/lake/data" + filePath := basePath + "/year=abc/part-0.parquet" + param := &ExternalParam{} + param.Cols = []*plan.ColDef{ + {Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}, + } + param.Ctx = context.Background() + param.Fileparam = &ExFileparam{Filepath: filePath} + param.Extern = &tree.ExternParam{ + ExParamConst: tree.ExParamConst{Filepath: basePath}, + } + param.currentPartValues = map[string]string{"year": "abc"} + + h := &ParquetHandler{partitionColIndices: []int{0}} + err := h.fillPartitionColumns(bat, param, proc) + require.Error(t, err) + assert.Contains(t, err.Error(), "year=abc/part-0.parquet") + assert.NotContains(t, err.Error(), basePath) + vec.Free(mp) + }) +} + +// --------------------------------------------------------------------------- +// Pruning observability tests — assert ListCalls / PrunedCount precisely +// --------------------------------------------------------------------------- + +func TestDiscoverHivePartitions_EQListCalls(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2020", IsDir: true}, + {Name: "year=2021", IsDir: true}, + {Name: "year=2022", IsDir: true}, + {Name: "year=2023", IsDir: true}, + {Name: "year=2024", IsDir: true}, + }, + "/data/year=2024": { + {Name: "part.parquet", IsDir: false, Size: 100}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + []PartitionPredicate{{ColName: "year", Op: PartOpEq, Values: []string{"2024"}}}, + ) + require.NoError(t, err) + assert.Equal(t, 2, result.ListCalls, "EQ single value: root + hit partition file dir = 2") + assert.Equal(t, 1, result.PartitionCount) + assert.Equal(t, 4, result.PrunedCount) + assert.Equal(t, 1, len(result.Files)) +} + +func TestDiscoverHivePartitions_INTwoValuesListCalls(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2020", IsDir: true}, + {Name: "year=2021", IsDir: true}, + {Name: "year=2022", IsDir: true}, + }, + "/data/year=2020": { + {Name: "f.parquet", IsDir: false, Size: 100}, + }, + "/data/year=2022": { + {Name: "f.parquet", IsDir: false, Size: 200}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + []PartitionPredicate{{ColName: "year", Op: PartOpIn, Values: []string{"2020", "2022"}}}, + ) + require.NoError(t, err) + assert.Equal(t, 3, result.ListCalls, "IN two values: root + 2 hit partition file dirs = 3") + assert.Equal(t, 2, result.PartitionCount) + assert.Equal(t, 1, result.PrunedCount) + assert.Equal(t, 2, len(result.Files)) +} + +func TestDiscoverHivePartitions_MultiLevelPartialPredicate(t *testing.T) { + dirs := map[string][]fileservice.DirEntry{ + "/data": { + {Name: "year=2024", IsDir: true}, + {Name: "year=2025", IsDir: true}, + }, + "/data/year=2024": { + {Name: "month=01", IsDir: true}, + {Name: "month=02", IsDir: true}, + {Name: "month=03", IsDir: true}, + }, + "/data/year=2024/month=01": { + {Name: "f.parquet", IsDir: false, Size: 100}, + }, + "/data/year=2024/month=02": { + {Name: "f.parquet", IsDir: false, Size: 200}, + }, + "/data/year=2024/month=03": { + {Name: "f.parquet", IsDir: false, Size: 300}, + }, + } + + result, err := DiscoverHivePartitions( + context.Background(), + mockListDir(dirs), + "/data", + []string{"year", "month"}, + []tree.HivePartColType{ + {Id: int32(types.T_int32)}, + {Id: int32(types.T_int32)}, + }, + []PartitionPredicate{{ColName: "year", Op: PartOpEq, Values: []string{"2024"}}}, + ) + require.NoError(t, err) + // year level: 1 pruned (2025), 1 kept (2024) + // month level: no predicate, all 3 enter + // ListCalls: root(1) + year=2024 months(1) + 3 file listings = 5 + assert.Equal(t, 5, result.ListCalls) + assert.Equal(t, 1, result.PrunedCount, "only year=2025 is pruned") + assert.Equal(t, 4, result.PartitionCount, "year=2024 + month=01 + month=02 + month=03") + assert.Equal(t, 3, len(result.Files)) +} + +func TestDiscoverHivePartitions_WarnPartitionCount(t *testing.T) { + // warnPartitionCount=5000. Use 5001 partitions. + // List calls = 1 (root) + 5001 (file listing per partition) = 5002, under maxListCalls(10000). + entries := make([]fileservice.DirEntry, 5001) + for i := range entries { + entries[i] = fileservice.DirEntry{Name: fmt.Sprintf("year=%d", i), IsDir: true} + } + fileEntries := []fileservice.DirEntry{{Name: "f.parquet", IsDir: false, Size: 10}} + + listDir := func(ctx context.Context, prefix string) iter.Seq2[*fileservice.DirEntry, error] { + return func(yield func(*fileservice.DirEntry, error) bool) { + if prefix == "/data" { + for i := range entries { + if !yield(&entries[i], nil) { + return + } + } + } else { + for i := range fileEntries { + if !yield(&fileEntries[i], nil) { + return + } + } + } + } + } + + result, err := DiscoverHivePartitions( + context.Background(), + listDir, + "/data", + []string{"year"}, + []tree.HivePartColType{{Id: int32(types.T_int32)}}, + nil, + ) + require.NoError(t, err, "5001 partitions should NOT error (only warn)") + assert.Equal(t, 5001, result.PartitionCount) + assert.True(t, result.warnEmitted, "warning should have been emitted for >5000 partitions") + assert.Equal(t, 5001, len(result.Files)) +} + +// Verify that the unused imports are consumed +var _ = catalog.ExternalFilePath +var _ = function.EQUAL diff --git a/pkg/sql/colexec/external/parquet.go b/pkg/sql/colexec/external/parquet.go index b9a66c218fe6c..743242438fed6 100644 --- a/pkg/sql/colexec/external/parquet.go +++ b/pkg/sql/colexec/external/parquet.go @@ -46,7 +46,8 @@ var maxParquetBatchCnt int64 = 100000 func newParquetHandler(param *ExternalParam) (*ParquetHandler, error) { h := ParquetHandler{ - batchCnt: maxParquetBatchCnt, + batchCnt: maxParquetBatchCnt, + filepathColIndex: -1, // sentinel: not projected } err := h.openFile(param) if err != nil { @@ -54,29 +55,26 @@ func newParquetHandler(param *ExternalParam) (*ParquetHandler, error) { } // Empty file handling (0 rows): only check column count, skip column name and type checks. - // This aligns with DuckDB behavior for empty parquet files. if h.file.NumRows() == 0 { - // Check if @vars are used in column list (LOAD DATA ... (col1, @v, col2)) - // Parquet doesn't support @vars, report explicit error if param.Extern.ExternType == int32(plan.ExternType_LOAD) && param.ColumnListLen > int32(len(param.Attrs)) { return nil, moerr.NewNYI(param.Ctx, "parquet load with @variables in column list") } - - // Only check column count, not column names or types - // Column count must match exactly (align with DuckDB behavior) - parquetColCnt := len(h.file.Root().Columns()) - tableColCnt := getParquetExpectedColCnt(param) - if parquetColCnt != tableColCnt { - return nil, moerr.NewInvalidInputf(param.Ctx, - "column count mismatch: parquet file has %d columns, but table has %d columns", - parquetColCnt, tableColCnt) + // Skip column count check in Hive mode: partition-only projections have + // 0 expected physical columns while the empty file still has schema columns. + if !param.Extern.HivePartitioning { + parquetColCnt := len(h.file.Root().Columns()) + tableColCnt := getParquetExpectedColCnt(param) + if parquetColCnt != tableColCnt { + return nil, moerr.NewInvalidInputf(param.Ctx, + "column count mismatch: parquet file has %d columns, but table has %d columns", + parquetColCnt, tableColCnt) + } } - // Return nil to indicate empty file, no data to load + // Caller treats (nil, nil) as "empty file, advance to next". return nil, nil } - // Non-empty file: use original logic (check column names and types) err = h.prepare(param) if err != nil { return nil, err @@ -191,6 +189,18 @@ func (h *ParquetHandler) prepare(param *ExternalParam) error { continue } + // Skip virtual columns: they are not in Parquet schema. + if param.isHivePartitionCol(attr.ColName) { + h.partitionColIndices = append(h.partitionColIndices, int(attr.ColIndex)) + continue + } + if catalog.ContainExternalHidenCol(attr.ColName) { + h.filepathColIndex = int(attr.ColIndex) + continue + } + + h.hasPhysicalCol = true + // Use case-insensitive column lookup (fix for issue #15621) col, err := h.findColumnIgnoreCase(param.Ctx, attr.ColName) if err != nil { @@ -234,6 +244,10 @@ func (h *ParquetHandler) prepare(param *ExternalParam) error { h.pages[attr.ColIndex] = col.Pages() } + if !h.hasPhysicalCol && (len(h.partitionColIndices) > 0 || h.filepathColIndex >= 0) { + h.rowCountOnly = true + } + // init row reader if has nested columns if h.hasNestedCols { h.rowReader = parquet.NewReader(h.file) @@ -1790,12 +1804,39 @@ func bigIntToTwosComplementBytes(ctx context.Context, bi *big.Int, size int) ([] } func (h *ParquetHandler) getData(bat *batch.Batch, param *ExternalParam, proc *process.Process) error { + if h.rowCountOnly { + return h.getDataRowCountOnly(bat) + } if h.hasNestedCols { return h.getDataByRow(bat, param, proc) } return h.getDataByPage(bat, param, proc) } +func (h *ParquetHandler) getDataRowCountOnly(bat *batch.Batch) error { + batchLimit := int(h.batchCnt) + rowCount := 0 + + if h.rowCountRemaining > 0 { + rowCount = min(h.rowCountRemaining, batchLimit) + h.rowCountRemaining -= rowCount + } else { + rgs := h.file.RowGroups() + if h.currentRowGroup >= len(rgs) { + bat.SetRowCount(0) + return nil + } + total := int(rgs[h.currentRowGroup].NumRows()) + h.currentRowGroup++ + rowCount = min(total, batchLimit) + h.rowCountRemaining = total - rowCount + } + + h.offset += int64(rowCount) + bat.SetRowCount(rowCount) + return nil +} + func (h *ParquetHandler) getDataByPage(bat *batch.Batch, param *ExternalParam, proc *process.Process) error { length := 0 finish := false @@ -1991,9 +2032,13 @@ func parseStringToDecimal128(s string, precision, scale int32) (types.Decimal128 func getParquetExpectedColCnt(param *ExternalParam) int { cnt := 0 for _, attr := range param.Attrs { - if !catalog.ContainExternalHidenCol(attr.ColName) { - cnt++ + if catalog.ContainExternalHidenCol(attr.ColName) { + continue + } + if param.isHivePartitionCol(attr.ColName) { + continue } + cnt++ } return cnt } diff --git a/pkg/sql/colexec/external/reader_parquet.go b/pkg/sql/colexec/external/reader_parquet.go index 30223e6a37b4f..6d64a7a02e23b 100644 --- a/pkg/sql/colexec/external/reader_parquet.go +++ b/pkg/sql/colexec/external/reader_parquet.go @@ -23,7 +23,6 @@ import ( ) // ParquetReader handles Parquet format files. -// Phase 1: thin wrapper around existing ParquetHandler logic. type ParquetReader struct { param *ExternalParam h *ParquetHandler @@ -35,6 +34,9 @@ func NewParquetReader(param *ExternalParam, proc *process.Process) *ParquetReade func (r *ParquetReader) Open(param *ExternalParam, proc *process.Process) (fileEmpty bool, err error) { r.param = param + if err := param.refreshPartitionValues(); err != nil { + return false, err + } r.h, err = newParquetHandler(param) if err != nil { return false, err @@ -64,6 +66,17 @@ func (r *ParquetReader) ReadBatch( return false, err } + // Virtual column fill is independent of rowCountOnly: both physical-col + // branches (getDataByPage / getDataByRow) and rowCountOnly need to stamp + // the hive partition values and __mo_filepath into their vectors whenever + // those columns are projected. rowCountOnly in prepare() only gates the + // getData dispatch (no mapper reads), not the virtual-column fill. + if buf.RowCount() > 0 && (r.h.filepathColIndex >= 0 || len(r.h.partitionColIndices) > 0) { + if err := r.h.fillVirtualColumns(buf, r.param, proc); err != nil { + return false, err + } + } + // Check if file is finished: getData sets offset and checks NumRows if r.h.file != nil && r.h.offset >= r.h.file.NumRows() { return true, nil diff --git a/pkg/sql/colexec/external/types.go b/pkg/sql/colexec/external/types.go index 5debf9c03073f..5aab238702bbf 100644 --- a/pkg/sql/colexec/external/types.go +++ b/pkg/sql/colexec/external/types.go @@ -72,8 +72,9 @@ type ExParamConst struct { } type ExParam struct { - Fileparam *ExFileparam - Filter *FilterParam + Fileparam *ExFileparam + Filter *FilterParam + currentPartValues map[string]string } type ExFileparam struct { @@ -285,6 +286,14 @@ type ParquetHandler struct { // for nested types support hasNestedCols bool rowReader *parquet.Reader + + // virtual column support (hive partitions + __mo_filepath) + partitionColIndices []int + filepathColIndex int // -1 = not projected + hasPhysicalCol bool + rowCountOnly bool + currentRowGroup int + rowCountRemaining int } type columnMapper struct { diff --git a/pkg/sql/compile/compile.go b/pkg/sql/compile/compile.go index 662acef8d54d5..f45bf6391d4f8 100644 --- a/pkg/sql/compile/compile.go +++ b/pkg/sql/compile/compile.go @@ -1604,6 +1604,11 @@ func (c *Compile) getReadWriteParallelFlag(param *tree.ExternParam, fileList []s } func (c *Compile) getExternalFileListAndSize(node *plan.Node, param *tree.ExternParam) (fileList []string, fileSize []int64, err error) { + // Hive partition tables use recursive list-and-filter discovery, not ReadDir. + // ReadDir requires glob patterns in filepath; Hive base paths are opaque directories. + if param.HivePartitioning { + return c.getHivePartitionFileList(node, param) + } switch node.ExternScan.Type { case int32(plan.ExternType_EXTERNAL_TB): t := time.Now() @@ -1637,6 +1642,68 @@ func (c *Compile) getExternalFileListAndSize(node *plan.Node, param *tree.Extern return fileList, fileSize, nil } +func (c *Compile) getHivePartitionFileList(node *plan.Node, param *tree.ExternParam) ([]string, []int64, error) { + partColSet := toLowerSet(param.HivePartitionCols) + partFilters, fpFilters, rowFilters := external.ClassifyFilters( + node.TableDef, node.FilterList, partColSet) + + preds := external.ExtractPartitionPredicatesFromExprs(node.TableDef, partFilters, partColSet) + + listDir := external.NewListDirFunc(param) + result, err := external.DiscoverHivePartitions( + c.proc.Ctx, listDir, param.Filepath, + param.HivePartitionCols, param.HivePartitionColTypes, preds) + if err != nil { + return nil, nil, err + } + + fileList := make([]string, len(result.Files)) + fileSize := make([]int64, len(result.Files)) + for i, f := range result.Files { + fileList[i] = f.FilePath + fileSize[i] = f.FileSize + } + + if len(fpFilters) > 0 { + var leftover []*plan.Expr + fileList, fileSize, leftover, err = runFilePathFilters(c.proc.Ctx, c.proc, node.TableDef, fpFilters, fileList, fileSize) + if err != nil { + return nil, nil, err + } + rowFilters = append(rowFilters, leftover...) + } + + node.FilterList = rowFilters + return fileList, fileSize, nil +} + +func runFilePathFilters( + ctx context.Context, + proc *process.Process, + tableDef *plan.TableDef, + fpFilters []*plan.Expr, + fileList []string, + fileSize []int64, +) ([]string, []int64, []*plan.Expr, error) { + tmpNode := &plan.Node{ + TableDef: tableDef, + FilterList: fpFilters, + } + outFileList, outFileSize, err := external.FilterFileList(ctx, tmpNode, proc, fileList, fileSize) + if err != nil { + return nil, nil, nil, err + } + return outFileList, outFileSize, tmpNode.FilterList, nil +} + +func toLowerSet(cols []string) map[string]bool { + m := make(map[string]bool, len(cols)) + for _, col := range cols { + m[strings.ToLower(col)] = true + } + return m +} + func (c *Compile) compileExternScan(node *plan.Node) ([]*Scope, error) { if c.isPrepare { return nil, cantCompileForPrepareErr @@ -1663,6 +1730,12 @@ func (c *Compile) compileExternScan(node *plan.Node) ([]*Scope, error) { return c.compileExternValueScan(node, param, strictSqlMode) } + // Hive partition tables must not enter parallel read paths — the parallel loop + // mutates param.Filepath per file, which breaks ExtractPartitionValues' base path. + if param.HivePartitioning { + param.Parallel = false + } + fileList, fileSize, err := c.getExternalFileListAndSize(node, param) if err != nil { return nil, err diff --git a/pkg/sql/compile/hive_partition_test.go b/pkg/sql/compile/hive_partition_test.go new file mode 100644 index 0000000000000..8d8c0ff9adeb6 --- /dev/null +++ b/pkg/sql/compile/hive_partition_test.go @@ -0,0 +1,109 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package compile + +import ( + "testing" + + "github.com/matrixorigin/matrixone/pkg/catalog" + "github.com/matrixorigin/matrixone/pkg/pb/plan" + "github.com/matrixorigin/matrixone/pkg/sql/plan/function" + "github.com/matrixorigin/matrixone/pkg/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestRunFilePathFilters_LeftoverContainsUnconsumed locks the compile-layer +// contract that motivates the fpFilters → rowFilters append in +// getHivePartitionFileList. If runFilePathFilters is modified (or deleted) to +// no longer surface FilterFileList's unconsumed predicates as its leftover +// return value, queries like +// +// WHERE __mo_filepath LIKE '%x%' OR false +// +// will silently skip their predicate and return wrong rows. This test fires +// immediately in that scenario. +func TestRunFilePathFilters_LeftoverContainsUnconsumed(t *testing.T) { + proc := testutil.NewProc(t) + + td := &plan.TableDef{ + Cols: []*plan.ColDef{ + {Name: "year"}, + {Name: catalog.ExternalFilePath}, + }, + } + orExpr := &plan.Expr{ + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: int64(function.OR) << 32, ObjName: "or"}, + Args: []*plan.Expr{ + { + Expr: &plan.Expr_F{F: &plan.Function{ + Func: &plan.ObjectRef{Obj: function.EqualFunctionEncodedID}, + Args: []*plan.Expr{ + {Expr: &plan.Expr_Col{Col: &plan.ColRef{ColPos: 1, Name: catalog.ExternalFilePath}}}, + {Expr: &plan.Expr_Lit{Lit: &plan.Literal{Value: &plan.Literal_Sval{Sval: "x"}}}}, + }, + }}, + }, + {Expr: &plan.Expr_Lit{Lit: &plan.Literal{Value: &plan.Literal_Bval{Bval: false}}}}, + }, + }}, + } + + fileList := []string{"/warehouse/data/year=2024/f.parquet"} + fileSize := []int64{100} + + outFileList, outFileSize, leftover, err := runFilePathFilters( + proc.Ctx, proc, td, []*plan.Expr{orExpr}, fileList, fileSize) + require.NoError(t, err) + + assert.Equal(t, fileList, outFileList, + "FilterFileList short-circuits when judgeContainColname rejects all fpFilters") + assert.Equal(t, fileSize, outFileSize) + require.Equal(t, 1, len(leftover), + "unconsumed OR(filepath, literal) must be returned in leftover so the caller can "+ + "append it to rowFilters; losing it silently drops the predicate at runtime") + assert.Same(t, orExpr, leftover[0]) +} + +// TestToLowerSet covers the tiny helper feeding partColSet. +func TestToLowerSet(t *testing.T) { + got := toLowerSet([]string{"Year", "MONTH", "day"}) + assert.True(t, got["year"]) + assert.True(t, got["month"]) + assert.True(t, got["day"]) + assert.False(t, got["Year"]) + assert.Equal(t, 3, len(got)) + + // Empty input. + got = toLowerSet(nil) + assert.Equal(t, 0, len(got)) +} + +// TestRunFilePathFilters_AllConsumed exercises the consumed-path of +// runFilePathFilters where FilterFileList keeps the filter and leftover is +// empty — distinct from the unconsumed regression above. +func TestRunFilePathFilters_NoFilters(t *testing.T) { + proc := testutil.NewProc(t) + td := &plan.TableDef{Cols: []*plan.ColDef{{Name: catalog.ExternalFilePath}}} + // Empty fpFilters → FilterFileList short-circuits, returns fileList unchanged. + fileList := []string{"/a.parquet"} + fileSize := []int64{10} + out, outSz, leftover, err := runFilePathFilters(proc.Ctx, proc, td, nil, fileList, fileSize) + require.NoError(t, err) + assert.Equal(t, fileList, out) + assert.Equal(t, fileSize, outSz) + assert.Empty(t, leftover) +} diff --git a/pkg/sql/parsers/tree/update.go b/pkg/sql/parsers/tree/update.go index d2d960be84889..138873bbebe74 100644 --- a/pkg/sql/parsers/tree/update.go +++ b/pkg/sql/parsers/tree/update.go @@ -161,6 +161,20 @@ type ExParamConst struct { Data string Tail *TailParameter StageName Identifier + + HivePartitioning bool + HivePartitionCols []string + HivePartitionColTypes []HivePartColType +} + +// HivePartColType is a compact snapshot of a partition column's type info. +// Defined in tree package to avoid importing pkg/pb/plan. +type HivePartColType struct { + Id int32 + Width int32 + Scale int32 + Enumvalues string + NullAbility bool } type ExParam struct { diff --git a/pkg/sql/plan/build_ddl.go b/pkg/sql/plan/build_ddl.go index 45ced634ca55d..68f9101a2aa6f 100644 --- a/pkg/sql/plan/build_ddl.go +++ b/pkg/sql/plan/build_ddl.go @@ -919,11 +919,16 @@ func buildCreateTable( if stmt.Param != nil { for i := 0; i < len(stmt.Param.Option); i += 2 { switch strings.ToLower(stmt.Param.Option[i]) { - case "endpoint", "region", "access_key_id", "secret_access_key", "bucket", "filepath", "compression", "format", "jsondata", "provider", "role_arn", "external_id": + case "endpoint", "region", "access_key_id", "secret_access_key", "bucket", "filepath", "compression", "format", "jsondata", "provider", "role_arn", "external_id", "hive_partitioning", "hive_partition_columns": default: return nil, moerr.NewBadConfigf(ctx.GetContext(), "the keyword '%s' is not support", strings.ToLower(stmt.Param.Option[i])) } } + + if err := validateAndSetHivePartitionOptions(ctx.GetContext(), stmt, createTable); err != nil { + return nil, err + } + if err := InitNullMap(stmt.Param, ctx); err != nil { return nil, err } @@ -5166,3 +5171,174 @@ func constructAddedPartitionDefs( return nil, moerr.NewNotSupportedNoCtx("unsupported partition method in ADD PARTITION") } } + +// validateAndSetHivePartitionOptions parses and validates hive_partitioning options from the DDL, +// normalizes partition column names, extracts column types, and strips hive keys from Option[]. +func validateAndSetHivePartitionOptions(ctx context.Context, stmt *tree.CreateTable, createTable *plan.CreateTable) error { + raw := stmt.Param.Option + + if err := rejectDuplicateKeys(ctx, raw, []string{"hive_partitioning", "hive_partition_columns"}); err != nil { + return err + } + + hiveEnabled, hiveCols, err := parseHiveOptionsFromRawOptions(ctx, raw) + if err != nil { + return err + } + if !hiveEnabled { + return nil + } + + if len(hiveCols) == 0 { + return moerr.NewBadConfig(ctx, "hive_partition_columns is required when hive_partitioning is enabled") + } + + if err := rejectDuplicateKeys(ctx, raw, []string{"format", "filepath"}); err != nil { + return err + } + + rawFormat := strings.ToLower(getRawOption(raw, "format")) + if rawFormat != "parquet" { + return moerr.NewBadConfigf(ctx, "hive_partitioning currently only supports format='parquet', got '%s'", rawFormat) + } + + rawFilepath := getRawOption(raw, "filepath") + if len(stmt.Param.StageName) != 0 || strings.HasPrefix(rawFilepath, "stage://") { + return moerr.NewBadConfig(ctx, "hive_partitioning does not support stage external tables") + } + + normalized := make([]string, 0, len(hiveCols)) + colTypes := make([]tree.HivePartColType, 0, len(hiveCols)) + seen := make(map[string]bool) + for _, pc := range hiveCols { + col := findColInTableDefCaseInsensitive(createTable.TableDef.Cols, pc) + if col == nil { + return moerr.NewBadConfigf(ctx, "partition column '%s' not found in table columns", pc) + } + if col.Hidden { + return moerr.NewBadConfigf(ctx, "partition column '%s' cannot be a hidden column", pc) + } + if col.GeneratedCol != nil { + return moerr.NewBadConfigf(ctx, "partition column '%s' cannot be a generated column", pc) + } + typId := types.T(col.Typ.Id) + if typId == types.T_array_float32 || typId == types.T_array_float64 { + return moerr.NewBadConfigf(ctx, "partition column '%s' cannot be a VECTOR type", pc) + } + canonical := strings.ToLower(col.Name) + if seen[canonical] { + return moerr.NewBadConfigf(ctx, "duplicate partition column '%s'", pc) + } + seen[canonical] = true + normalized = append(normalized, canonical) + + nullable := true + if col.Default != nil { + nullable = col.Default.NullAbility + } + colTypes = append(colTypes, tree.HivePartColType{ + Id: col.Typ.Id, + Width: col.Typ.Width, + Scale: col.Typ.Scale, + Enumvalues: col.Typ.Enumvalues, + NullAbility: nullable, + }) + } + + stmt.Param.HivePartitioning = true + stmt.Param.HivePartitionCols = normalized + stmt.Param.HivePartitionColTypes = colTypes + stmt.Param.Option = stripHiveOptionKeys(stmt.Param.Option) + return nil +} + +func parseHiveOptionsFromRawOptions(ctx context.Context, options []string) (enabled bool, cols []string, err error) { + var hiveVal string + var colsVal string + for i := 0; i < len(options); i += 2 { + key := strings.ToLower(options[i]) + switch key { + case "hive_partitioning": + hiveVal = strings.ToLower(options[i+1]) + case "hive_partition_columns": + colsVal = options[i+1] + } + } + if hiveVal == "" { + if strings.TrimSpace(colsVal) != "" { + return false, nil, moerr.NewBadConfig(ctx, "hive_partition_columns requires hive_partitioning='true'") + } + return false, nil, nil + } + if hiveVal != "true" && hiveVal != "false" { + return false, nil, moerr.NewBadConfigf(ctx, "hive_partitioning must be 'true' or 'false', got '%s'", hiveVal) + } + if hiveVal == "false" { + if strings.TrimSpace(colsVal) != "" { + return false, nil, moerr.NewBadConfig(ctx, "hive_partition_columns requires hive_partitioning='true'") + } + return false, nil, nil + } + if colsVal == "" { + return true, nil, nil + } + parts := strings.Split(colsVal, ",") + cols = make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + cols = append(cols, p) + } + } + return true, cols, nil +} + +func rejectDuplicateKeys(ctx context.Context, options []string, keys []string) error { + keySet := make(map[string]bool, len(keys)) + for _, k := range keys { + keySet[k] = true + } + seen := make(map[string]bool) + for i := 0; i < len(options); i += 2 { + key := strings.ToLower(options[i]) + if !keySet[key] { + continue + } + if seen[key] { + return moerr.NewBadConfigf(ctx, "duplicate option key '%s'", key) + } + seen[key] = true + } + return nil +} + +func getRawOption(options []string, key string) string { + for i := 0; i < len(options); i += 2 { + if strings.ToLower(options[i]) == key { + return options[i+1] + } + } + return "" +} + +func stripHiveOptionKeys(opt []string) []string { + out := make([]string, 0, len(opt)) + for i := 0; i < len(opt); i += 2 { + key := strings.ToLower(opt[i]) + if key == "hive_partitioning" || key == "hive_partition_columns" { + continue + } + out = append(out, opt[i], opt[i+1]) + } + return out +} + +func findColInTableDefCaseInsensitive(cols []*plan.ColDef, name string) *plan.ColDef { + lower := strings.ToLower(name) + for _, col := range cols { + if strings.ToLower(col.Name) == lower { + return col + } + } + return nil +} diff --git a/pkg/sql/plan/build_load.go b/pkg/sql/plan/build_load.go index fdf533903c18d..e7289328edfa5 100644 --- a/pkg/sql/plan/build_load.go +++ b/pkg/sql/plan/build_load.go @@ -222,6 +222,13 @@ func buildLoad(stmt *tree.Load, ctx CompilerContext, isPrepareStmt bool) (*Plan, return nil, err } + // Note on Hive partitioned external tables: LOAD DATA into any external + // table (hive or not) is rejected by checkTableType inside getDmlTableInfo + // above, producing "cannot insert/update/delete from external table". + // No hive-specific intercept is needed here — and any probe added below + // would be unreachable dead code. See Phase 8 P8-audit-3 decision to keep + // the generic external-table error for consistency with all DML on externals. + stmt.Param.Local = stmt.Local fileName, err := checkFileExist(stmt.Param, ctx) if err != nil { diff --git a/pkg/sql/plan/build_show_util.go b/pkg/sql/plan/build_show_util.go index 84ece374f1f05..f9c5f253c87cb 100644 --- a/pkg/sql/plan/build_show_util.go +++ b/pkg/sql/plan/build_show_util.go @@ -582,7 +582,12 @@ func ConstructCreateTableSQL( } } // hide file path - createStr += fmt.Sprintf(" INFILE{'FILEPATH'='','COMPRESSION'='%s','FORMAT'='%s','JSONDATA'='%s'}", param.CompressType, param.Format, param.JsonData) + createStr += fmt.Sprintf(" INFILE{'FILEPATH'='','COMPRESSION'='%s','FORMAT'='%s','JSONDATA'='%s'", param.CompressType, param.Format, param.JsonData) + if param.HivePartitioning { + createStr += fmt.Sprintf(",'HIVE_PARTITIONING'='true','HIVE_PARTITION_COLUMNS'='%s'", + strings.Join(param.HivePartitionCols, ",")) + } + createStr += "}" fields := "" if param.Tail != nil && param.Tail.Fields != nil { diff --git a/pkg/sql/plan/external.go b/pkg/sql/plan/external.go index 6db3ff550dada..6dbb8c3766ba5 100644 --- a/pkg/sql/plan/external.go +++ b/pkg/sql/plan/external.go @@ -179,6 +179,10 @@ func getExternalStats(node *plan.Node, builder *QueryBuilder) *Stats { return DefaultHugeStats() } + if param.HivePartitioning { + return DefaultHugeStats() + } + if param.ScanType == tree.S3 { if err = InitS3Param(param); err != nil { return DefaultHugeStats() diff --git a/pkg/sql/plan/utils.go b/pkg/sql/plan/utils.go index bdfe0c80d010d..b2bb878677d6f 100644 --- a/pkg/sql/plan/utils.go +++ b/pkg/sql/plan/utils.go @@ -1857,9 +1857,66 @@ func checkNoNeedCast(constT, columnT types.Type, constExpr *plan.Expr) bool { } +// parseHiveOptionKV handles hive_partitioning / hive_partition_columns keys in +// Init*Param. It is defensive against legacy JSON where stripHiveOptionKeys +// (build_ddl.go) had not run; when the param already has values normalized +// during DDL, the legacy option is skipped to avoid case-flip or type drift. +// +// Each key's skip guard MUST inspect only its own field. An earlier version +// coupled the hive_partitioning guard to HivePartitionCols; for legacy option +// orders like "hive_partition_columns=year, hive_partitioning=true" that caused +// hive_partitioning to be silently skipped after cols was populated, leaving +// HivePartitioning=false and the table mis-classified as non-hive. +// +// Returns (handled, err): +// - (false, nil) : key is not a hive key; caller should fall through to its own switch +// - (true, nil) : key handled (either applied or intentionally skipped) +// - (true, err) : key handled but value invalid +func parseHiveOptionKV(param *tree.ExternParam, key, val string) (bool, error) { + switch key { + case "hive_partitioning": + // Guard only on HivePartitioning itself — do NOT consult HivePartitionCols. + if param.HivePartitioning { + return true, nil + } + v := strings.ToLower(val) + if v != "true" && v != "false" { + return true, moerr.NewBadConfigf(param.Ctx, "hive_partitioning must be 'true' or 'false'") + } + param.HivePartitioning = (v == "true") + return true, nil + case "hive_partition_columns": + if len(param.HivePartitionCols) > 0 { + return true, nil + } + for _, p := range strings.Split(val, ",") { + p = strings.TrimSpace(p) + if p != "" { + param.HivePartitionCols = append(param.HivePartitionCols, strings.ToLower(p)) + } + } + return true, nil + } + return false, nil +} + +func validateHiveOptionConsistency(param *tree.ExternParam) error { + if !param.HivePartitioning && len(param.HivePartitionCols) > 0 { + return moerr.NewBadConfig(param.Ctx, "hive_partition_columns requires hive_partitioning='true'") + } + return nil +} + func InitInfileParam(param *tree.ExternParam) error { for i := 0; i < len(param.Option); i += 2 { - switch strings.ToLower(param.Option[i]) { + key := strings.ToLower(param.Option[i]) + if handled, err := parseHiveOptionKV(param, key, param.Option[i+1]); handled { + if err != nil { + return err + } + continue + } + switch key { case "filepath": param.Filepath = param.Option[i+1] case "compression": @@ -1878,9 +1935,12 @@ func InitInfileParam(param *tree.ExternParam) error { param.JsonData = jsondata param.Format = tree.JSONLINE default: - return moerr.NewBadConfigf(param.Ctx, "the keyword '%s' is not support", strings.ToLower(param.Option[i])) + return moerr.NewBadConfigf(param.Ctx, "the keyword '%s' is not support", key) } } + if err := validateHiveOptionConsistency(param); err != nil { + return err + } if len(param.Filepath) == 0 { return moerr.NewBadConfig(param.Ctx, "the filepath must be specified") } @@ -1896,7 +1956,14 @@ func InitInfileParam(param *tree.ExternParam) error { func InitS3Param(param *tree.ExternParam) error { param.S3Param = &tree.S3Parameter{} for i := 0; i < len(param.Option); i += 2 { - switch strings.ToLower(param.Option[i]) { + key := strings.ToLower(param.Option[i]) + if handled, err := parseHiveOptionKV(param, key, param.Option[i+1]); handled { + if err != nil { + return err + } + continue + } + switch key { case "endpoint": param.S3Param.Endpoint = param.Option[i+1] case "region": @@ -1930,11 +1997,13 @@ func InitS3Param(param *tree.ExternParam) error { } param.JsonData = jsondata param.Format = tree.JSONLINE - default: - return moerr.NewBadConfigf(param.Ctx, "the keyword '%s' is not support", strings.ToLower(param.Option[i])) + return moerr.NewBadConfigf(param.Ctx, "the keyword '%s' is not support", key) } } + if err := validateHiveOptionConsistency(param); err != nil { + return err + } if param.Format == tree.JSONLINE && len(param.JsonData) == 0 { return moerr.NewBadConfig(param.Ctx, "the jsondata must be specified") } @@ -2003,8 +2072,21 @@ func InitStageS3Param(param *tree.ExternParam, s stage.StageDef) error { param.S3Param.Provider, _ = s.GetCredentials(stage.PARAMKEY_PROVIDER, stage.S3_PROVIDER_AMAZON) param.CompressType, _ = s.GetCredentials(stage.PARAMKEY_COMPRESSION, "auto") + // Note: the parseHiveOptionKV call below is kept for parity with the other + // two Init*Param functions, but hive_partitioning on a stage external table + // is rejected at DDL (build_ddl.go validateAndSetHivePartitionOptions). The + // hive branch here is therefore unreachable via normal DDL; it exists only + // so every Init*Param follows the same shape and would tolerate legacy JSON + // that snuck hive keys past validation. for i := 0; i < len(param.Option); i += 2 { - switch strings.ToLower(param.Option[i]) { + key := strings.ToLower(param.Option[i]) + if handled, err := parseHiveOptionKV(param, key, param.Option[i+1]); handled { + if err != nil { + return err + } + continue + } + switch key { case "format": format := strings.ToLower(param.Option[i+1]) if format != tree.CSV && format != tree.JSONLINE && format != tree.PARQUET { @@ -2018,12 +2100,14 @@ func InitStageS3Param(param *tree.ExternParam, s stage.StageDef) error { } param.JsonData = jsondata param.Format = tree.JSONLINE - default: - return moerr.NewBadConfigf(param.Ctx, "the keyword '%s' is not support", strings.ToLower(param.Option[i])) + return moerr.NewBadConfigf(param.Ctx, "the keyword '%s' is not support", key) } } + if err := validateHiveOptionConsistency(param); err != nil { + return err + } if param.Format == tree.JSONLINE && len(param.JsonData) == 0 { return moerr.NewBadConfig(param.Ctx, "the jsondata must be specified") } diff --git a/pkg/sql/plan/utils_test.go b/pkg/sql/plan/utils_test.go index 85a23ae02959b..ed12af2424817 100644 --- a/pkg/sql/plan/utils_test.go +++ b/pkg/sql/plan/utils_test.go @@ -16,10 +16,14 @@ package plan import ( "context" + "net/url" "testing" "github.com/matrixorigin/matrixone/pkg/container/types" "github.com/matrixorigin/matrixone/pkg/pb/plan" + "github.com/matrixorigin/matrixone/pkg/sql/parsers/tree" + "github.com/matrixorigin/matrixone/pkg/stage" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -434,3 +438,823 @@ func TestDecimal128HasTrailingZeros(t *testing.T) { }) } } + +// TestParseHiveOptionKV verifies hive key parsing via Init*Param helper. +// Covers legacy-JSON fallback where Option[] still carries hive_partitioning / +// hive_partition_columns (stripHiveOptionKeys did not run). The key behavior: +// each key's skip-if-set guard must only inspect its own field; otherwise a +// reversed option order silently drops hive_partitioning=true. +func TestParseHiveOptionKV(t *testing.T) { + t.Run("canonical order applies both", func(t *testing.T) { + param := &tree.ExternParam{} + param.Option = []string{ + "hive_partitioning", "true", + "hive_partition_columns", "year,month", + } + for i := 0; i < len(param.Option); i += 2 { + handled, err := parseHiveOptionKV(param, param.Option[i], param.Option[i+1]) + require.True(t, handled) + require.NoError(t, err) + } + assert.True(t, param.HivePartitioning) + assert.Equal(t, []string{"year", "month"}, param.HivePartitionCols) + }) + + // Each key's skip-if-set guard must inspect only its own field. A coupled + // guard that treats non-empty HivePartitionCols as "already handled" would + // silently drop hive_partitioning=true when cols appeared first in Option[], + // leaving the table mis-classified as non-hive. Keep this case as a + // regression for that contract. + t.Run("reversed order still applies both", func(t *testing.T) { + param := &tree.ExternParam{} + param.Option = []string{ + "hive_partition_columns", "year,month", + "hive_partitioning", "true", + } + for i := 0; i < len(param.Option); i += 2 { + handled, err := parseHiveOptionKV(param, param.Option[i], param.Option[i+1]) + require.True(t, handled, "key=%s", param.Option[i]) + require.NoError(t, err) + } + assert.True(t, param.HivePartitioning, + "hive_partitioning must not be dropped when cols appeared first in Option[]") + assert.Equal(t, []string{"year", "month"}, param.HivePartitionCols) + }) + + t.Run("pre-populated HivePartitioning is not overwritten", func(t *testing.T) { + param := &tree.ExternParam{} + param.HivePartitioning = true + handled, err := parseHiveOptionKV(param, "hive_partitioning", "false") + require.True(t, handled) + require.NoError(t, err) + assert.True(t, param.HivePartitioning, "skip-if-set must not flip true→false") + }) + + t.Run("pre-populated HivePartitionCols is not overwritten", func(t *testing.T) { + param := &tree.ExternParam{} + param.HivePartitionCols = []string{"year"} + handled, err := parseHiveOptionKV(param, "hive_partition_columns", "month,day") + require.True(t, handled) + require.NoError(t, err) + assert.Equal(t, []string{"year"}, param.HivePartitionCols) + }) + + t.Run("invalid bool value reports error", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + handled, err := parseHiveOptionKV(param, "hive_partitioning", "yes") + require.True(t, handled) + require.Error(t, err) + }) + + t.Run("non-hive key returns not-handled", func(t *testing.T) { + param := &tree.ExternParam{} + handled, err := parseHiveOptionKV(param, "filepath", "/data/") + assert.False(t, handled) + assert.NoError(t, err) + }) + + t.Run("false value", func(t *testing.T) { + param := &tree.ExternParam{} + handled, err := parseHiveOptionKV(param, "hive_partitioning", "false") + require.True(t, handled) + require.NoError(t, err) + assert.False(t, param.HivePartitioning) + }) + + t.Run("cols lowercased and trimmed", func(t *testing.T) { + param := &tree.ExternParam{} + handled, err := parseHiveOptionKV(param, "hive_partition_columns", " Year , MONTH , , Day ") + require.True(t, handled) + require.NoError(t, err) + assert.Equal(t, []string{"year", "month", "day"}, param.HivePartitionCols) + }) +} + +// ------------------------------------------------------------------------- +// Init*Param legacy-JSON hive branches and plain happy paths. +// ------------------------------------------------------------------------- + +// TestInitInfileParam_Plain exercises the normal option pass-through with +// filepath/format/compression/jsondata so the non-hive arms are covered too. +func TestInitInfileParam_Plain(t *testing.T) { + param := &tree.ExternParam{} + param.Option = []string{ + "filepath", "/data/x", + "compression", "gzip", + "format", "parquet", + } + require.NoError(t, InitInfileParam(param)) + assert.Equal(t, "/data/x", param.Filepath) + assert.Equal(t, "gzip", param.CompressType) + assert.Equal(t, "parquet", param.Format) + + // jsonline/jsondata branch + param = &tree.ExternParam{} + param.Option = []string{"filepath", "/f", "jsondata", "object"} + require.NoError(t, InitInfileParam(param)) + assert.Equal(t, "object", param.JsonData) + assert.Equal(t, "jsonline", param.Format) + + // csv default + param = &tree.ExternParam{} + param.Option = []string{"filepath", "/csv"} + require.NoError(t, InitInfileParam(param)) + assert.Equal(t, "csv", param.Format) +} + +// TestInitInfileParam_HiveLegacyOption exercises parseHiveOptionKV via +// InitInfileParam when Option[] still contains hive keys (simulating JSON +// that predates stripHiveOptionKeys). +func TestInitInfileParam_HiveLegacyOption(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{ + "filepath", "/data/", + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year,month", + } + require.NoError(t, InitInfileParam(param)) + assert.True(t, param.HivePartitioning) + assert.Equal(t, []string{"year", "month"}, param.HivePartitionCols) +} + +func TestInitInfileParam_Errors(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + // Unknown format + param.Option = []string{"filepath", "/x", "format", "orc"} + require.Error(t, InitInfileParam(param)) + + // Unknown jsondata + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"filepath", "/x", "jsondata", "ndjson"} + require.Error(t, InitInfileParam(param)) + + // Missing filepath + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"format", "parquet"} + require.Error(t, InitInfileParam(param)) + + // jsonline without jsondata + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"filepath", "/x", "format", "jsonline"} + require.Error(t, InitInfileParam(param)) + + // Unknown keyword + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"unknown", "val", "filepath", "/x"} + require.Error(t, InitInfileParam(param)) + + // Invalid hive_partitioning value + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"filepath", "/x", "format", "parquet", "hive_partitioning", "yes"} + require.Error(t, InitInfileParam(param)) + + // Columns with hive_partitioning disabled are rejected after legacy parsing. + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{ + "filepath", "/x", + "format", "parquet", + "hive_partitioning", "false", + "hive_partition_columns", "year", + } + err := InitInfileParam(param) + require.Error(t, err) + assert.Contains(t, err.Error(), "requires hive_partitioning='true'") +} + +// TestInitS3Param_Plain exercises the S3 arm with normal options. +func TestInitS3Param_Plain(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{ + "endpoint", "https://s3.example.com", + "region", "us-west-2", + "access_key_id", "AK", + "secret_access_key", "SK", + "bucket", "my-bucket", + "filepath", "sales/", + "compression", "none", + "provider", "minio", + "role_arn", "arn:aws:iam::111:role/R", + "external_id", "ext", + "format", "parquet", + } + require.NoError(t, InitS3Param(param)) + assert.Equal(t, "https://s3.example.com", param.S3Param.Endpoint) + assert.Equal(t, "my-bucket", param.S3Param.Bucket) + assert.Equal(t, "sales/", param.Filepath) + assert.Equal(t, "parquet", param.Format) + + // jsondata jsonline path + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"bucket", "b", "jsondata", "array"} + require.NoError(t, InitS3Param(param)) + assert.Equal(t, "jsonline", param.Format) +} + +func TestInitS3Param_HiveLegacyOption(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{ + "bucket", "b", + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + require.NoError(t, InitS3Param(param)) + assert.True(t, param.HivePartitioning) + assert.Equal(t, []string{"year"}, param.HivePartitionCols) +} + +func TestInitS3Param_Errors(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + // Bad format + param.Option = []string{"bucket", "b", "format", "orc"} + require.Error(t, InitS3Param(param)) + + // Bad jsondata + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"bucket", "b", "jsondata", "bad"} + require.Error(t, InitS3Param(param)) + + // jsonline without jsondata + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"bucket", "b", "format", "jsonline"} + require.Error(t, InitS3Param(param)) + + // Unknown key + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"bogus", "x"} + require.Error(t, InitS3Param(param)) + + // Invalid hive_partitioning boolean + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"bucket", "b", "format", "parquet", "hive_partitioning", "maybe"} + require.Error(t, InitS3Param(param)) + + // Columns with hive_partitioning disabled are rejected after legacy parsing. + param = &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{ + "bucket", "b", + "format", "parquet", + "hive_partitioning", "false", + "hive_partition_columns", "year", + } + err := InitS3Param(param) + require.Error(t, err) + assert.Contains(t, err.Error(), "requires hive_partitioning='true'") +} + +// ------------------------------------------------------------------------- +// build_ddl.go hive DDL helpers. +// ------------------------------------------------------------------------- + +func TestParseHiveOptionsFromRawOptions_AllPaths(t *testing.T) { + ctx := context.Background() + + // Absent → (false, nil, nil) + en, cols, err := parseHiveOptionsFromRawOptions(ctx, []string{"filepath", "/x"}) + require.NoError(t, err) + assert.False(t, en) + assert.Nil(t, cols) + + // Explicit false → (false, nil, nil) + en, cols, err = parseHiveOptionsFromRawOptions(ctx, []string{"hive_partitioning", "false"}) + require.NoError(t, err) + assert.False(t, en) + assert.Nil(t, cols) + + // Columns without an enabled hive_partitioning flag are inconsistent. + _, _, err = parseHiveOptionsFromRawOptions(ctx, []string{"hive_partition_columns", "year"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "requires hive_partitioning='true'") + + _, _, err = parseHiveOptionsFromRawOptions(ctx, + []string{"hive_partitioning", "false", "hive_partition_columns", "year"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "requires hive_partitioning='true'") + + // Invalid value → error + _, _, err = parseHiveOptionsFromRawOptions(ctx, []string{"hive_partitioning", "yes"}) + require.Error(t, err) + + // true + empty cols → (true, nil, nil) (caller enforces non-empty) + en, cols, err = parseHiveOptionsFromRawOptions(ctx, []string{"hive_partitioning", "true"}) + require.NoError(t, err) + assert.True(t, en) + assert.Nil(t, cols) + + // true + cols — trimmed split + en, cols, err = parseHiveOptionsFromRawOptions(ctx, + []string{"hive_partitioning", "TRUE", "hive_partition_columns", " year ,, month "}) + require.NoError(t, err) + assert.True(t, en) + assert.Equal(t, []string{"year", "month"}, cols) +} + +func TestRejectDuplicateKeys(t *testing.T) { + ctx := context.Background() + // No duplicates → nil. + err := rejectDuplicateKeys(ctx, + []string{"format", "parquet", "filepath", "/x"}, + []string{"format", "filepath"}) + assert.NoError(t, err) + + // Key not in list is tolerated. + err = rejectDuplicateKeys(ctx, + []string{"compression", "gzip", "compression", "none"}, + []string{"format"}) + assert.NoError(t, err) + + // Duplicate of a watched key → error. + err = rejectDuplicateKeys(ctx, + []string{"format", "parquet", "format", "csv"}, + []string{"format"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "duplicate option key 'format'") +} + +func TestGetRawOption(t *testing.T) { + opts := []string{"Filepath", "/x", "format", "parquet"} + assert.Equal(t, "/x", getRawOption(opts, "filepath")) + assert.Equal(t, "parquet", getRawOption(opts, "format")) + assert.Equal(t, "", getRawOption(opts, "bucket")) +} + +func TestStripHiveOptionKeys(t *testing.T) { + in := []string{ + "filepath", "/x", + "hive_partitioning", "true", + "format", "parquet", + "hive_partition_columns", "year,month", + "compression", "gzip", + } + out := stripHiveOptionKeys(in) + assert.Equal(t, []string{ + "filepath", "/x", + "format", "parquet", + "compression", "gzip", + }, out) + + // Idempotent / no hive keys + in2 := []string{"filepath", "/x", "format", "parquet"} + assert.Equal(t, in2, stripHiveOptionKeys(in2)) + + // All hive keys + in3 := []string{"hive_partitioning", "true", "hive_partition_columns", "y"} + assert.Equal(t, []string{}, stripHiveOptionKeys(in3)) +} + +func TestFindColInTableDefCaseInsensitive(t *testing.T) { + td := []*plan.ColDef{ + {Name: "year"}, + {Name: "Month"}, + {Name: "Day"}, + } + got := findColInTableDefCaseInsensitive(td, "YEAR") + require.NotNil(t, got) + assert.Equal(t, "year", got.Name) + + got = findColInTableDefCaseInsensitive(td, "month") + require.NotNil(t, got) + assert.Equal(t, "Month", got.Name) + + assert.Nil(t, findColInTableDefCaseInsensitive(td, "nonexistent")) +} + +// ------------------------------------------------------------------------- +// validateAndSetHivePartitionOptions — every branch (happy + negative). +// ------------------------------------------------------------------------- + +// makeHivePlan builds a minimal plan.CreateTable with the given columns for +// validateAndSetHivePartitionOptions testing. +func makeHivePlan(cols ...*plan.ColDef) *plan.CreateTable { + return &plan.CreateTable{ + TableDef: &plan.TableDef{Cols: cols}, + } +} + +func TestValidateAndSetHivePartitionOptions_Disabled(t *testing.T) { + // hive_partitioning absent → returns nil, does not touch stmt.Param. + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{"filepath", "/x", "format", "parquet"} + ct := makeHivePlan(&plan.ColDef{Name: "id", Typ: plan.Type{Id: int32(types.T_int32)}}) + require.NoError(t, validateAndSetHivePartitionOptions(context.Background(), stmt, ct)) + assert.False(t, stmt.Param.HivePartitioning) +} + +func TestValidateAndSetHivePartitionOptions_DisabledWithColumnsRejected(t *testing.T) { + cases := []struct { + name string + opts []string + }{ + { + name: "columns without hive_partitioning", + opts: []string{"filepath", "/x", "format", "parquet", "hive_partition_columns", "year"}, + }, + { + name: "columns with hive_partitioning false", + opts: []string{ + "filepath", "/x", + "format", "parquet", + "hive_partitioning", "false", + "hive_partition_columns", "year", + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = tc.opts + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "requires hive_partitioning='true'") + }) + } +} + +func TestValidateAndSetHivePartitionOptions_HappyPath(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "filepath", "/data/", + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "Year", + } + ct := makeHivePlan( + &plan.ColDef{Name: "id", Typ: plan.Type{Id: int32(types.T_int32)}}, + &plan.ColDef{ + Name: "year", + Typ: plan.Type{Id: int32(types.T_int32)}, + Default: &plan.Default{NullAbility: true}, + }, + ) + require.NoError(t, validateAndSetHivePartitionOptions(context.Background(), stmt, ct)) + assert.True(t, stmt.Param.HivePartitioning) + assert.Equal(t, []string{"year"}, stmt.Param.HivePartitionCols) + require.Equal(t, 1, len(stmt.Param.HivePartitionColTypes)) + assert.Equal(t, int32(types.T_int32), stmt.Param.HivePartitionColTypes[0].Id) + assert.True(t, stmt.Param.HivePartitionColTypes[0].NullAbility) + // Option[] should be stripped of hive keys. + for i := 0; i < len(stmt.Param.Option); i += 2 { + assert.NotEqual(t, "hive_partitioning", stmt.Param.Option[i]) + assert.NotEqual(t, "hive_partition_columns", stmt.Param.Option[i]) + } +} + +func TestValidateAndSetHivePartitionOptions_MissingCols(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{"format", "parquet", "hive_partitioning", "true"} + ct := makeHivePlan() + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "hive_partition_columns is required") +} + +func TestValidateAndSetHivePartitionOptions_DuplicateHiveKey(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "hive_partitioning", "true", + "hive_partitioning", "false", + "hive_partition_columns", "year", + } + ct := makeHivePlan() + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "duplicate option key") +} + +func TestValidateAndSetHivePartitionOptions_DuplicateFormat(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "format", "csv", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "duplicate option key 'format'") +} + +func TestValidateAndSetHivePartitionOptions_NonParquetFormat(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "csv", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "only supports format='parquet'") +} + +func TestValidateAndSetHivePartitionOptions_StageFilepath(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "filepath", "stage://mystage/data/", + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "does not support stage external tables") +} + +func TestValidateAndSetHivePartitionOptions_StageNameSet(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.StageName = "mystage" + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "does not support stage external tables") +} + +func TestValidateAndSetHivePartitionOptions_ColumnNotFound(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "id", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "not found in table columns") +} + +func TestValidateAndSetHivePartitionOptions_HiddenColumn(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}, Hidden: true}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "cannot be a hidden column") +} + +func TestValidateAndSetHivePartitionOptions_GeneratedColumn(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{ + Name: "year", + Typ: plan.Type{Id: int32(types.T_int32)}, + GeneratedCol: &plan.GeneratedCol{}, + }) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "cannot be a generated column") +} + +func TestValidateAndSetHivePartitionOptions_VectorType(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "emb", + } + ct := makeHivePlan(&plan.ColDef{Name: "emb", Typ: plan.Type{Id: int32(types.T_array_float32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "cannot be a VECTOR type") + + ct = makeHivePlan(&plan.ColDef{Name: "emb", Typ: plan.Type{Id: int32(types.T_array_float64)}}) + err = validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "cannot be a VECTOR type") +} + +func TestValidateAndSetHivePartitionOptions_DuplicatePartitionColumn(t *testing.T) { + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year,year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "duplicate partition column") +} + +func TestValidateAndSetHivePartitionOptions_MultiLevelAndNullability(t *testing.T) { + // Multi-level partition columns; mixing with/without Default to exercise + // NullAbility default. + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "filepath", "/data/", + "format", "parquet", + "hive_partitioning", "true", + "hive_partition_columns", "year,month", + } + ct := makeHivePlan( + // year: NOT NULL (Default.NullAbility=false) + &plan.ColDef{ + Name: "year", + Typ: plan.Type{Id: int32(types.T_int32)}, + Default: &plan.Default{NullAbility: false}, + }, + // month: no Default → treated as nullable (default true) + &plan.ColDef{Name: "month", Typ: plan.Type{Id: int32(types.T_varchar), Width: 2}}, + ) + require.NoError(t, validateAndSetHivePartitionOptions(context.Background(), stmt, ct)) + require.Len(t, stmt.Param.HivePartitionColTypes, 2) + assert.False(t, stmt.Param.HivePartitionColTypes[0].NullAbility, "year declared NOT NULL") + assert.True(t, stmt.Param.HivePartitionColTypes[1].NullAbility, "month default nullable when Default is nil") + assert.Equal(t, int32(2), stmt.Param.HivePartitionColTypes[1].Width) +} + +func TestValidateAndSetHivePartitionOptions_InvalidHiveValue(t *testing.T) { + // parseHiveOptionsFromRawOptions returns an error path. + stmt := &tree.CreateTable{Param: &tree.ExternParam{}} + stmt.Param.Option = []string{ + "format", "parquet", + "hive_partitioning", "maybe", + "hive_partition_columns", "year", + } + ct := makeHivePlan(&plan.ColDef{Name: "year", Typ: plan.Type{Id: int32(types.T_int32)}}) + err := validateAndSetHivePartitionOptions(context.Background(), stmt, ct) + require.Error(t, err) + assert.Contains(t, err.Error(), "must be 'true' or 'false'") +} + +// ------------------------------------------------------------------------- +// InitStageS3Param — happy path + credential-missing error paths. +// ------------------------------------------------------------------------- + +func TestInitStageS3Param_HappyAndErrors(t *testing.T) { + parse := func(raw string) *url.URL { + u, err := url.Parse(raw) + require.NoError(t, err) + return u + } + + baseCreds := map[string]string{ + stage.PARAMKEY_AWS_KEY_ID: "AK", + stage.PARAMKEY_AWS_SECRET_KEY: "SK", + stage.PARAMKEY_AWS_REGION: "us-west-2", + stage.PARAMKEY_ENDPOINT: "https://s3.example.com", + } + + t.Run("happy_path", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + sd := stage.StageDef{ + Url: parse("s3://my-bucket/prefix/"), + Credentials: baseCreds, + } + require.NoError(t, InitStageS3Param(param, sd)) + assert.Equal(t, tree.S3, param.ScanType) + assert.Equal(t, "my-bucket", param.S3Param.Bucket) + assert.Equal(t, "AK", param.S3Param.APIKey) + assert.Equal(t, "SK", param.S3Param.APISecret) + assert.Equal(t, "us-west-2", param.S3Param.Region) + }) + + t.Run("bad_protocol", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + sd := stage.StageDef{Url: parse("http://x/")} + require.Error(t, InitStageS3Param(param, sd)) + }) + + t.Run("raw_query_rejected", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + sd := stage.StageDef{Url: parse("s3://b/p/?q=1")} + require.Error(t, InitStageS3Param(param, sd)) + }) + + // Each missing-cred path. + for _, k := range []string{ + stage.PARAMKEY_AWS_KEY_ID, stage.PARAMKEY_AWS_SECRET_KEY, + stage.PARAMKEY_AWS_REGION, stage.PARAMKEY_ENDPOINT, + } { + t.Run("missing_"+k, func(t *testing.T) { + creds := map[string]string{} + for kk, vv := range baseCreds { + if kk != k { + creds[kk] = vv + } + } + param := &tree.ExternParam{} + param.Ctx = context.Background() + sd := stage.StageDef{ + Url: parse("s3://b/p/"), + Credentials: creds, + } + err := InitStageS3Param(param, sd) + require.Error(t, err) + assert.Contains(t, err.Error(), k) + }) + } + + t.Run("option_format_csv_invalid", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"format", "orc"} + sd := stage.StageDef{ + Url: parse("s3://b/p/"), + Credentials: baseCreds, + } + require.Error(t, InitStageS3Param(param, sd)) + }) + + t.Run("option_unknown_key", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"unknown", "x"} + sd := stage.StageDef{ + Url: parse("s3://b/p/"), + Credentials: baseCreds, + } + require.Error(t, InitStageS3Param(param, sd)) + }) + + t.Run("jsonline_without_jsondata", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"format", "jsonline"} + sd := stage.StageDef{ + Url: parse("s3://b/p/"), + Credentials: baseCreds, + } + require.Error(t, InitStageS3Param(param, sd)) + }) + + t.Run("hive_legacy_option_under_stage", func(t *testing.T) { + // The defense-in-depth hive branch under InitStageS3Param. + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"hive_partitioning", "true", "hive_partition_columns", "year"} + sd := stage.StageDef{ + Url: parse("s3://b/p/"), + Credentials: baseCreds, + } + require.NoError(t, InitStageS3Param(param, sd)) + assert.True(t, param.HivePartitioning) + }) + + t.Run("hive_legacy_columns_disabled_under_stage", func(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"hive_partitioning", "false", "hive_partition_columns", "year"} + sd := stage.StageDef{ + Url: parse("s3://b/p/"), + Credentials: baseCreds, + } + err := InitStageS3Param(param, sd) + require.Error(t, err) + assert.Contains(t, err.Error(), "requires hive_partitioning='true'") + }) +} + +// ------------------------------------------------------------------------- +// InitInfileOrStageParam — non-stage pass-through. +// ------------------------------------------------------------------------- + +func TestInitInfileOrStageParam_NonStageFallsThrough(t *testing.T) { + param := &tree.ExternParam{} + param.Ctx = context.Background() + param.Option = []string{"filepath", "/data/x", "format", "parquet"} + // proc is unused for the non-stage branch. + require.NoError(t, InitInfileOrStageParam(param, nil)) + assert.Equal(t, "/data/x", param.Filepath) + assert.Equal(t, "parquet", param.Format) +} + +// Avoid unused import warning when some branches of types are not directly referenced. +var _ = types.T_int32 diff --git a/test/distributed/cases/table/hive_partition_external_table.result b/test/distributed/cases/table/hive_partition_external_table.result new file mode 100644 index 0000000000000..d1c5751d98281 --- /dev/null +++ b/test/distributed/cases/table/hive_partition_external_table.result @@ -0,0 +1,512 @@ +drop database if exists hive_part_db; +create database hive_part_db; +use hive_part_db; +drop table if exists hive_single; +create external table hive_single ( +id int, +amount double, +year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +drop table if exists hive_err1; +create external table hive_err1 ( +id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true'}; +invalid configuration: hive_partition_columns is required when hive_partitioning is enabled +drop table if exists hive_err2; +create external table hive_err2 ( +id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='csv', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +invalid configuration: hive_partitioning currently only supports format='parquet', got 'csv' +drop table if exists hive_err3; +create external table hive_err3 ( +id int, amount double +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='nonexistent'}; +invalid configuration: partition column 'nonexistent' not found in table columns +drop table if exists hive_err4; +create external table hive_err4 ( +id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partitioning'='false', 'hive_partition_columns'='year'}; +invalid configuration: duplicate option key 'hive_partitioning' +drop table if exists hive_err5; +create external table hive_err5 ( +id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='yes', 'hive_partition_columns'='year'}; +invalid configuration: hive_partitioning must be 'true' or 'false', got 'yes' +drop table if exists hive_err6; +create external table hive_err6 ( +id int, YEAR int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='YeaR'}; +drop table if exists hive_err7; +create external table hive_err7 ( +id int, year int, amount double +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year,year'}; +invalid configuration: duplicate partition column 'year' +drop table if exists hive_err8; +create external table hive_err8 ( +id int, +emb vecf32(3) +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='emb'}; +invalid configuration: partition column 'emb' cannot be a VECTOR type +show create table hive_single; +➤ Table[12,-1,0] ¦ Create Table[12,-1,0] 𝄀 +hive_single ¦ CREATE EXTERNAL TABLE `hive_single` ( + `id` int DEFAULT NULL, + `amount` double DEFAULT NULL, + `year` int DEFAULT NULL +) INFILE{'FILEPATH'='','COMPRESSION'='','FORMAT'='parquet','JSONDATA'='','HIVE_PARTITIONING'='true','HIVE_PARTITION_COLUMNS'='year'} +load data infile '$resources/hive_partition/single_level/year=2024/data.parquet' into table hive_single; +invalid input: cannot insert/update/delete from external table +drop table if exists hive_disabled; +create external table hive_disabled ( +id int, +amount double +) infile{'filepath'='$resources/hive_partition/non_hive/simple.parquet', 'format'='parquet', 'hive_partitioning'='false'}; +select count(*) from hive_disabled; +➤ count(*)[-5,64,0] 𝄀 +3 +select count(*) as cnt from hive_single; +➤ cnt[-5,64,0] 𝄀 +25 +select year, count(*) as cnt from hive_single where year = 2024 group by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 5 +select year, count(*) as cnt from hive_single where year in (2020, 2024) group by year order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2020 ¦ 5 𝄀 +2024 ¦ 5 +select year, count(*) as cnt from hive_single where year in (2022) group by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2022 ¦ 5 +select year, count(*) as cnt from hive_single where year not in (2020, 2021, 2022) group by year order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2023 ¦ 5 𝄀 +2024 ¦ 5 +select count(*) as cnt from hive_single where year > 2022; +➤ cnt[-5,64,0] 𝄀 +10 +select year, count(*) as cnt from hive_single where year between 2021 and 2023 group by year order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2021 ¦ 5 𝄀 +2022 ¦ 5 𝄀 +2023 ¦ 5 +select year, count(*) as cnt from hive_single where year = 2020 or year = 2024 group by year order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2020 ¦ 5 𝄀 +2024 ¦ 5 +select distinct year from hive_single order by year; +➤ year[4,32,0] 𝄀 +2020 𝄀 +2021 𝄀 +2022 𝄀 +2023 𝄀 +2024 +select sum(amount) as total from hive_single where year = 2020; +➤ total[8,54,0] 𝄀 +105.0 +select year, sum(amount) as total from hive_single group by year having sum(amount) >= 100 order by year; +➤ year[4,32,0] ¦ total[8,54,0] 𝄀 +2020 ¦ 105.0 𝄀 +2021 ¦ 105.0 𝄀 +2022 ¦ 105.0 𝄀 +2023 ¦ 105.0 𝄀 +2024 ¦ 105.0 +select count(distinct year) as distinct_years from hive_single; +➤ distinct_years[-5,64,0] 𝄀 +5 +select count(*) from hive_single where year + 1 = 2025; +➤ count(*)[-5,64,0] 𝄀 +5 +select count(*) from hive_single where cast(year as varchar) = '2024'; +➤ count(*)[-5,64,0] 𝄀 +5 +select id, year from hive_single order by year asc, id desc limit 5; +➤ id[4,32,0] ¦ year[4,32,0] 𝄀 +20204 ¦ 2020 𝄀 +20203 ¦ 2020 𝄀 +20202 ¦ 2020 𝄀 +20201 ¦ 2020 𝄀 +20200 ¦ 2020 +select count(*) as cnt from hive_single where year is not null; +➤ cnt[-5,64,0] 𝄀 +25 +select year, id from hive_single where year = 2023 order by id; +➤ year[4,32,0] ¦ id[4,32,0] 𝄀 +2023 ¦ 20230 𝄀 +2023 ¦ 20231 𝄀 +2023 ¦ 20232 𝄀 +2023 ¦ 20233 𝄀 +2023 ¦ 20234 +select year, cnt from ( +select year, count(*) as cnt from hive_single where year in (2020, 2021) group by year +) t order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2020 ¦ 5 𝄀 +2021 ¦ 5 +drop table if exists hive_multi; +create external table hive_multi ( +id int, +amount double, +year int, +month varchar(2) +) infile{'filepath'='$resources/hive_partition/multi_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year,month'}; +select count(*) as cnt from hive_multi; +➤ cnt[-5,64,0] 𝄀 +18 +select year, count(*) as cnt from hive_multi where year = 2024 group by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 9 +select month, count(*) as cnt from hive_multi where month = '01' group by month order by month; +➤ month[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +01 ¦ 6 +select year, month, count(*) as cnt from hive_multi where year = 2024 and month = '01' group by year, month; +➤ year[4,32,0] ¦ month[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 01 ¦ 3 +select year, month, count(*) as cnt from hive_multi +where year in (2024, 2025) and month in ('01', '02') +group by year, month order by year, month; +➤ year[4,32,0] ¦ month[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 01 ¦ 3 𝄀 +2024 ¦ 02 ¦ 3 𝄀 +2025 ¦ 01 ¦ 3 𝄀 +2025 ¦ 02 ¦ 3 +select year, month, sum(amount) as total from hive_multi group by year, month order by year, month; +➤ year[4,32,0] ¦ month[12,-1,0] ¦ total[8,54,0] 𝄀 +2024 ¦ 01 ¦ 6072.03 𝄀 +2024 ¦ 02 ¦ 6072.0599999999995 𝄀 +2024 ¦ 03 ¦ 6072.09 𝄀 +2025 ¦ 01 ¦ 6075.03 𝄀 +2025 ¦ 02 ¦ 6075.0599999999995 𝄀 +2025 ¦ 03 ¦ 6075.09 +select id, year, month, amount from hive_multi where year = 2025 order by month asc, id asc limit 6; +➤ id[4,32,0] ¦ year[4,32,0] ¦ month[12,-1,0] ¦ amount[8,54,0] 𝄀 +202510 ¦ 2025 ¦ 01 ¦ 2025.01 𝄀 +202511 ¦ 2025 ¦ 01 ¦ 2025.01 𝄀 +202512 ¦ 2025 ¦ 01 ¦ 2025.01 𝄀 +202520 ¦ 2025 ¦ 02 ¦ 2025.02 𝄀 +202521 ¦ 2025 ¦ 02 ¦ 2025.02 𝄀 +202522 ¦ 2025 ¦ 02 ¦ 2025.02 +select year, count(*) as cnt from hive_multi group by year having count(*) >= 9 order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 9 𝄀 +2025 ¦ 9 +select distinct year, month from hive_multi order by year, month; +➤ year[4,32,0] ¦ month[12,-1,0] 𝄀 +2024 ¦ 01 𝄀 +2024 ¦ 02 𝄀 +2024 ¦ 03 𝄀 +2025 ¦ 01 𝄀 +2025 ¦ 02 𝄀 +2025 ¦ 03 +select a.year, a.month, count(*) as cnt +from hive_multi a join hive_multi b +on a.year = b.year and a.month = b.month and a.id = b.id +where a.year = 2024 +group by a.year, a.month order by a.year, a.month; +➤ year[4,32,0] ¦ month[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 01 ¦ 3 𝄀 +2024 ¦ 02 ¦ 3 𝄀 +2024 ¦ 03 ¦ 3 +select year, mc from ( +select year, count(distinct month) as mc from hive_multi group by year +) t order by year; +➤ year[4,32,0] ¦ mc[-5,64,0] 𝄀 +2024 ¦ 3 𝄀 +2025 ¦ 3 +drop table if exists hive_string; +create external table hive_string ( +id int, +amount double, +country varchar(10) +) infile{'filepath'='$resources/hive_partition/string_part/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='country'}; +select country, count(*) as cnt from hive_string where country = 'US' group by country; +➤ country[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +US ¦ 4 +select country, count(*) as cnt from hive_string where country in ('US', 'CN') group by country order by country; +➤ country[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +CN ¦ 4 𝄀 +US ¦ 4 +select country, count(*) as cnt from hive_string where country like 'U%' group by country; +➤ country[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +US ¦ 4 +select country, count(*) as cnt from hive_string group by country order by country; +➤ country[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +CN ¦ 4 𝄀 +JP ¦ 4 𝄀 +US ¦ 4 +select country, count(*) as cnt from hive_string where country != 'JP' group by country order by country; +➤ country[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +CN ¦ 4 𝄀 +US ¦ 4 +select id, country from hive_string order by country, id limit 6; +➤ id[4,32,0] ¦ country[12,-1,0] 𝄀 +200 ¦ CN 𝄀 +201 ¦ CN 𝄀 +202 ¦ CN 𝄀 +203 ¦ CN 𝄀 +300 ¦ JP 𝄀 +301 ¦ JP +select country, length(country) as ln from hive_string group by country order by country; +➤ country[12,-1,0] ¦ ln[-5,64,0] 𝄀 +CN ¦ 2 𝄀 +JP ¦ 2 𝄀 +US ¦ 2 +drop table if exists hive_null; +create external table hive_null ( +id int, +amount double, +year int +) infile{'filepath'='$resources/hive_partition/null_part/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select id, year from hive_null order by id; +➤ id[4,32,0] ¦ year[4,32,0] 𝄀 +1 ¦ 2024 𝄀 +2 ¦ 2024 𝄀 +3 ¦ 2024 𝄀 +4 ¦ null 𝄀 +5 ¦ null +select count(*) as cnt from hive_null where year is null; +➤ cnt[-5,64,0] 𝄀 +2 +select count(*) as cnt from hive_null where year is not null; +➤ cnt[-5,64,0] 𝄀 +3 +select year, count(*) as cnt from hive_null group by year order by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +null ¦ 2 𝄀 +2024 ¦ 3 +select coalesce(year, -1) as y, count(*) as cnt from hive_null group by y order by y; +➤ y[-5,64,0] ¦ cnt[-5,64,0] 𝄀 +-1 ¦ 2 𝄀 +2024 ¦ 3 +drop table if exists hive_zeropad; +create external table hive_zeropad ( +id int, +amount double, +month int +) infile{'filepath'='$resources/hive_partition/zero_pad/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='month'}; +select month, count(*) as cnt from hive_zeropad where month = 1 group by month; +➤ month[4,32,0] ¦ cnt[-5,64,0] 𝄀 +1 ¦ 2 +select month, count(*) as cnt from hive_zeropad where month in (1, 12) group by month order by month; +➤ month[4,32,0] ¦ cnt[-5,64,0] 𝄀 +1 ¦ 2 𝄀 +12 ¦ 2 +select month, count(*) as cnt from hive_zeropad group by month order by month; +➤ month[4,32,0] ¦ cnt[-5,64,0] 𝄀 +1 ¦ 2 𝄀 +2 ¦ 2 𝄀 +12 ¦ 2 +select count(*) as cnt from hive_zeropad where month = 99; +➤ cnt[-5,64,0] 𝄀 +0 +select count(*) as cnt from hive_single where year = 2024 and __mo_filepath like '%year=2024%'; +➤ cnt[-5,64,0] 𝄀 +5 +select count(distinct __mo_filepath) as paths from hive_single; +➤ paths[-5,64,0] 𝄀 +5 +select year, count(distinct __mo_filepath) as files from hive_single group by year order by year; +➤ year[4,32,0] ¦ files[-5,64,0] 𝄀 +2020 ¦ 1 𝄀 +2021 ¦ 1 𝄀 +2022 ¦ 1 𝄀 +2023 ¦ 1 𝄀 +2024 ¦ 1 +select count(*) as rows_with_path from hive_single where length(__mo_filepath) > 0; +➤ rows_with_path[-5,64,0] 𝄀 +25 +drop table if exists parquet_non_hive; +create external table parquet_non_hive ( +id int, +amount double +) infile{'filepath'='$resources/hive_partition/non_hive/simple.parquet', 'format'='parquet'}; +select count(*) as cnt from parquet_non_hive where __mo_filepath like '%simple.parquet'; +➤ cnt[-5,64,0] 𝄀 +3 +select count(distinct __mo_filepath) as paths from parquet_non_hive; +➤ paths[-5,64,0] 𝄀 +1 +select count(*) as rows_with_path from parquet_non_hive where length(__mo_filepath) > 0; +➤ rows_with_path[-5,64,0] 𝄀 +3 +select count(*) as cnt from hive_single where year = 2024 and __mo_filepath like '%year=2024%'; +➤ cnt[-5,64,0] 𝄀 +5 +select count(*) as cnt from hive_single where year = 2024 and __mo_filepath like '%year=2020%'; +➤ cnt[-5,64,0] 𝄀 +0 +explain (check '["External Scan", "Filter Cond"]') select * from hive_single where year = 2024; +➤ AP QUERY PLAN ON MULTICN(10 core)[12,0,0] 𝄀 +Project 𝄀 + -> External Scan on hive_part_db.hive_single 𝄀 + Filter Cond: (hive_single.year = 2024) +explain (analyze true, check '["External Scan", "inputRows=", "outputRows="]') select * from hive_single where year = 2024; +-- @regex("inputRows=", true) +➤ ap query plan on multicn(10 core)[12,-1,0] 𝄀 +Project 𝄀 + Analyze: timeConsumed=0ms waitTime=0ms inputRows=5 outputRows=5 (min=5, max=5) InputSize=80 bytes OutputSize=80 bytes ReadSize=0 bytes|0 bytes|0 bytes MemorySize=80 bytes (min=80 bytes, max=80 bytes) 𝄀 + -> External Scan on hive_part_db.hive_single 𝄀 + Analyze: timeConsumed=0ms waitTime=0ms inputRows=5 outputRows=5 (min=5, max=5) InputSize=80 bytes OutputSize=80 bytes ReadSize=0 bytes|0 bytes|0 bytes MemorySize=165 bytes (min=80 bytes, max=80 bytes) 𝄀 + Filter Cond: (hive_single.year = 2024) +explain (check '["External Scan", "Filter Cond"]') select * from hive_multi where year = 2024 and month = '01'; +➤ AP QUERY PLAN ON MULTICN(10 core)[12,0,0] 𝄀 +Project 𝄀 + -> External Scan on hive_part_db.hive_multi 𝄀 + Filter Cond: (hive_multi.year = 2024), (hive_multi.month = '01') +explain (check '["External Scan", "Filter Cond"]') select * from hive_single where year in (2020, 2024); +➤ AP QUERY PLAN ON MULTICN(10 core)[12,0,0] 𝄀 +Project 𝄀 + -> External Scan on hive_part_db.hive_single 𝄀 + Filter Cond: hive_single.year in ([2020 2024]) +select hs.year, count(*) as cnt +from hive_single hs join hive_multi hm on hs.year = hm.year +where hs.year = 2024 +group by hs.year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 45 +drop table if exists year_dim; +create table year_dim (y int, label varchar(20)); +insert into year_dim values (2020, 'y2020'), (2024, 'y2024'), (2025, 'y2025'); +select d.label, count(*) as cnt +from hive_single h join year_dim d on h.year = d.y +where h.year in (2020, 2024) +group by d.label order by d.label; +➤ label[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +y2020 ¦ 5 𝄀 +y2024 ¦ 5 +select d.y, count(h.id) as cnt +from year_dim d left join hive_single h on h.year = d.y +group by d.y order by d.y; +➤ y[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2020 ¦ 5 𝄀 +2024 ¦ 5 𝄀 +2025 ¦ 0 +select 'single' as src, count(*) as cnt from hive_single where year = 2024 +union all +select 'multi' as src, count(*) as cnt from hive_multi where year = 2024; +➤ src[12,-1,0] ¦ cnt[-5,64,0] 𝄀 +single ¦ 5 𝄀 +multi ¦ 9 +select id, year from hive_single +where year = (select max(year) from year_dim where label = 'y2024') +order by id limit 5; +correlated columns in aggregate function is not yet implemented +select year, count(*) as cnt from hive_single +where year in (select y from year_dim where label like 'y2024%') +group by year; +➤ year[4,32,0] ¦ cnt[-5,64,0] 𝄀 +2024 ¦ 5 +select count(*) as cnt from hive_single h +where exists (select 1 from year_dim d where d.y = h.year); +➤ cnt[-5,64,0] 𝄀 +10 +with yearly as ( +select year, sum(amount) as total from hive_single group by year +) +select year, total from yearly where total > 100 order by year; +➤ year[4,32,0] ¦ total[8,54,0] 𝄀 +2020 ¦ 105.0 𝄀 +2021 ¦ 105.0 𝄀 +2022 ¦ 105.0 𝄀 +2023 ¦ 105.0 𝄀 +2024 ¦ 105.0 +select year, +round(sum(case when amount > 21 then amount else 0 end), 1) as above, +round(sum(case when amount <= 21 then amount else 0 end), 1) as below +from hive_single +where year in (2020, 2024) +group by year order by year; +➤ year[4,32,0] ¦ above[8,54,0] ¦ below[8,54,0] 𝄀 +2020 ¦ 73.5 ¦ 31.5 𝄀 +2024 ¦ 73.5 ¦ 31.5 +select year, id, row_number() over (partition by year order by id) as rn +from hive_single where year in (2020, 2021) order by year, id; +➤ year[4,32,0] ¦ id[4,32,0] ¦ rn[-5,64,0] 𝄀 +2020 ¦ 20200 ¦ 1 𝄀 +2020 ¦ 20201 ¦ 2 𝄀 +2020 ¦ 20202 ¦ 3 𝄀 +2020 ¦ 20203 ¦ 4 𝄀 +2020 ¦ 20204 ¦ 5 𝄀 +2021 ¦ 20210 ¦ 1 𝄀 +2021 ¦ 20211 ¦ 2 𝄀 +2021 ¦ 20212 ¦ 3 𝄀 +2021 ¦ 20213 ¦ 4 𝄀 +2021 ¦ 20214 ¦ 5 +select count(*) as cnt from ( +select id, year from hive_single where year = 2023 +union all +select id, year from hive_single where year = 2024 +) t; +➤ cnt[-5,64,0] 𝄀 +10 +drop table if exists hive_invalid_type; +create external table hive_invalid_type ( +id int, +amount double, +year int +) infile{'filepath'='$resources/hive_partition/invalid_type/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select * from hive_invalid_type; +internal error: partition value type conversion failed: col=year, value='abc', path=year=abc/data.parquet: strconv.ParseInt: parsing "abc": invalid syntax +drop table if exists hive_url_encoded; +create external table hive_url_encoded ( +id int, +amount double, +country varchar(20) +) infile{'filepath'='$resources/hive_partition/url_encoded/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='country'}; +select * from hive_url_encoded; +internal error: hive partition directory name contains '%' which is not supported: 'country=US%2FCA' +drop table if exists hive_stage_err; +create external table hive_stage_err ( +id int, year int +) infile{'filepath'='stage://mystage/data/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +invalid configuration: hive_partitioning does not support stage external tables +drop table if exists hive_not_null_default; +create external table hive_not_null_default ( +id int, +amount double, +year int not null +) infile{'filepath'='$resources/hive_partition/not_null_default/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select * from hive_not_null_default; +constraint violation: partition column 'year' is NOT NULL but directory has __HIVE_DEFAULT_PARTITION__ in path 'year=__HIVE_DEFAULT_PARTITION__/data.parquet'; allow NULL on the partition column or remove/rename the default partition directory +drop table if exists hive_col_overlap; +create external table hive_col_overlap ( +id int, +amount double, +year int +) infile{'filepath'='$resources/hive_partition/col_overlap/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select distinct year from hive_col_overlap; +➤ year[4,32,0] 𝄀 +2024 +select id, year from hive_col_overlap order by id; +➤ id[4,32,0] ¦ year[4,32,0] 𝄀 +1 ¦ 2024 𝄀 +2 ¦ 2024 +select id, amount from parquet_non_hive order by id; +➤ id[4,32,0] ¦ amount[8,54,0] 𝄀 +1 ¦ 100.0 𝄀 +2 ¦ 200.0 𝄀 +3 ¦ 300.0 +drop table if exists hive_mixed_case; +create external table hive_mixed_case ( +id int, +amount double, +Year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='Year'}; +select count(*) as cnt from hive_mixed_case where Year = 2024; +➤ cnt[-5,64,0] 𝄀 +5 +select count(*) as cnt from hive_mixed_case where year = 2024; +➤ cnt[-5,64,0] 𝄀 +5 +select count(*) as cnt from hive_single hs where hs.year = 2024; +➤ cnt[-5,64,0] 𝄀 +5 +drop table if exists hive_single; +create external table hive_single ( +id int, +amount double, +year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select count(*) from hive_single; +➤ count(*)[-5,64,0] 𝄀 +25 +drop database if exists hive_part_db; diff --git a/test/distributed/cases/table/hive_partition_external_table.sql b/test/distributed/cases/table/hive_partition_external_table.sql new file mode 100644 index 0000000000000..10cbd58f463bf --- /dev/null +++ b/test/distributed/cases/table/hive_partition_external_table.sql @@ -0,0 +1,494 @@ +-- Hive-style Partitioned External Table BVT Tests +-- +-- Coverage overview: +-- 1. DDL validation (success + negative) +-- 2. Single-level partition queries (full / EQ / IN / NOT IN / range / IS NULL) +-- 3. Multi-level partition queries (full / single / double / cross-level) +-- 4. String partition (EQ / IN / LIKE / ORDER BY partition col) +-- 5. NULL partition (__HIVE_DEFAULT_PARTITION__) +-- 6. Zero-padded integer partition +-- 7. __mo_filepath virtual column (hive + non-hive) +-- 8. EXPLAIN (CHECK + ANALYZE) +-- 9. Complex predicates (OR / BETWEEN / CAST / arithmetic) +-- 10. Aggregations & subqueries +-- 11. JOIN hive × hive, hive × internal +-- 12. UNION ALL / DISTINCT / HAVING +-- 13. Edge cases (type failure, URL-encoded, physical column overlap, stage rejection) + +drop database if exists hive_part_db; +create database hive_part_db; +use hive_part_db; + +-- ============================================================================ +-- 1. DDL Validation +-- ============================================================================ + +-- 1.1 Basic creation (single level) +drop table if exists hive_single; +create external table hive_single ( + id int, + amount double, + year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; + +-- 1.2 DDL error: missing partition_columns +drop table if exists hive_err1; +create external table hive_err1 ( + id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true'}; + +-- 1.3 DDL error: format not parquet +drop table if exists hive_err2; +create external table hive_err2 ( + id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='csv', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; + +-- 1.4 DDL error: column not found +drop table if exists hive_err3; +create external table hive_err3 ( + id int, amount double +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='nonexistent'}; + +-- 1.5 DDL error: duplicate hive key +drop table if exists hive_err4; +create external table hive_err4 ( + id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partitioning'='false', 'hive_partition_columns'='year'}; + +-- 1.6 DDL error: hive_partitioning value not boolean +drop table if exists hive_err5; +create external table hive_err5 ( + id int, year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='yes', 'hive_partition_columns'='year'}; + +-- 1.7 Partition column name matched case-insensitively (succeeds, no error) +-- Both the declared column `YEAR` and the partition reference `YeaR` lowercase +-- to `year`, so findColInTableDefCaseInsensitive finds the column. Same flow +-- as test 10.7 (hive_mixed_case), kept here for DDL-validation coverage. +drop table if exists hive_err6; +create external table hive_err6 ( + id int, YEAR int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='YeaR'}; + +-- 1.8 DDL error: duplicate partition column names +drop table if exists hive_err7; +create external table hive_err7 ( + id int, year int, amount double +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year,year'}; + +-- 1.9 DDL error: VECTOR partition column rejected +drop table if exists hive_err8; +create external table hive_err8 ( + id int, + emb vecf32(3) +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='emb'}; + +-- 1.10 SHOW CREATE TABLE +show create table hive_single; + +-- 1.11 LOAD DATA into hive table should be rejected (external table generic rejection) +load data infile '$resources/hive_partition/single_level/year=2024/data.parquet' into table hive_single; + +-- 1.12 hive_partitioning='false' treated as disabled (existing external table path) +drop table if exists hive_disabled; +create external table hive_disabled ( + id int, + amount double +) infile{'filepath'='$resources/hive_partition/non_hive/simple.parquet', 'format'='parquet', 'hive_partitioning'='false'}; +select count(*) from hive_disabled; + +-- ============================================================================ +-- 2. Single Level Partition Queries +-- ============================================================================ + +-- 2.1 Full scan (all 5 partitions x 5 rows = 25 rows) +select count(*) as cnt from hive_single; + +-- 2.2 EQ pruning +select year, count(*) as cnt from hive_single where year = 2024 group by year; + +-- 2.3 IN pruning +select year, count(*) as cnt from hive_single where year in (2020, 2024) group by year order by year; + +-- 2.4 IN pruning with single value +select year, count(*) as cnt from hive_single where year in (2022) group by year; + +-- 2.5 NOT IN (rowFilter fallback) +select year, count(*) as cnt from hive_single where year not in (2020, 2021, 2022) group by year order by year; + +-- 2.6 Non-prunable GT (rowFilter fallback, must not lose data) +select count(*) as cnt from hive_single where year > 2022; + +-- 2.7 BETWEEN (rowFilter fallback) +select year, count(*) as cnt from hive_single where year between 2021 and 2023 group by year order by year; + +-- 2.8 OR condition (rowFilter fallback; not prunable in P0) +select year, count(*) as cnt from hive_single where year = 2020 or year = 2024 group by year order by year; + +-- 2.9 Partition column only in SELECT +select distinct year from hive_single order by year; + +-- 2.10 Partition column only in WHERE +select sum(amount) as total from hive_single where year = 2020; + +-- 2.11 Partition col in HAVING (threshold < 105 to include all partitions) +select year, sum(amount) as total from hive_single group by year having sum(amount) >= 100 order by year; + +-- 2.12 COUNT DISTINCT on partition column +select count(distinct year) as distinct_years from hive_single; + +-- 2.13 Partition column in arithmetic expression (rowFilter evaluates) +select count(*) from hive_single where year + 1 = 2025; + +-- 2.14 CAST on partition column (rowFilter only, not pruned) +select count(*) from hive_single where cast(year as varchar) = '2024'; + +-- 2.15 ORDER BY partition column with LIMIT +select id, year from hive_single order by year asc, id desc limit 5; + +-- 2.16 Partition column IS NOT NULL (trivially true for non-null data) +select count(*) as cnt from hive_single where year is not null; + +-- 2.17 Partition column used both as predicate and projection +select year, id from hive_single where year = 2023 order by id; + +-- 2.18 Subquery with partition pruning +select year, cnt from ( + select year, count(*) as cnt from hive_single where year in (2020, 2021) group by year +) t order by year; + +-- ============================================================================ +-- 3. Multi Level Partition Queries +-- ============================================================================ + +drop table if exists hive_multi; +create external table hive_multi ( + id int, + amount double, + year int, + month varchar(2) +) infile{'filepath'='$resources/hive_partition/multi_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year,month'}; + +-- 3.1 Full scan (2 years x 3 months x 3 rows = 18) +select count(*) as cnt from hive_multi; + +-- 3.2 Single level pruning (outer) +select year, count(*) as cnt from hive_multi where year = 2024 group by year; + +-- 3.3 Single level pruning (inner only) +select month, count(*) as cnt from hive_multi where month = '01' group by month order by month; + +-- 3.4 Double level pruning +select year, month, count(*) as cnt from hive_multi where year = 2024 and month = '01' group by year, month; + +-- 3.5 Outer IN + inner IN +select year, month, count(*) as cnt from hive_multi +where year in (2024, 2025) and month in ('01', '02') +group by year, month order by year, month; + +-- 3.6 GROUP BY partition columns +select year, month, sum(amount) as total from hive_multi group by year, month order by year, month; + +-- 3.7 ORDER BY mixed partition + physical +select id, year, month, amount from hive_multi where year = 2025 order by month asc, id asc limit 6; + +-- 3.8 HAVING with partition columns +select year, count(*) as cnt from hive_multi group by year having count(*) >= 9 order by year; + +-- 3.9 Two-column distinct +select distinct year, month from hive_multi order by year, month; + +-- 3.10 Self join on partition columns +select a.year, a.month, count(*) as cnt +from hive_multi a join hive_multi b +on a.year = b.year and a.month = b.month and a.id = b.id +where a.year = 2024 +group by a.year, a.month order by a.year, a.month; + +-- 3.11 Group-by with subquery to count month per year +select year, mc from ( + select year, count(distinct month) as mc from hive_multi group by year +) t order by year; + +-- ============================================================================ +-- 4. String Partition +-- ============================================================================ + +drop table if exists hive_string; +create external table hive_string ( + id int, + amount double, + country varchar(10) +) infile{'filepath'='$resources/hive_partition/string_part/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='country'}; + +-- 4.1 String EQ (exact-byte match → prunable) +select country, count(*) as cnt from hive_string where country = 'US' group by country; + +-- 4.2 String IN +select country, count(*) as cnt from hive_string where country in ('US', 'CN') group by country order by country; + +-- 4.3 String LIKE (rowFilter only, not prunable) +select country, count(*) as cnt from hive_string where country like 'U%' group by country; + +-- 4.4 All countries +select country, count(*) as cnt from hive_string group by country order by country; + +-- 4.5 String partition != condition (rowFilter fallback) +select country, count(*) as cnt from hive_string where country != 'JP' group by country order by country; + +-- 4.6 String partition in ORDER BY (partition value in output) +select id, country from hive_string order by country, id limit 6; + +-- 4.7 Partition col length function +select country, length(country) as ln from hive_string group by country order by country; + +-- ============================================================================ +-- 5. NULL Partition (__HIVE_DEFAULT_PARTITION__) +-- ============================================================================ + +drop table if exists hive_null; +create external table hive_null ( + id int, + amount double, + year int +) infile{'filepath'='$resources/hive_partition/null_part/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; + +-- 5.1 NULL partition is visible +select id, year from hive_null order by id; + +-- 5.2 IS NULL filter +select count(*) as cnt from hive_null where year is null; + +-- 5.3 IS NOT NULL filter +select count(*) as cnt from hive_null where year is not null; + +-- 5.4 Aggregation handling NULL groups +select year, count(*) as cnt from hive_null group by year order by year; + +-- 5.5 Coalesce partition column +select coalesce(year, -1) as y, count(*) as cnt from hive_null group by y order by y; + +-- ============================================================================ +-- 6. Zero-padded Integer Partition +-- ============================================================================ + +drop table if exists hive_zeropad; +create external table hive_zeropad ( + id int, + amount double, + month int +) infile{'filepath'='$resources/hive_partition/zero_pad/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='month'}; + +-- 6.1 Integer comparison with zero-padded directory (month=01 matches WHERE month = 1) +select month, count(*) as cnt from hive_zeropad where month = 1 group by month; + +-- 6.2 Integer IN with mixed zero-padded targets +select month, count(*) as cnt from hive_zeropad where month in (1, 12) group by month order by month; + +-- 6.3 All months +select month, count(*) as cnt from hive_zeropad group by month order by month; + +-- 6.4 Non-matching value prunes all partitions +select count(*) as cnt from hive_zeropad where month = 99; + +-- ============================================================================ +-- 7. __mo_filepath Virtual Column +-- ============================================================================ + +-- 7.1 __mo_filepath on hive table (verify path contains partition directory) +select count(*) as cnt from hive_single where year = 2024 and __mo_filepath like '%year=2024%'; + +-- 7.2 __mo_filepath returns distinct paths per partition (projection, exercises +-- parquet prepare() filepathColIndex branch + fillVirtualColumns, not FilterFileList) +select count(distinct __mo_filepath) as paths from hive_single; + +-- 7.3 __mo_filepath projection + partition column aggregation (distinct value per partition) +select year, count(distinct __mo_filepath) as files from hive_single group by year order by year; + +-- 7.4 __mo_filepath as ONLY projected column (rowCountOnly path — no physical col read) +-- count(length(...)>0) confirms SetConstBytes fill produced non-empty bytes for every row. +select count(*) as rows_with_path from hive_single where length(__mo_filepath) > 0; + +-- 7.5 Non-hive parquet external table — projection-level assertion for pre-existing bug fix +-- The row count where __mo_filepath is non-empty must equal the file row count (3). +drop table if exists parquet_non_hive; +create external table parquet_non_hive ( + id int, + amount double +) infile{'filepath'='$resources/hive_partition/non_hive/simple.parquet', 'format'='parquet'}; +select count(*) as cnt from parquet_non_hive where __mo_filepath like '%simple.parquet'; +select count(distinct __mo_filepath) as paths from parquet_non_hive; +select count(*) as rows_with_path from parquet_non_hive where length(__mo_filepath) > 0; + +-- 7.6 Combined partition col + __mo_filepath filter (both conditions prune/filter) +select count(*) as cnt from hive_single where year = 2024 and __mo_filepath like '%year=2024%'; + +-- 7.7 Contradictory partition + filepath (empty result, but evaluates correctly) +select count(*) as cnt from hive_single where year = 2024 and __mo_filepath like '%year=2020%'; + +-- ============================================================================ +-- 8. EXPLAIN Verification +-- ============================================================================ + +-- 8.1 EXPLAIN shows External Scan with Filter Cond +explain (check '["External Scan", "Filter Cond"]') select * from hive_single where year = 2024; + +-- 8.2 EXPLAIN ANALYZE has runtime stats +-- @regex("inputRows=",true) +explain (analyze true, check '["External Scan", "inputRows=", "outputRows="]') select * from hive_single where year = 2024; + +-- 8.3 EXPLAIN multi-level partition scan shows both filter conditions retained (double-filter safety) +explain (check '["External Scan", "Filter Cond"]') select * from hive_multi where year = 2024 and month = '01'; + +-- 8.4 EXPLAIN with IN list +explain (check '["External Scan", "Filter Cond"]') select * from hive_single where year in (2020, 2024); + +-- ============================================================================ +-- 9. Complex Query Patterns +-- ============================================================================ + +-- 9.1 JOIN hive x hive on partition column +select hs.year, count(*) as cnt +from hive_single hs join hive_multi hm on hs.year = hm.year +where hs.year = 2024 +group by hs.year; + +-- 9.2 JOIN hive x internal (dimension) table +drop table if exists year_dim; +create table year_dim (y int, label varchar(20)); +insert into year_dim values (2020, 'y2020'), (2024, 'y2024'), (2025, 'y2025'); + +select d.label, count(*) as cnt +from hive_single h join year_dim d on h.year = d.y +where h.year in (2020, 2024) +group by d.label order by d.label; + +-- 9.3 LEFT JOIN preserves rows without match +select d.y, count(h.id) as cnt +from year_dim d left join hive_single h on h.year = d.y +group by d.y order by d.y; + +-- 9.4 UNION ALL merges partitions from two tables +select 'single' as src, count(*) as cnt from hive_single where year = 2024 +union all +select 'multi' as src, count(*) as cnt from hive_multi where year = 2024; + +-- 9.5 Scalar subquery with partition predicate +select id, year from hive_single +where year = (select max(year) from year_dim where label = 'y2024') +order by id limit 5; + +-- 9.6 IN subquery with hive partition column +select year, count(*) as cnt from hive_single +where year in (select y from year_dim where label like 'y2024%') +group by year; + +-- 9.7 EXISTS subquery +select count(*) as cnt from hive_single h +where exists (select 1 from year_dim d where d.y = h.year); + +-- 9.8 CTE over hive external table +with yearly as ( + select year, sum(amount) as total from hive_single group by year +) +select year, total from yearly where total > 100 order by year; + +-- 9.9 Aggregation with conditional sum (above vs below median amount) +select year, + round(sum(case when amount > 21 then amount else 0 end), 1) as above, + round(sum(case when amount <= 21 then amount else 0 end), 1) as below +from hive_single +where year in (2020, 2024) +group by year order by year; + +-- 9.10 Window function over partition column +-- Note: ROW_NUMBER() OVER (PARTITION BY year ORDER BY id) — demonstrates partition column usable in window spec +select year, id, row_number() over (partition by year order by id) as rn +from hive_single where year in (2020, 2021) order by year, id; + +-- 9.11 COUNT with subquery filter +select count(*) as cnt from ( + select id, year from hive_single where year = 2023 + union all + select id, year from hive_single where year = 2024 +) t; + +-- ============================================================================ +-- 10. Edge Cases +-- ============================================================================ + +-- 10.1 Type conversion failure: year declared as INT but directory has 'abc' +-- Error contains col + value + relative path (stable across machines). +drop table if exists hive_invalid_type; +create external table hive_invalid_type ( + id int, + amount double, + year int +) infile{'filepath'='$resources/hive_partition/invalid_type/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select * from hive_invalid_type; + +-- 10.2 URL-encoded directory name containing '%' should report error (P0 known limitation) +drop table if exists hive_url_encoded; +create external table hive_url_encoded ( + id int, + amount double, + country varchar(20) +) infile{'filepath'='$resources/hive_partition/url_encoded/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='country'}; +select * from hive_url_encoded; + +-- 10.3 Stage hive external table should be rejected at DDL +drop table if exists hive_stage_err; +create external table hive_stage_err ( + id int, year int +) infile{'filepath'='stage://mystage/data/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; + +-- 10.4 __HIVE_DEFAULT_PARTITION__ with NOT NULL column +-- Error contains col + "NOT NULL" + relative path (stable across machines). +drop table if exists hive_not_null_default; +create external table hive_not_null_default ( + id int, + amount double, + year int not null +) infile{'filepath'='$resources/hive_partition/not_null_default/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select * from hive_not_null_default; + +-- 10.5 Physical column overlap: parquet file has physical 'year=9999', path has year=2024 +-- Partition value from path (2024) must override the physical column (9999) +drop table if exists hive_col_overlap; +create external table hive_col_overlap ( + id int, + amount double, + year int +) infile{'filepath'='$resources/hive_partition/col_overlap/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select distinct year from hive_col_overlap; +select id, year from hive_col_overlap order by id; + +-- 10.6 Non-hive parquet smoke: regular physical column query (not just __mo_filepath) +select id, amount from parquet_non_hive order by id; + +-- 10.7 Case-insensitive column name in DDL +drop table if exists hive_mixed_case; +create external table hive_mixed_case ( + id int, + amount double, + Year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='Year'}; +select count(*) as cnt from hive_mixed_case where Year = 2024; +select count(*) as cnt from hive_mixed_case where year = 2024; + +-- 10.8 Partition column referenced with table qualifier +select count(*) as cnt from hive_single hs where hs.year = 2024; + +-- 10.9 DROP then re-CREATE (catalog round-trip) +drop table if exists hive_single; +create external table hive_single ( + id int, + amount double, + year int +) infile{'filepath'='$resources/hive_partition/single_level/', 'format'='parquet', 'hive_partitioning'='true', 'hive_partition_columns'='year'}; +select count(*) from hive_single; + +-- ============================================================================ +-- 11. Cleanup +-- ============================================================================ +drop database if exists hive_part_db; diff --git a/test/distributed/resources/hive_partition/col_overlap/year=2024/data.parquet b/test/distributed/resources/hive_partition/col_overlap/year=2024/data.parquet new file mode 100644 index 0000000000000..be5fcbd05d577 Binary files /dev/null and b/test/distributed/resources/hive_partition/col_overlap/year=2024/data.parquet differ diff --git a/test/distributed/resources/hive_partition/invalid_type/year=abc/data.parquet b/test/distributed/resources/hive_partition/invalid_type/year=abc/data.parquet new file mode 100644 index 0000000000000..2bd09d5ee9dc2 Binary files /dev/null and b/test/distributed/resources/hive_partition/invalid_type/year=abc/data.parquet differ diff --git a/test/distributed/resources/hive_partition/multi_level/year=2024/month=01/data.parquet b/test/distributed/resources/hive_partition/multi_level/year=2024/month=01/data.parquet new file mode 100644 index 0000000000000..d1b15d5f50c50 Binary files /dev/null and b/test/distributed/resources/hive_partition/multi_level/year=2024/month=01/data.parquet differ diff --git a/test/distributed/resources/hive_partition/multi_level/year=2024/month=02/data.parquet b/test/distributed/resources/hive_partition/multi_level/year=2024/month=02/data.parquet new file mode 100644 index 0000000000000..d3a4e85360d47 Binary files /dev/null and b/test/distributed/resources/hive_partition/multi_level/year=2024/month=02/data.parquet differ diff --git a/test/distributed/resources/hive_partition/multi_level/year=2024/month=03/data.parquet b/test/distributed/resources/hive_partition/multi_level/year=2024/month=03/data.parquet new file mode 100644 index 0000000000000..8d2e03e49bb0c Binary files /dev/null and b/test/distributed/resources/hive_partition/multi_level/year=2024/month=03/data.parquet differ diff --git a/test/distributed/resources/hive_partition/multi_level/year=2025/month=01/data.parquet b/test/distributed/resources/hive_partition/multi_level/year=2025/month=01/data.parquet new file mode 100644 index 0000000000000..9468e4c039a33 Binary files /dev/null and b/test/distributed/resources/hive_partition/multi_level/year=2025/month=01/data.parquet differ diff --git a/test/distributed/resources/hive_partition/multi_level/year=2025/month=02/data.parquet b/test/distributed/resources/hive_partition/multi_level/year=2025/month=02/data.parquet new file mode 100644 index 0000000000000..efe6dc06e38f0 Binary files /dev/null and b/test/distributed/resources/hive_partition/multi_level/year=2025/month=02/data.parquet differ diff --git a/test/distributed/resources/hive_partition/multi_level/year=2025/month=03/data.parquet b/test/distributed/resources/hive_partition/multi_level/year=2025/month=03/data.parquet new file mode 100644 index 0000000000000..c1ce6e382b696 Binary files /dev/null and b/test/distributed/resources/hive_partition/multi_level/year=2025/month=03/data.parquet differ diff --git a/test/distributed/resources/hive_partition/non_hive/simple.parquet b/test/distributed/resources/hive_partition/non_hive/simple.parquet new file mode 100644 index 0000000000000..104373d4f8622 Binary files /dev/null and b/test/distributed/resources/hive_partition/non_hive/simple.parquet differ diff --git a/test/distributed/resources/hive_partition/not_null_default/year=2024/data.parquet b/test/distributed/resources/hive_partition/not_null_default/year=2024/data.parquet new file mode 100644 index 0000000000000..189304dad7120 Binary files /dev/null and b/test/distributed/resources/hive_partition/not_null_default/year=2024/data.parquet differ diff --git a/test/distributed/resources/hive_partition/not_null_default/year=__HIVE_DEFAULT_PARTITION__/data.parquet b/test/distributed/resources/hive_partition/not_null_default/year=__HIVE_DEFAULT_PARTITION__/data.parquet new file mode 100644 index 0000000000000..a7d5246e26dd7 Binary files /dev/null and b/test/distributed/resources/hive_partition/not_null_default/year=__HIVE_DEFAULT_PARTITION__/data.parquet differ diff --git a/test/distributed/resources/hive_partition/null_part/year=2024/data.parquet b/test/distributed/resources/hive_partition/null_part/year=2024/data.parquet new file mode 100644 index 0000000000000..332590794427a Binary files /dev/null and b/test/distributed/resources/hive_partition/null_part/year=2024/data.parquet differ diff --git a/test/distributed/resources/hive_partition/null_part/year=__HIVE_DEFAULT_PARTITION__/data.parquet b/test/distributed/resources/hive_partition/null_part/year=__HIVE_DEFAULT_PARTITION__/data.parquet new file mode 100644 index 0000000000000..93ee3b43f50b6 Binary files /dev/null and b/test/distributed/resources/hive_partition/null_part/year=__HIVE_DEFAULT_PARTITION__/data.parquet differ diff --git a/test/distributed/resources/hive_partition/single_level/year=2020/data.parquet b/test/distributed/resources/hive_partition/single_level/year=2020/data.parquet new file mode 100644 index 0000000000000..08ac1f6dce3da Binary files /dev/null and b/test/distributed/resources/hive_partition/single_level/year=2020/data.parquet differ diff --git a/test/distributed/resources/hive_partition/single_level/year=2021/data.parquet b/test/distributed/resources/hive_partition/single_level/year=2021/data.parquet new file mode 100644 index 0000000000000..d9742decec3d1 Binary files /dev/null and b/test/distributed/resources/hive_partition/single_level/year=2021/data.parquet differ diff --git a/test/distributed/resources/hive_partition/single_level/year=2022/data.parquet b/test/distributed/resources/hive_partition/single_level/year=2022/data.parquet new file mode 100644 index 0000000000000..9839f330234c3 Binary files /dev/null and b/test/distributed/resources/hive_partition/single_level/year=2022/data.parquet differ diff --git a/test/distributed/resources/hive_partition/single_level/year=2023/data.parquet b/test/distributed/resources/hive_partition/single_level/year=2023/data.parquet new file mode 100644 index 0000000000000..16567adf5794c Binary files /dev/null and b/test/distributed/resources/hive_partition/single_level/year=2023/data.parquet differ diff --git a/test/distributed/resources/hive_partition/single_level/year=2024/.crc b/test/distributed/resources/hive_partition/single_level/year=2024/.crc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/test/distributed/resources/hive_partition/single_level/year=2024/_SUCCESS b/test/distributed/resources/hive_partition/single_level/year=2024/_SUCCESS new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/test/distributed/resources/hive_partition/single_level/year=2024/data.parquet b/test/distributed/resources/hive_partition/single_level/year=2024/data.parquet new file mode 100644 index 0000000000000..dc4244fb6e2ae Binary files /dev/null and b/test/distributed/resources/hive_partition/single_level/year=2024/data.parquet differ diff --git a/test/distributed/resources/hive_partition/string_part/country=CN/data.parquet b/test/distributed/resources/hive_partition/string_part/country=CN/data.parquet new file mode 100644 index 0000000000000..04c62e15df9b5 Binary files /dev/null and b/test/distributed/resources/hive_partition/string_part/country=CN/data.parquet differ diff --git a/test/distributed/resources/hive_partition/string_part/country=JP/data.parquet b/test/distributed/resources/hive_partition/string_part/country=JP/data.parquet new file mode 100644 index 0000000000000..43e561cc95724 Binary files /dev/null and b/test/distributed/resources/hive_partition/string_part/country=JP/data.parquet differ diff --git a/test/distributed/resources/hive_partition/string_part/country=US/data.parquet b/test/distributed/resources/hive_partition/string_part/country=US/data.parquet new file mode 100644 index 0000000000000..629b30740355a Binary files /dev/null and b/test/distributed/resources/hive_partition/string_part/country=US/data.parquet differ diff --git a/test/distributed/resources/hive_partition/url_encoded/country=US%2FCA/data.parquet b/test/distributed/resources/hive_partition/url_encoded/country=US%2FCA/data.parquet new file mode 100644 index 0000000000000..3018ca8c7c723 Binary files /dev/null and b/test/distributed/resources/hive_partition/url_encoded/country=US%2FCA/data.parquet differ diff --git a/test/distributed/resources/hive_partition/zero_pad/month=01/data.parquet b/test/distributed/resources/hive_partition/zero_pad/month=01/data.parquet new file mode 100644 index 0000000000000..0ef14e6d2fd23 Binary files /dev/null and b/test/distributed/resources/hive_partition/zero_pad/month=01/data.parquet differ diff --git a/test/distributed/resources/hive_partition/zero_pad/month=02/data.parquet b/test/distributed/resources/hive_partition/zero_pad/month=02/data.parquet new file mode 100644 index 0000000000000..958b53128158a Binary files /dev/null and b/test/distributed/resources/hive_partition/zero_pad/month=02/data.parquet differ diff --git a/test/distributed/resources/hive_partition/zero_pad/month=12/data.parquet b/test/distributed/resources/hive_partition/zero_pad/month=12/data.parquet new file mode 100644 index 0000000000000..c6fcdd13a04e3 Binary files /dev/null and b/test/distributed/resources/hive_partition/zero_pad/month=12/data.parquet differ