diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/ngram/NgramFileIndex.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/ngram/NgramFileIndex.java new file mode 100644 index 000000000000..cd51ba9fa92f --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/ngram/NgramFileIndex.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.ngram; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexResult; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fileindex.FileIndexer; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.utils.IOUtils; + +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import static org.apache.paimon.fileindex.FileIndexResult.REMAIN; +import static org.apache.paimon.fileindex.FileIndexResult.SKIP; + +/** N-gram file index for string prefix/suffix/contains queries. */ +public class NgramFileIndex implements FileIndexer { + + private static final int DEFAULT_GRAM_SIZE = 2; + private static final String GRAM_SIZE = "gram_size"; + + private final int gramSize; + + public NgramFileIndex(Options options) { + this.gramSize = options.getInteger(GRAM_SIZE, DEFAULT_GRAM_SIZE); + } + + @Override + public FileIndexWriter createWriter() { + return new Writer(gramSize); + } + + @Override + public FileIndexReader createReader(SeekableInputStream inputStream, int start, int length) { + try { + inputStream.seek(start); + byte[] serializedBytes = new byte[length]; + IOUtils.readFully(inputStream, serializedBytes); + return new Reader(serializedBytes); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** Writer for N-gram index. */ + private static class Writer extends FileIndexWriter { + + private final int gramSize; + private final Set ngramSet = new HashSet<>(); + + Writer(int gramSize) { + this.gramSize = gramSize; + } + + @Override + public void write(Object key) { + if (key == null) { + return; + } + String value = ((BinaryString) key).toString(); + addNgrams(value); + } + + private void addNgrams(String value) { + if (value.length() < gramSize) { + ngramSet.add(value); + } else { + for (int i = 0; i <= value.length() - gramSize; i++) { + ngramSet.add(value.substring(i, i + gramSize)); + } + } + } + + @Override + public byte[] serializedBytes() { + try { + ByteArrayOutputStream byteOut = new ByteArrayOutputStream(); + DataOutputStream dataOut = new DataOutputStream(byteOut); + + dataOut.writeInt(gramSize); + dataOut.writeInt(ngramSet.size()); + + for (String token : ngramSet) { + byte[] tokenBytes = token.getBytes("UTF-8"); + dataOut.writeShort(tokenBytes.length); + dataOut.write(tokenBytes); + } + + dataOut.close(); + return byteOut.toByteArray(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + /** Reader for N-gram index. */ + private static class Reader extends FileIndexReader { + + private final int gramSize; + private final Set ngramSet; + + Reader(byte[] serializedBytes) { + try { + DataInputStream dataIn = + new DataInputStream(new java.io.ByteArrayInputStream(serializedBytes)); + this.gramSize = dataIn.readInt(); + int setSize = dataIn.readInt(); + this.ngramSet = new HashSet<>(setSize); + + for (int i = 0; i < setSize; i++) { + int tokenLength = dataIn.readShort(); + byte[] tokenBytes = new byte[tokenLength]; + dataIn.readFully(tokenBytes); + ngramSet.add(new String(tokenBytes, "UTF-8")); + } + + dataIn.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public FileIndexResult visitEqual(FieldRef fieldRef, Object literal) { + return checkPattern(literalToString(literal)); + } + + @Override + public FileIndexResult visitStartsWith(FieldRef fieldRef, Object literal) { + return checkPattern(literalToString(literal)); + } + + @Override + public FileIndexResult visitEndsWith(FieldRef fieldRef, Object literal) { + return checkPattern(literalToString(literal)); + } + + @Override + public FileIndexResult visitContains(FieldRef fieldRef, Object literal) { + return checkPattern(literalToString(literal)); + } + + @Override + public FileIndexResult visitLike(FieldRef fieldRef, Object literal) { + String pattern = literalToString(literal); + String[] parts = pattern.split("%"); + String longestPart = ""; + for (String part : parts) { + if (part.length() > longestPart.length()) { + longestPart = part; + } + } + return checkPattern(longestPart); + } + + private FileIndexResult checkPattern(String pattern) { + if (pattern == null || pattern.isEmpty() || pattern.length() < gramSize) { + return REMAIN; + } + + for (int i = 0; i <= pattern.length() - gramSize; i++) { + String ngram = pattern.substring(i, i + gramSize); + if (!ngramSet.contains(ngram)) { + return SKIP; + } + } + return REMAIN; + } + + private String literalToString(Object literal) { + if (literal == null) { + return null; + } + if (literal instanceof BinaryString) { + return ((BinaryString) literal).toString(); + } + return literal.toString(); + } + } +} diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/ngram/NgramFileIndexFactory.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/ngram/NgramFileIndexFactory.java new file mode 100644 index 000000000000..8d5113175bfe --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/ngram/NgramFileIndexFactory.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.ngram; + +import org.apache.paimon.fileindex.FileIndexer; +import org.apache.paimon.fileindex.FileIndexerFactory; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.CharType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.VarCharType; + +/** Factory to create N-gram file index. */ +public class NgramFileIndexFactory implements FileIndexerFactory { + + public static final String NGRAM_INDEX = "ngram"; + + @Override + public String identifier() { + return NGRAM_INDEX; + } + + @Override + public FileIndexer create(DataType dataType, Options options) { + if (!isStringType(dataType)) { + throw new IllegalArgumentException( + "N-gram index only supports string types (VARCHAR, CHAR), got: " + dataType); + } + return new NgramFileIndex(options); + } + + private boolean isStringType(DataType dataType) { + return dataType instanceof VarCharType || dataType instanceof CharType; + } +} diff --git a/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory b/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory index 5f8ed20221d4..2f519c8af418 100644 --- a/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory +++ b/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory @@ -17,3 +17,4 @@ org.apache.paimon.fileindex.bloomfilter.BloomFilterFileIndexFactory org.apache.paimon.fileindex.bitmap.BitmapFileIndexFactory org.apache.paimon.fileindex.bsi.BitSliceIndexBitmapFileIndexFactory org.apache.paimon.fileindex.rangebitmap.RangeBitmapFileIndexFactory +org.apache.paimon.fileindex.ngram.NgramFileIndexFactory diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexBenchmarkTest.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexBenchmarkTest.java new file mode 100644 index 000000000000..2b8faca80ab4 --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexBenchmarkTest.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.ngram; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fs.ByteArraySeekableStream; +import org.apache.paimon.options.Options; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class NgramFileIndexBenchmarkTest { + + @Test + void benchmarkIndexBuildTime() { + int recordCount = 100_000; + List testData = generateTestData(recordCount); + + // Warmup + buildIndex(testData.subList(0, 1000)); + + // Benchmark + long startTime = System.nanoTime(); + byte[] indexBytes = buildIndex(testData); + long buildTimeMs = (System.nanoTime() - startTime) / 1_000_000; + + System.out.println( + String.format( + "Index Build: %d records -> %d bytes, time: %d ms (%.2f records/ms)", + recordCount, + indexBytes.length, + buildTimeMs, + (double) recordCount / buildTimeMs)); + assertThat(buildTimeMs).isLessThan(5000); + } + + @Test + void benchmarkIndexSize() { + int[] recordCounts = {1_000, 10_000, 100_000}; + + System.out.println("Index Size Benchmark:"); + System.out.println("Records\t\tIndex Size\tCompression"); + for (int count : recordCounts) { + List data = generateTestData(count); + byte[] indexBytes = buildIndex(data); + double compressionRatio = (double) indexBytes.length / (count * 20); + System.out.println( + String.format( + "%d\t\t%d bytes\t%.2f%%", + count, indexBytes.length, compressionRatio * 100)); + } + } + + @Test + void benchmarkFilteringPerformance() { + int recordCount = 50_000; + List testData = generateTestData(recordCount); + byte[] indexBytes = buildIndex(testData); + FileIndexReader reader = createReader(indexBytes); + + String[] queries = {"ap", "ba", "xy", "zz", "test", "app", "ban"}; + + System.out.println("\nFiltering Performance:"); + System.out.println("Query\t\tResult\t\tTime(µs)"); + for (String query : queries) { + long startTime = System.nanoTime(); + FileIndexReader tempReader = createReader(indexBytes); + boolean remain = + tempReader.visitStartsWith(null, BinaryString.fromString(query)).remain(); + long timeUs = (System.nanoTime() - startTime) / 1_000; + System.out.println( + String.format("%s\t\t%s\t\t%d", query, remain ? "REMAIN" : "SKIP", timeUs)); + } + } + + @Test + void benchmarkSkipRateVsCardinality() { + System.out.println("\nSkip Rate vs Data Cardinality:"); + System.out.println("Scenario\t\t\tRecords\t\tSkip Rate"); + + // Scenario 1: Low cardinality (repeating values) + List lowCard = new ArrayList<>(); + for (int i = 0; i < 10_000; i++) { + lowCard.add("apple_" + (i % 100)); + } + byte[] indexLowCard = buildIndex(lowCard); + int skipsLowCard = countSkips(indexLowCard, new String[] {"xyz", "def", "ghi"}); + System.out.println( + String.format( + "Low Cardinality (100 unique)\t10000\t\t%.1f%%", + (double) skipsLowCard * 100 / 3)); + + // Scenario 2: High cardinality (unique values) + List highCard = new ArrayList<>(); + for (int i = 0; i < 10_000; i++) { + highCard.add("prefix_" + i); + } + byte[] indexHighCard = buildIndex(highCard); + int skipsHighCard = countSkips(indexHighCard, new String[] {"xyz", "def", "ghi"}); + System.out.println( + String.format( + "High Cardinality (10000 unique)\t10000\t\t%.1f%%", + (double) skipsHighCard * 100 / 3)); + + // Scenario 3: Domain-specific (emails) + List emails = new ArrayList<>(); + String[] domains = {"gmail.com", "yahoo.com", "outlook.com", "example.org"}; + for (int i = 0; i < 10_000; i++) { + emails.add("user_" + i + "@" + domains[i % domains.length]); + } + byte[] indexEmails = buildIndex(emails); + int skipsEmails = + countSkips(indexEmails, new String[] {"@qq", "@sina", "@qq.com", "@sina.com.cn"}); + System.out.println( + String.format( + "Email Domain Pattern\t\t10000\t\t%.1f%%", (double) skipsEmails * 100 / 4)); + } + + @Test + void benchmarkMemoryUsage() { + int recordCount = 100_000; + List testData = generateTestData(recordCount); + + long memBefore = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); + byte[] indexBytes = buildIndex(testData); + long memAfter = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); + + System.out.println("\nMemory Usage:"); + System.out.println( + String.format( + "Records: %d, Index Size: %d bytes, Overhead: %d MB", + recordCount, indexBytes.length, (memAfter - memBefore) / (1024 * 1024))); + } + + @Test + void benchmarkQueryPatternVariations() { + List data = new ArrayList<>(); + for (int i = 0; i < 5_000; i++) { + data.add("application_service_" + i + "_prod_server_" + (i % 100)); + } + byte[] indexBytes = buildIndex(data); + + System.out.println("\nPattern Variation Skip Rates:"); + System.out.println("Pattern\t\t\t\tSkip?"); + + String[] patterns = { + "app", + "application", + "service", + "prod", + "server", + "xyz", + "notexist", + "ap", + "appl", + "applicat" + }; + + for (String pattern : patterns) { + FileIndexReader reader = createReader(indexBytes); + boolean remain = + reader.visitStartsWith(null, BinaryString.fromString(pattern)).remain(); + System.out.println(String.format("%-30s\t%s", pattern, remain ? "REMAIN" : "SKIP")); + } + } + + private List generateTestData(int count) { + List data = new ArrayList<>(); + String[] prefixes = {"apple", "banana", "application", "test", "production", "service"}; + String[] suffixes = {"_prod", "_test", "_dev", "_staging", "_backup", ""}; + + for (int i = 0; i < count; i++) { + String value = prefixes[i % prefixes.length] + "_" + i + suffixes[i % suffixes.length]; + data.add(value); + } + return data; + } + + private byte[] buildIndex(List data) { + Options options = new Options(); + NgramFileIndex index = new NgramFileIndex(options); + FileIndexWriter writer = index.createWriter(); + for (String value : data) { + writer.write(BinaryString.fromString(value)); + } + return writer.serializedBytes(); + } + + private FileIndexReader createReader(byte[] indexBytes) { + Options options = new Options(); + NgramFileIndex index = new NgramFileIndex(options); + ByteArraySeekableStream stream = new ByteArraySeekableStream(indexBytes); + return index.createReader(stream, 0, indexBytes.length); + } + + private int countSkips(byte[] indexBytes, String[] queries) { + int skips = 0; + for (String query : queries) { + FileIndexReader reader = createReader(indexBytes); + if (!reader.visitStartsWith(null, BinaryString.fromString(query)).remain()) { + skips++; + } + } + return skips; + } +} diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexSimpleTest.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexSimpleTest.java new file mode 100644 index 000000000000..6405e97d22df --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexSimpleTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.ngram; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.options.Options; + +import org.junit.jupiter.api.Test; + +import java.util.HashSet; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +class NgramFileIndexSimpleTest { + + @Test + void testNgramGeneration() { + Options options = new Options(); + NgramFileIndex index = new NgramFileIndex(options); + FileIndexWriter writer = index.createWriter(); + + writer.write(BinaryString.fromString("hello")); + writer.write(BinaryString.fromString("world")); + + Set expectedNgrams = new HashSet<>(); + expectedNgrams.add("he"); + expectedNgrams.add("el"); + expectedNgrams.add("ll"); + expectedNgrams.add("lo"); + expectedNgrams.add("wo"); + expectedNgrams.add("or"); + expectedNgrams.add("rl"); + expectedNgrams.add("ld"); + + byte[] bytes = writer.serializedBytes(); + + String patternsWithXyz = "xy"; + assertThat(expectedNgrams.contains(patternsWithXyz)).isFalse(); + assertThat(expectedNgrams.contains("yz")).isFalse(); + assertThat(expectedNgrams.contains("he")).isTrue(); + assertThat(expectedNgrams.contains("wo")).isTrue(); + } +} diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexTest.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexTest.java new file mode 100644 index 000000000000..2e9a1e61982e --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/ngram/NgramFileIndexTest.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.ngram; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fs.ByteArraySeekableStream; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataTypes; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class NgramFileIndexTest { + + @Test + void testStartsWithShouldSkip() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + writer.write(BinaryString.fromString("world")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitStartsWith(null, BinaryString.fromString("he")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("wo")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("xyz")).remain()).isFalse(); + } + + @Test + void testEndsWithShouldSkip() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + writer.write(BinaryString.fromString("world")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitEndsWith(null, BinaryString.fromString("lo")).remain()).isTrue(); + assertThat(reader.visitEndsWith(null, BinaryString.fromString("ld")).remain()).isTrue(); + assertThat(reader.visitEndsWith(null, BinaryString.fromString("xyz")).remain()).isFalse(); + } + + @Test + void testContainsShouldSkip() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + writer.write(BinaryString.fromString("world")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitContains(null, BinaryString.fromString("ll")).remain()).isTrue(); + assertThat(reader.visitContains(null, BinaryString.fromString("or")).remain()).isTrue(); + assertThat(reader.visitContains(null, BinaryString.fromString("xyz")).remain()).isFalse(); + } + + @Test + void testEqualShouldWork() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitEqual(null, BinaryString.fromString("hello")).remain()).isTrue(); + assertThat(reader.visitEqual(null, BinaryString.fromString("world")).remain()).isFalse(); + } + + @Test + void testShortPatternReturnsRemain() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitStartsWith(null, BinaryString.fromString("h")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("a")).remain()).isTrue(); + } + + @Test + void testDifferentGramSize() { + Options options = new Options(); + options.set("gram_size", "3"); + NgramFileIndex index = new NgramFileIndex(options); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitStartsWith(null, BinaryString.fromString("hel")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("xyz")).remain()).isFalse(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("he")).remain()).isTrue(); + } + + @Test + void testSerializationDeserialization() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("apple")); + writer.write(BinaryString.fromString("application")); + writer.write(BinaryString.fromString("banana")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitStartsWith(null, BinaryString.fromString("ap")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("ba")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("xyz")).remain()).isFalse(); + } + + @Test + void testNullValueHandling() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + writer.write(null); + writer.write(BinaryString.fromString("world")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitStartsWith(null, BinaryString.fromString("he")).remain()).isTrue(); + assertThat(reader.visitStartsWith(null, BinaryString.fromString("xyz")).remain()).isFalse(); + } + + @Test + void testLikePattern() { + NgramFileIndex index = new NgramFileIndex(new Options()); + FileIndexWriter writer = index.createWriter(); + writer.write(BinaryString.fromString("hello")); + writer.write(BinaryString.fromString("world")); + + byte[] bytes = writer.serializedBytes(); + FileIndexReader reader = createReader(index, bytes); + + assertThat(reader.visitLike(null, BinaryString.fromString("h%o")).remain()).isTrue(); + assertThat(reader.visitLike(null, BinaryString.fromString("w%d")).remain()).isTrue(); + } + + @Test + void testNonStringTypeThrowsException() { + NgramFileIndexFactory factory = new NgramFileIndexFactory(); + assertThatThrownBy(() -> factory.create(DataTypes.INT(), new Options())) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("N-gram index only supports string types"); + } + + private FileIndexReader createReader(NgramFileIndex index, byte[] bytes) { + ByteArraySeekableStream stream = new ByteArraySeekableStream(bytes); + return index.createReader(stream, 0, bytes.length); + } +}