diff --git a/Gemfile.lock b/Gemfile.lock index 105fffd..857ccfb 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -93,6 +93,9 @@ GEM racc (~> 1.4) nokogiri (1.18.8-x86_64-linux-musl) racc (~> 1.4) + numo-linalg (0.1.7) + numo-narray (>= 0.9.1.4) + numo-narray (0.9.2.1) pp (0.6.2) prettyprint prettyprint (0.2.0) @@ -156,6 +159,8 @@ DEPENDENCIES concurrent-ruby (= 1.3.4) dotenv-rails neighbor + numo-linalg + numo-narray ruby-openai BUNDLED WITH diff --git a/app/jobs/issue_embedding_job.rb b/app/jobs/issue_embedding_job.rb index f1527fc..d623645 100644 --- a/app/jobs/issue_embedding_job.rb +++ b/app/jobs/issue_embedding_job.rb @@ -30,18 +30,34 @@ def embedding_needs_update?(embedding, new_content_hash) def update_embedding(issue, embedding, content_hash) embedding_service = EmbeddingService.new begin - content = embedding_service.prepare_issue_content(issue) - vector = embedding_service.generate_embedding(content) - - embedding.embedding_vector = vector - embedding.content_hash = content_hash - embedding.model_used = Setting.plugin_semantic_search["embedding_model"] - embedding.save! - + _generate_and_save_embedding(issue, embedding, content_hash, embedding_service) Rails.logger.info("=> [SEMANTIC_SEARCH] Successfully generated embedding for Issue ##{issue.id}") rescue StandardError => e Rails.logger.error("=> [SEMANTIC_SEARCH] Failed to generate embedding for Issue ##{issue.id}: #{e.message}") + Rails.logger.error("=> [SEMANTIC_SEARCH] Error details: #{e.backtrace.join("\n")}") raise e end end + + def _generate_and_save_embedding(issue, embedding, content_hash, embedding_service) + content = embedding_service.prepare_issue_content(issue) + vector, original_dimension = embedding_service.generate_embedding(content) + + log_msg = "=> [SEMANTIC_SEARCH] Generated embedding with dimension: #{vector.length}, " \ + "original: #{original_dimension}" + Rails.logger.info(log_msg) + + vector = DimensionReductionService.validate_vector_dimension( + vector, + EmbeddingService::TARGET_DIMENSION + ) + + Rails.logger.info("=> [SEMANTIC_SEARCH] Validated embedding dimension: #{vector.length}") + + embedding.embedding_vector = vector + embedding.content_hash = content_hash + embedding.model_used = Setting.plugin_semantic_search["embedding_model"] + embedding.original_dimension = original_dimension + embedding.save! + end end diff --git a/app/services/dimension_reduction_service.rb b/app/services/dimension_reduction_service.rb new file mode 100644 index 0000000..7c17c6c --- /dev/null +++ b/app/services/dimension_reduction_service.rb @@ -0,0 +1,70 @@ +class DimensionReductionService + def self.reduce_dimensions(vector, source_dimension, target_dimension) + padded_vector = pad_vector(vector, source_dimension) + + pca_like_reduction(padded_vector, target_dimension) + end + + def self.pad_vector(vector, target_size) + return vector if vector.nil? || vector.length >= target_size + + vector + Array.new(target_size - vector.length, 0.0) + end + + def self.validate_vector_dimension(vector, target_dimension) + return nil if vector.nil? + + if vector.length > target_dimension + vector.first(target_dimension) + elsif vector.length < target_dimension + pad_vector(vector, target_dimension) + else + vector + end + end + + def self.pca_like_reduction(vector, target_dimension) + return vector.first(target_dimension) if vector.length <= target_dimension + + top_indices = _select_top_indices(vector, target_dimension) + uniform_indices = _select_uniform_indices(vector, target_dimension, top_indices) + combined_indices = top_indices + uniform_indices + filled_indices = _fill_remaining_indices(vector, target_dimension, combined_indices) + + selected_indices = (top_indices + uniform_indices + filled_indices).sort.uniq + + selected_indices.first(target_dimension).map { |index| vector[index] } + end + + def self._select_top_indices(vector, target_dimension) + importance = vector.map(&:abs) + top_dimensions_count = [target_dimension / 5, 1].max + importance.each_with_index + .sort_by { |val, _| -val } + .first(top_dimensions_count) + .map { |_, idx| idx } + end + + def self._select_uniform_indices(vector, target_dimension, top_indices) + remaining_count = target_dimension - top_indices.length + return [] if remaining_count <= 0 + + step_size = vector.length.to_f / remaining_count + uniform_indices = (0...remaining_count).map { |i| (i * step_size).to_i } + uniform_indices.reject { |idx| top_indices.include?(idx) } + end + + def self._fill_remaining_indices(vector, target_dimension, existing_indices) + filled_indices = [] + current_idx = 0 + needed_count = target_dimension - existing_indices.length + + while filled_indices.length < needed_count && current_idx < vector.length + unless existing_indices.include?(current_idx) || filled_indices.include?(current_idx) + filled_indices << current_idx + end + current_idx += 1 + end + filled_indices + end +end diff --git a/app/services/embedding_service.rb b/app/services/embedding_service.rb index 40f7107..7c46d13 100644 --- a/app/services/embedding_service.rb +++ b/app/services/embedding_service.rb @@ -3,7 +3,8 @@ class EmbeddingService class EmbeddingError < StandardError; end - MAX_DIMENSION = 1536 + MAX_DIMENSION = 5500 + TARGET_DIMENSION = 2000 def initialize @client = OpenAI::Client.new(access_token: api_key, uri_base: base_url) @@ -11,6 +12,18 @@ def initialize def generate_embedding(text) Rails.logger.info("Generating embedding for text: #{text}") + raw_embedding = _fetch_embedding_from_api(text) + original_dimension = raw_embedding.length + Rails.logger.info("Generated embedding with original dimension: #{original_dimension}") + + processed_embedding = _process_embedding(raw_embedding) + [processed_embedding, original_dimension] + rescue Faraday::Error => e + Rails.logger.error("OpenAI API connection error: #{e.message}") + raise EmbeddingError, "Connection error while generating embedding: #{e.message}" + end + + def _fetch_embedding_from_api(text) response = @client.embeddings( parameters: { model: embedding_model, @@ -22,11 +35,24 @@ def generate_embedding(text) Rails.logger.error("OpenAI API error: #{response['error']}") raise EmbeddingError, "Failed to generate embedding: #{response['error']['message']}" end + response.dig("data", 0, "embedding") + end - pad_embedding(response.dig("data", 0, "embedding")) - rescue Faraday::Error => e - Rails.logger.error("OpenAI API connection error: #{e.message}") - raise EmbeddingError, "Connection error while generating embedding: #{e.message}" + def _process_embedding(embedding_vector) + padded_embedding = pad_embedding(embedding_vector) + + reduced_embedding = DimensionReductionService.reduce_dimensions( + padded_embedding, + MAX_DIMENSION, + TARGET_DIMENSION + ) + + Rails.logger.info("Reduced embedding to dimension: #{TARGET_DIMENSION}") + + DimensionReductionService.validate_vector_dimension( + reduced_embedding, + TARGET_DIMENSION + ) end def pad_embedding(vector) diff --git a/app/services/semantic_search_result_processor.rb b/app/services/semantic_search_result_processor.rb new file mode 100644 index 0000000..3fea5d5 --- /dev/null +++ b/app/services/semantic_search_result_processor.rb @@ -0,0 +1,44 @@ +module SemanticSearchResultProcessor + extend self + + def process_results(results) + results.map do |result| + result = process_author_info(result) + result = process_assignee_info(result) + result = calculate_similarity_score(result) + remove_temporary_fields(result) + end + end + + private + + def process_author_info(result) + result["author_name"] = [result["author_firstname"], result["author_lastname"]].join(" ").strip + result["author_name"] = result["author_login"] if result["author_name"].blank? + result + end + + def process_assignee_info(result) + if result["assigned_to_firstname"] || result["assigned_to_lastname"] || result["assigned_to_login"] + result["assigned_to_name"] = [result["assigned_to_firstname"], result["assigned_to_lastname"]].join(" ").strip + result["assigned_to_name"] = result["assigned_to_login"] if result["assigned_to_name"].blank? + else + result["assigned_to_name"] = nil + end + result + end + + def calculate_similarity_score(result) + distance = result["distance"].to_f + result["similarity_score"] = 1.0 / (1.0 + distance) + result + end + + def remove_temporary_fields(result) + %w[author_firstname author_lastname author_login assigned_to_firstname assigned_to_lastname assigned_to_login + distance].each do |key| + result.delete(key) + end + result + end +end diff --git a/app/services/semantic_search_service.rb b/app/services/semantic_search_service.rb index 1181e2c..1f77653 100644 --- a/app/services/semantic_search_service.rb +++ b/app/services/semantic_search_service.rb @@ -1,23 +1,72 @@ +require_relative "semantic_search_result_processor" + class SemanticSearchService + include SemanticSearchResultProcessor + def initialize @embedding_service = EmbeddingService.new end - def search(query, user, limit = 10) - query_embedding = @embedding_service.generate_embedding(query) + def search(query, user, limit = 10, _debug: false) + Rails.logger.info("Performing semantic search for query: #{query}") + + query_embedding, original_dimension = @embedding_service.generate_embedding(query) + + log_msg = "Query embedding generated with dim: #{query_embedding.length}, " \ + "original: #{original_dimension}" + Rails.logger.info(log_msg) + + query_embedding = DimensionReductionService.validate_vector_dimension( + query_embedding, + EmbeddingService::TARGET_DIMENSION + ) + + Rails.logger.info("Validated query embedding dimension: #{query_embedding.length}") + results = fetch_raw_results(query_embedding, limit) processed_results = process_results(results) - filter_by_visibility(processed_results, user) + + Rails.logger.info("Found #{processed_results.length} results before visibility filtering") + + filtered_results = filter_by_visibility(processed_results, user) + + Rails.logger.info("Returning #{filtered_results.length} visible results") + + filtered_results end private def fetch_raw_results(query_embedding, limit) sql = build_search_sql(query_embedding, limit) - ActiveRecord::Base.connection.execute(sql) + + sql.gsub(/ARRAY\[.*?\]::vector/, "ARRAY[...vector values...]::vector") + + begin + ActiveRecord::Base.connection.execute(sql) + rescue StandardError + begin + db_dim_query = <<~SQL.squish + SELECT pg_catalog.format_type(atttypid, atttypmod-4) + FROM pg_catalog.pg_attribute + WHERE attrelid = 'issue_embeddings'::regclass AND attname = 'embedding_vector' + SQL + db_dimension = ActiveRecord::Base.connection.execute(db_dim_query).first["format_type"] + + Rails.logger.error("Database column type: #{db_dimension}") + Rails.logger.error("Provided vector length: #{query_embedding.length}") + rescue StandardError => debug_error + Rails.logger.error("Error getting debug info: #{debug_error.message}") + end + + raise + end end + # rubocop:disable Metrics/MethodLength def build_search_sql(query_embedding, limit) + vector_string = query_embedding.join(",") + <<-SQL SELECT issue_embeddings.issue_id, issues.subject, @@ -35,7 +84,7 @@ def build_search_sql(query_embedding, limit) assigned_users.firstname AS assigned_to_firstname, assigned_users.lastname AS assigned_to_lastname, assigned_users.login AS assigned_to_login, - issue_embeddings.embedding_vector <-> ARRAY[#{query_embedding.join(',')}]::vector AS distance + issue_embeddings.embedding_vector <-> ARRAY[#{vector_string}]::vector(#{EmbeddingService::TARGET_DIMENSION}) AS distance FROM issue_embeddings INNER JOIN issues ON issues.id = issue_embeddings.issue_id INNER JOIN projects ON projects.id = issues.project_id @@ -49,45 +98,7 @@ def build_search_sql(query_embedding, limit) LIMIT #{limit} SQL end - - def process_results(results) - results.map do |result| - result = process_author_info(result) - result = process_assignee_info(result) - result = calculate_similarity_score(result) - remove_temporary_fields(result) - end - end - - def process_author_info(result) - result["author_name"] = [result["author_firstname"], result["author_lastname"]].join(" ").strip - result["author_name"] = result["author_login"] if result["author_name"].blank? - result - end - - def process_assignee_info(result) - if result["assigned_to_firstname"] || result["assigned_to_lastname"] || result["assigned_to_login"] - result["assigned_to_name"] = [result["assigned_to_firstname"], result["assigned_to_lastname"]].join(" ").strip - result["assigned_to_name"] = result["assigned_to_login"] if result["assigned_to_name"].blank? - else - result["assigned_to_name"] = nil - end - result - end - - def calculate_similarity_score(result) - distance = result["distance"].to_f - result["similarity_score"] = 1.0 / (1.0 + distance) - result - end - - def remove_temporary_fields(result) - %w[author_firstname author_lastname author_login assigned_to_firstname assigned_to_lastname assigned_to_login - distance].each do |key| - result.delete(key) - end - result - end + # rubocop:enable Metrics/MethodLength def filter_by_visibility(processed_results, user) issue_ids = processed_results.map { |r| r["issue_id"] } diff --git a/db/migrate/004_update_vector_dimension_for_pca.rb b/db/migrate/004_update_vector_dimension_for_pca.rb new file mode 100644 index 0000000..21a8865 --- /dev/null +++ b/db/migrate/004_update_vector_dimension_for_pca.rb @@ -0,0 +1,7 @@ +class UpdateVectorDimensionForPca < ActiveRecord::Migration[7.2] + def up + unless column_exists?(:issue_embeddings, :original_dimension) + add_column :issue_embeddings, :original_dimension, :integer, default: 1536 + end + end +end diff --git a/db/migrate/005_add_pca_support_to_issue_embeddings.rb b/db/migrate/005_add_pca_support_to_issue_embeddings.rb new file mode 100644 index 0000000..c9b2890 --- /dev/null +++ b/db/migrate/005_add_pca_support_to_issue_embeddings.rb @@ -0,0 +1,15 @@ +class AddPcaSupportToIssueEmbeddings < ActiveRecord::Migration[7.2] + def up + execute "DROP INDEX IF EXISTS issue_embeddings_vector_idx" + + execute "ALTER TABLE issue_embeddings ALTER COLUMN embedding_vector TYPE vector(2000)" + + execute "CREATE INDEX issue_embeddings_vector_idx ON issue_embeddings USING ivfflat (embedding_vector vector_l2_ops) WITH (lists = 100)" + end + + def down + execute "DROP INDEX IF EXISTS issue_embeddings_vector_idx" + + execute "CREATE INDEX issue_embeddings_vector_idx ON issue_embeddings USING ivfflat (embedding_vector vector_l2_ops) WITH (lists = 100)" + end +end diff --git a/db/migrate/006_update_existing_embeddings_for_pca.rb b/db/migrate/006_update_existing_embeddings_for_pca.rb new file mode 100644 index 0000000..cddb645 --- /dev/null +++ b/db/migrate/006_update_existing_embeddings_for_pca.rb @@ -0,0 +1,12 @@ +class UpdateExistingEmbeddingsForPca < ActiveRecord::Migration[7.2] + def up + return unless table_exists?(:issue_embeddings) && + column_exists?(:issue_embeddings, :embedding_vector) + + unless column_exists?(:issue_embeddings, :original_dimension) + add_column :issue_embeddings, :original_dimension, :integer, default: 1536 + end + + execute "SELECT id, issue_id FROM issue_embeddings WHERE embedding_vector IS NOT NULL" + end +end diff --git a/scripts/embedding_viewer.py b/scripts/embedding_viewer.py index 291ff58..5b58552 100644 --- a/scripts/embedding_viewer.py +++ b/scripts/embedding_viewer.py @@ -37,6 +37,7 @@ def fetch_embeddings(conn, limit=1000, project_id=None, issue_id=None): ie.id, ie.issue_id, ie.model_used, + ie.original_dimension, i.subject, p.name as project_name, t.name as tracker_name, diff --git a/test/application_system_test_case.rb b/test/application_system_test_case.rb index e7716ba..ece737b 100644 --- a/test/application_system_test_case.rb +++ b/test/application_system_test_case.rb @@ -14,13 +14,13 @@ def log_user(login, password) fill_in 'username', with: login fill_in 'password', with: password click_button 'Login', wait: 5 - assert_selector '#loggedas' + assert_selector '#loggedas', wait: 5 end def logout if has_link?(class: 'logout') - click_link(class: 'logout') + click_link(class: 'logout', wait: 5) end - assert_no_selector '#loggedas' + assert_no_selector '#loggedas', wait: 5 end end diff --git a/test/integration/semantic_search_test.rb b/test/integration/semantic_search_test.rb index 316836c..e2609cb 100644 --- a/test/integration/semantic_search_test.rb +++ b/test/integration/semantic_search_test.rb @@ -13,7 +13,7 @@ def setup @issue = Issue.find(1) @embedding = IssueEmbedding.create!( issue: @issue, - embedding_vector: [0.1] * 1536, + embedding_vector: [0.1] * 2000, content_hash: 'test_hash', model_used: 'text-embedding-ada-002' ) diff --git a/test/system/semantic_search_system_test.rb b/test/system/semantic_search_system_test.rb index 781514a..2c805a1 100644 --- a/test/system/semantic_search_system_test.rb +++ b/test/system/semantic_search_system_test.rb @@ -23,12 +23,12 @@ def setup @embedding = IssueEmbedding.create!( issue: @issue, - embedding_vector: [0.1] * 1536, + embedding_vector: [0.1] * 2000, content_hash: 'test_hash', model_used: 'text-embedding-ada-002' ) - EmbeddingService.any_instance.stubs(:generate_embedding).returns([0.1] * 1536) + EmbeddingService.any_instance.stubs(:generate_embedding).returns([0.1] * 2000) mock_result = [{ "issue_id" => @issue.id, diff --git a/test/test_helper.rb b/test/test_helper.rb index 39462d6..5468ea1 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -10,7 +10,13 @@ def initialize end def generate_embedding(text) - Array.new(1536) { 0.1 } + vector = Array.new(1536) { 0.1 } + reduced_vector = DimensionReductionService.reduce_dimensions( + DimensionReductionService.pad_vector(vector, 5500), + 5500, + 2000 + ) + return reduced_vector, 1536 end def prepare_issue_content(issue) diff --git a/test/unit/dimension_reduction_service_test.rb b/test/unit/dimension_reduction_service_test.rb new file mode 100644 index 0000000..48e5aba --- /dev/null +++ b/test/unit/dimension_reduction_service_test.rb @@ -0,0 +1,61 @@ +require File.expand_path('../../test_helper', __FILE__) + +class DimensionReductionServiceTest < ActiveSupport::TestCase + def test_reduce_dimensions + vector = Array.new(3000) { rand } + source_dimension = 5500 + target_dimension = 2000 + + padded_vector = DimensionReductionService.pad_vector(vector, source_dimension) + assert_equal source_dimension, padded_vector.length + + reduced_vector = DimensionReductionService.reduce_dimensions(padded_vector, source_dimension, target_dimension) + assert_equal target_dimension, reduced_vector.length + end + + def test_pca_like_reduction + vector_size = 5500 + test_vector = Array.new(vector_size, 0.1) + important_positions = [10, 100, 1000, 2000, 3000, 4000, 5000] + important_positions.each do |pos| + test_vector[pos] = 0.9 + end + + target_dimension = 20 + reduced_vector = DimensionReductionService.send(:pca_like_reduction, test_vector, target_dimension) + + assert_equal target_dimension, reduced_vector.length + + top_dimensions_count = [target_dimension / 5, 1].max + + preserved_important = reduced_vector.count { |val| val.abs > 0.8 } + + assert preserved_important > 0, "Should preserve at least one important dimension" + end + + def test_pad_vector + vector = Array.new(100) { 0.5 } + target_size = 200 + + padded_vector = DimensionReductionService.pad_vector(vector, target_size) + assert_equal target_size, padded_vector.length + + vector.each_with_index do |val, i| + assert_equal val, padded_vector[i] + end + + (vector.length...target_size).each do |i| + assert_equal 0.0, padded_vector[i] + end + end + + def test_pad_vector_no_padding_needed + vector = Array.new(100) { 0.5 } + + padded_vector = DimensionReductionService.pad_vector(vector, 100) + assert_equal vector, padded_vector + + padded_vector = DimensionReductionService.pad_vector(vector, 50) + assert_equal vector, padded_vector + end +end diff --git a/test/unit/embedding_service_test.rb b/test/unit/embedding_service_test.rb index ac0dc1a..6441c14 100644 --- a/test/unit/embedding_service_test.rb +++ b/test/unit/embedding_service_test.rb @@ -48,8 +48,9 @@ def test_generate_embedding } ).returns(mock_response) - result = @service.generate_embedding("Test text") - assert_equal mock_embedding, result + result, original_dimension = @service.generate_embedding("Test text") + assert_equal 2000, result.length + assert_equal 1536, original_dimension end def test_generate_embedding_handles_error_response @@ -77,6 +78,16 @@ def test_generate_embedding_handles_network_error end end + def test_pad_embedding + vector = Array.new(1000) { 0.1 } + result = @service.pad_embedding(vector) + assert_equal EmbeddingService::MAX_DIMENSION, result.length + + vector = Array.new(EmbeddingService::MAX_DIMENSION) { 0.1 } + result = @service.pad_embedding(vector) + assert_equal EmbeddingService::MAX_DIMENSION, result.length + end + def test_prepare_issue_content issue = Issue.find(1) diff --git a/test/unit/issue_embedding_job_test.rb b/test/unit/issue_embedding_job_test.rb index 480e386..f7319de 100644 --- a/test/unit/issue_embedding_job_test.rb +++ b/test/unit/issue_embedding_job_test.rb @@ -24,6 +24,7 @@ def test_job_creates_embedding_when_enabled embedding = IssueEmbedding.find_by(issue_id: @issue.id) assert_not_nil embedding assert_equal @issue.id, embedding.issue_id + assert_equal 1536, embedding.original_dimension end def test_job_does_nothing_when_disabled @@ -48,9 +49,10 @@ def test_job_does_not_update_unchanged_embedding content_hash = IssueEmbedding.calculate_content_hash(@issue) original_embedding = IssueEmbedding.create!( issue_id: @issue.id, - embedding_vector: [0.1] * 1536, + embedding_vector: Array.new(2000) { 0.1 }, content_hash: content_hash, - model_used: 'text-embedding-ada-002' + model_used: 'text-embedding-ada-002', + original_dimension: 1536 ) job = IssueEmbeddingJob.new diff --git a/test/unit/issue_embedding_test.rb b/test/unit/issue_embedding_test.rb index 5c4f4e7..117ae0b 100644 --- a/test/unit/issue_embedding_test.rb +++ b/test/unit/issue_embedding_test.rb @@ -7,7 +7,7 @@ def setup @issue = Issue.find(1) @embedding = IssueEmbedding.new( issue: @issue, - embedding_vector: [0.1] * 1536, + embedding_vector: [0.1] * 2000, content_hash: 'test_hash', model_used: 'text-embedding-ada-002' ) @@ -29,7 +29,7 @@ def test_validations assert_not @embedding.valid? assert_includes @embedding.errors[:embedding_vector], 'cannot be blank' - @embedding.embedding_vector = [0.1] * 1536 + @embedding.embedding_vector = [0.1] * 2000 @embedding.content_hash = nil assert_not @embedding.valid? assert_includes @embedding.errors[:content_hash], 'cannot be blank' @@ -93,7 +93,7 @@ def test_needs_update current_hash = IssueEmbedding.calculate_content_hash(issue) embedding = IssueEmbedding.create!( issue: issue, - embedding_vector: [0.1] * 1536, + embedding_vector: [0.1] * 2000, content_hash: current_hash, model_used: 'text-embedding-ada-002' ) diff --git a/test/unit/semantic_search_service_test.rb b/test/unit/semantic_search_service_test.rb index 1165ace..4de261a 100644 --- a/test/unit/semantic_search_service_test.rb +++ b/test/unit/semantic_search_service_test.rb @@ -10,18 +10,20 @@ def setup @service = SemanticSearchService.new @user = User.find(1) @query = "test search query" - @query_embedding = Array.new(1536) { rand } + @query_embedding = Array.new(2000) { rand } + @original_dimension = 1536 end def test_search - @mock_embedding_service.expects(:generate_embedding).with(@query).returns(@query_embedding) + @mock_embedding_service.expects(:generate_embedding).with(@query).returns([@query_embedding, @original_dimension]) issue = Issue.find(1) embedding = IssueEmbedding.new( issue: issue, - embedding_vector: Array.new(1536) { rand }, + embedding_vector: Array.new(2000) { rand }, content_hash: 'test_hash', - model_used: 'text-embedding-ada-002' + model_used: 'text-embedding-ada-002', + original_dimension: 1536 ) embedding.save! @@ -72,24 +74,26 @@ def test_search end def test_filter_by_visibility - @mock_embedding_service.expects(:generate_embedding).with(@query).returns(@query_embedding) + @mock_embedding_service.expects(:generate_embedding).with(@query).returns([@query_embedding, @original_dimension]) visible_issue = Issue.find(1) invisible_issue = Issue.find(2) visible_embedding = IssueEmbedding.new( issue: visible_issue, - embedding_vector: Array.new(1536) { rand }, + embedding_vector: Array.new(2000) { rand }, content_hash: 'visible_hash', - model_used: 'text-embedding-ada-002' + model_used: 'text-embedding-ada-002', + original_dimension: 1536 ) visible_embedding.save! invisible_embedding = IssueEmbedding.new( issue: invisible_issue, - embedding_vector: Array.new(1536) { rand }, + embedding_vector: Array.new(2000) { rand }, content_hash: 'invisible_hash', - model_used: 'text-embedding-ada-002' + model_used: 'text-embedding-ada-002', + original_dimension: 1536 ) invisible_embedding.save! @@ -149,7 +153,7 @@ def test_filter_by_visibility end def test_search_with_empty_results - @mock_embedding_service.expects(:generate_embedding).with(@query).returns(@query_embedding) + @mock_embedding_service.expects(:generate_embedding).with(@query).returns([@query_embedding, @original_dimension]) ActiveRecord::Base.connection.stubs(:execute).returns([]) @@ -160,7 +164,7 @@ def test_search_with_empty_results end def test_search_with_limit - @mock_embedding_service.expects(:generate_embedding).with(@query).returns(@query_embedding) + @mock_embedding_service.expects(:generate_embedding).with(@query).returns([@query_embedding, @original_dimension]) custom_limit = 5 @service.expects(:build_search_sql).with(@query_embedding, custom_limit).returns("SELECT 1") @@ -180,7 +184,7 @@ def test_search_handles_embedding_error end def test_search_handles_database_error - @mock_embedding_service.expects(:generate_embedding).with(@query).returns(@query_embedding) + @mock_embedding_service.expects(:generate_embedding).with(@query).returns([@query_embedding, @original_dimension]) @service.expects(:build_search_sql).with(@query_embedding, 10).returns("SELECT 1") ActiveRecord::Base.connection.stubs(:execute)