Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ GEM
racc (~> 1.4)
nokogiri (1.18.8-x86_64-linux-musl)
racc (~> 1.4)
numo-linalg (0.1.7)
numo-narray (>= 0.9.1.4)
numo-narray (0.9.2.1)
pp (0.6.2)
prettyprint
prettyprint (0.2.0)
Expand Down Expand Up @@ -156,6 +159,8 @@ DEPENDENCIES
concurrent-ruby (= 1.3.4)
dotenv-rails
neighbor
numo-linalg
numo-narray
ruby-openai

BUNDLED WITH
Expand Down
32 changes: 24 additions & 8 deletions app/jobs/issue_embedding_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,34 @@ def embedding_needs_update?(embedding, new_content_hash)
def update_embedding(issue, embedding, content_hash)
embedding_service = EmbeddingService.new
begin
content = embedding_service.prepare_issue_content(issue)
vector = embedding_service.generate_embedding(content)

embedding.embedding_vector = vector
embedding.content_hash = content_hash
embedding.model_used = Setting.plugin_semantic_search["embedding_model"]
embedding.save!

_generate_and_save_embedding(issue, embedding, content_hash, embedding_service)
Rails.logger.info("=> [SEMANTIC_SEARCH] Successfully generated embedding for Issue ##{issue.id}")
rescue StandardError => e
Rails.logger.error("=> [SEMANTIC_SEARCH] Failed to generate embedding for Issue ##{issue.id}: #{e.message}")
Rails.logger.error("=> [SEMANTIC_SEARCH] Error details: #{e.backtrace.join("\n")}")
raise e
end
end

def _generate_and_save_embedding(issue, embedding, content_hash, embedding_service)
content = embedding_service.prepare_issue_content(issue)
vector, original_dimension = embedding_service.generate_embedding(content)

log_msg = "=> [SEMANTIC_SEARCH] Generated embedding with dimension: #{vector.length}, " \
"original: #{original_dimension}"
Rails.logger.info(log_msg)

vector = DimensionReductionService.validate_vector_dimension(
vector,
EmbeddingService::TARGET_DIMENSION
)

Rails.logger.info("=> [SEMANTIC_SEARCH] Validated embedding dimension: #{vector.length}")

embedding.embedding_vector = vector
embedding.content_hash = content_hash
embedding.model_used = Setting.plugin_semantic_search["embedding_model"]
embedding.original_dimension = original_dimension
embedding.save!
end
end
70 changes: 70 additions & 0 deletions app/services/dimension_reduction_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
class DimensionReductionService
def self.reduce_dimensions(vector, source_dimension, target_dimension)
padded_vector = pad_vector(vector, source_dimension)

pca_like_reduction(padded_vector, target_dimension)
end

def self.pad_vector(vector, target_size)
return vector if vector.nil? || vector.length >= target_size

vector + Array.new(target_size - vector.length, 0.0)
end

def self.validate_vector_dimension(vector, target_dimension)
return nil if vector.nil?

if vector.length > target_dimension
vector.first(target_dimension)
elsif vector.length < target_dimension
pad_vector(vector, target_dimension)
else
vector
end
end

def self.pca_like_reduction(vector, target_dimension)
return vector.first(target_dimension) if vector.length <= target_dimension

top_indices = _select_top_indices(vector, target_dimension)
uniform_indices = _select_uniform_indices(vector, target_dimension, top_indices)
combined_indices = top_indices + uniform_indices
filled_indices = _fill_remaining_indices(vector, target_dimension, combined_indices)

selected_indices = (top_indices + uniform_indices + filled_indices).sort.uniq

selected_indices.first(target_dimension).map { |index| vector[index] }
end

def self._select_top_indices(vector, target_dimension)
importance = vector.map(&:abs)
top_dimensions_count = [target_dimension / 5, 1].max
importance.each_with_index
.sort_by { |val, _| -val }
.first(top_dimensions_count)
.map { |_, idx| idx }
end

def self._select_uniform_indices(vector, target_dimension, top_indices)
remaining_count = target_dimension - top_indices.length
return [] if remaining_count <= 0

step_size = vector.length.to_f / remaining_count
uniform_indices = (0...remaining_count).map { |i| (i * step_size).to_i }
uniform_indices.reject { |idx| top_indices.include?(idx) }
end

def self._fill_remaining_indices(vector, target_dimension, existing_indices)
filled_indices = []
current_idx = 0
needed_count = target_dimension - existing_indices.length

while filled_indices.length < needed_count && current_idx < vector.length
unless existing_indices.include?(current_idx) || filled_indices.include?(current_idx)
filled_indices << current_idx
end
current_idx += 1
end
filled_indices
end
end
36 changes: 31 additions & 5 deletions app/services/embedding_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,27 @@
class EmbeddingService
class EmbeddingError < StandardError; end

MAX_DIMENSION = 1536
MAX_DIMENSION = 5500
TARGET_DIMENSION = 2000

def initialize
@client = OpenAI::Client.new(access_token: api_key, uri_base: base_url)
end

def generate_embedding(text)
Rails.logger.info("Generating embedding for text: #{text}")
raw_embedding = _fetch_embedding_from_api(text)
original_dimension = raw_embedding.length
Rails.logger.info("Generated embedding with original dimension: #{original_dimension}")

processed_embedding = _process_embedding(raw_embedding)
[processed_embedding, original_dimension]
rescue Faraday::Error => e
Rails.logger.error("OpenAI API connection error: #{e.message}")
raise EmbeddingError, "Connection error while generating embedding: #{e.message}"
end

def _fetch_embedding_from_api(text)
response = @client.embeddings(
parameters: {
model: embedding_model,
Expand All @@ -22,11 +35,24 @@ def generate_embedding(text)
Rails.logger.error("OpenAI API error: #{response['error']}")
raise EmbeddingError, "Failed to generate embedding: #{response['error']['message']}"
end
response.dig("data", 0, "embedding")
end

pad_embedding(response.dig("data", 0, "embedding"))
rescue Faraday::Error => e
Rails.logger.error("OpenAI API connection error: #{e.message}")
raise EmbeddingError, "Connection error while generating embedding: #{e.message}"
def _process_embedding(embedding_vector)
padded_embedding = pad_embedding(embedding_vector)

reduced_embedding = DimensionReductionService.reduce_dimensions(
padded_embedding,
MAX_DIMENSION,
TARGET_DIMENSION
)

Rails.logger.info("Reduced embedding to dimension: #{TARGET_DIMENSION}")

DimensionReductionService.validate_vector_dimension(
reduced_embedding,
TARGET_DIMENSION
)
end

def pad_embedding(vector)
Expand Down
44 changes: 44 additions & 0 deletions app/services/semantic_search_result_processor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
module SemanticSearchResultProcessor
extend self

def process_results(results)
results.map do |result|
result = process_author_info(result)
result = process_assignee_info(result)
result = calculate_similarity_score(result)
remove_temporary_fields(result)
end
end

private

def process_author_info(result)
result["author_name"] = [result["author_firstname"], result["author_lastname"]].join(" ").strip
result["author_name"] = result["author_login"] if result["author_name"].blank?
result
end

def process_assignee_info(result)
if result["assigned_to_firstname"] || result["assigned_to_lastname"] || result["assigned_to_login"]
result["assigned_to_name"] = [result["assigned_to_firstname"], result["assigned_to_lastname"]].join(" ").strip
result["assigned_to_name"] = result["assigned_to_login"] if result["assigned_to_name"].blank?
else
result["assigned_to_name"] = nil
end
result
end

def calculate_similarity_score(result)
distance = result["distance"].to_f
result["similarity_score"] = 1.0 / (1.0 + distance)
result
end

def remove_temporary_fields(result)
%w[author_firstname author_lastname author_login assigned_to_firstname assigned_to_lastname assigned_to_login
distance].each do |key|
result.delete(key)
end
result
end
end
99 changes: 55 additions & 44 deletions app/services/semantic_search_service.rb
Original file line number Diff line number Diff line change
@@ -1,23 +1,72 @@
require_relative "semantic_search_result_processor"

class SemanticSearchService
include SemanticSearchResultProcessor

def initialize
@embedding_service = EmbeddingService.new
end

def search(query, user, limit = 10)
query_embedding = @embedding_service.generate_embedding(query)
def search(query, user, limit = 10, _debug: false)
Rails.logger.info("Performing semantic search for query: #{query}")

query_embedding, original_dimension = @embedding_service.generate_embedding(query)

log_msg = "Query embedding generated with dim: #{query_embedding.length}, " \
"original: #{original_dimension}"
Rails.logger.info(log_msg)

query_embedding = DimensionReductionService.validate_vector_dimension(
query_embedding,
EmbeddingService::TARGET_DIMENSION
)

Rails.logger.info("Validated query embedding dimension: #{query_embedding.length}")

results = fetch_raw_results(query_embedding, limit)
processed_results = process_results(results)
filter_by_visibility(processed_results, user)

Rails.logger.info("Found #{processed_results.length} results before visibility filtering")

filtered_results = filter_by_visibility(processed_results, user)

Rails.logger.info("Returning #{filtered_results.length} visible results")

filtered_results
end

private

def fetch_raw_results(query_embedding, limit)
sql = build_search_sql(query_embedding, limit)
ActiveRecord::Base.connection.execute(sql)

sql.gsub(/ARRAY\[.*?\]::vector/, "ARRAY[...vector values...]::vector")

begin
ActiveRecord::Base.connection.execute(sql)
rescue StandardError
begin
db_dim_query = <<~SQL.squish
SELECT pg_catalog.format_type(atttypid, atttypmod-4)
FROM pg_catalog.pg_attribute
WHERE attrelid = 'issue_embeddings'::regclass AND attname = 'embedding_vector'
SQL
db_dimension = ActiveRecord::Base.connection.execute(db_dim_query).first["format_type"]

Rails.logger.error("Database column type: #{db_dimension}")
Rails.logger.error("Provided vector length: #{query_embedding.length}")
rescue StandardError => debug_error
Rails.logger.error("Error getting debug info: #{debug_error.message}")
end

raise
end
end

# rubocop:disable Metrics/MethodLength
def build_search_sql(query_embedding, limit)
vector_string = query_embedding.join(",")

<<-SQL
SELECT issue_embeddings.issue_id,
issues.subject,
Expand All @@ -35,7 +84,7 @@ def build_search_sql(query_embedding, limit)
assigned_users.firstname AS assigned_to_firstname,
assigned_users.lastname AS assigned_to_lastname,
assigned_users.login AS assigned_to_login,
issue_embeddings.embedding_vector <-> ARRAY[#{query_embedding.join(',')}]::vector AS distance
issue_embeddings.embedding_vector <-> ARRAY[#{vector_string}]::vector(#{EmbeddingService::TARGET_DIMENSION}) AS distance
FROM issue_embeddings
INNER JOIN issues ON issues.id = issue_embeddings.issue_id
INNER JOIN projects ON projects.id = issues.project_id
Expand All @@ -49,45 +98,7 @@ def build_search_sql(query_embedding, limit)
LIMIT #{limit}
SQL
end

def process_results(results)
results.map do |result|
result = process_author_info(result)
result = process_assignee_info(result)
result = calculate_similarity_score(result)
remove_temporary_fields(result)
end
end

def process_author_info(result)
result["author_name"] = [result["author_firstname"], result["author_lastname"]].join(" ").strip
result["author_name"] = result["author_login"] if result["author_name"].blank?
result
end

def process_assignee_info(result)
if result["assigned_to_firstname"] || result["assigned_to_lastname"] || result["assigned_to_login"]
result["assigned_to_name"] = [result["assigned_to_firstname"], result["assigned_to_lastname"]].join(" ").strip
result["assigned_to_name"] = result["assigned_to_login"] if result["assigned_to_name"].blank?
else
result["assigned_to_name"] = nil
end
result
end

def calculate_similarity_score(result)
distance = result["distance"].to_f
result["similarity_score"] = 1.0 / (1.0 + distance)
result
end

def remove_temporary_fields(result)
%w[author_firstname author_lastname author_login assigned_to_firstname assigned_to_lastname assigned_to_login
distance].each do |key|
result.delete(key)
end
result
end
# rubocop:enable Metrics/MethodLength

def filter_by_visibility(processed_results, user)
issue_ids = processed_results.map { |r| r["issue_id"] }
Expand Down
7 changes: 7 additions & 0 deletions db/migrate/004_update_vector_dimension_for_pca.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class UpdateVectorDimensionForPca < ActiveRecord::Migration[7.2]
def up
unless column_exists?(:issue_embeddings, :original_dimension)
add_column :issue_embeddings, :original_dimension, :integer, default: 1536
end
end
end
15 changes: 15 additions & 0 deletions db/migrate/005_add_pca_support_to_issue_embeddings.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
class AddPcaSupportToIssueEmbeddings < ActiveRecord::Migration[7.2]
def up
execute "DROP INDEX IF EXISTS issue_embeddings_vector_idx"

execute "ALTER TABLE issue_embeddings ALTER COLUMN embedding_vector TYPE vector(2000)"

execute "CREATE INDEX issue_embeddings_vector_idx ON issue_embeddings USING ivfflat (embedding_vector vector_l2_ops) WITH (lists = 100)"
end

def down
execute "DROP INDEX IF EXISTS issue_embeddings_vector_idx"

execute "CREATE INDEX issue_embeddings_vector_idx ON issue_embeddings USING ivfflat (embedding_vector vector_l2_ops) WITH (lists = 100)"
end
end
Loading