diff --git a/app/forms/concerns/hyrax/redirects_field_behavior.rb b/app/forms/concerns/hyrax/redirects_field_behavior.rb index a43662ea64..4112776f16 100644 --- a/app/forms/concerns/hyrax/redirects_field_behavior.rb +++ b/app/forms/concerns/hyrax/redirects_field_behavior.rb @@ -53,8 +53,9 @@ def deserialize!(params) # Builds plain hashes (the persisted shape) from the submitted # `redirects_attributes` payload. Drops rows marked for destruction - # or with a blank path. Normalizes paths to the canonical form - # stored in the redirects table. + # or with a blank path. Normalizes paths up-front so the validator + # sees canonical form (so DSpace-style pasted URLs validate cleanly). + # or with a blank path. def redirects_attributes_populator(fragment:, **_options) return unless respond_to?(:redirects) return unless Hyrax.config.redirects_active? diff --git a/app/indexers/hyrax/indexers/redirects_indexer.rb b/app/indexers/hyrax/indexers/redirects_indexer.rb index d4191a8552..9d0036cf6b 100644 --- a/app/indexers/hyrax/indexers/redirects_indexer.rb +++ b/app/indexers/hyrax/indexers/redirects_indexer.rb @@ -17,8 +17,9 @@ def to_solr(*args) next document unless Hyrax.config.redirects_active? next document unless resource.respond_to?(:redirects) # Valkyrie's JSONValueMapper symbolizes hash keys on read; accept either. + # Paths are normalized at write time by Hyrax::RedirectsNormalization. document['redirects_path_ssim'] = Array(resource.redirects) - .map { |entry| Hyrax::RedirectPathNormalizer.call(entry['path'] || entry[:path]) } + .map { |entry| entry['path'] || entry[:path] } .reject(&:blank?) end end diff --git a/app/models/concerns/hyrax/redirects_normalization.rb b/app/models/concerns/hyrax/redirects_normalization.rb new file mode 100644 index 0000000000..9e1c9ed770 --- /dev/null +++ b/app/models/concerns/hyrax/redirects_normalization.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +module Hyrax + # Normalizes redirect path entries on assignment so every write path — + # form submissions, console writes, importers, change-set saves — + # produces canonical data. Read-side consumers can trust the persisted + # shape and skip their own normalization. + # + # Included next to Hyrax::Schema(:redirects) on Hyrax::Work and + # Hyrax::PcdmCollection. The override sits on Valkyrie's set_value + # primitive so it fires under both flex modes (the simple schema's + # generated setter and the m3 singleton-class setter both route through + # set_value). + # + # See documentation/redirects.md. + module RedirectsNormalization + extend ActiveSupport::Concern + + def set_value(key, value) + value = normalize_redirects(value) if key.to_sym == :redirects + super + end + + private + + def normalize_redirects(value) + Array(value).map { |entry| normalize_entry(entry) } + end + + def normalize_entry(entry) + return entry unless entry.is_a?(Hash) + normalized = entry.dup + path_key = normalized.key?(:path) ? :path : 'path' + normalized[path_key] = Hyrax::RedirectPathNormalizer.call(normalized[path_key]) + normalized + end + end +end diff --git a/app/models/hyrax/pcdm_collection.rb b/app/models/hyrax/pcdm_collection.rb index 124d076e90..b83fb8bfa8 100644 --- a/app/models/hyrax/pcdm_collection.rb +++ b/app/models/hyrax/pcdm_collection.rb @@ -43,6 +43,7 @@ class PcdmCollection < Hyrax::Resource include Hyrax::Schema(:core_metadata) if Hyrax.config.collection_include_metadata? # See documentation/redirects.md for the redirects feature. include Hyrax::Schema(:redirects) if Hyrax.config.collection_include_metadata? && Hyrax.config.redirects_enabled? + include Hyrax::RedirectsNormalization if Hyrax.config.redirects_enabled? attribute :collection_type_gid, Valkyrie::Types::String attribute :member_ids, Valkyrie::Types::Array.of(Valkyrie::Types::ID).meta(ordered: true) diff --git a/app/models/hyrax/work.rb b/app/models/hyrax/work.rb index 147bd3d266..bd833ed10a 100644 --- a/app/models/hyrax/work.rb +++ b/app/models/hyrax/work.rb @@ -97,6 +97,7 @@ class Work < Hyrax::Resource include Hyrax::Schema(:core_metadata) if Hyrax.config.work_default_metadata # See documentation/redirects.md for the redirects feature. include Hyrax::Schema(:redirects) if Hyrax.config.work_default_metadata && Hyrax.config.redirects_enabled? + include Hyrax::RedirectsNormalization if Hyrax.config.redirects_enabled? attribute :admin_set_id, Valkyrie::Types::ID attribute :member_ids, Valkyrie::Types::Array.of(Valkyrie::Types::ID).meta(ordered: true) diff --git a/documentation/redirects.md b/documentation/redirects.md index 7930784efb..0afb4d7892 100644 --- a/documentation/redirects.md +++ b/documentation/redirects.md @@ -124,21 +124,22 @@ When the config is off, the loader filters `redirects.yaml` out of the schema se ## Path normalization -Every redirect path in Hyrax — whether typed into a form, looked up by the resolver, written to the redirects table, or queried by the validator — passes through `Hyrax::RedirectPathNormalizer.call`. This is the single source of truth for "what does this path look like on disk?". Normalization rules: +Every redirect path in Hyrax passes through `Hyrax::RedirectPathNormalizer.call`. This is the single source of truth for "what does this path look like on disk?". Normalization rules: 1. If the input parses as a URL with a scheme and host (e.g. `https://old.example.edu/handle/12345/678`), keep only the path component. 2. Strip query strings (`?utm_source=foo`) and fragments (`#section`). 3. Ensure a leading slash (`handle/123` → `/handle/123`). 4. Strip trailing slashes (`/handle/123/` → `/handle/123`), with the exception that the bare path `/` is preserved. -The normalizer is idempotent — `normalize(normalize(x)) == normalize(x)` — and is wired in at four call sites so they all agree on the canonical form: +The normalizer is idempotent — `normalize(normalize(x)) == normalize(x)`. Normalization happens in two distinct contexts: -- `Hyrax::RedirectsFieldBehavior#redirects_attributes_populator` normalizes each entry's path on form submission, before validation. The normalized form is what the validator sees and what the resource persists. +**On write (resource layer).** `Hyrax::RedirectsNormalization` is included on `Hyrax::Work` and `Hyrax::PcdmCollection` and overrides `set_value` so any assignment to the `redirects` attribute — form save, console write, importer, change-set apply — normalizes each entry's path before persistence. Read-side consumers (`Hyrax::Indexers::RedirectsIndexer`, `Hyrax::Transactions::Steps::SyncRedirectPaths`) trust the persisted shape and do not re-normalize. + +**On input (boundary layer).** Three boundary points canonicalize input from outside the resource before consulting the persisted state: + +- `Hyrax::RedirectsFieldBehavior#redirects_attributes_populator` normalizes form entries before the form-level validator runs, so a user pasting a full URL (`https://old.example.edu/handle/123?utm=email`) is forgivingly accepted and the validator sees the canonical form. - `Hyrax::RedirectsController#show` (the resolver) normalizes the incoming request path before the Solr lookup, so `/foo/` and `/foo` both resolve. - `Hyrax::RedirectsLookup` normalizes its input on construction, so callers can pass any reasonable form. -- `Hyrax::Transactions::Steps::SyncRedirectPaths` normalizes paths before writing to the redirects table, as defense in depth. - -A user pasting a full URL from an old DSpace page (`https://old.example.edu/handle/123?utm=email`) sees the form quietly accept and persist `/handle/123`. ## Validation diff --git a/lib/hyrax/transactions/steps/sync_redirect_paths.rb b/lib/hyrax/transactions/steps/sync_redirect_paths.rb index d407397e11..4ca12791b7 100644 --- a/lib/hyrax/transactions/steps/sync_redirect_paths.rb +++ b/lib/hyrax/transactions/steps/sync_redirect_paths.rb @@ -40,8 +40,9 @@ def syncable?(object) def build_rows(object) # Valkyrie's JSONValueMapper symbolizes hash keys on read; accept either. + # Paths are normalized at write time by Hyrax::RedirectsNormalization. paths = Array(object.redirects) - .map { |entry| Hyrax::RedirectPathNormalizer.call(entry['path'] || entry[:path]) } + .map { |entry| entry['path'] || entry[:path] } .reject(&:blank?) .uniq now = Time.current diff --git a/spec/models/concerns/hyrax/redirects_normalization_spec.rb b/spec/models/concerns/hyrax/redirects_normalization_spec.rb new file mode 100644 index 0000000000..b22390f227 --- /dev/null +++ b/spec/models/concerns/hyrax/redirects_normalization_spec.rb @@ -0,0 +1,86 @@ +# frozen_string_literal: true + +RSpec.describe Hyrax::RedirectsNormalization do + let(:resource_class) do + Class.new(Valkyrie::Resource) do + attribute :redirects, Valkyrie::Types::Array.of(Valkyrie::Types::Hash) + include Hyrax::RedirectsNormalization + end + end + + let(:resource) { resource_class.new } + + describe 'normalization on write' do + it 'strips trailing slashes' do + resource.redirects = [{ 'path' => '/foo/' }] + expect(resource.redirects.first['path']).to eq('/foo') + end + + it 'adds a leading slash' do + resource.redirects = [{ 'path' => 'handle/123' }] + expect(resource.redirects.first['path']).to eq('/handle/123') + end + + it 'extracts path from a full URL with query string' do + resource.redirects = [{ 'path' => 'https://old.example.edu/handle/123?utm=email' }] + expect(resource.redirects.first['path']).to eq('/handle/123') + end + + it 'accepts symbol keys' do + resource.redirects = [{ path: '/foo/' }] + expect(resource.redirects.first[:path]).to eq('/foo') + end + + it 'preserves non-path keys on the entry' do + resource.redirects = [{ 'path' => '/foo/', 'canonical' => true, 'sequence' => 0 }] + entry = resource.redirects.first + expect(entry['path']).to eq('/foo') + expect(entry['canonical']).to be true + expect(entry['sequence']).to eq(0) + end + + it 'normalizes every entry in a multi-entry array' do + resource.redirects = [{ 'path' => '/foo/' }, { 'path' => 'bar' }] + expect(resource.redirects.map { |e| e['path'] }).to eq(['/foo', '/bar']) + end + + it 'is idempotent — re-assigning normalized values is a no-op' do + resource.redirects = [{ 'path' => '/foo/' }] + first = resource.redirects + resource.redirects = first + expect(resource.redirects).to eq(first) + end + + it 'leaves non-redirects attributes untouched' do + other_class = Class.new(Valkyrie::Resource) do + attribute :title, Valkyrie::Types::String + attribute :redirects, Valkyrie::Types::Array.of(Valkyrie::Types::Hash) + include Hyrax::RedirectsNormalization + end + r = other_class.new(title: 'A title that ends with /') + expect(r.title).to eq('A title that ends with /') + end + end + + describe 'flex-mode behavior (m3 singleton-class setter)' do + let(:flex_class) do + Class.new(Valkyrie::Resource) do + include Hyrax::Flexibility + attribute :redirects, Valkyrie::Types::Array.of(Valkyrie::Types::Hash) + include Hyrax::RedirectsNormalization + end + end + + it 'fires under the flex singleton-class setter path' do + # Mirrors how Hyrax::Flexibility.load attaches the schema to the + # singleton class before set_value is called. + resource = flex_class.allocate + resource.send(:initialize, {}) + resource.singleton_class.attributes(redirects: Valkyrie::Types::Array.of(Valkyrie::Types::Hash)) + + resource.set_value(:redirects, [{ 'path' => '/foo/' }]) + + expect(resource.redirects.first['path']).to eq('/foo') + end + end +end