diff --git a/changelog/69073.fixed.md b/changelog/69073.fixed.md new file mode 100644 index 000000000000..d610309edd7b --- /dev/null +++ b/changelog/69073.fixed.md @@ -0,0 +1 @@ +``LoadAuth.get_tok`` now distinguishes between corrupt token blobs (removed from the store) and transient backend errors such as Redis connection drops or NFS hangs (token kept, request treated as not-authenticated). Previously a single backend hiccup could log every authenticated user out by deleting valid tokens. diff --git a/salt/auth/__init__.py b/salt/auth/__init__.py index 019c98229632..2b51fb94aeac 100644 --- a/salt/auth/__init__.py +++ b/salt/auth/__init__.py @@ -239,30 +239,57 @@ def get_tok(self, tok): Return the name associated with the token, or False if the token is not valid """ - tdata = {} try: tdata = self.tokens["{}.get_token".format(self.opts["eauth_tokens"])]( self.opts, tok ) - except salt.exceptions.SaltDeserializationError: - log.warning("Failed to load token %r - removing broken/empty file.", tok) - rm_tok = True - else: - if not tdata: - return {} - rm_tok = False + except salt.exceptions.SaltDeserializationError as exc: + # The on-disk / in-store token blob is corrupt and cannot + # be parsed. Removing it is the right call -- a corrupt + # token can never authenticate anyway, and leaving it + # around makes every subsequent ``get_tok`` for the same + # id keep failing. ``%r`` on the exception gives the + # operator the class and message inline (e.g. msgpack + # format error, truncated file) without spamming a full + # traceback into a hot-path WARNING; the full traceback is + # available via the companion ``log.debug`` for deeper + # investigation. + log.warning( + "Token %r could not be deserialized (%r); removing it from the store.", + tok, + exc, + ) + log.debug("Token deserialization traceback:", exc_info=True) + self.rm_token(tok) + return {} + except OSError as exc: + # Transient backend error (Redis connection blip, NFS hang, + # hung disk). The token itself is fine; do NOT delete it -- + # that would log every authenticated user out on every + # backend hiccup. Return an empty dict so the caller treats + # this request as not-authenticated; the next request will + # retry against the backend and succeed once it recovers. + # Same logging pattern as above -- exception class + message + # at WARNING, full traceback at DEBUG so a flapping deploy + # stays diagnoseable without GB/hour of stack frames. + log.warning( + "Token store transient error reading %r (%r); treating as " + "not-authenticated for this request without removing the " + "token from the store.", + tok, + exc, + ) + log.debug("Token store transient-error traceback:", exc_info=True) + return {} + if not tdata: + return {} if tdata.get("expire", 0) < time.time(): - # If expire isn't present in the token it's invalid and needs - # to be removed. Also, if it's present and has expired - in - # other words, the expiration is before right now, it should - # be removed. - rm_tok = True - - if rm_tok: + # Expired token: drop it from the store. ``expire`` defaults + # to 0 if missing, so a malformed-but-deserializable token + # without an ``expire`` key falls into this branch too. self.rm_token(tok) return {} - return tdata def list_tokens(self): diff --git a/tests/pytests/unit/auth/test_auth.py b/tests/pytests/unit/auth/test_auth.py index 4fc3d836426f..683dd8fdd9b5 100644 --- a/tests/pytests/unit/auth/test_auth.py +++ b/tests/pytests/unit/auth/test_auth.py @@ -2,6 +2,8 @@ import salt.auth import salt.config +import salt.exceptions +from tests.support.mock import MagicMock def test_cve_2021_3244(tmp_path): @@ -31,3 +33,102 @@ def test_cve_2021_3244(tmp_path): t_data = auth.get_tok(t_data["token"]) assert not t_data assert not token_file.exists() + + +# --------------------------------------------------------------------------- +# ``LoadAuth.get_tok`` exception handling. +# +# A backend read failure must be classified by *cause*, not collapsed to +# "delete the token": +# +# * ``SaltDeserializationError`` — the stored blob cannot be parsed. The +# token is permanently unusable; remove it so subsequent reads do not +# keep failing on the same corrupt entry. +# * ``OSError`` / ``IOError`` — transient (Redis connection drop, NFS +# hang, full disk). The token itself is fine. Returning ``{}`` while +# leaving the token in place makes this request not-authenticated and +# lets the next request retry once the backend recovers. **Deleting on +# transient I/O errors logs every authenticated user out on every +# backend hiccup**, which is the regression these tests guard against. +# * Expired (deserialized successfully but past ``expire``) — remove. +# --------------------------------------------------------------------------- + + +def _make_auth(tmp_path, get_token_side_effect): + """Build a ``LoadAuth`` whose ``localfs.get_token`` is replaced by + ``get_token_side_effect``. ``rm_token`` is wrapped with a + ``MagicMock`` so the test can assert on whether the production code + chose to delete the token. The backing token directory is real so + ``rm_token`` would not blow up if it were called -- we are checking + *intent*, not state.""" + token_dir = tmp_path / "tokens" + token_dir.mkdir() + opts = { + "extension_modules": "", + "optimization_order": [0, 1, 2], + "token_expire": 60, + "keep_acl_in_token": False, + "eauth_tokens": "localfs", + "token_dir": str(token_dir), + "token_expire_user_override": False, + "external_auth": {"auto": {"foo": []}}, + } + auth = salt.auth.LoadAuth(opts) + + if callable(get_token_side_effect): + auth.tokens["localfs.get_token"] = get_token_side_effect + else: + auth.tokens["localfs.get_token"] = MagicMock(side_effect=get_token_side_effect) + + auth.tokens["localfs.rm_token"] = MagicMock() + return auth + + +def test_get_tok_returns_empty_and_keeps_token_on_io_error(tmp_path): + """Headline regression: a transient backend error (e.g. Redis + connection drop) must NOT cause the token to be deleted. The + previous implementation either propagated the exception or deleted + the token -- both wrong. Correct behaviour is to return ``{}`` and + leave the token alone so the next request can retry.""" + auth = _make_auth(tmp_path, OSError("redis connection reset")) + + result = auth.get_tok("any-token-id") + + assert result == {} + auth.tokens["localfs.rm_token"].assert_not_called() + + +def test_get_tok_removes_token_on_deserialization_error(tmp_path): + """A corrupt token blob is permanently unusable; removing it is + correct because every subsequent read would fail the same way.""" + auth = _make_auth( + tmp_path, + salt.exceptions.SaltDeserializationError("bad msgpack"), + ) + + result = auth.get_tok("corrupt-token-id") + + assert result == {} + auth.tokens["localfs.rm_token"].assert_called_once_with( + auth.opts, "corrupt-token-id" + ) + + +def test_get_tok_removes_expired_token(tmp_path): + """Expired tokens are deserializable but past their ``expire`` + timestamp. They must be removed so the store does not accumulate + dead entries.""" + expired_blob = { + "token": "expired-token-id", + "expire": time.time() - 60, + "name": "foo", + "eauth": "auto", + } + auth = _make_auth(tmp_path, lambda opts, tok: expired_blob) + + result = auth.get_tok("expired-token-id") + + assert result == {} + auth.tokens["localfs.rm_token"].assert_called_once_with( + auth.opts, "expired-token-id" + )