Skip to content
Closed
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
let isRetrying = false;
let totalDeletedCount = 0;

const getExpiredTokensQuery = (dbClient: Knex | Knex.Transaction, nowTimestamp: Date) => {
// Query for revoked and exceeded usage tokens (these use indexes correctly)
const getRevokedAndExceededQuery = (dbClient: Knex | Knex.Transaction) => {
const revokedTokensQuery = dbClient(TableName.IdentityAccessToken)
.where({
isAccessTokenRevoked: true
Expand All @@ -65,45 +66,94 @@
)
.select("id");

const expiredTTLQuery = dbClient(TableName.IdentityAccessToken)
return dbClient
.select("id")
.from(revokedTokensQuery.unionAll(exceededUsageLimitQuery).as("revoked_and_exceeded"))
.distinct();
};

// Query for TTL-expired tokens - run separately with ORDER BY + LIMIT to force index usage.
// WHY ORDER BY: PostgreSQL's planner cannot accurately estimate selectivity for
// "(expression) < NOW()" because NOW() is volatile and there's no histogram for computed expressions.
// Adding ORDER BY on the indexed expression forces an Index Scan because the index is already sorted.
// The ORDER BY must be at the same level as LIMIT to work.
const getExpiredTTLQuery = (dbClient: Knex | Knex.Transaction, nowTimestamp: Date) => {
return dbClient(TableName.IdentityAccessToken)
.where("accessTokenTTL", ">", 0)
.andWhereRaw(
`
-- Check if the token's effective expiration time has passed.
-- The expiration time is calculated by adding its TTL to its last renewal/creation time.
(COALESCE(
"${TableName.IdentityAccessToken}"."accessTokenLastRenewedAt", -- Use last renewal time if available
"${TableName.IdentityAccessToken}"."createdAt" -- Otherwise, use creation time
) AT TIME ZONE 'UTC') -- Convert to UTC so that it can be an immutable function for our expression index
"${TableName.IdentityAccessToken}"."accessTokenLastRenewedAt",
"${TableName.IdentityAccessToken}"."createdAt"
) AT TIME ZONE 'UTC')
+ make_interval(
secs => LEAST(
"${TableName.IdentityAccessToken}"."accessTokenTTL", -- Token's specified TTL
? -- Capped by MAX_TTL (parameterized value)
"${TableName.IdentityAccessToken}"."accessTokenTTL",
?
)
)
< ?::timestamptz AT TIME ZONE 'UTC' -- Check if the calculated time is before now (cast to UTC timestamp for comparison)
< ?::timestamptz AT TIME ZONE 'UTC'
`,
[MAX_TTL, nowTimestamp]
)
.orderByRaw(
`(COALESCE(
"${TableName.IdentityAccessToken}"."accessTokenLastRenewedAt",
"${TableName.IdentityAccessToken}"."createdAt"
) AT TIME ZONE 'UTC')
+ make_interval(
secs => LEAST(
"${TableName.IdentityAccessToken}"."accessTokenTTL",
${MAX_TTL}
)
)`
)
.select("id");

// Notice: we broken down the query into multiple queries and union them to avoid index usage issues.
// each query got their own index for better performance, therefore, if you want to change
// the query, you need to update the indexes accordingly to avoid performance regressions.
return dbClient
.select("id")
.from(revokedTokensQuery.unionAll(exceededUsageLimitQuery).unionAll(expiredTTLQuery).as("all_expired_tokens"))
.distinct();
};

// Delete revoked and exceeded usage tokens first (these use indexes correctly)
do {
try {
const deleteBatch = async (dbClient: Knex | Knex.Transaction) => {
const idsToDeleteQuery = getRevokedAndExceededQuery(dbClient).limit(BATCH_SIZE);
return dbClient(TableName.IdentityAccessToken).whereIn("id", idsToDeleteQuery).del().returning("id");
};

if (tx) {
// eslint-disable-next-line no-await-in-loop
deletedTokenIds = await deleteBatch(tx);
} else {
// eslint-disable-next-line no-await-in-loop
deletedTokenIds = await db.transaction(async (trx) => {
await trx.raw(`SET LOCAL statement_timeout = ${QUERY_TIMEOUT_MS}`);
return deleteBatch(trx);
});
}

numberOfRetryOnFailure = 0;
totalDeletedCount += deletedTokenIds.length;
} catch (error) {
numberOfRetryOnFailure += 1;
logger.error(error, "Failed to delete revoked/exceeded tokens on pruning");
} finally {
// eslint-disable-next-line no-await-in-loop
await new Promise((resolve) => {
setTimeout(resolve, 500);
});
}
isRetrying = numberOfRetryOnFailure > 0;
} while (deletedTokenIds.length > 0 || (isRetrying && numberOfRetryOnFailure < MAX_RETRY_ON_FAILURE));

// Reset for TTL deletion
numberOfRetryOnFailure = 0;
isRetrying = false;

Check failure on line 149 in backend/src/services/identity-access-token/identity-access-token-dal.ts

View check run for this annotation

Claude / Claude Code Review

First-loop max-retry failure silently suppressed by reset

The final 'Pruning failed and stopped after N consecutive retries' summary error log only reflects the second loop (TTL-expired tokens), not the first loop (revoked/exceeded tokens). If the first loop exhausts all MAX_RETRY_ON_FAILURE retries, numberOfRetryOnFailure is reset to 0 before the second loop runs, silently dropping the critical failure signal. Fix: add a matching summary error log after the first loop exits, mirroring the check already present after the second loop.
Comment thread
sheensantoscapadngan marked this conversation as resolved.
Outdated
Comment thread
sheensantoscapadngan marked this conversation as resolved.
Outdated

// Delete TTL-expired tokens separately with ORDER BY + LIMIT to force index usage
do {

Check notice on line 152 in backend/src/services/identity-access-token/identity-access-token-dal.ts

View check run for this annotation

Claude / Claude Code Review

deletedTokenIds not reset between loops enables infinite retry bypass

The while-loop condition uses a pre-existing flaw where MAX_RETRY_ON_FAILURE can be completely bypassed: the catch block never clears deletedTokenIds, so a stale non-empty value from the last successful batch keeps the left-hand OR clause perpetually true, allowing infinite looping under persistent DB failure. This PR copies the same vulnerable pattern into a second loop and also fails to reset deletedTokenIds in the inter-loop reset block (lines 148-149), so loop 2 can inherit loop 1 stale stat
Comment thread
sheensantoscapadngan marked this conversation as resolved.
Outdated
try {
const deleteBatch = async (dbClient: Knex | Knex.Transaction) => {
// The default random_page_cost is 4.0, which is too high for this query.
// With SSD powered database, random access is way faster.
// We set it to 1.1 to make the query opt for random access and thus more likely to use the index.
await dbClient.raw(`SET LOCAL random_page_cost = 1.1`);
const idsToDeleteQuery = getExpiredTokensQuery(dbClient, now).limit(BATCH_SIZE);
// ORDER BY + LIMIT at the same level forces PostgreSQL to use the index
const idsToDeleteQuery = getExpiredTTLQuery(dbClient, now).limit(BATCH_SIZE);
return dbClient(TableName.IdentityAccessToken).whereIn("id", idsToDeleteQuery).del().returning("id");
};

Expand All @@ -118,15 +168,15 @@
});
}

numberOfRetryOnFailure = 0; // reset
numberOfRetryOnFailure = 0;
totalDeletedCount += deletedTokenIds.length;
} catch (error) {
numberOfRetryOnFailure += 1;
logger.error(error, "Failed to delete a batch of expired identity access tokens on pruning");
logger.error(error, "Failed to delete TTL-expired tokens on pruning");
} finally {
// eslint-disable-next-line no-await-in-loop
await new Promise((resolve) => {
setTimeout(resolve, 500); // time to breathe for db
setTimeout(resolve, 500);
});
}
isRetrying = numberOfRetryOnFailure > 0;
Expand Down
Loading