Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ export const identityAccessTokenDALFactory = (db: TDbClient) => {
const nowResult = await dbConnection.raw<{ rows: Array<{ now: Date }> }>(`SELECT NOW() AT TIME ZONE 'UTC' as now`);
const { now } = nowResult.rows[0];

let deletedTokenIds: { id: string }[] = [];
let lastBatchDeletedCount = 0;
let numberOfRetryOnFailure = 0;
let isRetrying = false;
let totalDeletedCount = 0;

const getExpiredTokensQuery = (dbClient: Knex | Knex.Transaction, nowTimestamp: Date) => {
// Query for revoked and exceeded usage tokens (these use indexes correctly)
const getRevokedAndExceededQuery = (dbClient: Knex | Knex.Transaction) => {
const revokedTokensQuery = dbClient(TableName.IdentityAccessToken)
.where({
isAccessTokenRevoked: true
Expand All @@ -65,48 +65,106 @@ export const identityAccessTokenDALFactory = (db: TDbClient) => {
)
.select("id");

const expiredTTLQuery = dbClient(TableName.IdentityAccessToken)
return dbClient
.select("id")
.from(revokedTokensQuery.unionAll(exceededUsageLimitQuery).as("revoked_and_exceeded"))
.distinct();
};

// Query for TTL-expired tokens - run separately with ORDER BY + LIMIT to force index usage.
// WHY ORDER BY: PostgreSQL's planner cannot accurately estimate selectivity for
// "(expression) < NOW()" because NOW() is volatile and there's no histogram for computed expressions.
// Adding ORDER BY on the indexed expression forces an Index Scan because the index is already sorted.
// The ORDER BY must be at the same level as LIMIT to work.
const getExpiredTTLQuery = (dbClient: Knex | Knex.Transaction, nowTimestamp: Date) => {
return dbClient(TableName.IdentityAccessToken)
.where("accessTokenTTL", ">", 0)
.andWhereRaw(
`
-- Check if the token's effective expiration time has passed.
-- The expiration time is calculated by adding its TTL to its last renewal/creation time.
(COALESCE(
"${TableName.IdentityAccessToken}"."accessTokenLastRenewedAt", -- Use last renewal time if available
"${TableName.IdentityAccessToken}"."createdAt" -- Otherwise, use creation time
) AT TIME ZONE 'UTC') -- Convert to UTC so that it can be an immutable function for our expression index
"${TableName.IdentityAccessToken}"."accessTokenLastRenewedAt",
"${TableName.IdentityAccessToken}"."createdAt"
) AT TIME ZONE 'UTC')
+ make_interval(
secs => LEAST(
"${TableName.IdentityAccessToken}"."accessTokenTTL", -- Token's specified TTL
? -- Capped by MAX_TTL (parameterized value)
"${TableName.IdentityAccessToken}"."accessTokenTTL",
?
)
)
< ?::timestamptz AT TIME ZONE 'UTC' -- Check if the calculated time is before now (cast to UTC timestamp for comparison)
< ?::timestamptz AT TIME ZONE 'UTC'
`,
[MAX_TTL, nowTimestamp]
)
.orderByRaw(
`(COALESCE(
"${TableName.IdentityAccessToken}"."accessTokenLastRenewedAt",
"${TableName.IdentityAccessToken}"."createdAt"
) AT TIME ZONE 'UTC')
+ make_interval(
secs => LEAST(
"${TableName.IdentityAccessToken}"."accessTokenTTL",
${MAX_TTL}
)
)`
)
.select("id");

// Notice: we broken down the query into multiple queries and union them to avoid index usage issues.
// each query got their own index for better performance, therefore, if you want to change
// the query, you need to update the indexes accordingly to avoid performance regressions.
return dbClient
.select("id")
.from(revokedTokensQuery.unionAll(exceededUsageLimitQuery).unionAll(expiredTTLQuery).as("all_expired_tokens"))
.distinct();
};

// Continue only if the most recent attempt produced work, or we're still within the retry
// budget after a failure. Tracking these signals independently ensures a failed iteration
// cannot inherit a prior successful batch's count and bypass MAX_RETRY_ON_FAILURE.
const shouldContinue = () =>
lastBatchDeletedCount > 0 || (numberOfRetryOnFailure > 0 && numberOfRetryOnFailure < MAX_RETRY_ON_FAILURE);

// Delete revoked and exceeded usage tokens first (these use indexes correctly)
do {
try {
const deleteBatch = async (dbClient: Knex | Knex.Transaction) => {
const idsToDeleteQuery = getRevokedAndExceededQuery(dbClient).limit(BATCH_SIZE);
return dbClient(TableName.IdentityAccessToken).whereIn("id", idsToDeleteQuery).del().returning("id");
};

let deletedTokenIds: { id: string }[];
if (tx) {
// eslint-disable-next-line no-await-in-loop
deletedTokenIds = await deleteBatch(tx);
} else {
// eslint-disable-next-line no-await-in-loop
deletedTokenIds = await db.transaction(async (trx) => {
await trx.raw(`SET LOCAL statement_timeout = ${QUERY_TIMEOUT_MS}`);
return deleteBatch(trx);
});
}

lastBatchDeletedCount = deletedTokenIds.length;
totalDeletedCount += deletedTokenIds.length;
numberOfRetryOnFailure = 0;
} catch (error) {
lastBatchDeletedCount = 0;
numberOfRetryOnFailure += 1;
logger.error(error, "Failed to delete revoked/exceeded tokens on pruning");
} finally {
// eslint-disable-next-line no-await-in-loop
await new Promise((resolve) => {
setTimeout(resolve, 500);
});
}
} while (shouldContinue());

// Reset for TTL deletion
lastBatchDeletedCount = 0;
numberOfRetryOnFailure = 0;

// Delete TTL-expired tokens separately with ORDER BY + LIMIT to force index usage
do {
try {
const deleteBatch = async (dbClient: Knex | Knex.Transaction) => {
// The default random_page_cost is 4.0, which is too high for this query.
// With SSD powered database, random access is way faster.
// We set it to 1.1 to make the query opt for random access and thus more likely to use the index.
await dbClient.raw(`SET LOCAL random_page_cost = 1.1`);
const idsToDeleteQuery = getExpiredTokensQuery(dbClient, now).limit(BATCH_SIZE);
// ORDER BY + LIMIT at the same level forces PostgreSQL to use the index
const idsToDeleteQuery = getExpiredTTLQuery(dbClient, now).limit(BATCH_SIZE);
return dbClient(TableName.IdentityAccessToken).whereIn("id", idsToDeleteQuery).del().returning("id");
};

let deletedTokenIds: { id: string }[];
if (tx) {
// eslint-disable-next-line no-await-in-loop
deletedTokenIds = await deleteBatch(tx);
Expand All @@ -118,19 +176,20 @@ export const identityAccessTokenDALFactory = (db: TDbClient) => {
});
}

numberOfRetryOnFailure = 0; // reset
lastBatchDeletedCount = deletedTokenIds.length;
totalDeletedCount += deletedTokenIds.length;
numberOfRetryOnFailure = 0;
} catch (error) {
lastBatchDeletedCount = 0;
numberOfRetryOnFailure += 1;
logger.error(error, "Failed to delete a batch of expired identity access tokens on pruning");
logger.error(error, "Failed to delete TTL-expired tokens on pruning");
} finally {
// eslint-disable-next-line no-await-in-loop
await new Promise((resolve) => {
setTimeout(resolve, 500); // time to breathe for db
setTimeout(resolve, 500);
});
}
isRetrying = numberOfRetryOnFailure > 0;
} while (deletedTokenIds.length > 0 || (isRetrying && numberOfRetryOnFailure < MAX_RETRY_ON_FAILURE));
} while (shouldContinue());

if (numberOfRetryOnFailure >= MAX_RETRY_ON_FAILURE) {
logger.error(
Expand Down
Loading