Skip to content
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
00a67b4
Huge commit - restore uses installshard - big update in locking
HoustonPutman Jul 24, 2025
2d5e12b
Implement callingLock mirroring for distributed API Manager locking
HoustonPutman Aug 1, 2025
37cea68
Merge remote-tracking branch 'apache/main' into locking-update
HoustonPutman Sep 2, 2025
293f35d
Merge remote-tracking branch 'apache/main' into locking-update
HoustonPutman Dec 2, 2025
242235a
Changelog
HoustonPutman Dec 2, 2025
72ae4bf
Merge remote-tracking branch 'apache/main' into solr-18011-locking-up…
HoustonPutman Jan 5, 2026
b0452f4
Support distributed locks better
HoustonPutman Jan 8, 2026
72401ca
Start switching over to AdminCmdContext
HoustonPutman Jan 8, 2026
6dac593
Refactor how locking is passed through collections api commands.
HoustonPutman Jan 9, 2026
e368c48
Make some bug fixes
HoustonPutman Jan 10, 2026
97352f7
Some test 'fixes', need to rethink v2 tests still
HoustonPutman Jan 10, 2026
6f2d61a
Add real tests for DeleteAlias and DeleteNode v2 apis
HoustonPutman Jan 12, 2026
74176a4
Tidy
HoustonPutman Jan 12, 2026
df0888c
Fix broken tests
HoustonPutman Jan 13, 2026
3627d29
Add missing test utility class
HoustonPutman Jan 13, 2026
4a49aa9
Move over remaining APIs and tests
HoustonPutman Jan 13, 2026
021303a
Merge remote-tracking branch 'apache/main' into solr-18011-locking-up…
HoustonPutman Jan 13, 2026
e61279c
Merge remote-tracking branch 'apache/main' into solr-18011-locking-up…
HoustonPutman Jan 17, 2026
4c9a766
Fix changelog entry
HoustonPutman Jan 17, 2026
c7cfe19
Remove files that shouldn't be changed.
HoustonPutman Jan 17, 2026
08326cc
One more that shouldn't be changed
HoustonPutman Jan 17, 2026
2513cd4
Fixes for passing callingLockIds around
HoustonPutman Jan 22, 2026
7dc8a98
Tidy
HoustonPutman Jan 28, 2026
b0f4520
Merge remote-tracking branch 'apache/main' into solr-18011-locking-up…
HoustonPutman Jan 29, 2026
dbb01cd
Deserialize callingLockIds in context class
HoustonPutman Jan 29, 2026
e87052e
Make some fixes for LockTree, add tests to validate
HoustonPutman Jan 29, 2026
5a2b445
Improve tests, fix errors in locking
HoustonPutman Jan 30, 2026
1e4767a
Add tests for Distributed locking. Make fixes
HoustonPutman Jan 30, 2026
b7939e0
Merge remote-tracking branch 'apache/main' into solr-18011-locking-up…
HoustonPutman Mar 24, 2026
1bf192c
Fix precommit issues
HoustonPutman Mar 24, 2026
e750951
Merge remote-tracking branch 'apache/main' into solr-18011-locking-up…
HoustonPutman Apr 14, 2026
a9b4cdb
Make sure that locking must be top-down, and cannot introduce dead-locks
HoustonPutman Apr 14, 2026
76ea4fd
Only pass relevant lockId, not all lockIds
HoustonPutman Apr 14, 2026
629d9c4
One missed refactored name
HoustonPutman Apr 14, 2026
2687168
Some fixes
HoustonPutman Apr 14, 2026
edf4a12
Improve changelog entry
HoustonPutman Apr 15, 2026
50f97a2
Fix error scenario for overseer, add test
HoustonPutman Apr 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions changelog/unreleased/solr-18011-locking-update.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
title: Allow locked Admin APIs to call other locked AdminAPIs without deadlocking
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[0] If these deadlocks are a real thing that a user might've seen, then I think we should reword the changelog to highlight the benefit they get out of this change. Maybe something like:

Resolve locking-related deadlock in INSTALLSHARD and similar Admin API calls

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So currently no Admin APIs call other Admin APIs, they will run the Cmd themselves perhaps, but they won't submit it to the overseer or anything. Or they will call the coreAPIs themselves that the other collection API would do. But we currently get around this deadlock by just not doing it. I think we should be able to re-use code a bit better and have 1 entry point for a lot of these APIs, so this is a necessary part of that.

type: changed # added, changed, fixed, deprecated, removed, dependency_update, security, other
authors:
- name: Houston Putman
nick: HoustonPutman
links:
- name: SOLR-18011
url: https://issues.apache.org/jira/browse/SOLR-18011
6 changes: 6 additions & 0 deletions solr/core/src/java/org/apache/solr/api/V2HttpCall.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.solr.api;

import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.params.CollectionAdminParams.CALLING_LOCK_IDS_HEADER;
import static org.apache.solr.servlet.SolrDispatchFilter.Action.ADMIN;
import static org.apache.solr.servlet.SolrDispatchFilter.Action.ADMIN_OR_REMOTEPROXY;
import static org.apache.solr.servlet.SolrDispatchFilter.Action.PROCESS;
Expand Down Expand Up @@ -218,6 +219,11 @@ private void initAdminRequest(String path) throws Exception {
solrReq.getContext().put(CoreContainer.class.getName(), cores);
requestType = AuthorizationContext.RequestType.ADMIN;
action = ADMIN;

String callingLockIds = req.getHeader(CALLING_LOCK_IDS_HEADER);
if (callingLockIds != null && !callingLockIds.isBlank()) {
solrReq.getContext().put(CALLING_LOCK_IDS_HEADER, callingLockIds);
}
}

protected void parseRequest() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.solr.cloud;

import java.util.List;
import org.apache.solr.cloud.api.collections.CollectionApiLockFactory;
import org.apache.solr.common.params.CollectionParams;

Expand Down Expand Up @@ -62,5 +63,6 @@ DistributedLock createLock(
CollectionParams.LockLevel level,
String collName,
String shardId,
String replicaName);
String replicaName,
List<String> callingLockIds);
}
4 changes: 4 additions & 0 deletions solr/core/src/java/org/apache/solr/cloud/DistributedLock.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,8 @@ public interface DistributedLock {
void release();

boolean isAcquired();

String getLockId();

boolean isMirroringLock();
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import com.google.common.annotations.VisibleForTesting;
import java.lang.invoke.MethodHandles;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.solr.common.SolrException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -46,7 +47,12 @@ public void waitUntilAcquired() {
for (DistributedLock lock : locks) {
log.debug("DistributedMultiLock.waitUntilAcquired. About to wait on lock {}", lock);
lock.waitUntilAcquired();
log.debug("DistributedMultiLock.waitUntilAcquired. Acquired lock {}", lock);
if (lock.isMirroringLock()) {
log.debug(
"DistributedMultiLock.waitUntilAcquired. Mirroring already-acquired lock {}", lock);
} else {
log.debug("DistributedMultiLock.waitUntilAcquired. Acquired lock {}", lock);
}
}
}

Expand All @@ -70,6 +76,10 @@ public boolean isAcquired() {
return true;
}

public String getLockId() {
return locks.stream().map(DistributedLock::getLockId).collect(Collectors.joining(","));
}

@VisibleForTesting
public int getCountInternalLocks() {
return locks.size();
Expand Down
87 changes: 79 additions & 8 deletions solr/core/src/java/org/apache/solr/cloud/LockTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.solr.cloud.OverseerMessageHandler.Lock;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CollectionParams.LockLevel;
Expand All @@ -38,20 +39,35 @@ public class LockTree {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final Node root = new Node(null, LockLevel.CLUSTER, null);

public final Map<String, Lock> allLocks = new HashMap<>();

private class LockImpl implements Lock {
final Node node;
final String id;

LockImpl(Node node) {
this.node = node;
this.id = UUID.randomUUID().toString();
}

@Override
public void unlock() {
synchronized (LockTree.this) {
node.unlock(this);
allLocks.remove(id);
}
}

@Override
public String id() {
return id;
}

@Override
public boolean validateSubpath(int lockLevel, List<String> path) {
return node.validateSubpath(lockLevel, path);
}

@Override
public String toString() {
return StrUtils.join(node.constructPath(new ArrayDeque<>()), '/');
Expand All @@ -71,12 +87,33 @@ public String toString() {
public class Session {
private SessionNode root = new SessionNode(LockLevel.CLUSTER);

public Lock lock(CollectionParams.CollectionAction action, List<String> path) {
public Lock lock(
CollectionParams.CollectionAction action, List<String> path, List<String> callingLockIds) {
if (action.lockLevel == LockLevel.NONE) return FREELOCK;
log.debug("Calling lock level: {}", callingLockIds);
Node startingNode = LockTree.this.root;
SessionNode startingSession = root;

// If a callingLockId was passed in, validate it with the current lock path, and only start
// locking below the calling lock
Lock callingLock = callingLockIds != null ? allLocks.get(callingLockIds.getLast()) : null;
boolean ignoreCallingLock = false;
if (callingLock != null && callingLock.validateSubpath(action.lockLevel.getHeight(), path)) {
startingNode = ((LockImpl) callingLock).node;
startingSession = startingSession.find(startingNode.level.getHeight(), path);
if (startingSession == null) {
startingSession = root;
}
ignoreCallingLock = true;
}
synchronized (LockTree.this) {
if (root.isBusy(action.lockLevel, path)) return null;
Lock lockObject = LockTree.this.root.lock(action.lockLevel, path);
if (lockObject == null) root.markBusy(action.lockLevel, path);
if (startingSession.isBusy(action.lockLevel, path)) return null;
Lock lockObject = startingNode.lock(action.lockLevel, path, ignoreCallingLock);
if (lockObject == null) {
startingSession.markBusy(action.lockLevel, path);
} else {
allLocks.put(lockObject.id(), lockObject);
}
return lockObject;
}
}
Expand Down Expand Up @@ -125,6 +162,18 @@ boolean isBusy(LockLevel lockLevel, List<String> path) {
return false;
}
}

SessionNode find(int lockLevel, List<String> path) {
if (level.getHeight() == lockLevel) {
return this;
} else if (level.getHeight() < lockLevel
&& kids != null
&& kids.containsKey(path.get(level.getHeight()))) {
return kids.get(path.get(level.getHeight())).find(lockLevel, path);
} else {
return null;
}
}
}

public Session getSession() {
Expand Down Expand Up @@ -158,8 +207,9 @@ void unlock(LockImpl lockObject) {
}
}

Lock lock(LockLevel lockLevel, List<String> path) {
if (myLock != null) return null; // I'm already locked. no need to go any further
Lock lock(LockLevel lockLevel, List<String> path, boolean ignoreCurrentLock) {
if (myLock != null && !ignoreCurrentLock)
return null; // I'm already locked. no need to go any further
if (lockLevel == level) {
// lock is supposed to be acquired at this level
// If I am locked or any of my children or grandchildren are locked
Expand All @@ -171,16 +221,37 @@ Lock lock(LockLevel lockLevel, List<String> path) {
Node child = children.get(childName);
if (child == null)
children.put(childName, child = new Node(childName, level.getChild(), this));
return child.lock(lockLevel, path);
return child.lock(lockLevel, path, false);
}
}

boolean validateSubpath(int lockLevel, List<String> path) {
return level.getHeight() < lockLevel
&& (level.getHeight() == 0 || name.equals(path.get(level.getHeight() - 1)))
&& (mom == null || mom.validateSubpath(lockLevel, path));
}

ArrayDeque<String> constructPath(ArrayDeque<String> collect) {
if (name != null) collect.addFirst(name);
if (mom != null) mom.constructPath(collect);
return collect;
}
}

static final Lock FREELOCK = () -> {};
static final String FREELOCK_ID = "-1";
static final Lock FREELOCK =
new Lock() {
@Override
public void unlock() {}

@Override
public String id() {
return FREELOCK_ID;
}

@Override
public boolean validateSubpath(int lockLevel, List<String> path) {
return false;
}
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.lang.invoke.MethodHandles;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
Expand Down Expand Up @@ -61,7 +62,7 @@ public OverseerConfigSetMessageHandler(ZkStateReader zkStateReader, CoreContaine
}

@Override
public OverseerSolrResponse processMessage(ZkNodeProps message, String operation) {
public OverseerSolrResponse processMessage(ZkNodeProps message, String operation, Lock lock) {
NamedList<Object> results = new NamedList<>();
try {
if (!operation.startsWith(CONFIGSETS_ACTION_PREFIX)) {
Expand Down Expand Up @@ -117,7 +118,22 @@ public Lock lockTask(ZkNodeProps message, long ignored) {
String configSetName = getTaskKey(message);
if (canExecute(configSetName, message)) {
markExclusiveTask(configSetName, message);
return () -> unmarkExclusiveTask(configSetName, message);
return new Lock() {
@Override
public void unlock() {
unmarkExclusiveTask(configSetName, message);
}

@Override
public String id() {
return configSetName;
}

@Override
public boolean validateSubpath(int lockLevel, List<String> path) {
return false;
}
};
}
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.solr.cloud;

import java.util.List;
import org.apache.solr.common.cloud.ZkNodeProps;

/** Interface for processing messages received by an {@link OverseerTaskProcessor} */
Expand All @@ -26,7 +27,7 @@ public interface OverseerMessageHandler {
* @param operation the operation to process
* @return response
*/
OverseerSolrResponse processMessage(ZkNodeProps message, String operation);
OverseerSolrResponse processMessage(ZkNodeProps message, String operation, Lock lock);

/**
* @return the name of the OverseerMessageHandler
Expand All @@ -41,6 +42,10 @@ public interface OverseerMessageHandler {

interface Lock {
void unlock();

String id();

boolean validateSubpath(int lockLevel, List<String> path);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ public void run() {
if (log.isDebugEnabled()) {
log.debug("Runner processing {}", head.getId());
}
response = messageHandler.processMessage(message, operation);
response = messageHandler.processMessage(message, operation, lock);
} finally {
timerContext.stop();
updateStats(statsName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.solr.cloud;

import java.util.List;
import java.util.Objects;
import org.apache.solr.cloud.api.collections.DistributedCollectionConfigSetCommandRunner;
import org.apache.solr.common.SolrException;
Expand Down Expand Up @@ -44,7 +45,8 @@ public DistributedLock createLock(
CollectionParams.LockLevel level,
String collName,
String shardId,
String replicaName) {
String replicaName,
List<String> callingLockIds) {
Objects.requireNonNull(collName, "collName can't be null");
if (level != CollectionParams.LockLevel.COLLECTION) {
Objects.requireNonNull(
Expand All @@ -56,7 +58,8 @@ public DistributedLock createLock(
}

String lockPath = getLockPath(level, collName, shardId, replicaName);
return doCreateLock(isWriteLock, lockPath);

return doCreateLock(isWriteLock, lockPath, callingLockIds);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.solr.cloud;

import java.util.Collections;
import java.util.Objects;
import org.apache.solr.common.cloud.SolrZkClient;

Expand All @@ -40,7 +41,7 @@ public DistributedLock createLock(boolean isWriteLock, String configSetName) {
Objects.requireNonNull(configSetName, "configSetName can't be null");

String lockPath = getLockPath(configSetName);
return doCreateLock(isWriteLock, lockPath);
return doCreateLock(isWriteLock, lockPath, Collections.emptyList());
}

/**
Expand Down
Loading