Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 53 additions & 15 deletions cdlib/algorithms/internal/BIGCLAM.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,35 +57,73 @@ def gradient(F, A, i):
grad = sum_neigh - sum_nneigh
return grad

def gradient_fast(F, A, i):
r"""Fast implementation of the gradient function, considering
equation 4 of https://cs.stanford.edu/people/jure/pubs/bigclam-wsdm13.pdf

def train(A, C, iterations=100):
.. math::

\nabla l(F_u) =
\sum_{v \in N(u)} F_v \left(1 + \frac{e^{-F_u^T F_v}}{1-e^{-F_u^T F_v}}\right)
- \sum_v F_v + F_u

"""
_, C = F.shape
neighbours = np.where(A[i])[0]

grad = np.zeros((C,))
for nb in neighbours:
dotproduct = F[nb].dot(F[i])
grad += F[nb] * (1 + sigm(dotproduct))
grad -= np.sum(F, axis=0)
grad += F[i]
Comment on lines +74 to +79

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gradient_fast() recomputes np.sum(F, axis=0) for every node update, which is O(N*C) per node and can erase the intended speedup on sparse graphs. Consider computing the global sum once per outer iteration (or maintaining it incrementally as F updates) and passing it into gradient_fast.

Copilot uses AI. Check for mistakes.
return grad

def get_embeddings(A, C, iterations=100, learning_rate=0.005, naive=False):
# initialize an F
N = A.shape[0]
F = np.random.rand(N, C)

for n in range(iterations):
for person in range(N):
grad = gradient(F, A, person)
if naive:
grad = gradient(F, A, person)
else:
grad = gradient_fast(F, A, person)

F[person] += 0.005 * grad
F[person] += learning_rate * grad

F[person] = np.maximum(0.001, F[person]) # F should be nonnegative
log_likelihood(F, A)
F[person] = np.maximum(0.00001, F[person]) # F should be nonnegative
# log_likelihood(F, A)
Comment on lines +94 to +97

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lowering the non-negativity floor from 0.001 to 1e-5 increases the chance that F_u^T F_v becomes extremely small, which makes sigm() blow up (division by ~0) and can produce inf/unstable gradients. Consider keeping the previous floor or making sigm() numerically safe (e.g., via expm1 and clipping) to prevent divergence.

Copilot uses AI. Check for mistakes.
return F


def big_Clam(graph, number_communities):
adj = nx.to_numpy_matrix(graph)
F = train(adj, number_communities)
F_argmax = np.argmax(F, 1)
dict_communities = {}
for i in range(0, number_communities):
dict_communities[i] = []
for node, com in zip(graph.nodes(), F_argmax):
dict_communities[com].append(node)
def get_communities(F, graph, number_communities, method='argmax'):
if method == 'argmax':
F_argmax = np.argmax(F, 1)
dict_communities = {com: [] for com in range(number_communities)}
for node, com in zip(graph.nodes(), F_argmax.tolist()):
dict_communities[com].append(node)
elif method == 'threshold':
n, m = graph.number_of_nodes(), graph.number_of_edges()
epsilon = 2 * m / (n * (n - 1))
Comment on lines +107 to +108

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the threshold affiliation path, epsilon = 2*m/(n*(n-1)) will divide by zero when n < 2, and epsilon can hit 1 (complete graph) leading to log(0) and delta=inf. Please guard small graphs and clamp epsilon to a safe range (or use nx.density() and cap to < 1.0) to avoid runtime errors/inf thresholds.

Suggested change
n, m = graph.number_of_nodes(), graph.number_of_edges()
epsilon = 2 * m / (n * (n - 1))
n = graph.number_of_nodes()
if n < 2:
epsilon = 0.0
else:
epsilon = nx.density(graph)
epsilon = min(max(epsilon, 0.0), np.nextafter(1.0, 0.0))

Copilot uses AI. Check for mistakes.
delta = np.sqrt(-np.log(1 - epsilon))
memberships = np.where(F >= delta, 1, 0)
# in this case, a node can belong to multiple communities
dict_communities = {com: [] for com in range(number_communities)}
for node, membership in zip(graph.nodes(), memberships):
for com in np.nonzero(membership)[0].tolist():
dict_communities[com].append(node)
else:
raise ValueError("Method not supported")

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The raised ValueError("Method not supported") is hard to act on and drops the invalid value. Include the provided method and the allowed values (e.g., 'argmax'/'threshold') so callers can quickly correct their input.

Suggested change
raise ValueError("Method not supported")
raise ValueError(
f"Method '{method}' not supported. Allowed values are: 'argmax', 'threshold'."
)

Copilot uses AI. Check for mistakes.

list_communities = []
for com in dict_communities:
list_communities.append(dict_communities[com])
Comment on lines 119 to 121

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_communities() currently returns a fixed-length list with potentially many empty communities (especially with argmax when some communities get no assignments, or with threshold when delta is high). Consider filtering out empty communities before returning to match other algorithms in this module and avoid downstream metrics having to handle empty clusters.

Suggested change
list_communities = []
for com in dict_communities:
list_communities.append(dict_communities[com])
list_communities = [members for members in dict_communities.values() if members]

Copilot uses AI. Check for mistakes.

return list_communities

def big_clam_communities(graph, number_communities, iterations=100, learning_rate=0.005, naive=False, affiliation_method='argmax'):
adj = nx.to_numpy_array(graph, weight=None)
F = get_embeddings(adj, number_communities, iterations=iterations, learning_rate=learning_rate, naive=naive)

return get_communities(F, graph, number_communities, method=affiliation_method)
133 changes: 66 additions & 67 deletions cdlib/algorithms/overlapping_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from cdlib import NodeClustering
from cdlib.random import get_seed
from cdlib.utils import suppress_stdout, convert_graph_formats, nx_node_integer_mapping
from cdlib.algorithms.internal.BIGCLAM import big_clam_communities
from cdlib.algorithms.internal.CONGO import Congo_
from cdlib.algorithms.internal.CONGA import Conga_
from cdlib.algorithms.internal.LAIS2_nx import LAIS2
Expand Down Expand Up @@ -95,7 +96,7 @@
"lemon",
"slpa",
"multicom",
# "big_clam",
"big_clam",
# "danmf",
# "egonet_splitter",
# "nnsed",
Expand Down Expand Up @@ -875,72 +876,70 @@ def multicom(g_original: object, seed_node: object) -> NodeClustering:
)


# def big_clam(
# g_original: object,
# dimensions: int = 8,
# iterations: int = 50,
# learning_rate: float = 0.005,
# ) -> NodeClustering:
# """
# BigClam is an overlapping community detection method that scales to large networks.
# The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations.
#
#
# **Supported Graph Types**
#
# ========== ======== ========
# Undirected Directed Weighted
# ========== ======== ========
# Yes No No
# ========== ======== ========
#
# :param g_original: a networkx/igraph object
# :param dimensions: Number of embedding dimensions. Default 8.
# :param iterations: Number of training iterations. Default 50.
# :param learning_rate: Gradient ascent learning rate. Default is 0.005.
# :return: NodeClustering object
#
#
# :Example:
#
# >>> from cdlib import algorithms
# >>> import networkx as nx
# >>> G = nx.karate_club_graph()
# >>> coms = algorithms.big_clam(G)
#
# :References:
#
# Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013.
#
# .. note:: Reference implementation: https://karateclub.readthedocs.io/
# """
# __try_load_karate()
# g = convert_graph_formats(g_original, nx.Graph)
#
# model = karateclub.BigClam(
# dimensions=dimensions, iterations=iterations, learning_rate=learning_rate
# )
# model.fit(g)
# members = model.get_memberships()
#
# # Reshaping the results
# coms_to_node = defaultdict(list)
# for n, c in members.items():
# coms_to_node[c].append(n)
#
# coms = [list(c) for c in coms_to_node.values()]
#
# return NodeClustering(
# coms,
# g_original,
# "BigClam",
# method_parameters={
# "dimensions": dimensions,
# "iterations": iterations,
# "learning_rate": learning_rate,
# },
# overlap=True,
# )
def big_clam(
g_original: object,
dimensions: int = 8,
iterations: int = 50,
learning_rate: float = 0.005,
naive: bool = False,
affiliation_method: str = "argmax",
) -> NodeClustering:
"""
BigClam is an overlapping community detection method that scales to large networks.
The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations.


**Supported Graph Types**

========== ======== ========
Undirected Directed Weighted
========== ======== ========
Yes No No
========== ======== ========

:param g_original: a networkx/igraph object
:param dimensions: Number of embedding dimensions. Default 8.
:param iterations: Number of training iterations. Default 50.
:param learning_rate: Gradient ascent learning rate. Default is 0.005.
:param naive: If False, the method uses a more efficient implementation for the gradient ascent step. Default is False.
:param affiliation_method: Method for deciding node-cluster affiliations. "argmax" assigns each node to the cluster with the highest affiliation score, while "threshold" assigns nodes to all clusters for which their affiliation score is above a certain threshold that is computed based on the graph structure (cf. Yang and Leskovec, 2013). In the latter case, communities can overlap. Default is "argmax".
:return: NodeClustering object


:Example:

>>> from cdlib import algorithms
>>> import networkx as nx
>>> G = nx.karate_club_graph()
>>> coms = algorithms.big_clam(G)

:References:

Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013.
"""

coms = big_clam_communities(
g_original,
number_communities=dimensions,
iterations=iterations,
learning_rate=learning_rate,
naive=naive,
affiliation_method=affiliation_method,
)
Comment on lines +921 to +928

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

big_clam advertises support for networkx/igraph inputs, but it passes g_original directly into big_clam_communities(), which calls nx.to_numpy_array() and will fail for igraph graphs. Convert the input with convert_graph_formats(g_original, nx.Graph) (and use the converted graph for the internal call) to keep behavior consistent with the other algorithms in this module.

Copilot uses AI. Check for mistakes.

return NodeClustering(
coms,
g_original,
"BigClam",
method_parameters={
"dimensions": dimensions,
"iterations": iterations,
"learning_rate": learning_rate,
"naive": naive,
"affiliation_method": affiliation_method,
},
overlap=True,

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NodeClustering(overlap=True) is always set, but with affiliation_method="argmax" the output is a disjoint partition. This flag is used by downstream evaluation/serialization, so it should reflect the actual result (e.g., overlap=(affiliation_method == "threshold")).

Suggested change
overlap=True,
overlap=(affiliation_method == "threshold"),

Copilot uses AI. Check for mistakes.
)


# def danmf(
Expand Down
18 changes: 9 additions & 9 deletions cdlib/test/test_community_discovery_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,15 +365,15 @@ def test_markov_clustering(self):
if len(communities.communities[0]) > 0:
self.assertEqual(type(communities.communities[0][0]), int)

# def test_bigClam(self):
# if karateclub is None:
# return
# g = nx.karate_club_graph()
# coms = algorithms.big_clam(g)
# self.assertEqual(type(coms.communities), list)
# if len(coms.communities) > 0:
# self.assertEqual(type(coms.communities[0]), list)
# self.assertEqual(type(coms.communities[0][0]), int)
def test_bigClam(self):

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test name test_bigClam is inconsistent with the rest of this file’s snake_case test naming (e.g., test_markov_clustering, test_multicom). Renaming to test_big_clam improves consistency and discoverability in test reports.

Suggested change
def test_bigClam(self):
def test_big_clam(self):

Copilot uses AI. Check for mistakes.
g = nx.karate_club_graph()
coms = algorithms.big_clam(g)
self.assertEqual(type(coms.communities), list)
if len(coms.communities) > 0:
for com in coms.communities:
self.assertEqual(type(com), list)
if len(com) > 0:
self.assertEqual(type(com[0]), int)

Copilot AI Apr 21, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

big_clam now has new behavior knobs (naive, affiliation_method incl. the overlapping 'threshold' mode), but the test only exercises the default settings and does not cover the threshold-based overlapping output or invalid affiliation_method values. Adding focused assertions for these branches would help prevent regressions.

Suggested change
threshold_coms = algorithms.big_clam(
g, naive=True, affiliation_method="threshold"
)
self.assertEqual(type(threshold_coms.communities), list)
if len(threshold_coms.communities) > 0:
for com in threshold_coms.communities:
self.assertEqual(type(com), list)
if len(com) > 0:
self.assertEqual(type(com[0]), int)
with self.assertRaises(ValueError):
algorithms.big_clam(g, affiliation_method="unsupported")

Copilot uses AI. Check for mistakes.
def test_lemon(self):
g = get_string_graph()
Expand Down
Loading