-
Notifications
You must be signed in to change notification settings - Fork 77
feat: restore BigClam compatibility with networkx>=3.0 (no karateclub library dependency) #257
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -57,35 +57,73 @@ def gradient(F, A, i): | |||||||||||||||||
| grad = sum_neigh - sum_nneigh | ||||||||||||||||||
| return grad | ||||||||||||||||||
|
|
||||||||||||||||||
| def gradient_fast(F, A, i): | ||||||||||||||||||
| r"""Fast implementation of the gradient function, considering | ||||||||||||||||||
| equation 4 of https://cs.stanford.edu/people/jure/pubs/bigclam-wsdm13.pdf | ||||||||||||||||||
|
|
||||||||||||||||||
| def train(A, C, iterations=100): | ||||||||||||||||||
| .. math:: | ||||||||||||||||||
|
|
||||||||||||||||||
| \nabla l(F_u) = | ||||||||||||||||||
| \sum_{v \in N(u)} F_v \left(1 + \frac{e^{-F_u^T F_v}}{1-e^{-F_u^T F_v}}\right) | ||||||||||||||||||
| - \sum_v F_v + F_u | ||||||||||||||||||
|
|
||||||||||||||||||
| """ | ||||||||||||||||||
| _, C = F.shape | ||||||||||||||||||
| neighbours = np.where(A[i])[0] | ||||||||||||||||||
|
|
||||||||||||||||||
| grad = np.zeros((C,)) | ||||||||||||||||||
| for nb in neighbours: | ||||||||||||||||||
| dotproduct = F[nb].dot(F[i]) | ||||||||||||||||||
| grad += F[nb] * (1 + sigm(dotproduct)) | ||||||||||||||||||
| grad -= np.sum(F, axis=0) | ||||||||||||||||||
| grad += F[i] | ||||||||||||||||||
| return grad | ||||||||||||||||||
|
|
||||||||||||||||||
| def get_embeddings(A, C, iterations=100, learning_rate=0.005, naive=False): | ||||||||||||||||||
| # initialize an F | ||||||||||||||||||
| N = A.shape[0] | ||||||||||||||||||
| F = np.random.rand(N, C) | ||||||||||||||||||
|
|
||||||||||||||||||
| for n in range(iterations): | ||||||||||||||||||
| for person in range(N): | ||||||||||||||||||
| grad = gradient(F, A, person) | ||||||||||||||||||
| if naive: | ||||||||||||||||||
| grad = gradient(F, A, person) | ||||||||||||||||||
| else: | ||||||||||||||||||
| grad = gradient_fast(F, A, person) | ||||||||||||||||||
|
|
||||||||||||||||||
| F[person] += 0.005 * grad | ||||||||||||||||||
| F[person] += learning_rate * grad | ||||||||||||||||||
|
|
||||||||||||||||||
| F[person] = np.maximum(0.001, F[person]) # F should be nonnegative | ||||||||||||||||||
| log_likelihood(F, A) | ||||||||||||||||||
| F[person] = np.maximum(0.00001, F[person]) # F should be nonnegative | ||||||||||||||||||
| # log_likelihood(F, A) | ||||||||||||||||||
|
Comment on lines
+94
to
+97
|
||||||||||||||||||
| return F | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| def big_Clam(graph, number_communities): | ||||||||||||||||||
| adj = nx.to_numpy_matrix(graph) | ||||||||||||||||||
| F = train(adj, number_communities) | ||||||||||||||||||
| F_argmax = np.argmax(F, 1) | ||||||||||||||||||
| dict_communities = {} | ||||||||||||||||||
| for i in range(0, number_communities): | ||||||||||||||||||
| dict_communities[i] = [] | ||||||||||||||||||
| for node, com in zip(graph.nodes(), F_argmax): | ||||||||||||||||||
| dict_communities[com].append(node) | ||||||||||||||||||
| def get_communities(F, graph, number_communities, method='argmax'): | ||||||||||||||||||
| if method == 'argmax': | ||||||||||||||||||
| F_argmax = np.argmax(F, 1) | ||||||||||||||||||
| dict_communities = {com: [] for com in range(number_communities)} | ||||||||||||||||||
| for node, com in zip(graph.nodes(), F_argmax.tolist()): | ||||||||||||||||||
| dict_communities[com].append(node) | ||||||||||||||||||
| elif method == 'threshold': | ||||||||||||||||||
| n, m = graph.number_of_nodes(), graph.number_of_edges() | ||||||||||||||||||
| epsilon = 2 * m / (n * (n - 1)) | ||||||||||||||||||
|
Comment on lines
+107
to
+108
|
||||||||||||||||||
| n, m = graph.number_of_nodes(), graph.number_of_edges() | |
| epsilon = 2 * m / (n * (n - 1)) | |
| n = graph.number_of_nodes() | |
| if n < 2: | |
| epsilon = 0.0 | |
| else: | |
| epsilon = nx.density(graph) | |
| epsilon = min(max(epsilon, 0.0), np.nextafter(1.0, 0.0)) |
Copilot
AI
Apr 21, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The raised ValueError("Method not supported") is hard to act on and drops the invalid value. Include the provided method and the allowed values (e.g., 'argmax'/'threshold') so callers can quickly correct their input.
| raise ValueError("Method not supported") | |
| raise ValueError( | |
| f"Method '{method}' not supported. Allowed values are: 'argmax', 'threshold'." | |
| ) |
Copilot
AI
Apr 21, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
get_communities() currently returns a fixed-length list with potentially many empty communities (especially with argmax when some communities get no assignments, or with threshold when delta is high). Consider filtering out empty communities before returning to match other algorithms in this module and avoid downstream metrics having to handle empty clusters.
| list_communities = [] | |
| for com in dict_communities: | |
| list_communities.append(dict_communities[com]) | |
| list_communities = [members for members in dict_communities.values() if members] |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -8,6 +8,7 @@ | |||||
| from cdlib import NodeClustering | ||||||
| from cdlib.random import get_seed | ||||||
| from cdlib.utils import suppress_stdout, convert_graph_formats, nx_node_integer_mapping | ||||||
| from cdlib.algorithms.internal.BIGCLAM import big_clam_communities | ||||||
| from cdlib.algorithms.internal.CONGO import Congo_ | ||||||
| from cdlib.algorithms.internal.CONGA import Conga_ | ||||||
| from cdlib.algorithms.internal.LAIS2_nx import LAIS2 | ||||||
|
|
@@ -95,7 +96,7 @@ | |||||
| "lemon", | ||||||
| "slpa", | ||||||
| "multicom", | ||||||
| # "big_clam", | ||||||
| "big_clam", | ||||||
| # "danmf", | ||||||
| # "egonet_splitter", | ||||||
| # "nnsed", | ||||||
|
|
@@ -875,72 +876,70 @@ def multicom(g_original: object, seed_node: object) -> NodeClustering: | |||||
| ) | ||||||
|
|
||||||
|
|
||||||
| # def big_clam( | ||||||
| # g_original: object, | ||||||
| # dimensions: int = 8, | ||||||
| # iterations: int = 50, | ||||||
| # learning_rate: float = 0.005, | ||||||
| # ) -> NodeClustering: | ||||||
| # """ | ||||||
| # BigClam is an overlapping community detection method that scales to large networks. | ||||||
| # The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations. | ||||||
| # | ||||||
| # | ||||||
| # **Supported Graph Types** | ||||||
| # | ||||||
| # ========== ======== ======== | ||||||
| # Undirected Directed Weighted | ||||||
| # ========== ======== ======== | ||||||
| # Yes No No | ||||||
| # ========== ======== ======== | ||||||
| # | ||||||
| # :param g_original: a networkx/igraph object | ||||||
| # :param dimensions: Number of embedding dimensions. Default 8. | ||||||
| # :param iterations: Number of training iterations. Default 50. | ||||||
| # :param learning_rate: Gradient ascent learning rate. Default is 0.005. | ||||||
| # :return: NodeClustering object | ||||||
| # | ||||||
| # | ||||||
| # :Example: | ||||||
| # | ||||||
| # >>> from cdlib import algorithms | ||||||
| # >>> import networkx as nx | ||||||
| # >>> G = nx.karate_club_graph() | ||||||
| # >>> coms = algorithms.big_clam(G) | ||||||
| # | ||||||
| # :References: | ||||||
| # | ||||||
| # Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013. | ||||||
| # | ||||||
| # .. note:: Reference implementation: https://karateclub.readthedocs.io/ | ||||||
| # """ | ||||||
| # __try_load_karate() | ||||||
| # g = convert_graph_formats(g_original, nx.Graph) | ||||||
| # | ||||||
| # model = karateclub.BigClam( | ||||||
| # dimensions=dimensions, iterations=iterations, learning_rate=learning_rate | ||||||
| # ) | ||||||
| # model.fit(g) | ||||||
| # members = model.get_memberships() | ||||||
| # | ||||||
| # # Reshaping the results | ||||||
| # coms_to_node = defaultdict(list) | ||||||
| # for n, c in members.items(): | ||||||
| # coms_to_node[c].append(n) | ||||||
| # | ||||||
| # coms = [list(c) for c in coms_to_node.values()] | ||||||
| # | ||||||
| # return NodeClustering( | ||||||
| # coms, | ||||||
| # g_original, | ||||||
| # "BigClam", | ||||||
| # method_parameters={ | ||||||
| # "dimensions": dimensions, | ||||||
| # "iterations": iterations, | ||||||
| # "learning_rate": learning_rate, | ||||||
| # }, | ||||||
| # overlap=True, | ||||||
| # ) | ||||||
| def big_clam( | ||||||
| g_original: object, | ||||||
| dimensions: int = 8, | ||||||
| iterations: int = 50, | ||||||
| learning_rate: float = 0.005, | ||||||
| naive: bool = False, | ||||||
| affiliation_method: str = "argmax", | ||||||
| ) -> NodeClustering: | ||||||
| """ | ||||||
| BigClam is an overlapping community detection method that scales to large networks. | ||||||
| The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations. | ||||||
|
|
||||||
|
|
||||||
| **Supported Graph Types** | ||||||
|
|
||||||
| ========== ======== ======== | ||||||
| Undirected Directed Weighted | ||||||
| ========== ======== ======== | ||||||
| Yes No No | ||||||
| ========== ======== ======== | ||||||
|
|
||||||
| :param g_original: a networkx/igraph object | ||||||
| :param dimensions: Number of embedding dimensions. Default 8. | ||||||
| :param iterations: Number of training iterations. Default 50. | ||||||
| :param learning_rate: Gradient ascent learning rate. Default is 0.005. | ||||||
| :param naive: If False, the method uses a more efficient implementation for the gradient ascent step. Default is False. | ||||||
| :param affiliation_method: Method for deciding node-cluster affiliations. "argmax" assigns each node to the cluster with the highest affiliation score, while "threshold" assigns nodes to all clusters for which their affiliation score is above a certain threshold that is computed based on the graph structure (cf. Yang and Leskovec, 2013). In the latter case, communities can overlap. Default is "argmax". | ||||||
| :return: NodeClustering object | ||||||
|
|
||||||
|
|
||||||
| :Example: | ||||||
|
|
||||||
| >>> from cdlib import algorithms | ||||||
| >>> import networkx as nx | ||||||
| >>> G = nx.karate_club_graph() | ||||||
| >>> coms = algorithms.big_clam(G) | ||||||
|
|
||||||
| :References: | ||||||
|
|
||||||
| Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013. | ||||||
| """ | ||||||
|
|
||||||
| coms = big_clam_communities( | ||||||
| g_original, | ||||||
| number_communities=dimensions, | ||||||
| iterations=iterations, | ||||||
| learning_rate=learning_rate, | ||||||
| naive=naive, | ||||||
| affiliation_method=affiliation_method, | ||||||
| ) | ||||||
|
Comment on lines
+921
to
+928
|
||||||
|
|
||||||
| return NodeClustering( | ||||||
| coms, | ||||||
| g_original, | ||||||
| "BigClam", | ||||||
| method_parameters={ | ||||||
| "dimensions": dimensions, | ||||||
| "iterations": iterations, | ||||||
| "learning_rate": learning_rate, | ||||||
| "naive": naive, | ||||||
| "affiliation_method": affiliation_method, | ||||||
| }, | ||||||
| overlap=True, | ||||||
|
||||||
| overlap=True, | |
| overlap=(affiliation_method == "threshold"), |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -365,15 +365,15 @@ def test_markov_clustering(self): | |||||||||||||||||||||||||||||
| if len(communities.communities[0]) > 0: | ||||||||||||||||||||||||||||||
| self.assertEqual(type(communities.communities[0][0]), int) | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| # def test_bigClam(self): | ||||||||||||||||||||||||||||||
| # if karateclub is None: | ||||||||||||||||||||||||||||||
| # return | ||||||||||||||||||||||||||||||
| # g = nx.karate_club_graph() | ||||||||||||||||||||||||||||||
| # coms = algorithms.big_clam(g) | ||||||||||||||||||||||||||||||
| # self.assertEqual(type(coms.communities), list) | ||||||||||||||||||||||||||||||
| # if len(coms.communities) > 0: | ||||||||||||||||||||||||||||||
| # self.assertEqual(type(coms.communities[0]), list) | ||||||||||||||||||||||||||||||
| # self.assertEqual(type(coms.communities[0][0]), int) | ||||||||||||||||||||||||||||||
| def test_bigClam(self): | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
| def test_bigClam(self): | |
| def test_big_clam(self): |
Copilot
AI
Apr 21, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
big_clam now has new behavior knobs (naive, affiliation_method incl. the overlapping 'threshold' mode), but the test only exercises the default settings and does not cover the threshold-based overlapping output or invalid affiliation_method values. Adding focused assertions for these branches would help prevent regressions.
| threshold_coms = algorithms.big_clam( | |
| g, naive=True, affiliation_method="threshold" | |
| ) | |
| self.assertEqual(type(threshold_coms.communities), list) | |
| if len(threshold_coms.communities) > 0: | |
| for com in threshold_coms.communities: | |
| self.assertEqual(type(com), list) | |
| if len(com) > 0: | |
| self.assertEqual(type(com[0]), int) | |
| with self.assertRaises(ValueError): | |
| algorithms.big_clam(g, affiliation_method="unsupported") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
gradient_fast()recomputesnp.sum(F, axis=0)for every node update, which is O(N*C) per node and can erase the intended speedup on sparse graphs. Consider computing the global sum once per outer iteration (or maintaining it incrementally as F updates) and passing it intogradient_fast.