diff --git a/cdlib/algorithms/internal/BIGCLAM.py b/cdlib/algorithms/internal/BIGCLAM.py index 317a0c2..8e9d103 100644 --- a/cdlib/algorithms/internal/BIGCLAM.py +++ b/cdlib/algorithms/internal/BIGCLAM.py @@ -57,35 +57,73 @@ def gradient(F, A, i): grad = sum_neigh - sum_nneigh return grad +def gradient_fast(F, A, i): + r"""Fast implementation of the gradient function, considering + equation 4 of https://cs.stanford.edu/people/jure/pubs/bigclam-wsdm13.pdf -def train(A, C, iterations=100): + .. math:: + + \nabla l(F_u) = + \sum_{v \in N(u)} F_v \left(1 + \frac{e^{-F_u^T F_v}}{1-e^{-F_u^T F_v}}\right) + - \sum_v F_v + F_u + + """ + _, C = F.shape + neighbours = np.where(A[i])[0] + + grad = np.zeros((C,)) + for nb in neighbours: + dotproduct = F[nb].dot(F[i]) + grad += F[nb] * (1 + sigm(dotproduct)) + grad -= np.sum(F, axis=0) + grad += F[i] + return grad + +def get_embeddings(A, C, iterations=100, learning_rate=0.005, naive=False): # initialize an F N = A.shape[0] F = np.random.rand(N, C) for n in range(iterations): for person in range(N): - grad = gradient(F, A, person) + if naive: + grad = gradient(F, A, person) + else: + grad = gradient_fast(F, A, person) - F[person] += 0.005 * grad + F[person] += learning_rate * grad - F[person] = np.maximum(0.001, F[person]) # F should be nonnegative - log_likelihood(F, A) + F[person] = np.maximum(0.00001, F[person]) # F should be nonnegative + # log_likelihood(F, A) return F - -def big_Clam(graph, number_communities): - adj = nx.to_numpy_matrix(graph) - F = train(adj, number_communities) - F_argmax = np.argmax(F, 1) - dict_communities = {} - for i in range(0, number_communities): - dict_communities[i] = [] - for node, com in zip(graph.nodes(), F_argmax): - dict_communities[com].append(node) +def get_communities(F, graph, number_communities, method='argmax'): + if method == 'argmax': + F_argmax = np.argmax(F, 1) + dict_communities = {com: [] for com in range(number_communities)} + for node, com in zip(graph.nodes(), F_argmax.tolist()): + dict_communities[com].append(node) + elif method == 'threshold': + n, m = graph.number_of_nodes(), graph.number_of_edges() + epsilon = 2 * m / (n * (n - 1)) + delta = np.sqrt(-np.log(1 - epsilon)) + memberships = np.where(F >= delta, 1, 0) + # in this case, a node can belong to multiple communities + dict_communities = {com: [] for com in range(number_communities)} + for node, membership in zip(graph.nodes(), memberships): + for com in np.nonzero(membership)[0].tolist(): + dict_communities[com].append(node) + else: + raise ValueError("Method not supported") list_communities = [] for com in dict_communities: list_communities.append(dict_communities[com]) return list_communities + +def big_clam_communities(graph, number_communities, iterations=100, learning_rate=0.005, naive=False, affiliation_method='argmax'): + adj = nx.to_numpy_array(graph, weight=None) + F = get_embeddings(adj, number_communities, iterations=iterations, learning_rate=learning_rate, naive=naive) + + return get_communities(F, graph, number_communities, method=affiliation_method) diff --git a/cdlib/algorithms/overlapping_partition.py b/cdlib/algorithms/overlapping_partition.py index 8a1be26..be515e0 100644 --- a/cdlib/algorithms/overlapping_partition.py +++ b/cdlib/algorithms/overlapping_partition.py @@ -8,6 +8,7 @@ from cdlib import NodeClustering from cdlib.random import get_seed from cdlib.utils import suppress_stdout, convert_graph_formats, nx_node_integer_mapping +from cdlib.algorithms.internal.BIGCLAM import big_clam_communities from cdlib.algorithms.internal.CONGO import Congo_ from cdlib.algorithms.internal.CONGA import Conga_ from cdlib.algorithms.internal.LAIS2_nx import LAIS2 @@ -95,7 +96,7 @@ "lemon", "slpa", "multicom", - # "big_clam", + "big_clam", # "danmf", # "egonet_splitter", # "nnsed", @@ -875,72 +876,70 @@ def multicom(g_original: object, seed_node: object) -> NodeClustering: ) -# def big_clam( -# g_original: object, -# dimensions: int = 8, -# iterations: int = 50, -# learning_rate: float = 0.005, -# ) -> NodeClustering: -# """ -# BigClam is an overlapping community detection method that scales to large networks. -# The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations. -# -# -# **Supported Graph Types** -# -# ========== ======== ======== -# Undirected Directed Weighted -# ========== ======== ======== -# Yes No No -# ========== ======== ======== -# -# :param g_original: a networkx/igraph object -# :param dimensions: Number of embedding dimensions. Default 8. -# :param iterations: Number of training iterations. Default 50. -# :param learning_rate: Gradient ascent learning rate. Default is 0.005. -# :return: NodeClustering object -# -# -# :Example: -# -# >>> from cdlib import algorithms -# >>> import networkx as nx -# >>> G = nx.karate_club_graph() -# >>> coms = algorithms.big_clam(G) -# -# :References: -# -# Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013. -# -# .. note:: Reference implementation: https://karateclub.readthedocs.io/ -# """ -# __try_load_karate() -# g = convert_graph_formats(g_original, nx.Graph) -# -# model = karateclub.BigClam( -# dimensions=dimensions, iterations=iterations, learning_rate=learning_rate -# ) -# model.fit(g) -# members = model.get_memberships() -# -# # Reshaping the results -# coms_to_node = defaultdict(list) -# for n, c in members.items(): -# coms_to_node[c].append(n) -# -# coms = [list(c) for c in coms_to_node.values()] -# -# return NodeClustering( -# coms, -# g_original, -# "BigClam", -# method_parameters={ -# "dimensions": dimensions, -# "iterations": iterations, -# "learning_rate": learning_rate, -# }, -# overlap=True, -# ) +def big_clam( + g_original: object, + dimensions: int = 8, + iterations: int = 50, + learning_rate: float = 0.005, + naive: bool = False, + affiliation_method: str = "argmax", +) -> NodeClustering: + """ + BigClam is an overlapping community detection method that scales to large networks. + The procedure uses gradient ascent to create an embedding which is used for deciding the node-cluster affiliations. + + + **Supported Graph Types** + + ========== ======== ======== + Undirected Directed Weighted + ========== ======== ======== + Yes No No + ========== ======== ======== + + :param g_original: a networkx/igraph object + :param dimensions: Number of embedding dimensions. Default 8. + :param iterations: Number of training iterations. Default 50. + :param learning_rate: Gradient ascent learning rate. Default is 0.005. + :param naive: If False, the method uses a more efficient implementation for the gradient ascent step. Default is False. + :param affiliation_method: Method for deciding node-cluster affiliations. "argmax" assigns each node to the cluster with the highest affiliation score, while "threshold" assigns nodes to all clusters for which their affiliation score is above a certain threshold that is computed based on the graph structure (cf. Yang and Leskovec, 2013). In the latter case, communities can overlap. Default is "argmax". + :return: NodeClustering object + + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> G = nx.karate_club_graph() + >>> coms = algorithms.big_clam(G) + + :References: + + Yang, Jaewon, and Jure Leskovec. "Overlapping community detection at scale: a nonnegative matrix factorization approach." Proceedings of the sixth ACM international conference on Web search and data mining. 2013. + """ + + coms = big_clam_communities( + g_original, + number_communities=dimensions, + iterations=iterations, + learning_rate=learning_rate, + naive=naive, + affiliation_method=affiliation_method, + ) + + return NodeClustering( + coms, + g_original, + "BigClam", + method_parameters={ + "dimensions": dimensions, + "iterations": iterations, + "learning_rate": learning_rate, + "naive": naive, + "affiliation_method": affiliation_method, + }, + overlap=True, + ) # def danmf( diff --git a/cdlib/test/test_community_discovery_models.py b/cdlib/test/test_community_discovery_models.py index f4c1411..ba4c6cf 100644 --- a/cdlib/test/test_community_discovery_models.py +++ b/cdlib/test/test_community_discovery_models.py @@ -365,15 +365,15 @@ def test_markov_clustering(self): if len(communities.communities[0]) > 0: self.assertEqual(type(communities.communities[0][0]), int) - # def test_bigClam(self): - # if karateclub is None: - # return - # g = nx.karate_club_graph() - # coms = algorithms.big_clam(g) - # self.assertEqual(type(coms.communities), list) - # if len(coms.communities) > 0: - # self.assertEqual(type(coms.communities[0]), list) - # self.assertEqual(type(coms.communities[0][0]), int) + def test_bigClam(self): + g = nx.karate_club_graph() + coms = algorithms.big_clam(g) + self.assertEqual(type(coms.communities), list) + if len(coms.communities) > 0: + for com in coms.communities: + self.assertEqual(type(com), list) + if len(com) > 0: + self.assertEqual(type(com[0]), int) def test_lemon(self): g = get_string_graph()