Source code for util.text_similarity.max_independent_set_calc

import abc

import networkx as nx
from heapdict import heapdict
from igraph import Graph
from networkx.algorithms import approximation as approx


[docs] class MaxIndependentSetCalc(abc.ABC):
[docs] @abc.abstractmethod def find_max_set( self, num_texts: int, similar_pairs: list[tuple[int, int]] ) -> set[int]: pass
[docs] class OptimalIndependentSetCalc(MaxIndependentSetCalc):
[docs] def find_max_set( self, num_texts: int, similar_pairs: list[tuple[int, int]] ) -> set[int]: """ gets the maximum independent set of a graph :param num_texts: :param similar_pairs: :return: """ g = Graph() g.add_vertices(num_texts) g.add_edges(similar_pairs) independent_sets: list[tuple[int]] = g.largest_independent_vertex_sets() return set(sorted(independent_sets)[0]) if independent_sets else set()
[docs] class ApproximateIndependentSetCalc(MaxIndependentSetCalc):
[docs] def find_max_set( self, num_texts: int, similar_pairs: list[tuple[int, int]] ) -> set[int]: """ gets the approximate maximum independent set of a graph :param num_texts: :param similar_pairs: :return: """ graph = nx.Graph() graph.add_nodes_from(range(num_texts)) # Add edges representing similarity between texts graph.add_edges_from(similar_pairs) return set(approx.maximum_independent_set(graph))
[docs] class GreedyIndependentSetCalc(MaxIndependentSetCalc):
[docs] def find_max_set( self, num_texts: int, similar_pairs: list[tuple[int, int]] ) -> set[int]: """ Greedy algorithm to find an approximate maximum independent set by iteratively removing the node with the highest degree and its edges. :param num_texts: Number of vertices (texts) :param similar_pairs: List of edges representing pairs of similar texts :return: Set of vertices in the approximate maximum independent set """ graph = nx.Graph() graph.add_nodes_from(range(num_texts)) graph.add_edges_from(similar_pairs) independent_set = set() degree_heap = heapdict() # Initialize the heap with node degrees (negated for max-heap behavior) for node in graph.nodes: degree_heap[node] = -len(list(graph.neighbors(node))) while graph.edges: # Get the node with the most edges (highest degree) node = degree_heap.popitem()[0] neighbors = list(graph.neighbors(node)) # Remove node and its edges from the graph graph.remove_node(node) # noinspection PyTypeChecker for neighbor in neighbors: if neighbor in degree_heap: degree_heap[neighbor] += 1 for node in graph.nodes: independent_set.add(node) return independent_set