import dataclasses
from util.text_similarity.lsh_min_hash.lsh_min_hash import LSHMinHash
from util.text_similarity.max_independent_set_calc import MaxIndependentSetCalc
[docs]
@dataclasses.dataclass(frozen=True)
class TextsSimilarityFilter:
lsh_min_hash: LSHMinHash
max_diff_set_calc: MaxIndependentSetCalc
[docs]
def filter_similar_texts(self, texts: list[str]) -> set[int]:
similar_pairs = self.lsh_min_hash.get_similar_pairs(texts)
return self.max_diff_set_calc.find_max_set(len(texts), similar_pairs)