Source code for util.text_similarity.texts_similarity_filter

import dataclasses

from util.text_similarity.lsh_min_hash.lsh_min_hash import LSHMinHash
from util.text_similarity.max_independent_set_calc import MaxIndependentSetCalc


[docs] @dataclasses.dataclass(frozen=True) class TextsSimilarityFilter: lsh_min_hash: LSHMinHash max_diff_set_calc: MaxIndependentSetCalc
[docs] def filter_similar_texts(self, texts: list[str]) -> set[int]: similar_pairs = self.lsh_min_hash.get_similar_pairs(texts) return self.max_diff_set_calc.find_max_set(len(texts), similar_pairs)