Source code for util.text_similarity.lsh_min_hash.shingles_generator

import abc
import dataclasses


[docs] @dataclasses.dataclass class ShinglesGenerator(abc.ABC):
[docs] @abc.abstractmethod def generate_shingles(self, text: str) -> set[str]: pass
[docs] @dataclasses.dataclass class SimpleShinglesGenerator(ShinglesGenerator): ngram_size: int
[docs] def generate_shingles(self, text: str) -> set[str]: return { text[i: i + self.ngram_size] for i in range(len(text) - self.ngram_size + 1) }
[docs] @dataclasses.dataclass class MultipleShinglesGenerator(ShinglesGenerator): ngram_sizes: list[int]
[docs] def generate_shingles(self, text: str) -> set[str]: return set.union( *[ SimpleShinglesGenerator(ngram_size=n).generate_shingles(text) for n in self.ngram_sizes ] )