import abc
import dataclasses
[docs]
@dataclasses.dataclass
class ShinglesGenerator(abc.ABC):
[docs]
@abc.abstractmethod
def generate_shingles(self, text: str) -> set[str]:
pass
[docs]
@dataclasses.dataclass
class SimpleShinglesGenerator(ShinglesGenerator):
ngram_size: int
[docs]
def generate_shingles(self, text: str) -> set[str]:
return {
text[i: i + self.ngram_size]
for i in range(len(text) - self.ngram_size + 1)
}
[docs]
@dataclasses.dataclass
class MultipleShinglesGenerator(ShinglesGenerator):
ngram_sizes: list[int]
[docs]
def generate_shingles(self, text: str) -> set[str]:
return set.union(
*[
SimpleShinglesGenerator(ngram_size=n).generate_shingles(text)
for n in self.ngram_sizes
]
)