import dataclasses
import random
import time
import numpy as np
from nltk.corpus import words
from util.text_similarity.lsh_min_hash.lsh_min_hash import LSHMinHash
word_list = words.words()
[docs]
@dataclasses.dataclass
class RandomTextsGenerator:
"""
Class to generate random texts
Attributes:
average_words: int - average number of words in a text
number_total_available_words: int - the number of words that are available to generate random texts
available_words: list[str] - list of available words
"""
average_words: int
number_total_available_words: int = 100
available_words: list[str] = None
def __post_init__(self):
self.available_words = random.choices(
word_list, k=self.number_total_available_words
)
[docs]
def get_random_num_words(self):
return random.randint(
int(self.average_words * 0.5), int(self.average_words * 1.5)
)
def _generate_random_text(self, num_words: int):
random_text = " ".join(random.choices(word_list, k=num_words))
return random_text
[docs]
def generate_random_texts(self, num_texts: int):
return [
self._generate_random_text(self.get_random_num_words())
for _ in range(num_texts)
]
[docs]
@dataclasses.dataclass(frozen=True)
class MeasurementParams:
start: int
factor: float
num_points: int
iterations: int
[docs]
def get_xs(self) -> list[int]:
return [0] + list(
[int(self.start * self.factor ** i) for i in range(self.num_points)]
)
[docs]
@dataclasses.dataclass
class MeasureLSHTimeComplexity:
random_text_generator: RandomTextsGenerator
lsh: LSHMinHash
[docs]
def measure_time(self, num_texts: int):
lsh = LSHMinHash()
texts = self.random_text_generator.generate_random_texts(num_texts)
start = time.time()
lsh.get_similar_pairs(texts)
return time.time() - start
[docs]
def measure_avg_time(self, num_texts: int, iterations: int) -> float:
return (
sum([self.measure_time(num_texts) for _ in range(iterations)]) / iterations
)
[docs]
def generate_measurements(self, m_params: MeasurementParams):
x = m_params.get_xs()
y = list(
[self.measure_avg_time(num_texts, m_params.iterations) for num_texts in x]
)
return x, y
[docs]
def get_poly_coeffs(self, measurement_params: MeasurementParams, degree=2):
x, y = self.generate_measurements(measurement_params)
coeffs = np.polyfit(x, y, degree)
non_negative_coeffs = [max(0, coeff) for coeff in coeffs]
return non_negative_coeffs, x, y