Source code for util.text_similarity.lsh_min_hash.time_analysis

import dataclasses
import random
import time

import numpy as np
from nltk.corpus import words

from util.text_similarity.lsh_min_hash.lsh_min_hash import LSHMinHash

word_list = words.words()


[docs] @dataclasses.dataclass class RandomTextsGenerator: """ Class to generate random texts Attributes: average_words: int - average number of words in a text number_total_available_words: int - the number of words that are available to generate random texts available_words: list[str] - list of available words """ average_words: int number_total_available_words: int = 100 available_words: list[str] = None def __post_init__(self): self.available_words = random.choices( word_list, k=self.number_total_available_words )
[docs] def get_random_num_words(self): return random.randint( int(self.average_words * 0.5), int(self.average_words * 1.5) )
def _generate_random_text(self, num_words: int): random_text = " ".join(random.choices(word_list, k=num_words)) return random_text
[docs] def generate_random_texts(self, num_texts: int): return [ self._generate_random_text(self.get_random_num_words()) for _ in range(num_texts) ]
[docs] @dataclasses.dataclass(frozen=True) class MeasurementParams: start: int factor: float num_points: int iterations: int
[docs] def get_xs(self) -> list[int]: return [0] + list( [int(self.start * self.factor ** i) for i in range(self.num_points)] )
[docs] @dataclasses.dataclass class MeasureLSHTimeComplexity: random_text_generator: RandomTextsGenerator lsh: LSHMinHash
[docs] def measure_time(self, num_texts: int): lsh = LSHMinHash() texts = self.random_text_generator.generate_random_texts(num_texts) start = time.time() lsh.get_similar_pairs(texts) return time.time() - start
[docs] def measure_avg_time(self, num_texts: int, iterations: int) -> float: return ( sum([self.measure_time(num_texts) for _ in range(iterations)]) / iterations )
[docs] def generate_measurements(self, m_params: MeasurementParams): x = m_params.get_xs() y = list( [self.measure_avg_time(num_texts, m_params.iterations) for num_texts in x] ) return x, y
[docs] def get_poly_coeffs(self, measurement_params: MeasurementParams, degree=2): x, y = self.generate_measurements(measurement_params) coeffs = np.polyfit(x, y, degree) non_negative_coeffs = [max(0, coeff) for coeff in coeffs] return non_negative_coeffs, x, y