util.text_similarity.lsh_min_hash package

Submodules

util.text_similarity.lsh_min_hash.lsh_min_hash module

class util.text_similarity.lsh_min_hash.lsh_min_hash.TextMinHash(index: int, text: str, minhash: datasketch.minhash.MinHash)[source]

Bases: object

index: int
text: str
minhash: MinHash
__init__(index: int, text: str, minhash: MinHash) None
class util.text_similarity.lsh_min_hash.lsh_min_hash.LSHMinHash(shingles_generator: util.text_similarity.lsh_min_hash.shingles_generator.ShinglesGenerator = <factory>, threshold: float = 0.8, num_perm: int = 128)[source]

Bases: object

shingles_generator: ShinglesGenerator
threshold: float = 0.8
num_perm: int = 128
lsh: MinHashLSH | None = None
get_similar_pairs(texts: list[str]) list[tuple[int, int]][source]
__init__(shingles_generator: ~util.text_similarity.lsh_min_hash.shingles_generator.ShinglesGenerator = <factory>, threshold: float = 0.8, num_perm: int = 128) None

util.text_similarity.lsh_min_hash.lsh_min_hash_test module

util.text_similarity.lsh_min_hash.lsh_min_hash_test.test_get_similar_pairs(texts, expected_pairs)[source]
util.text_similarity.lsh_min_hash.lsh_min_hash_test.test_threshold_variation(threshold, texts, expected_pairs)[source]
util.text_similarity.lsh_min_hash.lsh_min_hash_test.test_large_text_set()[source]
util.text_similarity.lsh_min_hash.lsh_min_hash_test.test_absolute_time_should_be_small()[source]

util.text_similarity.lsh_min_hash.shingles_generator module

class util.text_similarity.lsh_min_hash.shingles_generator.ShinglesGenerator[source]

Bases: ABC

abstractmethod generate_shingles(text: str) set[str][source]
__init__() None
class util.text_similarity.lsh_min_hash.shingles_generator.SimpleShinglesGenerator(ngram_size: int)[source]

Bases: ShinglesGenerator

ngram_size: int
generate_shingles(text: str) set[str][source]
__init__(ngram_size: int) None
class util.text_similarity.lsh_min_hash.shingles_generator.MultipleShinglesGenerator(ngram_sizes: list[int])[source]

Bases: ShinglesGenerator

ngram_sizes: list[int]
generate_shingles(text: str) set[str][source]
__init__(ngram_sizes: list[int]) None

util.text_similarity.lsh_min_hash.time_analysis module

class util.text_similarity.lsh_min_hash.time_analysis.RandomTextsGenerator(average_words: int, number_total_available_words: int = 100, available_words: list[str] = None)[source]

Bases: object

Class to generate random texts

average_words

int - average number of words in a text

Type:

int

number_total_available_words

int - the number of words that are available to generate random texts

Type:

int

available_words

list[str] - list of available words

Type:

list[str]

average_words: int
number_total_available_words: int = 100
available_words: list[str] = None
get_random_num_words()[source]
generate_random_texts(num_texts: int)[source]
__init__(average_words: int, number_total_available_words: int = 100, available_words: list[str] = None) None
class util.text_similarity.lsh_min_hash.time_analysis.MeasurementParams(start: int, factor: float, num_points: int, iterations: int)[source]

Bases: object

start: int
factor: float
num_points: int
iterations: int
get_xs() list[int][source]
__init__(start: int, factor: float, num_points: int, iterations: int) None
class util.text_similarity.lsh_min_hash.time_analysis.MeasureLSHTimeComplexity(random_text_generator: util.text_similarity.lsh_min_hash.time_analysis.RandomTextsGenerator, lsh: util.text_similarity.lsh_min_hash.lsh_min_hash.LSHMinHash)[source]

Bases: object

random_text_generator: RandomTextsGenerator
lsh: LSHMinHash
measure_time(num_texts: int)[source]
measure_avg_time(num_texts: int, iterations: int) float[source]
generate_measurements(m_params: MeasurementParams)[source]
get_poly_coeffs(measurement_params: MeasurementParams, degree=2)[source]
__init__(random_text_generator: RandomTextsGenerator, lsh: LSHMinHash) None

Module contents