util.text_similarity.lsh_min_hash package
Submodules
util.text_similarity.lsh_min_hash.lsh_min_hash module
- class util.text_similarity.lsh_min_hash.lsh_min_hash.TextMinHash(index: int, text: str, minhash: datasketch.minhash.MinHash)[source]
Bases:
object
- index: int
- text: str
- minhash: MinHash
- __init__(index: int, text: str, minhash: MinHash) None
- class util.text_similarity.lsh_min_hash.lsh_min_hash.LSHMinHash(shingles_generator: util.text_similarity.lsh_min_hash.shingles_generator.ShinglesGenerator = <factory>, threshold: float = 0.8, num_perm: int = 128)[source]
Bases:
object
- shingles_generator: ShinglesGenerator
- threshold: float = 0.8
- num_perm: int = 128
- lsh: MinHashLSH | None = None
- __init__(shingles_generator: ~util.text_similarity.lsh_min_hash.shingles_generator.ShinglesGenerator = <factory>, threshold: float = 0.8, num_perm: int = 128) None
util.text_similarity.lsh_min_hash.lsh_min_hash_test module
- util.text_similarity.lsh_min_hash.lsh_min_hash_test.test_get_similar_pairs(texts, expected_pairs)[source]
util.text_similarity.lsh_min_hash.shingles_generator module
- class util.text_similarity.lsh_min_hash.shingles_generator.ShinglesGenerator[source]
Bases:
ABC
- __init__() None
- class util.text_similarity.lsh_min_hash.shingles_generator.SimpleShinglesGenerator(ngram_size: int)[source]
Bases:
ShinglesGenerator
- ngram_size: int
- __init__(ngram_size: int) None
util.text_similarity.lsh_min_hash.time_analysis module
- class util.text_similarity.lsh_min_hash.time_analysis.RandomTextsGenerator(average_words: int, number_total_available_words: int = 100, available_words: list[str] = None)[source]
Bases:
object
Class to generate random texts
- average_words
int - average number of words in a text
- Type:
int
- number_total_available_words
int - the number of words that are available to generate random texts
- Type:
int
- available_words
list[str] - list of available words
- Type:
list[str]
- average_words: int
- number_total_available_words: int = 100
- available_words: list[str] = None
- __init__(average_words: int, number_total_available_words: int = 100, available_words: list[str] = None) None
- class util.text_similarity.lsh_min_hash.time_analysis.MeasurementParams(start: int, factor: float, num_points: int, iterations: int)[source]
Bases:
object
- start: int
- factor: float
- num_points: int
- iterations: int
- __init__(start: int, factor: float, num_points: int, iterations: int) None
- class util.text_similarity.lsh_min_hash.time_analysis.MeasureLSHTimeComplexity(random_text_generator: util.text_similarity.lsh_min_hash.time_analysis.RandomTextsGenerator, lsh: util.text_similarity.lsh_min_hash.lsh_min_hash.LSHMinHash)[source]
Bases:
object
- random_text_generator: RandomTextsGenerator
- lsh: LSHMinHash
- generate_measurements(m_params: MeasurementParams)[source]
- get_poly_coeffs(measurement_params: MeasurementParams, degree=2)[source]
- __init__(random_text_generator: RandomTextsGenerator, lsh: LSHMinHash) None