Source code for util.text_similarity.lsh_min_hash.lsh_min_hash_test

from datetime import timedelta

import nltk
import pytest

from util.lin_reg_plot_helper import LinRegPredictor, plot_regression
from util.text_similarity.lsh_min_hash.lsh_min_hash import LSHMinHash
from util.text_similarity.lsh_min_hash.shingles_generator import (
    MultipleShinglesGenerator,
)
from util.text_similarity.lsh_min_hash.time_analysis import (
    MeasureLSHTimeComplexity,
    MeasurementParams,
    RandomTextsGenerator,
)

nltk.download("words")



[docs]
@pytest.mark.parametrize(
        "texts,expected_pairs",
        [
                # Completely identical texts
                (["abc abc abc", "abc abc abc"], [(0, 1)]),
                # Very similar texts with minor differences
                (["The quick brown fox jumps", "The quick brown fox leaps"], [(0, 1)]),
                # Very similar texts with minor differences
                (
                        [
                                "I like to learn for my university courses",
                                "My university is big. I like to learn for my university courses",
                        ],
                        [(0, 1)],
                ),
                (
                        [
                                "I like to learn for my university courses",
                                "This text is much longer, but has many of the same words! My university is big. I like to learn for my university courses",
                        ],
                        [],
                ),
                # Completely different texts
                (["apple pie", "orange juice", "banana split"], []),
                # Multiple near duplicates
                (
                        [
                                "hello world",
                                "hello world!",
                                "hello world.",
                                "this is different!",
                        ],
                        [(0, 1), (0, 2), (1, 2)],
                ),
                # Edge case - single text
                (["lorem ipsum dolor sit amet"], []),
                # Empty list
                ([], []),
        ],
)
def test_get_similar_pairs(texts, expected_pairs):
    lsh = LSHMinHash(
            threshold=0.6, shingles_generator=MultipleShinglesGenerator(ngram_sizes=[3, 5])
    )
    result = lsh.get_similar_pairs(texts)
    assert sorted(result) == sorted(expected_pairs)




[docs]
@pytest.mark.parametrize(
        "threshold,texts,expected_pairs",
        [
                # Test with different thresholds
                (0.9, ["The cat in the hat", "The cat in the bag"], []),
                (0.5, ["The cat in the hat", "The cat in the bag"], [(0, 1)]),
        ],
)
def test_threshold_variation(threshold, texts, expected_pairs):
    lsh = LSHMinHash(threshold=threshold)
    result = lsh.get_similar_pairs(texts)
    assert sorted(result) == sorted(expected_pairs)




[docs]
def test_large_text_set():
    num_texts = 30
    same_index_1 = 10
    same_index_2 = 15

    random_texts = RandomTextsGenerator(average_words=10)
    texts = random_texts.generate_random_texts(num_texts)

    texts[same_index_1] = texts[same_index_2]
    lsh = LSHMinHash(threshold=0.9)
    result = lsh.get_similar_pairs(texts)

    assert (same_index_1, same_index_2) in result




[docs]
@pytest.mark.expensive
def test_absolute_time_should_be_small():
    time_calc = MeasureLSHTimeComplexity(
            lsh=LSHMinHash(
                    threshold=0.6,
                    shingles_generator=MultipleShinglesGenerator(ngram_sizes=[3, 5]),
                    num_perm=128,
            ),
            random_text_generator=RandomTextsGenerator(
                    average_words=25, number_total_available_words=4
            ),
    )

    # noinspection PyArgumentEqualDefault
    coeffs, x, y = time_calc.get_poly_coeffs(
            MeasurementParams(start=10, factor=5, num_points=2, iterations=1)
    )
    print(f"coeffs: {[f'{coeff:.4f}' for coeff in coeffs]}")
    plot_regression(x, y, coeffs)

    prediction_x = 1e5
    prediction_y = LinRegPredictor(coeffs).predict(prediction_x)

    amount_time = timedelta(seconds=prediction_y)
    print(f"Prediction for {prediction_x} texts: {amount_time}")
    assert amount_time < timedelta(hours=6)