util.text_similarity package

Subpackages

Submodules

util.text_similarity.max_independent_set_calc module

class util.text_similarity.max_independent_set_calc.MaxIndependentSetCalc[source]

Bases: ABC

abstractmethod find_max_set(num_texts: int, similar_pairs: list[tuple[int, int]]) set[int][source]
class util.text_similarity.max_independent_set_calc.OptimalIndependentSetCalc[source]

Bases: MaxIndependentSetCalc

find_max_set(num_texts: int, similar_pairs: list[tuple[int, int]]) set[int][source]

gets the maximum independent set of a graph :param num_texts: :param similar_pairs: :return:

class util.text_similarity.max_independent_set_calc.ApproximateIndependentSetCalc[source]

Bases: MaxIndependentSetCalc

find_max_set(num_texts: int, similar_pairs: list[tuple[int, int]]) set[int][source]

gets the approximate maximum independent set of a graph :param num_texts: :param similar_pairs: :return:

class util.text_similarity.max_independent_set_calc.GreedyIndependentSetCalc[source]

Bases: MaxIndependentSetCalc

find_max_set(num_texts: int, similar_pairs: list[tuple[int, int]]) set[int][source]

Greedy algorithm to find an approximate maximum independent set by iteratively removing the node with the highest degree and its edges. :param num_texts: Number of vertices (texts) :param similar_pairs: List of edges representing pairs of similar texts :return: Set of vertices in the approximate maximum independent set

util.text_similarity.max_independent_set_calc_test module

util.text_similarity.max_independent_set_calc_test.PAIRS_LIN_LOG_FUNC(x)
class util.text_similarity.max_independent_set_calc_test.CalcNumPairs[source]

Bases: object

get_num_pairs(num_texts) int[source]
class util.text_similarity.max_independent_set_calc_test.StaticCalcNumPairs(num_pairs)[source]

Bases: CalcNumPairs

__init__(num_pairs)[source]
get_num_pairs(num_texts)[source]
class util.text_similarity.max_independent_set_calc_test.LogCalcNumPairs[source]

Bases: CalcNumPairs

get_num_pairs(num_texts)[source]
class util.text_similarity.max_independent_set_calc_test.LinSquareRootCalcNumPairs[source]

Bases: CalcNumPairs

get_num_pairs(num_texts)[source]
class util.text_similarity.max_independent_set_calc_test.RandomGraphGenerator(_calc_num_pairs: util.text_similarity.max_independent_set_calc_test.CalcNumPairs = <util.text_similarity.max_independent_set_calc_test.LogCalcNumPairs object at 0x0000016012942E40>)[source]

Bases: object

num_pairs(num_texts) int[source]
create_random_pairs(num_texts: int)[source]
__init__(_calc_num_pairs: ~util.text_similarity.max_independent_set_calc_test.CalcNumPairs = <util.text_similarity.max_independent_set_calc_test.LogCalcNumPairs object>) None
class util.text_similarity.max_independent_set_calc_test.MeasurementResult(num_texts: int, time: float, result_size: int)[source]

Bases: object

num_texts: int
time: float
result_size: int
static zero()[source]
__init__(num_texts: int, time: float, result_size: int) None
class util.text_similarity.max_independent_set_calc_test.MeasureIndependentSetCalc(calc: util.text_similarity.max_independent_set_calc.MaxIndependentSetCalc, graph_generator: util.text_similarity.max_independent_set_calc_test.RandomGraphGenerator)[source]

Bases: object

calc: MaxIndependentSetCalc
graph_generator: RandomGraphGenerator
analyze_calc_run(num_texts) MeasurementResult[source]
analyze_calc_runs(num_texts, iterations) MeasurementResult[source]
generate_measurements(list_num_texts, iterations)[source]
__init__(calc: MaxIndependentSetCalc, graph_generator: RandomGraphGenerator) None
util.text_similarity.max_independent_set_calc_test.greedy_calc()[source]
util.text_similarity.max_independent_set_calc_test.approx_calc()[source]
util.text_similarity.max_independent_set_calc_test.optimal_calc()[source]
util.text_similarity.max_independent_set_calc_test.random_graph_gen()[source]
util.text_similarity.max_independent_set_calc_test.dense_graph_gen()[source]
util.text_similarity.max_independent_set_calc_test.test_find_max_set(num_texts, similar_pairs, acceptable_solutions)[source]
util.text_similarity.max_independent_set_calc_test.test_time_complexity(greedy_calc, dense_graph_gen)[source]
class util.text_similarity.max_independent_set_calc_test.ScatterData(xs: <built-in function array>, ys: <built-in function array>, label: str)[source]

Bases: object

xs: array
ys: array
label: str
__init__(xs: array, ys: array, label: str) None
util.text_similarity.max_independent_set_calc_test.plot_algos(tile, x_label, y_label, scatter_data_list)[source]
util.text_similarity.max_independent_set_calc_test.test_greedy_accuracy(greedy_calc, optimal_calc, approx_calc, dense_graph_gen)[source]

util.text_similarity.texts_similarity_filter module

class util.text_similarity.texts_similarity_filter.TextsSimilarityFilter(lsh_min_hash: util.text_similarity.lsh_min_hash.lsh_min_hash.LSHMinHash, max_diff_set_calc: util.text_similarity.max_independent_set_calc.MaxIndependentSetCalc)[source]

Bases: object

lsh_min_hash: LSHMinHash
max_diff_set_calc: MaxIndependentSetCalc
filter_similar_texts(texts: list[str]) set[int][source]
__init__(lsh_min_hash: LSHMinHash, max_diff_set_calc: MaxIndependentSetCalc) None

util.text_similarity.texts_similarity_filter_test module

util.text_similarity.texts_similarity_filter_test.mock_lsh_min_hash()[source]
util.text_similarity.texts_similarity_filter_test.mock_max_set_calc()[source]
util.text_similarity.texts_similarity_filter_test.similarity_filter(mock_lsh_min_hash, mock_max_set_calc)[source]
util.text_similarity.texts_similarity_filter_test.test_filter_similar_texts(similarity_filter, mock_lsh_min_hash, mock_max_set_calc, texts, similar_pairs, expected_result)[source]

Module contents