similarity.py 1.08 KB
"""Cosine 相似度计算与去重判定。"""

from enum import Enum

import numpy as np

DUPLICATE_THRESHOLD = 0.95
SUSPECTED_THRESHOLD = 0.85


class SimilarityDecision(Enum):
    DUPLICATE = "duplicate"
    SUSPECTED = "suspected"
    NEW = "new"


class CompositionSimilarity:
    @staticmethod
    def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)
        if norm_a == 0.0 or norm_b == 0.0:
            return 0.0
        return float(np.dot(a, b) / (norm_a * norm_b))

    @staticmethod
    def classify_similarity(similarity: float) -> SimilarityDecision:
        if similarity >= DUPLICATE_THRESHOLD:
            return SimilarityDecision.DUPLICATE
        if similarity >= SUSPECTED_THRESHOLD:
            return SimilarityDecision.SUSPECTED
        return SimilarityDecision.NEW

    @staticmethod
    def compare(a: np.ndarray, b: np.ndarray) -> tuple[float, SimilarityDecision]:
        sim = CompositionSimilarity.cosine_similarity(a, b)
        return sim, CompositionSimilarity.classify_similarity(sim)