similarity.py
1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""Cosine 相似度计算与去重判定。"""
from enum import Enum
import numpy as np
DUPLICATE_THRESHOLD = 0.95
SUSPECTED_THRESHOLD = 0.85
class SimilarityDecision(Enum):
DUPLICATE = "duplicate"
SUSPECTED = "suspected"
NEW = "new"
class CompositionSimilarity:
@staticmethod
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
if norm_a == 0.0 or norm_b == 0.0:
return 0.0
return float(np.dot(a, b) / (norm_a * norm_b))
@staticmethod
def classify_similarity(similarity: float) -> SimilarityDecision:
if similarity >= DUPLICATE_THRESHOLD:
return SimilarityDecision.DUPLICATE
if similarity >= SUSPECTED_THRESHOLD:
return SimilarityDecision.SUSPECTED
return SimilarityDecision.NEW
@staticmethod
def compare(a: np.ndarray, b: np.ndarray) -> tuple[float, SimilarityDecision]:
sim = CompositionSimilarity.cosine_similarity(a, b)
return sim, CompositionSimilarity.classify_similarity(sim)