refactoring distance functions

This commit is contained in:
Sefik Ilkin Serengil 2024-10-06 21:00:07 +01:00
parent a93fb63c97
commit 53a96f635a

View File

@ -265,40 +265,34 @@ def find_cosine_distance(
source_representation: Union[np.ndarray, list], test_representation: Union[np.ndarray, list] source_representation: Union[np.ndarray, list], test_representation: Union[np.ndarray, list]
) -> Union[np.float64, np.ndarray]: ) -> Union[np.float64, np.ndarray]:
""" """
Find cosine distance between two given vectors Find cosine distance between two given vectors or batches of vectors.
Args: Args:
source_representation (np.ndarray or list): 1st vector source_representation (np.ndarray or list): 1st vector or batch of vectors.
test_representation (np.ndarray or list): 2nd vector test_representation (np.ndarray or list): 2nd vector or batch of vectors.
Returns Returns
distance (np.float64 or np.ndarray): calculated cosine distance(s). np.float64 or np.ndarray: Calculated cosine distance(s).
it is type of np.float64 for given single embeddings It returns a np.float64 for single embeddings and np.ndarray for batch embeddings.
or type of np.ndarray for given batch embeddings
""" """
if isinstance(source_representation, list): # Convert inputs to numpy arrays if necessary
source_representation = np.array(source_representation) source_representation = np.asarray(source_representation)
test_representation = np.asarray(test_representation)
if isinstance(test_representation, list): if source_representation.ndim == 1 and test_representation.ndim == 1:
test_representation = np.array(test_representation)
if len(source_representation.shape) == 1 and len(test_representation.shape) == 1:
# single embedding # single embedding
a = np.dot(source_representation, test_representation) dot_product = np.dot(source_representation, test_representation)
b = np.linalg.norm(source_representation) source_norm = np.linalg.norm(source_representation)
c = np.linalg.norm(test_representation) test_norm = np.linalg.norm(test_representation)
distances = 1 - a / (b * c) distances = 1 - dot_product / (source_norm * test_norm)
elif len(source_representation.shape) == 2 and len(test_representation.shape) == 2: elif source_representation.ndim == 2 and test_representation.ndim == 2:
# list of embeddings (batch) # list of embeddings (batch)
# source_representation's shape is (N, D) source_normed = l2_normalize(source_representation, axis=1) # (N, D)
# test_representation's shape is (M, D) test_normed = l2_normalize(test_representation, axis=1) # (M, D)
# distances' shape is (M, N) cosine_similarities = np.dot(test_normed, source_normed.T) # (M, N)
source_embeddings_norm = l2_normalize(source_representation, axis=1)
test_embeddings_norm = l2_normalize(test_representation, axis=1)
cosine_similarities = np.dot(test_embeddings_norm, source_embeddings_norm.T)
distances = 1 - cosine_similarities distances = 1 - cosine_similarities
else: else:
raise ValueError( raise ValueError(
"embeddings can either be 1 or 2 dimensional " f"Embeddings must be 1D or 2D, but received "
f"but it is {len(source_representation.shape)} & {len(test_representation.shape)}" f"source shape: {source_representation.shape}, test shape: {test_representation.shape}"
) )
return distances return distances
@ -307,36 +301,33 @@ def find_euclidean_distance(
source_representation: Union[np.ndarray, list], test_representation: Union[np.ndarray, list] source_representation: Union[np.ndarray, list], test_representation: Union[np.ndarray, list]
) -> Union[np.float64, np.ndarray]: ) -> Union[np.float64, np.ndarray]:
""" """
Find euclidean distance between two given vectors Find Euclidean distance between two vectors or batches of vectors.
Args: Args:
source_representation (np.ndarray or list): 1st vector source_representation (np.ndarray or list): 1st vector or batch of vectors.
test_representation (np.ndarray or list): 2nd vector test_representation (np.ndarray or list): 2nd vector or batch of vectors.
Returns
distance (np.float64 or np.ndarray): calculated euclidean distance(s). Returns:
it is type of np.float64 for given single embeddings np.float64 or np.ndarray: Euclidean distance(s).
or type of np.ndarray for given batch embeddings Returns a np.float64 for single embeddings and np.ndarray for batch embeddings.
""" """
if isinstance(source_representation, list): # Convert inputs to numpy arrays if necessary
source_representation = np.array(source_representation) source_representation = np.asarray(source_representation)
test_representation = np.asarray(test_representation)
if isinstance(test_representation, list): # Single embedding case (1D arrays)
test_representation = np.array(test_representation) if source_representation.ndim == 1 and test_representation.ndim == 1:
distances = np.linalg.norm(source_representation - test_representation)
if len(source_representation.shape) == 1 and len(test_representation.shape) == 1: # Batch embeddings case (2D arrays)
# single embedding elif source_representation.ndim == 2 and test_representation.ndim == 2:
diff = source_representation - test_representation diff = (
distances = np.linalg.norm(diff) source_representation[None, :, :] - test_representation[:, None, :]
elif len(source_representation.shape) == 2 and len(test_representation.shape) == 2: ) # (N, D) - (M, D) = (M, N, D)
# list of embeddings (batch)
# source_representation's shape is (N, D)
# test_representation's shape is (M, D)
# distances' shape is (M, N)
diff = source_representation[None, :, :] - test_representation[:, None, :] # (M, N, D)
distances = np.linalg.norm(diff, axis=2) # (M, N) distances = np.linalg.norm(diff, axis=2) # (M, N)
else: else:
raise ValueError( raise ValueError(
"embeddings can either be 1 or 2 dimensional " f"Embeddings must be 1D or 2D, but received "
f"but it is {len(source_representation.shape)} & {len(test_representation.shape)}" f"source shape: {source_representation.shape}, test shape: {test_representation.shape}"
) )
return distances return distances
@ -352,8 +343,8 @@ def l2_normalize(
Returns: Returns:
np.ndarray: l2 normalized vector np.ndarray: l2 normalized vector
""" """
if isinstance(x, list): # Convert inputs to numpy arrays if necessary
x = np.array(x) x = np.asarray(x)
norm = np.linalg.norm(x, axis=axis, keepdims=True) norm = np.linalg.norm(x, axis=axis, keepdims=True)
return x / (norm + epsilon) return x / (norm + epsilon)
@ -364,40 +355,37 @@ def find_distance(
distance_metric: str, distance_metric: str,
) -> Union[np.float64, np.ndarray]: ) -> Union[np.float64, np.ndarray]:
""" """
Wrapper to find distance between vectors according to the given distance metric Wrapper to find the distance between vectors based on the specified distance metric.
Args:
source_representation (np.ndarray or list): 1st vector
test_representation (np.ndarray or list): 2nd vector
Returns
distance (np.float64 or np.ndarray): calculated cosine distance(s).
it is type of np.float64 for given single embeddings
or type of np.ndarray for given batch embeddings
"""
if isinstance(alpha_embedding, list):
alpha_embedding = np.array(alpha_embedding)
if isinstance(beta_embedding, list): Args:
beta_embedding = np.array(beta_embedding) alpha_embedding (np.ndarray or list): 1st vector or batch of vectors.
beta_embedding (np.ndarray or list): 2nd vector or batch of vectors.
distance_metric (str): The type of distance to compute
('cosine', 'euclidean', or 'euclidean_l2').
Returns:
np.float64 or np.ndarray: The calculated distance(s).
"""
# Convert inputs to numpy arrays if necessary
alpha_embedding = np.asarray(alpha_embedding)
beta_embedding = np.asarray(beta_embedding)
# Ensure that both embeddings are either 1D or 2D
if alpha_embedding.ndim != beta_embedding.ndim or alpha_embedding.ndim not in (1, 2):
raise ValueError(
f"Both embeddings must be either 1D or 2D, but received "
f"alpha shape: {alpha_embedding.shape}, beta shape: {beta_embedding.shape}"
)
if distance_metric == "cosine": if distance_metric == "cosine":
distance = find_cosine_distance(alpha_embedding, beta_embedding) distance = find_cosine_distance(alpha_embedding, beta_embedding)
elif distance_metric == "euclidean": elif distance_metric == "euclidean":
distance = find_euclidean_distance(alpha_embedding, beta_embedding) distance = find_euclidean_distance(alpha_embedding, beta_embedding)
elif distance_metric == "euclidean_l2": elif distance_metric == "euclidean_l2":
if len(alpha_embedding.shape) == 1 and len(beta_embedding.shape) == 1: axis = None if alpha_embedding.ndim == 1 else 1
# single embedding normalized_alpha = l2_normalize(alpha_embedding, axis=axis)
axis = None normalized_beta = l2_normalize(beta_embedding, axis=axis)
elif len(alpha_embedding.shape) == 2 and len(beta_embedding.shape) == 2: distance = find_euclidean_distance(normalized_alpha, normalized_beta)
# list of embeddings (batch)
axis = 1
else:
raise ValueError(
"embeddings can either be 1 or 2 dimensional "
f"but it is {len(alpha_embedding.shape)} & {len(beta_embedding.shape)}"
)
distance = find_euclidean_distance(
l2_normalize(alpha_embedding, axis=axis), l2_normalize(beta_embedding, axis=axis)
)
else: else:
raise ValueError("Invalid distance_metric passed - ", distance_metric) raise ValueError("Invalid distance_metric passed - ", distance_metric)
return np.round(distance, 6) return np.round(distance, 6)