Source code for nachos.similarity_functions.cosine

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nachos.similarity_functions.abstract_similarity import AbstractSimilarity
from nachos.similarity_functions import register

[docs]@register('cosine') class Cosine(AbstractSimilarity): ''' Summary: Defines the (thresholded) cosine similarity between two points. Each points are expected to be ndarrays. The cosine similarity is computed using the sklearn pairwise metrics package. If all pairwise distances are desired, then the ndarray can be Nxd, where N specifies the number of data points. Using N > 1 is useful when defining similarities on sets, which this similarity function is automatically designed to do. It returns the largest pairwise similarity between any elements of the sets being compared. '''
[docs] @classmethod def build(cls, conf: dict): return cls(conf['cosine_thresh'])
[docs] def __init__(self, t: float): super().__init__() self.thresh = t
[docs] def __call__(self, f: set, g: set) -> float: ''' Summary: Computes the thresholded cosine similarity between inputs f, g. f, g are assumed to be real valued vectors, generally representing embeddings which have been whitened. Inputs --------------------------------------- :param f: an ndarray representing a set of vectors to compare :type f: set :param g: an ndarray representing a set of vectors to compare :type g: set Returns -------------------------------------- :return: returns the similarity score :rtype: float ''' # cast sets of arrays to ndarray f, g = np.array(list(f)), np.array(list(g)) # In the case of a set of vectors being compared to a single vector, or # another set, we use \sup_{x, y} d(x, y) sim = cosine_similarity(f, g).max() return float(sim if sim > self.thresh else 0)