Source code for pysad.models.rs_hash

from pysad.core.base_model import BaseModel
import numpy as np


[docs]class RSHash(BaseModel): """Subspace outlier detection in linear time with randomized hashing :cite:`sathe2016subspace`. This implementation is adapted from `cmuxstream-baselines <https://github.com/cmuxstream/cmuxstream-baselines/blob/master/Dynamic/RS_Hash/sparse_stream_RSHash.py>`_. Args: feature_mins (np.float64 array of shape (num_features,)): Minimum boundary of the features. feature_maxes (np.float64 array of shape (num_features,)): Maximum boundary of the features. sampling_points (int): The number of sampling points (Default=1000). decay (float): The decay hyperparameter (Default=0.015). num_components (int): The number of ensemble components (Default=100). num_hash_fns (int): The number of hashing functions (Default=1). """ def __init__( self, feature_mins, feature_maxes, sampling_points=1000, decay=0.015, num_components=100, num_hash_fns=1): self.minimum = feature_mins self.maximum = feature_maxes self.m = num_components self.w = num_hash_fns self.s = sampling_points self.dim = len(self.minimum) self.decay = decay self.scores = [] self.num_hash = num_hash_fns self.cmsketches = [] self.effS = max(1000, 1.0 / (1 - np.power(2, -self.decay))) self.f = np.random.uniform( low=1.0 / np.sqrt(self.effS), high=1 - (1.0 / np.sqrt(self.effS)), size=self.m) for i in range(self.num_hash): self.cmsketches.append({}) self._sample_dims() self.alpha = self._sample_shifts() self.index = 0 + 1 - self.s self.last_score = None
[docs] def fit_partial(self, X, y=None): """Fits the model to next instance. Args: X (np.float64 array of shape (num_features,)): The instance to fit. y (int): Ignored since the model is unsupervised (Default=None). Returns: object: Returns the self. """ score_instance = 0 for r in range(self.m): Y = -1 * np.ones(len(self.V[r])) Y[range(len(self.V[r]))] = np.floor( (X[np.array(self.V[r])] + np.array(self.alpha[r])) / float(self.f[r])) mod_entry = np.insert(Y, 0, r) mod_entry = tuple(mod_entry.astype(np.int32)) c = [] for w in range(len(self.cmsketches)): try: value = self.cmsketches[w][mod_entry] except KeyError: value = (self.index, 0) # Scoring the Instance tstamp = value[0] wt = value[1] new_wt = wt * np.power(2, -self.decay * (self.index - tstamp)) c.append(new_wt) # Update the instance new_tstamp = self.index self.cmsketches[w][mod_entry] = (new_tstamp, new_wt + 1) min_c = min(c) c = np.log(1 + min_c) score_instance = score_instance + c self.last_score = score_instance / self.m self.index += 1 return self
[docs] def score_partial(self, X): """Scores the anomalousness of the next instance. Outputs the last score. Note that this method must be called after the fit_partial Args: X (any): Ignored. Returns: float: The anomalousness score of the last fitted instance. """ return self.last_score
def _sample_shifts(self): alpha = [] for r in range(self.m): alpha.append( np.random.uniform( low=0, high=self.f[r], size=len(self.V[r]))) return alpha def _sample_dims(self): max_term = np.max((2 * np.ones(self.f.size), list(1.0 / self.f)), axis=0) common_term = np.log(self.effS) / np.log(max_term) low_value = 1 + 0.5 * common_term high_value = common_term self.r = np.empty([self.m, ], dtype=int) self.V = [] for i in range(self.m): if np.floor(low_value[i]) == np.floor(high_value[i]): self.r[i] = 1 else: self.r[i] = min( np.random.randint( low=low_value[i], high=high_value[i]), self.dim) all_feats = np.array(list(range(self.dim)), dtype=np.int32) choice_feats = all_feats[np.where(self.minimum != self.maximum)] sel_V = np.random.choice( choice_feats, size=self.r[i], replace=False) self.V.append(sel_V)