Source code for pysad.utils.data

import os
import numpy as np
from pysad.utils.array_streamer import ArrayStreamer


[docs]class Data: """A helper class to load various data. Args: data_base_path (str): Base path that contains the data files. """ def __init__(self, data_base_path="data"): self.data_base_path = data_base_path def _get_data_files(self): """ Helper method to return the names of the data files. Returns: file_names (list[str]): List of data file names. """ return [ 'arrhythmia.mat', 'cardio.mat', 'glass.mat', 'ionosphere.mat', 'letter.mat', 'lympho.mat', 'mnist.mat', 'musk.mat', 'optdigits.mat', 'pendigits.mat', 'pima.mat', 'satellite.mat', 'satimage-2.mat', 'shuttle.mat', 'vertebral.mat', 'vowels.mat', 'wbc.mat', "gisette_sampled.txt", "isolet_sampled.txt", "madelon_sampled.txt", "pima-indians_sampled.txt", "magic-telescope_sampled.txt", ] def _load_via_txt(self, path): """Loads the data file from .txt file. Args: path (str): The path of data. Returns: X (np.float64 array of shape (num_instances, num_features)): Feature vectors. """ X = np.loadtxt(path, delimiter=",") return X
[docs] def get_data(self, data_file): """Loads the data given the path. Args: data_file: Path of the data. Returns: X (np.array of shape (num_instances, num_features)): Feature vectors. y (np.array of shape (num_instances,)): Labels. """ data_path = os.path.join(self.data_base_path, data_file) if ".mat" in data_file: from scipy.io import loadmat f = loadmat(data_path) X = f['X'] y = f['y'].ravel() else: X = self._load_via_txt(data_path) y = X[:, -1].ravel() X = X[:, :-1] return X, y
[docs] def get_iterator(self, data_file, shuffle=True, seed=None): """The iterator function Args: data_file (str): Path of data. shuffle (bool): Whether to shuffle (Default=True). seed (int): Random seed (Default=None). Returns: iterator (The iterator): pysad.utils.array_streamer.ArrayStreamer.iter method applied with (X, y), where X is the variable containing feature vectors and y is the variable containing labels. """ if seed is not None: np.random.seed(seed) iterator = ArrayStreamer(shuffle=shuffle) X, y = self.get_data(data_file) return iterator.iter(X, y)