Source code for preprocess

# -*- coding: utf-8 -*-

# LIBTwinSVM: A Library for Twin Support Vector Machines
# Developers: Mir, A. and Mahdi Rahbar
# License: GNU General Public License v3.0

"""
In this module, functions for reading and processing datasets are defined.
"""


from os.path import splitext, split
from sklearn.datasets import load_svmlight_file
from libtsvm.model import DataInfo
import numpy as np
import pandas as pd


[docs]class DataReader():
    """
    It handels data-related tasks like reading, etc.

    Parameters
    ----------
    file_path : str
        Path to the dataset file.

    sep : str
        Separator character

    header : boolean
        whether the dataset has header names or not.

    Attributes
    ----------
    X_train : array-like, shape (n_samples, n_features)
        Training samples in NumPy array.

    y_train :  array-like, shape(n_samples,)
        Class labels of training samples.

    hdr_names : list
        Header names of datasets.

    filename : str
        dataset's filename
    """

    def __init__(self, file_path, sep, header):

        self.file_path = file_path
        self.sep = sep
        self.header = header

[docs]    def load_data(self, shuffle, normalize):
        """
        It reads a CSV file into pandas DataFrame.

        Parameters
        ----------
        shuffle : boolean
            Whether to shuffle the dataset or not.

        normalize : boolean
            Whether to normalize the dataset or not.
        """

        f_name, f_ext = splitext(self.file_path)

        if f_ext == '.csv':

            df = pd.read_csv(self.file_path, sep=self.sep)
            self.hdr_names = list(df.columns.values)[1:] if self.header else []

        elif f_ext == '.libsvm':

            X, y, _ = read_libsvm(self.file_path)

            df = pd.DataFrame(np.hstack((y.reshape(X.shape[0], 1), X)))
            self.hdr_names = []
            
            # Check that the lables of binary problems are +1 and -1.
            class_label = df.iloc[:, 0].unique()
            
            if class_label.size == 2:
                
                if not(1 in class_label and -1 in class_label):
                    
                    df.iloc[:, 0][df.iloc[:, 0] == class_label[0]] = 1 
                    df.iloc[:, 0][df.iloc[:, 0] == class_label[1]] = -1 

        else:

            raise ValueError("Dataset format is not supported: %s" % f_ext)

        if shuffle:

            df = df.sample(frac=1).reset_index(drop=True)

            # print(df)

        # extract class labels
        self.y_train = df.iloc[:, 0].values
        df.drop(df.columns[0], axis=1, inplace=True)

        if normalize:

            df = (df - df.mean()) / df.std()

        self.X_train = df.values  # Feature values
        self.filename = splitext(split(f_name)[-1])[0]
        # print(self.filename)

[docs]    def get_data(self):
        """
        It returns processed dataset.

        Returns
        -------
        array-like
            Training samples in NumPy array.

        array-like
            Class labels of training samples.

        str
            The dataset's filename
        """

        if all([hasattr(self, attr) for attr in ['X_train', 'y_train',
                'filename']]):

            return self.X_train, self.y_train, self.filename

        else:

            raise AttributeError("The dataset has not been loaded yet!"
                                 "Run load_data() method.")

[docs]    def get_data_info(self):
        """
        It returns data characteristics from dataset.

        Returns
        ------
        object
            data characteristics
        """

        unq_cls_lables = np.unique(self.y_train)

        return DataInfo(self.X_train.shape[0], self.X_train.shape[1],
                        unq_cls_lables.size, unq_cls_lables, self.hdr_names)


# def conv_str_fl(data):
#    """
#    It converts string data to float for computation.
#
#    Parameters
#    ----------
#    data : array-like, shape (n_samples, n_features)
#        Training samples, where n_samples is the number of samples
#        and n_features is the number of features.
#
#    Returns
#    -------
#    array-like
#        A numerical dataset which is suitable for futher computation.
#    """
#
#    temp_data = np.zeros(data.shape)
#
#    # Read rows
#    for i in range(data.shape[0]):
#
#        # Read coloums
#        for j in range(data.shape[1]):
#
#            temp_data[i][j] = float(data[i][j])
#
#    return temp_data

# def read_data(filename, header=True):
#
#    """
#    It converts a CSV dataset to NumPy arrays for further operations
#    like training the TwinSVM classifier.
#
#    Parameters
#    ----------
#    filename : str
#        Path to the dataset file.
#
#    header : boolean, optional (default=True)
#        Ignores first row of dataset which contains header names.
#
#    Returns
#    -------
#    data_train : array-like, shape (n_samples, n_features) 
#        Training samples in NumPy array.
#
#    data_labels : array-like, shape(n_samples,) 
#        Class labels of training samples.
#
#    file_name : str
#        Dataset's filename.
#    """
#
#    data = open(filename, 'r')
#
#    data_csv = csv.reader(data, delimiter=',')
#
#    # Ignore header names
#    if not header:
#
#        data_array = np.array(list(data_csv))
#       
#    else:
#  
#        data_array = np.array(list(data_csv)[1:]) # [1:] for removing headers
#   
#    data.close()
#   
#    # Shuffle data
#    #np.random.shuffle(data_array)                        
#   
#    # Convers string data to float
#    data_train = conv_str_fl(data_array[:, 1:])                     
#                         
#    data_labels = np.array([int(i) for i in data_array[:, 0]])
#    
#    file_name = splitext(split(filename)[-1])[0]
#    
#    return data_train, data_labels, file_name 


[docs]def read_libsvm(filename):
    """
    It reads `LIBSVM <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/>`_
    data files for doing classification using the TwinSVM model.

    Parameters
    ----------
    filename : str
    Path to the LIBSVM data file.

    Returns
    -------
    array-like
    Training samples.

    array-like
    Class labels of training samples.

    str
    Dataset's filename
    """

    libsvm_data = load_svmlight_file(filename)
    file_name = splitext(split(filename)[-1])[0]

    # Converting sparse CSR matrix to NumPy array
    return libsvm_data[0].toarray(), libsvm_data[1].astype(np.int), file_name