Source code for preprocess

# -*- coding: utf-8 -*-

# LIBTwinSVM: A Library for Twin Support Vector Machines
# Developers: Mir, A. and Mahdi Rahbar
# License: GNU General Public License v3.0

"""
In this module, functions for reading and processing datasets are defined.
"""


from os.path import splitext, split
from sklearn.datasets import load_svmlight_file
from libtsvm.model import DataInfo
import numpy as np
import pandas as pd


[docs]class DataReader(): """ It handels data-related tasks like reading, etc. Parameters ---------- file_path : str Path to the dataset file. sep : str Separator character header : boolean whether the dataset has header names or not. Attributes ---------- X_train : array-like, shape (n_samples, n_features) Training samples in NumPy array. y_train : array-like, shape(n_samples,) Class labels of training samples. hdr_names : list Header names of datasets. filename : str dataset's filename """ def __init__(self, file_path, sep, header): self.file_path = file_path self.sep = sep self.header = header
[docs] def load_data(self, shuffle, normalize): """ It reads a CSV file into pandas DataFrame. Parameters ---------- shuffle : boolean Whether to shuffle the dataset or not. normalize : boolean Whether to normalize the dataset or not. """ f_name, f_ext = splitext(self.file_path) if f_ext == '.csv': df = pd.read_csv(self.file_path, sep=self.sep) self.hdr_names = list(df.columns.values)[1:] if self.header else [] elif f_ext == '.libsvm': X, y, _ = read_libsvm(self.file_path) df = pd.DataFrame(np.hstack((y.reshape(X.shape[0], 1), X))) self.hdr_names = [] # Check that the lables of binary problems are +1 and -1. class_label = df.iloc[:, 0].unique() if class_label.size == 2: if not(1 in class_label and -1 in class_label): df.iloc[:, 0][df.iloc[:, 0] == class_label[0]] = 1 df.iloc[:, 0][df.iloc[:, 0] == class_label[1]] = -1 else: raise ValueError("Dataset format is not supported: %s" % f_ext) if shuffle: df = df.sample(frac=1).reset_index(drop=True) # print(df) # extract class labels self.y_train = df.iloc[:, 0].values df.drop(df.columns[0], axis=1, inplace=True) if normalize: df = (df - df.mean()) / df.std() self.X_train = df.values # Feature values self.filename = splitext(split(f_name)[-1])[0]
# print(self.filename)
[docs] def get_data(self): """ It returns processed dataset. Returns ------- array-like Training samples in NumPy array. array-like Class labels of training samples. str The dataset's filename """ if all([hasattr(self, attr) for attr in ['X_train', 'y_train', 'filename']]): return self.X_train, self.y_train, self.filename else: raise AttributeError("The dataset has not been loaded yet!" "Run load_data() method.")
[docs] def get_data_info(self): """ It returns data characteristics from dataset. Returns ------ object data characteristics """ unq_cls_lables = np.unique(self.y_train) return DataInfo(self.X_train.shape[0], self.X_train.shape[1], unq_cls_lables.size, unq_cls_lables, self.hdr_names)
# def conv_str_fl(data): # """ # It converts string data to float for computation. # # Parameters # ---------- # data : array-like, shape (n_samples, n_features) # Training samples, where n_samples is the number of samples # and n_features is the number of features. # # Returns # ------- # array-like # A numerical dataset which is suitable for futher computation. # """ # # temp_data = np.zeros(data.shape) # # # Read rows # for i in range(data.shape[0]): # # # Read coloums # for j in range(data.shape[1]): # # temp_data[i][j] = float(data[i][j]) # # return temp_data # def read_data(filename, header=True): # # """ # It converts a CSV dataset to NumPy arrays for further operations # like training the TwinSVM classifier. # # Parameters # ---------- # filename : str # Path to the dataset file. # # header : boolean, optional (default=True) # Ignores first row of dataset which contains header names. # # Returns # ------- # data_train : array-like, shape (n_samples, n_features) # Training samples in NumPy array. # # data_labels : array-like, shape(n_samples,) # Class labels of training samples. # # file_name : str # Dataset's filename. # """ # # data = open(filename, 'r') # # data_csv = csv.reader(data, delimiter=',') # # # Ignore header names # if not header: # # data_array = np.array(list(data_csv)) # # else: # # data_array = np.array(list(data_csv)[1:]) # [1:] for removing headers # # data.close() # # # Shuffle data # #np.random.shuffle(data_array) # # # Convers string data to float # data_train = conv_str_fl(data_array[:, 1:]) # # data_labels = np.array([int(i) for i in data_array[:, 0]]) # # file_name = splitext(split(filename)[-1])[0] # # return data_train, data_labels, file_name
[docs]def read_libsvm(filename): """ It reads `LIBSVM <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/>`_ data files for doing classification using the TwinSVM model. Parameters ---------- filename : str Path to the LIBSVM data file. Returns ------- array-like Training samples. array-like Class labels of training samples. str Dataset's filename """ libsvm_data = load_svmlight_file(filename) file_name = splitext(split(filename)[-1])[0] # Converting sparse CSR matrix to NumPy array return libsvm_data[0].toarray(), libsvm_data[1].astype(np.int), file_name