Source code for skmultilearn.dataset

import arff
import bz2
import pickle
from scipy import sparse
import hashlib

import os
import requests
import shutil
from collections import defaultdict


[docs]def get_data_home(data_home=None, subdirectory=''): """Return the path of the scikit-multilearn data dir. This folder is used by some large dataset loaders to avoid downloading the data several times. By default the :code:`data_home` is set to a folder named :code:`'scikit_ml_learn_data'` in the user home folder. Alternatively, it can be set by the :code:`'SCIKIT_ML_LEARN_DATA'` environment variable or programmatically by giving an explicit folder path. The :code:`'~'` symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created. Parameters ---------- data_home : str (default is None) the path to the directory in which scikit-multilearn data sets should be stored, if None the path is generated as stated above subdirectory : str, default '' return path subdirectory under data_home if data_home passed or under default if not passed Returns -------- str the path to the data home """ if data_home is None: if len(subdirectory) > 0: data_home = os.environ.get('SCIKIT_ML_LEARN_DATA', os.path.join('~', 'scikit_ml_learn_data', subdirectory)) else: data_home = os.environ.get('SCIKIT_ML_LEARN_DATA', os.path.join('~', 'scikit_ml_learn_data')) data_home = os.path.expanduser(data_home) if not os.path.exists(data_home): os.makedirs(data_home) return data_home
[docs]def clear_data_home(data_home=None): """Delete all the content of the data home cache. Parameters ---------- data_home : str (default is None) the path to the directory in which scikit-multilearn data sets should be stored. """ data_home = get_data_home(data_home) shutil.rmtree(data_home)
def _get_download_base_url(): """Returns base URL for data sets.""" return 'http://scikit.ml/datasets/'
[docs]def available_data_sets(): """Lists available data sets and their variants Returns ------- dict[(set_name, variant_name)] -> [md5, file_name] available datasets and their variants with the key pertaining to the :code:`(set_name, variant_name)` and values include md5 and file name on server """ r = requests.get(_get_download_base_url() + 'data.list') if r.status_code != 200: r.raise_for_status() else: raw_data_list = r.text variant_information = defaultdict(list) for row in raw_data_list.split('\n'): md5, file_name = row.split(';') set_name, variant = file_name.split('.')[0].split('-') if (set_name, variant) in variant_information: raise Exception('Data file broken, files doubled, please file bug report.') variant_information[(set_name, variant)] = [md5, file_name] return variant_information
[docs]def download_dataset(set_name, variant, data_home=None): """Downloads a data set Parameters ---------- set_name : str name of set from :func:`available_data_sets` variant : str variant of the data set from :func:`available_data_sets` data_home : default None, str custom base folder for data, if None, default is used Returns ------- str path to the downloaded data set file on disk """ data_sets = available_data_sets() if (set_name, variant) not in data_sets: raise ValueError('The set {} in variant {} does not exist on server.'.format(set_name, variant)) md5, name = data_sets[set_name, variant] if data_home is None: target_name = os.path.join(get_data_home(), name) else: target_name = os.path.join(data_home, name) if os.path.exists(target_name): if md5 == _get_md5(target_name): print ("{}:{} - exists, not redownloading".format(set_name, variant)) return target_name else: print ("{}:{} - exists, but MD5 sum mismatch - redownloading".format(set_name, variant)) else: print("{}:{} - does not exists downloading".format(set_name, variant)) # not found or broken md5 _download_single_file(name, target_name) found_md5 = _get_md5(target_name) if md5 != found_md5: raise Exception( "{}: MD5 mismatch {} vs {} - possible download error".format(name, md5, found_md5)) print("Downloaded {}-{}".format(set_name, variant)) return target_name
[docs]def load_dataset(set_name, variant, data_home=None): """Loads a selected variant of the given data set Parameters ---------- set_name : str name of set from :func:`available_data_sets` variant : str variant of the data set data_home : default None, str custom base folder for data, if None, default is used Returns -------- dict the loaded multilabel data set variant in the scikit-multilearn format, see data_sets """ path = download_dataset(set_name, variant, data_home) if path is not None: return load_dataset_dump(path) return None
[docs]def load_from_arff(filename, label_count, label_location="end", input_feature_type='float', encode_nominal=True, load_sparse=False, return_attribute_definitions=False): """Method for loading ARFF files as numpy array Parameters ---------- filename : str path to ARFF file labelcount: integer number of labels in the ARFF file endian: str {"big", "little"} (default is "big") whether the ARFF file contains labels at the beginning of the attributes list ("start", MEKA format) or at the end ("end", MULAN format) input_feature_type: numpy.type as string (default is "float") the desire type of the contents of the return 'X' array-likes, default 'i8', should be a numpy type, see http://docs.scipy.org/doc/numpy/user/basics.types.html encode_nominal: bool (default is True) whether convert categorical data into numeric factors - required for some scikit classifiers that can't handle non-numeric input features. load_sparse: boolean (default is False) whether to read arff file as a sparse file format, liac-arff breaks if sparse reading is enabled for non-sparse ARFFs. return_attribute_definitions: boolean (default is False) whether to return the definitions for each attribute in the dataset Returns ------- X : :mod:`scipy.sparse.lil_matrix` of `input_feature_type`, shape=(n_samples, n_features) input feature matrix y : :mod:`scipy.sparse.lil_matrix` of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments names of attributes : List[str] list of attribute names from ARFF file """ if not load_sparse: arff_frame = arff.load( open(filename, 'r'), encode_nominal=encode_nominal, return_type=arff.DENSE ) matrix = sparse.csr_matrix( arff_frame['data'], dtype=input_feature_type ) else: arff_frame = arff.load( open(filename, 'r'), encode_nominal=encode_nominal, return_type=arff.COO ) data = arff_frame['data'][0] row = arff_frame['data'][1] col = arff_frame['data'][2] matrix = sparse.coo_matrix( (data, (row, col)), shape=(max(row) + 1, max(col) + 1) ) if label_location == "start": X, y = matrix.tocsc()[:, label_count:].tolil(), matrix.tocsc()[:, :label_count].astype(int).tolil() feature_names = arff_frame['attributes'][label_count:] label_names = arff_frame['attributes'][:label_count] elif label_location == "end": X, y = matrix.tocsc()[:, :-label_count].tolil(), matrix.tocsc()[:, -label_count:].astype(int).tolil() feature_names = arff_frame['attributes'][:-label_count] label_names = arff_frame['attributes'][-label_count:] else: # unknown endian return None if return_attribute_definitions: return X, y, feature_names, label_names else: return X, y
[docs]def save_to_arff(X, y, label_location="end", save_sparse=True, filename=None): """Method for dumping data to ARFF files Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments label_location: string {"start", "end"} (default is "end") whether the ARFF file will contain labels at the beginning of the attributes list ("start", MEKA format) or at the end ("end", MULAN format) save_sparse: boolean Whether to save in ARFF's sparse dictionary-like format instead of listing all zeroes within file, very useful in multi-label classification. filename : str or None Path to ARFF file, if None, the ARFF representation is returned as string Returns ------- str or None the ARFF dump string, if filename is None """ X = X.todok() y = y.todok() x_prefix = 0 y_prefix = 0 x_attributes = [(u'X{}'.format(i), u'NUMERIC') for i in range(X.shape[1])] y_attributes = [(u'y{}'.format(i), [str(0), str(1)]) for i in range(y.shape[1])] if label_location == "end": y_prefix = X.shape[1] relation_sign = -1 attributes = x_attributes + y_attributes elif label_location == "start": x_prefix = y.shape[1] relation_sign = 1 attributes = y_attributes + x_attributes else: raise ValueError("Label location not in {start, end}") if save_sparse: data = [{} for r in range(X.shape[0])] else: data = [[0 for c in range(X.shape[1] + y.shape[1])] for r in range(X.shape[0])] for keys, value in list(X.items()): data[keys[0]][x_prefix + keys[1]] = value for keys, value in list(y.items()): data[keys[0]][y_prefix + keys[1]] = value dataset = { u'description': u'traindata', u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign), u'attributes': attributes, u'data': data } arff_data = arff.dumps(dataset) if filename is None: return arff_data with open(filename, 'w') as fp: fp.write(arff_data)
[docs]def save_dataset_dump(input_space, labels, feature_names, label_names, filename=None): """Saves a compressed data set dump Parameters ---------- input_space: array-like of array-likes Input space array-like of input feature vectors labels: array-like of binary label vectors Array-like of labels assigned to each input vector, as a binary indicator vector (i.e. if 5th position has value 1 then the input vector has label no. 5) feature_names: array-like,optional names of features label_names: array-like, optional names of labels filename : str, optional Path to dump file, if without .bz2, the .bz2 extension will be appended. """ data = {'X': input_space, 'y': labels, 'features': feature_names, 'labels': label_names} if filename is not None: if filename[-4:] != '.bz2': filename += ".bz2" with bz2.BZ2File(filename, "wb") as file_handle: pickle.dump(data, file_handle) else: return data
[docs]def load_dataset_dump(filename): """Loads a compressed data set dump Parameters ---------- filename : str path to dump file, if without .bz2 ending, the .bz2 extension will be appended. Returns ------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments names of attributes: List[str] list of attribute names for `X` columns names of labels: List[str] list of label names for `y` columns """ if not os.path.exists(filename): raise IOError("File {} does not exist, use load_dataset to download file".format(filename)) if filename[-4:] != '.bz2': filename += ".bz2" with bz2.BZ2File(filename, "r") as file_handle: data = pickle.load(file_handle) return data['X'], data['y'], data['features'], data['labels']
def _download_single_file(data_file_name, target_file_name, base_url=None): base_url = base_url or _get_download_base_url() r = requests.get(base_url + data_file_name, stream=True) if r.status_code == 200: with open(target_file_name, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) else: r.raise_for_status() def _get_md5(file_name): hash_md5 = hashlib.md5() with open(file_name, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest()