Source code for skmultilearn.ext.meka

import os
import shlex
import subprocess
import sys
import tempfile
import zipfile
from builtins import filter
from builtins import map
from builtins import range
from builtins import str

import scipy.sparse as sparse

from ..base import MLClassifierBase
from ..dataset import save_to_arff, get_data_home, _download_single_file, _get_md5

try:
    from shlex import quote as cmd_quote
except ImportError:
    from pipes import quote as cmd_quote

SUPPORTED_VERSION = '1.9.2'
SUPPORTED_VERSION_MD5 = 'e909044b39513bbad451b8d71098b22c'


[docs]def download_meka(version=None): """Downloads a given version of the MEKA library and returns its classpath Parameters ---------- version : str the MEKA version to download, default falls back to currently supported version 1.9.2 Returns ------- string meka class path string for installed version Raises ------ IOError if unpacking the meka release file does not provide a proper setup Exception if MD5 mismatch happens after a download error """ version = version or SUPPORTED_VERSION meka_release_string = "meka-release-{}".format(version) file_name = meka_release_string + '-bin.zip' meka_path = get_data_home(subdirectory='meka') target_path = os.path.join(meka_path, file_name) path_to_lib = os.path.join(meka_path, meka_release_string, 'lib') if os.path.exists(target_path): print("MEKA {} found, not downloading".format(version)) else: print("MEKA {} not found, downloading".format(version)) release_url = "http://downloads.sourceforge.net/project/meka/meka-{}/".format(version) _download_single_file(file_name, target_path, release_url) found_md5 = _get_md5(target_path) if SUPPORTED_VERSION_MD5 != found_md5: raise Exception("MD5 mismatch - possible MEKA download error") if not os.path.exists(path_to_lib): with zipfile.ZipFile(target_path, 'r') as meka_zip: print("Unzipping MEKA {} to {}".format(version, meka_path + os.path.sep)) meka_zip.extractall(path=meka_path + os.path.sep) if not os.path.exists(os.path.join(path_to_lib, 'meka-{}.jar'.format(version))): raise IOError("Something went wrong, MEKA files missing, please file a bug report") return path_to_lib + os.path.sep
[docs]class Meka(MLClassifierBase): """Wrapper for the MEKA classifier Allows using MEKA, WEKA and some of MULAN classifiers from scikit-compatible API. For more information on how to use this class see the tutorial: :doc:`../meka` Parameters ---------- meka_classifier : str The MEKA classifier string and parameters from the MEKA API, such as :code:`meka.classifiers.multilabel.MULAN -S RAkEL2` weka_classifier : str The WEKA classifier string and parameters from the WEKA API, such as :code:`weka.classifiers.trees.J48` java_command : str Path to test the java command meka_classpath: str Path to the MEKA class path folder, usually the folder lib in the directory MEKA was extracted into Attributes ---------- output_ : str the full text output of MEKA command References ---------- If you use this wrapper please also cite: .. code-block :: latex @article{MEKA, author = {Read, Jesse and Reutemann, Peter and Pfahringer, Bernhard and Holmes, Geoff}, title = {{MEKA}: A Multi-label/Multi-target Extension to {Weka}}, journal = {Journal of Machine Learning Research}, year = {2016}, volume = {17}, number = {21}, pages = {1--5}, url = {http://jmlr.org/papers/v17/12-164.html}, } @article{Hall:2009:WDM:1656274.1656278, author = {Hall, Mark and Frank, Eibe and Holmes, Geoffrey and Pfahringer, Bernhard and Reutemann, Peter and Witten, Ian H.}, title = {The WEKA Data Mining Software: An Update}, journal = {SIGKDD Explor. Newsl.}, issue_date = {June 2009}, volume = {11}, number = {1}, month = nov, year = {2009}, issn = {1931-0145}, pages = {10--18}, numpages = {9}, url = {http://doi.acm.org/10.1145/1656274.1656278}, doi = {10.1145/1656274.1656278}, acmid = {1656278}, publisher = {ACM}, address = {New York, NY, USA}, } Examples -------- Here's an example of performing Label Powerset classification using MEKA with a WEKA Naive Bayes classifier. .. code-block:: python from skmultilearn.ext import Meka, download_meka meka = Meka( meka_classifier = "meka.classifiers.multilabel.LC", weka_classifier = "weka.classifiers.bayes.NaiveBayes", meka_classpath = download_meka(), java_command = '/usr/bin/java') meka.fit(X_train, y_train) predictions = meka.predict(X_test) """ def __init__(self, meka_classifier=None, weka_classifier=None, java_command=None, meka_classpath=None): super(Meka, self).__init__() self.java_command = java_command if self.java_command is None: # TODO: this will not be needed once we're python 3 ready - we will # use it only in python 2.7 cases from whichcraft import which self.java_command = which("java") if self.java_command is None: raise ValueError("Java not found") self.meka_classpath = meka_classpath if self.meka_classpath is None: self.meka_classpath = os.environ.get('MEKA_CLASSPATH') if self.meka_classpath is None: raise ValueError("No meka classpath defined") self.meka_classifier = meka_classifier self.weka_classifier = weka_classifier self.copyable_attrs = [ 'meka_classifier', 'weka_classifier', 'java_command', 'meka_classpath' ] self.output_ = None self._verbosity = 5 self._warnings = None self.require_dense = [False, False] self._clean() def _clean(self): """Sets various attributes to :code:`None`""" self._results = None self._statistics = None self.output_ = None self._error = None self._label_count = None self._instance_count = None def _remove_temporary_files(self, temporary_files): """Internal function for cleaning temporary files""" for file_object in temporary_files: file_name = file_object.name file_object.close() if os.path.exists(file_name): os.remove(file_name) arff_file_name = file_name + '.arff' if os.path.exists(arff_file_name): os.remove(arff_file_name)
[docs] def fit(self, X, y): """Fits classifier to training data Internally this method dumps X and y to temporary arff files and runs MEKA with relevant arguments using :meth:`_run`. It uses a sparse DOK representation (:class:`scipy.sparse.dok_matrix`) of the X matrix. Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments Returns ------- self fitted instance of self """ self._clean() X = self._ensure_input_format( X, sparse_format='dok', enforce_sparse=True) y = self._ensure_output_format( y, sparse_format='dok', enforce_sparse=True) self._label_count = y.shape[1] # we need this in case threshold needs to be recalibrated in meka self.train_data_ = save_to_arff(X, y) train_arff = tempfile.NamedTemporaryFile(delete=False) classifier_dump_file = tempfile.NamedTemporaryFile(delete=False) try: with open(train_arff.name + '.arff', 'w') as fp: fp.write(self.train_data_) input_args = [ '-verbosity', "0", '-split-percentage', "100", '-t', '"{}"'.format(train_arff.name + '.arff'), '-d', '"{}"'.format(classifier_dump_file.name), ] self._run_meka_command(input_args) self.classifier_dump = None with open(classifier_dump_file.name, 'rb') as fp: self.classifier_dump = fp.read() finally: self._remove_temporary_files([train_arff, classifier_dump_file]) return self
[docs] def predict(self, X): """Predict label assignments for X Internally this method dumps X to temporary arff files and runs MEKA with relevant arguments using :func:`_run`. It uses a sparse DOK representation (:class:`scipy.sparse.dok_matrix`) of the X matrix. Parameters ---------- X : numpy.ndarray or scipy.sparse input features of shape :code:`(n_samples, n_features)` Returns ------- scipy.sparse of int sparse matrix of integers with shape :code:`(n_samples, n_features)` """ X = self._ensure_input_format( X, sparse_format='dok', enforce_sparse=True) self._instance_count = X.shape[0] if self.classifier_dump is None: raise Exception('Not classified') sparse_y = sparse.coo_matrix((X.shape[0], self._label_count), dtype=int) try: train_arff = tempfile.NamedTemporaryFile(delete=False) test_arff = tempfile.NamedTemporaryFile(delete=False) classifier_dump_file = tempfile.NamedTemporaryFile(delete=False) with open(train_arff.name + '.arff', 'w') as fp: fp.write(self.train_data_) with open(classifier_dump_file.name, 'wb') as fp: fp.write(self.classifier_dump) with open(test_arff.name + '.arff', 'w') as fp: fp.write(save_to_arff(X, sparse_y)) args = [ '-l', '"{}"'.format(classifier_dump_file.name) ] self._run(train_arff.name + '.arff', test_arff.name + '.arff', args) self._parse_output() finally: self._remove_temporary_files( [train_arff, test_arff, classifier_dump_file] ) return self._results
def _run(self, train_file, test_file, additional_arguments=[]): """Runs the meka classifiers Parameters ---------- train_file : str path to train :code:`.arff` file in meka format (big endian, labels first in attributes list). test_file : str path to test :code:`.arff` file in meka format (big endian, labels first in attributes list). Returns ------- predictions: sparse binary indicator matrix [n_test_samples, n_labels] array of binary label vectors including label predictions of shape :code:`(n_test_samples, n_labels)` """ self.output_ = None self._warnings = None # meka_command_string = 'java -cp "/home/niedakh/pwr/old/meka-1.5/lib/*" meka.classifiers.multilabel.MULAN -S RAkEL2 # -threshold 0 -t {train} -T {test} -verbosity {verbosity} -W weka.classifiers.bayes.NaiveBayes' # meka.classifiers.multilabel.LC, weka.classifiers.bayes.NaiveBayes args = [ '-t', '"{}"'.format(train_file), '-T', '"{}"'.format(test_file), '-verbosity', str(5), ] + additional_arguments self._run_meka_command(args) return self def _parse_output(self): """Internal function for parsing MEKA output.""" if self.output_ is None: self._results = None self._statistics = None return None predictions_split_head = '==== PREDICTIONS' predictions_split_foot = '|===========' if self._label_count is None: self._label_count = map(lambda y: int(y.split(')')[1].strip()), [ x for x in self.output_.split('\n') if 'Number of labels' in x])[0] if self._instance_count is None: self._instance_count = int(float(filter(lambda x: '==== PREDICTIONS (N=' in x, self.output_.split( '\n'))[0].split('(')[1].split('=')[1].split(')')[0])) predictions = self.output_.split(predictions_split_head)[1].split( predictions_split_foot)[0].split('\n')[1:-1] predictions = [y.split(']')[0] for y in [x.split('] [')[1] for x in predictions]] predictions = [[a for a in [f.strip() for f in z.split(',')] if len(a) > 0] for z in predictions] predictions = [[int(a) for a in z] for z in predictions] assert self._verbosity == 5 self._results = sparse.lil_matrix( (self._instance_count, self._label_count), dtype='int') for row in range(self._instance_count): for label in predictions[row]: self._results[row, label] = 1 statistics = [x for x in self.output_.split( '== Evaluation Info')[1].split('\n') if len(x) > 0 and '==' not in x] statistics = [y for y in [z.strip() for z in statistics] if ' ' in y] array_data = [z for z in statistics if '[' in z] non_array_data = [z for z in statistics if '[' not in z] self._statistics = {} for row in non_array_data: r = row.strip().split(' ') r = [z for z in r if len(z) > 0] r = [z.strip() for z in r] if len(r) < 2: continue try: test_value = float(r[1]) except ValueError: test_value = r[1] r[1] = test_value self._statistics[r[0]] = r[1] for row in array_data: r = row.strip().split('[') r = [z.strip() for z in r] r[1] = r[1].replace(', ', ' ').replace( ',', '.').replace(']', '').split(' ') r[1] = [x for x in r[1] if len(x) > 0] self._statistics[r[0]] = r[1] def _run_meka_command(self, args): command_args = [ self.java_command, '-cp', '"{}*"'.format(self.meka_classpath), self.meka_classifier, ] if self.weka_classifier is not None: command_args += ['-W', self.weka_classifier] command_args += args meka_command = " ".join(command_args) if sys.platform != 'win32': meka_command = shlex.split(meka_command) pipes = subprocess.Popen(meka_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) self.output_, self._error = pipes.communicate() if type(self.output_) == bytes: self.output_ = self.output_.decode(sys.stdout.encoding) if type(self._error) == bytes: self._error = self._error.decode(sys.stdout.encoding) if pipes.returncode != 0: raise Exception(self.output_ + self._error)