import os
import shlex
import subprocess
import sys
import tempfile
import zipfile
from builtins import filter
from builtins import map
from builtins import range
from builtins import str
import scipy.sparse as sparse
from ..base import MLClassifierBase
from ..dataset import save_to_arff, get_data_home, _download_single_file, _get_md5
try:
from shlex import quote as cmd_quote
except ImportError:
from pipes import quote as cmd_quote
SUPPORTED_VERSION = '1.9.2'
SUPPORTED_VERSION_MD5 = 'e909044b39513bbad451b8d71098b22c'
[docs]def download_meka(version=None):
"""Downloads a given version of the MEKA library and returns its classpath
Parameters
----------
version : str
the MEKA version to download, default falls back to currently supported version 1.9.2
Returns
-------
string
meka class path string for installed version
Raises
------
IOError
if unpacking the meka release file does not provide a proper setup
Exception
if MD5 mismatch happens after a download error
"""
version = version or SUPPORTED_VERSION
meka_release_string = "meka-release-{}".format(version)
file_name = meka_release_string + '-bin.zip'
meka_path = get_data_home(subdirectory='meka')
target_path = os.path.join(meka_path, file_name)
path_to_lib = os.path.join(meka_path, meka_release_string, 'lib')
if os.path.exists(target_path):
print("MEKA {} found, not downloading".format(version))
else:
print("MEKA {} not found, downloading".format(version))
release_url = "http://downloads.sourceforge.net/project/meka/meka-{}/".format(version)
_download_single_file(file_name, target_path, release_url)
found_md5 = _get_md5(target_path)
if SUPPORTED_VERSION_MD5 != found_md5:
raise Exception("MD5 mismatch - possible MEKA download error")
if not os.path.exists(path_to_lib):
with zipfile.ZipFile(target_path, 'r') as meka_zip:
print("Unzipping MEKA {} to {}".format(version, meka_path + os.path.sep))
meka_zip.extractall(path=meka_path + os.path.sep)
if not os.path.exists(os.path.join(path_to_lib, 'meka-{}.jar'.format(version))):
raise IOError("Something went wrong, MEKA files missing, please file a bug report")
return path_to_lib + os.path.sep
[docs]class Meka(MLClassifierBase):
"""Wrapper for the MEKA classifier
Allows using MEKA, WEKA and some of MULAN classifiers from scikit-compatible API. For more information on
how to use this class see the tutorial: :doc:`../meka`
Parameters
----------
meka_classifier : str
The MEKA classifier string and parameters from the MEKA API,
such as :code:`meka.classifiers.multilabel.MULAN -S RAkEL2`
weka_classifier : str
The WEKA classifier string and parameters from the WEKA API,
such as :code:`weka.classifiers.trees.J48`
java_command : str
Path to test the java command
meka_classpath: str
Path to the MEKA class path folder, usually the folder lib
in the directory MEKA was extracted into
Attributes
----------
output_ : str
the full text output of MEKA command
References
----------
If you use this wrapper please also cite:
.. code-block :: latex
@article{MEKA,
author = {Read, Jesse and Reutemann, Peter and Pfahringer, Bernhard and Holmes, Geoff},
title = {{MEKA}: A Multi-label/Multi-target Extension to {Weka}},
journal = {Journal of Machine Learning Research},
year = {2016},
volume = {17},
number = {21},
pages = {1--5},
url = {http://jmlr.org/papers/v17/12-164.html},
}
@article{Hall:2009:WDM:1656274.1656278,
author = {Hall, Mark and Frank, Eibe and Holmes, Geoffrey and Pfahringer, Bernhard and Reutemann, Peter and Witten, Ian H.},
title = {The WEKA Data Mining Software: An Update},
journal = {SIGKDD Explor. Newsl.},
issue_date = {June 2009},
volume = {11},
number = {1},
month = nov,
year = {2009},
issn = {1931-0145},
pages = {10--18},
numpages = {9},
url = {http://doi.acm.org/10.1145/1656274.1656278},
doi = {10.1145/1656274.1656278},
acmid = {1656278},
publisher = {ACM},
address = {New York, NY, USA},
}
Examples
--------
Here's an example of performing Label Powerset classification using MEKA with a WEKA Naive Bayes classifier.
.. code-block:: python
from skmultilearn.ext import Meka, download_meka
meka = Meka(
meka_classifier = "meka.classifiers.multilabel.LC",
weka_classifier = "weka.classifiers.bayes.NaiveBayes",
meka_classpath = download_meka(),
java_command = '/usr/bin/java')
meka.fit(X_train, y_train)
predictions = meka.predict(X_test)
"""
def __init__(self, meka_classifier=None, weka_classifier=None,
java_command=None, meka_classpath=None):
super(Meka, self).__init__()
self.java_command = java_command
if self.java_command is None:
# TODO: this will not be needed once we're python 3 ready - we will
# use it only in python 2.7 cases
from whichcraft import which
self.java_command = which("java")
if self.java_command is None:
raise ValueError("Java not found")
self.meka_classpath = meka_classpath
if self.meka_classpath is None:
self.meka_classpath = os.environ.get('MEKA_CLASSPATH')
if self.meka_classpath is None:
raise ValueError("No meka classpath defined")
self.meka_classifier = meka_classifier
self.weka_classifier = weka_classifier
self.copyable_attrs = [
'meka_classifier',
'weka_classifier',
'java_command',
'meka_classpath'
]
self.output_ = None
self._verbosity = 5
self._warnings = None
self.require_dense = [False, False]
self._clean()
def _clean(self):
"""Sets various attributes to :code:`None`"""
self._results = None
self._statistics = None
self.output_ = None
self._error = None
self._label_count = None
self._instance_count = None
def _remove_temporary_files(self, temporary_files):
"""Internal function for cleaning temporary files"""
for file_object in temporary_files:
file_name = file_object.name
file_object.close()
if os.path.exists(file_name):
os.remove(file_name)
arff_file_name = file_name + '.arff'
if os.path.exists(arff_file_name):
os.remove(arff_file_name)
[docs] def fit(self, X, y):
"""Fits classifier to training data
Internally this method dumps X and y to temporary arff files and
runs MEKA with relevant arguments using :meth:`_run`. It uses a
sparse DOK representation (:class:`scipy.sparse.dok_matrix`)
of the X matrix.
Parameters
----------
X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
input feature matrix
y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
binary indicator matrix with label assignments
Returns
-------
self
fitted instance of self
"""
self._clean()
X = self._ensure_input_format(
X, sparse_format='dok', enforce_sparse=True)
y = self._ensure_output_format(
y, sparse_format='dok', enforce_sparse=True)
self._label_count = y.shape[1]
# we need this in case threshold needs to be recalibrated in meka
self.train_data_ = save_to_arff(X, y)
train_arff = tempfile.NamedTemporaryFile(delete=False)
classifier_dump_file = tempfile.NamedTemporaryFile(delete=False)
try:
with open(train_arff.name + '.arff', 'w') as fp:
fp.write(self.train_data_)
input_args = [
'-verbosity', "0",
'-split-percentage', "100",
'-t', '"{}"'.format(train_arff.name + '.arff'),
'-d', '"{}"'.format(classifier_dump_file.name),
]
self._run_meka_command(input_args)
self.classifier_dump = None
with open(classifier_dump_file.name, 'rb') as fp:
self.classifier_dump = fp.read()
finally:
self._remove_temporary_files([train_arff, classifier_dump_file])
return self
[docs] def predict(self, X):
"""Predict label assignments for X
Internally this method dumps X to temporary arff files and
runs MEKA with relevant arguments using :func:`_run`. It uses a
sparse DOK representation (:class:`scipy.sparse.dok_matrix`)
of the X matrix.
Parameters
----------
X : numpy.ndarray or scipy.sparse
input features of shape :code:`(n_samples, n_features)`
Returns
-------
scipy.sparse of int
sparse matrix of integers with shape :code:`(n_samples, n_features)`
"""
X = self._ensure_input_format(
X, sparse_format='dok', enforce_sparse=True)
self._instance_count = X.shape[0]
if self.classifier_dump is None:
raise Exception('Not classified')
sparse_y = sparse.coo_matrix((X.shape[0], self._label_count), dtype=int)
try:
train_arff = tempfile.NamedTemporaryFile(delete=False)
test_arff = tempfile.NamedTemporaryFile(delete=False)
classifier_dump_file = tempfile.NamedTemporaryFile(delete=False)
with open(train_arff.name + '.arff', 'w') as fp:
fp.write(self.train_data_)
with open(classifier_dump_file.name, 'wb') as fp:
fp.write(self.classifier_dump)
with open(test_arff.name + '.arff', 'w') as fp:
fp.write(save_to_arff(X, sparse_y))
args = [
'-l', '"{}"'.format(classifier_dump_file.name)
]
self._run(train_arff.name + '.arff', test_arff.name + '.arff', args)
self._parse_output()
finally:
self._remove_temporary_files(
[train_arff, test_arff, classifier_dump_file]
)
return self._results
def _run(self, train_file, test_file, additional_arguments=[]):
"""Runs the meka classifiers
Parameters
----------
train_file : str
path to train :code:`.arff` file in meka format
(big endian, labels first in attributes list).
test_file : str
path to test :code:`.arff` file in meka format
(big endian, labels first in attributes list).
Returns
-------
predictions: sparse binary indicator matrix [n_test_samples, n_labels]
array of binary label vectors including label predictions of
shape :code:`(n_test_samples, n_labels)`
"""
self.output_ = None
self._warnings = None
# meka_command_string = 'java -cp "/home/niedakh/pwr/old/meka-1.5/lib/*" meka.classifiers.multilabel.MULAN -S RAkEL2
# -threshold 0 -t {train} -T {test} -verbosity {verbosity} -W weka.classifiers.bayes.NaiveBayes'
# meka.classifiers.multilabel.LC, weka.classifiers.bayes.NaiveBayes
args = [
'-t', '"{}"'.format(train_file),
'-T', '"{}"'.format(test_file),
'-verbosity', str(5),
] + additional_arguments
self._run_meka_command(args)
return self
def _parse_output(self):
"""Internal function for parsing MEKA output."""
if self.output_ is None:
self._results = None
self._statistics = None
return None
predictions_split_head = '==== PREDICTIONS'
predictions_split_foot = '|==========='
if self._label_count is None:
self._label_count = map(lambda y: int(y.split(')')[1].strip()), [
x for x in self.output_.split('\n') if 'Number of labels' in x])[0]
if self._instance_count is None:
self._instance_count = int(float(filter(lambda x: '==== PREDICTIONS (N=' in x, self.output_.split(
'\n'))[0].split('(')[1].split('=')[1].split(')')[0]))
predictions = self.output_.split(predictions_split_head)[1].split(
predictions_split_foot)[0].split('\n')[1:-1]
predictions = [y.split(']')[0]
for y in [x.split('] [')[1] for x in predictions]]
predictions = [[a for a in [f.strip() for f in z.split(',')] if len(a) > 0]
for z in predictions]
predictions = [[int(a) for a in z] for z in predictions]
assert self._verbosity == 5
self._results = sparse.lil_matrix(
(self._instance_count, self._label_count), dtype='int')
for row in range(self._instance_count):
for label in predictions[row]:
self._results[row, label] = 1
statistics = [x for x in self.output_.split(
'== Evaluation Info')[1].split('\n') if len(x) > 0 and '==' not in x]
statistics = [y for y in [z.strip() for z in statistics] if ' ' in y]
array_data = [z for z in statistics if '[' in z]
non_array_data = [z for z in statistics if '[' not in z]
self._statistics = {}
for row in non_array_data:
r = row.strip().split(' ')
r = [z for z in r if len(z) > 0]
r = [z.strip() for z in r]
if len(r) < 2:
continue
try:
test_value = float(r[1])
except ValueError:
test_value = r[1]
r[1] = test_value
self._statistics[r[0]] = r[1]
for row in array_data:
r = row.strip().split('[')
r = [z.strip() for z in r]
r[1] = r[1].replace(', ', ' ').replace(
',', '.').replace(']', '').split(' ')
r[1] = [x for x in r[1] if len(x) > 0]
self._statistics[r[0]] = r[1]
def _run_meka_command(self, args):
command_args = [
self.java_command,
'-cp', '"{}*"'.format(self.meka_classpath),
self.meka_classifier,
]
if self.weka_classifier is not None:
command_args += ['-W', self.weka_classifier]
command_args += args
meka_command = " ".join(command_args)
if sys.platform != 'win32':
meka_command = shlex.split(meka_command)
pipes = subprocess.Popen(meka_command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True)
self.output_, self._error = pipes.communicate()
if type(self.output_) == bytes:
self.output_ = self.output_.decode(sys.stdout.encoding)
if type(self._error) == bytes:
self._error = self._error.decode(sys.stdout.encoding)
if pipes.returncode != 0:
raise Exception(self.output_ + self._error)