Source code for fastsk.old_utils

import os
import os.path as osp
import subprocess
import numpy as np
from fastsk import FastSK
from sklearn.svm import LinearSVC
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn import metrics
import time
import multiprocessing
import subprocess


[docs]def time_fastsk(
    g,
    m,
    t,
    data_location,
    prefix,
    approx=False,
    max_iters=None,
    timeout=None,
    skip_variance=False,
):
    """Run FastSK kernel computation. If a timeout is provided,
    it'll run as a subprocess, which will be killed when the timeout is
    reached.
    """
    fastsk = FastskRunner(prefix, data_location)

    start = time.time()
    if timeout:
        if max_iters:
            args = {
                "t": t,
                "approx": approx,
                "skip_variance": skip_variance,
                "I": max_iters,
            }
        else:
            args = {"t": t, "approx": approx, "skip_variance": skip_variance}
        p = multiprocessing.Process(
            target=fastsk.compute_train_kernel,
            name="TimeFastSK",
            args=(g, m),
            kwargs=args,
        )
        p.start()
        p.join(timeout)
        if p.is_alive():
            p.terminate()
            p.join()
    else:
        if max_iters:
            fastsk.compute_train_kernel(
                g, m, t=t, approx=approx, I=max_iters, skip_variance=skip_variance
            )
        else:
            fastsk.compute_train_kernel(
                g, m, t=t, approx=approx, skip_variance=skip_variance
            )

    end = time.time()

    return end - start


[docs]def fastsk_wrap(dataset, g, m, t, approx, I, delta, skip_variance, C, return_dict):
    fastsk = FastskRunner(dataset)
    acc, auc = fastsk.train_and_test(g, m, t, I, approx, skip_variance, C)
    return_dict["acc"] = acc
    return_dict["auc"] = auc


[docs]def train_and_test_fastsk(
    dataset, g, m, t, approx, I=50, delta=0.025, skip_variance=False, C=1, timeout=None
):
    start = time.time()
    if timeout:
        manager = multiprocessing.Manager()
        return_dict = manager.dict()
        p = multiprocessing.Process(
            target=fastsk_wrap,
            name="Train and test FastSK",
            args=(dataset, g, m, t, approx, I, delta, skip_variance, C, return_dict),
        )
        p.start()
        p.join(timeout)
        if p.is_alive():
            p.terminate()
            p.join()

        if return_dict.values() == []:
            acc, auc = 0, 0
        else:
            acc, auc = return_dict["acc"], return_dict["auc"]
    else:
        acc, auc = fastsk.train_and_test(
            g, m, t=t, I=I, approx=approx, skip_variance=skip_variance, C=C
        )
    end = time.time()

    return acc, auc, end - start


[docs]def gkm_wrap(
    g, m, t, prefix, gkm_data, gkm_exec, approx, timeout, alphabet, return_dict
):
    k = g - m
    gkm = GkmRunner(gkm_exec, gkm_data, prefix, g, k, approx, alphabet, "./temp")
    acc, auc = gkm.train_and_test(t)
    return_dict["acc"] = acc
    return_dict["auc"] = auc


[docs]def train_and_test_gkm(
    g, m, t, prefix, gkm_data, gkm_exec, approx=False, timeout=None, alphabet=None
):
    start = time.time()
    if timeout:
        manager = multiprocessing.Manager()
        return_dict = manager.dict()
        p = multiprocessing.Process(
            target=gkm_wrap,
            name="Train and test Gkm",
            args=(
                g,
                m,
                t,
                prefix,
                gkm_data,
                gkm_exec,
                approx,
                timeout,
                alphabet,
                return_dict,
            ),
        )
        p.start()
        p.join(timeout)
        if p.is_alive():
            p.terminate()
            p.join()
        if return_dict.values() == []:
            acc, auc = 0, 0
        else:
            acc, auc = return_dict["acc"], return_dict["auc"]
    else:
        k = g - m
        gkm = GkmRunner(gkm_exec, gkm_data, prefix, g, k, approx, alphabet, "./temp")
        acc, auc = gkm.train_and_test(t)
    end = time.time()
    return acc, auc, end - start


[docs]def time_gkm(
    g, m, t, prefix, gkm_data, gkm_exec, approx=False, timeout=None, alphabet=None
):
    """Run gkm-SVM2.0 kernel computation. If a timeout is provided,
    it'll be run as a subprocess, which will be killed when the timeout is
    reached.
    """
    k = g - m
    gkm = GkmRunner(gkm_exec, gkm_data, prefix, g, k, approx, alphabet, "./temp")

    start = time.time()
    if timeout:
        p = multiprocessing.Process(
            target=gkm.compute_train_kernel, name="TimeGkm", args=(t,)
        )
        p.start()
        p.join(timeout)
        if p.is_alive():
            p.terminate()
            p.join()
    else:
        gkm.compute_train_kernel(t)

    end = time.time()

    return end - start


[docs]def time_gakco(g, m, type_, prefix, timeout=None):
    gakco_exec = "./baselines/GaKCo-SVM/bin/GaKCo"
    data = "./data/"
    gakco = GaKCoRunner(gakco_exec, data, type_, prefix)

    start = time.time()
    gakco.compute_kernel(g, m, mode="train")
    end = time.time()

    return end - start


[docs]def time_blended(k1, k2, prefix, timeout=None):
    blended_exec = "./baselines/String_Kernels_Package/code/"
    data = "../data/"
    blended = BlendedSpectrumRunner(blended_exec, data, prefix)
    start = time.time()
    blended.compute_kernel(k1, k2, mode="train")
    end = time.time()

    return end - start


[docs]class Vocabulary(object):
    """A class for storing the vocabulary of a
    sequence dataset. Maps words or characters to indexes in the
    vocabulary.
    """

    def __init__(self):
        self._token2idx = {}
        self._token2idx[0] = 0
        self._size = len(self._token2idx)

[docs]    def add(self, token):
        """
        Add a token to the vocabulary.
        Args:
            token: a letter (for char-level model) or word (for word-level model)
            for which to create a mapping to an integer (the idx).
        Return:
            the index of the word. If it's already present, return its
            index. Otherwise, add it before returning the index.
        """
        if token not in self._token2idx:
            self._token2idx[token] = self._size
            self._size += 1
        return self._token2idx.get(token)

[docs]    def size(self):
        """Return the number tokens in the vocabulary."""
        return self._size

    def __str__(self):
        return str(self._token2idx)


[docs]class FastaUtility:
    def __init__(self, vocab=None):
        r"""
        Initialize a helper object for parsing datasets in FASTA-like format.

        Parameters
        ----------
        vocab :
        """
        self._vocab = Vocabulary() if vocab is None else vocab

[docs]    def read_data(self, data_file, vocab="inferred", regression=False):
        r"""Read a file with the FASTA-like format of alternating
        labels lines followed by sequences. For example:
            >1
            >AAAGAT
            >1
            >AAAAAGAT
            >0
            >AGTC

        Parameters
        ----------
        data_file : string
            The path to the sequences.
        vocab : string

        Returns
        ----------
        X : list
            list of sequences where characters have been mapped to numbers.
        Y : list
            list of labels
        """
        assert vocab.lower() in ["dna", "protein", "inferred"]
        X, Y = [], []
        with open(data_file, "r") as f:
            label_line = True
            for line in f:
                line = line.strip().lower()
                if label_line:
                    split = line.split(">")
                    assert len(split) == 2
                    if regression:
                        label = split[1]
                    else:
                        label = int(split[1])
                        assert label in [-1, 0, 1]
                    Y.append(label)
                    label_line = False
                else:
                    seq = list(line)
                    seq = [self._vocab.add(token) for token in seq]
                    X.append(seq)
                    label_line = True
            assert len(X) == len(Y)

        return X, Y

[docs]    def shortest_seq(self, data_file):
        X, Y = self.read_data(data_file)
        shortest = len(X[0])
        for x in X:
            if len(x) < shortest:
                shortest = len(x)
        return shortest


[docs]class ArabicUtility:
    def __init__(self, vocab=None):
        r"""
        Initialize a helper object for parsing datasets in the MADAR Arabic
        Dialect Identification task format.
        https://www.aclweb.org/anthology/L18-1535/

        There are 26 dialects in one of the tasks, which is
        too many for us to handle right now. Instead, we just the
        following 6 cities:
            RAB - Rabat
            BEI - Beirut
            DOH - Doha
            CAI - Cairo
            TUN - Tunis
            MSA - Modern Standard Arabic

        Parameters
        ----------
        vocab : a Vocabulary object
        """
        self._vocab = Vocabulary() if vocab is None else vocab
        self._classes = Vocabulary()
        self._labels_to_use = ["RAB", "BEI", "DOH", "CAI", "TUN", "MSA"]

[docs]    def read_data(self, data_file, vocab="inferred"):
        r"""Read a file with the following format:
            بالمناسبة ، اسمي هيروش إيجيما . MSA
            مش قادر نرقد كويس في الليل .    CAI

            That is, a sequence of Arabic characters, a tab,
            and a three-letter label/city code.

        Parameters
        ----------
        data_file : string
            The path to the sequences.
        vocab : string

        Returns
        ----------
        X : list
            list of sequences where characters have been mapped to numbers.
        Y : list
            list of numerical labels (not one-hot)
        """
        assert vocab.lower() in ["dna", "protein", "arabic", "inferred"]
        X, Y = [], []
        with open(data_file, "r", encoding="utf-8") as f:
            for line in f:
                seq, label = line.rstrip().split("\t")
                assert len(label) == 3
                if label in self._labels_to_use:
                    if len(seq) < 10:
                        continue
                    seq = list(seq)
                    seq = [self._vocab.add(token) for token in seq]
                    X.append(seq)
                    Y.append(self._classes.add(label))
            assert len(X) == len(Y)

        return X, Y


[docs]class DslUtility:
    def __init__(self, vocab=None):
        self._vocab = Vocabulary() if vocab is None else vocab
        self._classes = Vocabulary()

[docs]    def read_data(self, data_file, vocab="inferred"):
        assert vocab.lower() in ["dna", "protein", "arabic", "inferred"]
        X, Y = [], []
        with open(data_file, "r", encoding="utf-8") as f:
            for line in f:
                seq, label = line.rstrip().split("\t")
                if len(seq) < 10:
                    continue
                seq = list(seq)
                seq = [self._vocab.add(token) for token in seq]
                X.append(seq)
                Y.append(self._classes.add(label))
            assert len(X) == len(Y)

        return X, Y


[docs]class FastskRunner:
    def __init__(self, prefix, data_location="../data"):
        self.prefix = prefix
        self.train_file = osp.join(data_location, prefix + ".train.fasta")
        self.test_file = osp.join(data_location, prefix + ".test.fasta")

        reader = FastaUtility()
        self.train_seq, self.Ytrain = reader.read_data(self.train_file)
        self.test_seq, Ytest = reader.read_data(self.test_file)
        Ytest = np.array(Ytest).reshape(-1, 1)
        self.Ytest = Ytest

[docs]    def compute_train_kernel(
        self, g, m, t=20, approx=True, I=100, delta=0.025, skip_variance=False
    ):
        kernel = FastSK(
            g=g,
            m=m,
            t=t,
            approx=approx,
            max_iters=I,
            delta=delta,
            skip_variance=skip_variance,
        )
        kernel.compute_train(self.train_seq)

[docs]    def train_and_test(
        self, g, m, t, approx, I=100, delta=0.025, skip_variance=False, C=1
    ):
        kernel = FastSK(
            g=g,
            m=m,
            t=t,
            approx=approx,
            max_iters=I,
            delta=delta,
            skip_variance=skip_variance,
        )

        kernel.compute_kernel(self.train_seq, self.test_seq)
        self.Xtrain = kernel.get_train_kernel()
        self.Xtest = kernel.get_test_kernel()
        self.stdevs = kernel.get_stdevs()
        svm = LinearSVC(C=C, class_weight="balanced")
        self.clf = CalibratedClassifierCV(svm, cv=5).fit(self.Xtrain, self.Ytrain)
        acc, auc = self.evaluate_clf()
        return acc, auc

[docs]    def evaluate_clf(self):
        acc = self.clf.score(self.Xtest, self.Ytest)
        probs = self.clf.predict_proba(self.Xtest)[:, 1]
        auc = metrics.roc_auc_score(self.Ytest, probs)
        return acc, auc


[docs]class FastskRegressor:
    def __init__(self, dataset, data_location="../data"):
        self.dataset = dataset
        self.train_file = osp.join(data_location, prefix + ".train.fasta")
        self.test_file = osp.join(data_location, prefix + ".test.fasta")

        reader = FastaUtility()
        self.train_seq, self.Ytrain = reader.read_data(train_file, regression=True)
        self.test_seq, self.Ytest = reader.read_data(test_file, regression=True)
        self.Ytrain = np.array(self.Ytrain).astype(np.float)
        self.Ytest = np.array(self.Ytest).astype(np.float)

[docs]    def compute_train_kernel(
        self, g, m, t=20, approx=True, I=100, delta=0.025, skip_variance=False
    ):
        kernel = FastSK(
            g=g,
            m=m,
            t=t,
            approx=approx,
            max_iters=I,
            delta=delta,
            skip_variance=skip_variance,
        )
        kernel.compute_train(self.train_seq)

[docs]    def train_and_test(self, g, m, t, approx, I=100, delta=0.025, skip_variance=False):
        kernel = FastSK(
            g=g,
            m=m,
            t=t,
            approx=approx,
            max_iters=I,
            delta=delta,
            skip_variance=skip_variance,
        )

        kernel.compute_kernel(self.train_seq, self.test_seq)
        self.Xtest = kernel.get_test_kernel()
        self.Xtest = np.array(self.Xtest).reshape(len(self.Xtest), -1)
        self.Xtrain = kernel.get_train_kernel()
        self.Xtrain = np.array(self.Xtrain).reshape(len(self.Xtrain), -1)

        # Can replace Lasso with alternative regression approaches such as SVR
        model = LassoCV(cv=5, n_jobs=t, random_state=293).fit(self.Xtrain, self.Ytrain)
        r2 = model.score(self.Xtest, self.Ytest)
        return r2


[docs]class GkmRunner:
    def __init__(
        self,
        exec_location,
        data_locaton,
        dataset,
        g,
        k,
        approx=False,
        alphabet=None,
        outdir="./temp",
    ):
        self.exec_location = exec_location
        self.dir = data_locaton
        self.dataset = dataset
        self.outdir = outdir
        self.g, self.k, self.alphabet = g, k, alphabet

        """Important note:
        gkmSVM's -d parameter (max_m) is *not* the same as our
        m = g - k parameter. It's actually the upper bound of the
        summation shown in equation 3 in the
        2014 gkmSVM paper (ghandi2014enhanced)."""
        if approx:
            """By default, their approximation algorithm truncates the
            summation from eq. 3 to a value of 3 mismatches.
            """
            self.max_m = 3
        else:
            """If using the exact algo, the summation runs from
            0 to l (their l is our g)
            """
            self.max_m = self.g

        ## Data files
        self.train_pos_file = osp.join(self.dir, self.dataset + ".train.pos.fasta")
        self.train_neg_file = osp.join(self.dir, self.dataset + ".train.neg.fasta")
        self.test_pos_file = osp.join(self.dir, self.dataset + ".test.pos.fasta")
        self.test_neg_file = osp.join(self.dir, self.dataset + ".test.neg.fasta")
        self.train_test_pos_file = osp.join(
            self.outdir, self.dataset + ".train_test.pos.fasta"
        )
        self.train_test_neg_file = osp.join(
            self.outdir, self.dataset + ".train_test.neg.fasta"
        )

        ## Temp files that gkm creates
        if not osp.exists(self.outdir):
            os.makedirs(self.outdir)
        self.kernel_file = osp.join(self.outdir, self.dataset + "_kernel.out")
        self.svm_file_prefix = osp.join(self.outdir, "svmtrain")
        self.svmalpha = self.svm_file_prefix + "_svalpha.out"
        self.svseq = self.svm_file_prefix + "_svseq.fa"
        self.pos_pred_file = osp.join(self.outdir, self.dataset + ".preds.pos.out")
        self.neg_pred_file = osp.join(self.outdir, self.dataset + ".preds.neg.out")

[docs]    def compute_train_kernel(self, t):
        execute = osp.join(self.exec_location, "gkmsvm_kernel")
        command = [
            execute,
            "-a",
            str(2),
            "-l",
            str(self.g),
            "-k",
            str(self.k),
            "-d",
            str(self.max_m),
            "-T",
            str(t),
            "-R",
        ]
        if self.alphabet is not None:
            command += ["-A", self.alphabet]
        command += [self.train_pos_file, self.train_neg_file, self.kernel_file]
        print(" ".join(command))
        output = subprocess.check_output(command)

[docs]    def train_svm(self):
        execute = osp.join(self.exec_location, "gkmsvm_train")
        command = [
            execute,
            self.kernel_file,
            self.train_pos_file,
            self.train_neg_file,
            self.svm_file_prefix,
        ]
        print(" ".join(command))
        output = subprocess.check_output(command)

[docs]    def classify(self):
        ## pos predictions
        execute = osp.join(self.exec_location, "gkmsvm_classify")
        command = [
            execute,
            "-l",
            str(self.g),
            "-k",
            str(self.k),
            "-d",
            str(self.max_m),
            "-R",
        ]
        if self.alphabet is not None:
            command += ["-A", self.alphabet]
        command += [self.test_pos_file, self.svseq, self.svmalpha, self.pos_pred_file]
        print(" ".join(command))
        subprocess.check_output(command)

        # get neg preds
        command = [
            execute,
            "-l",
            str(self.g),
            "-k",
            str(self.k),
            "-d",
            str(self.max_m),
            "-R",
        ]
        if self.alphabet is not None:
            command += ["-A", self.alphabet]

        command += [self.test_neg_file, self.svseq, self.svmalpha, self.neg_pred_file]
        print(" ".join(command))
        subprocess.check_output(command)

[docs]    def evaluate(self):
        pos_preds = self.read_preds(self.pos_pred_file)
        neg_preds = self.read_preds(self.neg_pred_file)

        print("Computing accuracy...")
        acc = self.get_accuracy(pos_preds, neg_preds)
        print("Computing AUC...")
        auc = self.get_auc(pos_preds, neg_preds)
        print("Accuracy = {}, AUC = {}".format(acc, auc))
        return acc, auc

[docs]    def train_and_test(self, t=20):
        self.compute_train_kernel(t)
        self.train_svm()
        self.classify()
        acc, auc = self.evaluate()
        return acc, auc

[docs]    def read_preds(self, file):
        preds = []
        with open(file, "r") as f:
            for line in f:
                line = line.split()
                assert len(line) == 2
                preds.append(float(line[1]))
        return preds

[docs]    def get_accuracy(self, pos_preds, neg_preds):
        accuracy = 0
        num_correct = 0
        num_pred = len(pos_preds) + len(neg_preds)
        for pred in pos_preds:
            if pred > 0:
                num_correct += 1
        for pred in neg_preds:
            if pred <= 0:
                num_correct += 1
        return num_correct / num_pred

[docs]    def get_auc(self, pos_preds, neg_preds):
        ytrue = [1 for _ in pos_preds] + [-1 for _ in neg_preds]
        yscore = [score for score in pos_preds] + [score for score in neg_preds]
        auc = metrics.roc_auc_score(ytrue, yscore)
        return auc


[docs]class GaKCoRunner:
    def __init__(self, exec_location, data_locaton, type_, prefix, outdir="./temp"):
        self.exec_location = exec_location
        self.data_locaton = data_locaton
        self.train_file = osp.join("../data", prefix + ".train.fasta")
        self.test_file = osp.join("../data", prefix + ".test.fasta")
        self.train_test_file = osp.join(outdir, prefix + "_train_test.fasta")
        assert type_ in ["dna", "protein"]
        if type_ == "protein":
            self.dict_file = osp.join(data_locaton, "full_prot.dict.txt")
        else:
            self.dict_file = osp.join(data_locaton, "dna.dictionary.txt")
        self.labels_file = osp.join(outdir, "labels.txt")
        self.kernel_file = osp.join(outdir, "kernel.txt")
        self.num_train, self.num_test = 0, 0

[docs]    def compute_kernel(self, g, m, mode="train", t=1):
        self.g = g
        self.m = m
        self.k = g - m

        assert mode in ["train", "test", "train_test"]
        if mode == "train":
            data_file = self.train_file
        elif mode == "test":
            data_file = self.test_file
        else:
            data_file = self.train_test_file

        command = [
            self.exec_location,
            "-g",
            str(self.g),
            "-k",
            str(self.k),
            data_file,
            self.dict_file,
            self.labels_file,
            self.kernel_file,
        ]

        output = subprocess.check_output(command)

[docs]    def train_and_test(self, g, m, C=1):
        self.combine_train_and_test()
        self.compute_kernel(g, m, mode="train_test")

        self.Xtrain, self.Xtest = self.read_kernel()
        self.Ytrain, self.Ytest = self.read_labels()

        svm = LinearSVC(C=C)
        self.clf = CalibratedClassifierCV(svm, cv=5).fit(self.Xtrain, self.Ytrain)
        acc, auc = self.evaluate_clf()
        return acc, auc

[docs]    def evaluate_clf(self):
        acc = self.clf.score(self.Xtest, self.Ytest)
        probs = self.clf.predict_proba(self.Xtest)[:, 1]
        auc = metrics.roc_auc_score(self.Ytest, probs)
        return acc, auc

[docs]    def combine_train_and_test(self):
        lines = []
        with open(self.train_file, "r") as f:
            for line in f:
                if line[0] == ">":
                    self.num_train += 1
                lines.append(line)
        with open(self.test_file, "r") as f:
            for line in f:
                if line[0] == ">":
                    self.num_test += 1
                lines.append(line)
        with open(self.train_test_file, "w+") as f:
            f.writelines(lines)

[docs]    def read_labels(self):
        Ytrain, Ytest = [], []
        with open(self.train_file, "r") as f:
            for line in f:
                if line[0] == ">":
                    Ytrain.append(line.rstrip().split(">")[1])
        with open(self.test_file, "r") as f:
            for line in f:
                if line[0] == ">":
                    Ytest.append(line.rstrip().split(">")[1])

        return Ytrain, Ytest

[docs]    def read_kernel(self):
        Xtrain, Xtest = [], []
        with open(self.kernel_file, "r") as f:
            count = 0
            for line in f:
                x = [float(item.split(":")[1]) for item in line.rstrip().split(" ")][
                    : self.num_train
                ]
                if count < self.num_train:
                    Xtrain.append(x)
                else:
                    Xtest.append(x)
                count += 1

        return Xtrain, Xtest

[docs]    def get_labels(self):
        pass


[docs]class BlendedSpectrumRunner:
    def __init__(self, exec_dir, data_locaton, prefix, outdir="./temp"):
        self.exec_dir = exec_dir
        self.train_fasta = osp.join(data_locaton, prefix + ".train.fasta")
        self.test_fasta = osp.join(data_locaton, prefix + ".test.fasta")
        self.outdir = outdir
        if not osp.exists(self.outdir):
            os.makedirs(self.outdir)
        self.train_seq = osp.join(self.outdir, prefix + "_spectrum.train.txt")
        self.test_seq = osp.join(self.outdir, prefix + "_spectrum.test.txt")
        self.train_and_test_seq = osp.join(
            self.outdir, prefix + "_.train-tst.spectrum.txt"
        )
        self.num_train, self.num_test = 0, 0
        self.write_seq(self.train_fasta, mode="train")

        self.kernel_file = osp.join(outdir, "kernel.txt")

[docs]    def combine_train_and_test(self):
        Xtrain, Xtest, self.Ytrain, self.Ytest = [], [], [], []
        with open(self.train_fasta, "r") as f:
            label_line = True
            for line in f:
                line = line.rstrip()
                if label_line:
                    self.num_train += 1
                    label = line.split(">")[1]
                    self.Ytrain.append(label)
                    label_line = False
                else:
                    Xtrain.append(line.lower())
                    label_line = True
        with open(self.test_fasta, "r") as f:
            label_line = True
            for line in f:
                line = line.rstrip()
                if label_line:
                    self.num_test += 1
                    label = line.split(">")[1]
                    self.Ytest.append(label)
                    label_line = False
                else:
                    Xtest.append(line.lower())
                    label_line = True
        X = Xtrain + Xtest
        with open(self.train_and_test, "w+") as f:
            for x in X:
                f.write(x + "\n")

[docs]    def write_seq(self, datafile, mode="train"):
        assert mode in ["train", "test"]
        if mode == "train":
            outfile = self.train_seq
        else:
            outfile = self.test_seq
        X, Y = [], []
        with open(datafile, "r") as f:
            label_line = True
            for line in f:
                line = line.rstrip()
                if label_line:
                    label = line.split(">")[1]
                    Y.append(label)
                    label_line = False
                else:
                    X.append(line.lower())
                    label_line = True

        with open(outfile, "w+") as f:
            for x in X:
                f.write(x + "\n")

[docs]    def compute_kernel(self, k1=3, k2=5, mode="train_and_test"):
        self.k1, self.k2 = k1, k2

        datafile = self.train_seq
        assert mode in ["train", "test", "train_and_test"]
        if mode == "train":
            datafile = self.train_seq
        elif mode == "test":
            datafile = self.test_seq
        else:
            datafile = self.train_and_test_seq

        command = [
            "java",
            "-cp",
            self.exec_dir,
            "ComputeStringKernel",
            "spectrum",
            str(self.k1),
            str(self.k2),
            datafile,
            self.kernel_file,
        ]
        output = subprocess.check_output(command)

[docs]    def read_kernel(self):
        Xtrain, Xtest = [], []
        with open(self.kernel_file, "r") as f:
            count = 0
            for line in f:
                x = [float(item) for item in line.rstrip().split(" ")][: self.num_train]
                if count < self.num_train:
                    Xtrain.append(x)
                else:
                    Xtest.append(x)
                count += 1

        return Xtrain, Xtest

[docs]    def train_and_test(self, k1=3, k2=5, C=1):
        self.combine_train_and_test()
        self.compute_kernel(k1, k2, mode="train_and_test")

        self.Xtrain, self.Xtest = self.read_kernel()

        svm = LinearSVC(C=C, class_weight="balanced", max_iter=3000)
        self.clf = CalibratedClassifierCV(svm, cv=5).fit(self.Xtrain, self.Ytrain)
        acc, auc = self.evaluate_clf()
        return acc, auc

[docs]    def evaluate_clf(self):
        acc = self.clf.score(self.Xtest, self.Ytest)
        probs = self.clf.predict_proba(self.Xtest)[:, 1]
        auc = metrics.roc_auc_score(self.Ytest, probs)
        return acc, auc