Source code for fastsk.old_utils

import os
import os.path as osp
import subprocess
import numpy as np
from fastsk import FastSK
from sklearn.svm import LinearSVC
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn import metrics
import time
import multiprocessing
import subprocess


[docs]def time_fastsk( g, m, t, data_location, prefix, approx=False, max_iters=None, timeout=None, skip_variance=False, ): """Run FastSK kernel computation. If a timeout is provided, it'll run as a subprocess, which will be killed when the timeout is reached. """ fastsk = FastskRunner(prefix, data_location) start = time.time() if timeout: if max_iters: args = { "t": t, "approx": approx, "skip_variance": skip_variance, "I": max_iters, } else: args = {"t": t, "approx": approx, "skip_variance": skip_variance} p = multiprocessing.Process( target=fastsk.compute_train_kernel, name="TimeFastSK", args=(g, m), kwargs=args, ) p.start() p.join(timeout) if p.is_alive(): p.terminate() p.join() else: if max_iters: fastsk.compute_train_kernel( g, m, t=t, approx=approx, I=max_iters, skip_variance=skip_variance ) else: fastsk.compute_train_kernel( g, m, t=t, approx=approx, skip_variance=skip_variance ) end = time.time() return end - start
[docs]def fastsk_wrap(dataset, g, m, t, approx, I, delta, skip_variance, C, return_dict): fastsk = FastskRunner(dataset) acc, auc = fastsk.train_and_test(g, m, t, I, approx, skip_variance, C) return_dict["acc"] = acc return_dict["auc"] = auc
[docs]def train_and_test_fastsk( dataset, g, m, t, approx, I=50, delta=0.025, skip_variance=False, C=1, timeout=None ): start = time.time() if timeout: manager = multiprocessing.Manager() return_dict = manager.dict() p = multiprocessing.Process( target=fastsk_wrap, name="Train and test FastSK", args=(dataset, g, m, t, approx, I, delta, skip_variance, C, return_dict), ) p.start() p.join(timeout) if p.is_alive(): p.terminate() p.join() if return_dict.values() == []: acc, auc = 0, 0 else: acc, auc = return_dict["acc"], return_dict["auc"] else: acc, auc = fastsk.train_and_test( g, m, t=t, I=I, approx=approx, skip_variance=skip_variance, C=C ) end = time.time() return acc, auc, end - start
[docs]def gkm_wrap( g, m, t, prefix, gkm_data, gkm_exec, approx, timeout, alphabet, return_dict ): k = g - m gkm = GkmRunner(gkm_exec, gkm_data, prefix, g, k, approx, alphabet, "./temp") acc, auc = gkm.train_and_test(t) return_dict["acc"] = acc return_dict["auc"] = auc
[docs]def train_and_test_gkm( g, m, t, prefix, gkm_data, gkm_exec, approx=False, timeout=None, alphabet=None ): start = time.time() if timeout: manager = multiprocessing.Manager() return_dict = manager.dict() p = multiprocessing.Process( target=gkm_wrap, name="Train and test Gkm", args=( g, m, t, prefix, gkm_data, gkm_exec, approx, timeout, alphabet, return_dict, ), ) p.start() p.join(timeout) if p.is_alive(): p.terminate() p.join() if return_dict.values() == []: acc, auc = 0, 0 else: acc, auc = return_dict["acc"], return_dict["auc"] else: k = g - m gkm = GkmRunner(gkm_exec, gkm_data, prefix, g, k, approx, alphabet, "./temp") acc, auc = gkm.train_and_test(t) end = time.time() return acc, auc, end - start
[docs]def time_gkm( g, m, t, prefix, gkm_data, gkm_exec, approx=False, timeout=None, alphabet=None ): """Run gkm-SVM2.0 kernel computation. If a timeout is provided, it'll be run as a subprocess, which will be killed when the timeout is reached. """ k = g - m gkm = GkmRunner(gkm_exec, gkm_data, prefix, g, k, approx, alphabet, "./temp") start = time.time() if timeout: p = multiprocessing.Process( target=gkm.compute_train_kernel, name="TimeGkm", args=(t,) ) p.start() p.join(timeout) if p.is_alive(): p.terminate() p.join() else: gkm.compute_train_kernel(t) end = time.time() return end - start
[docs]def time_gakco(g, m, type_, prefix, timeout=None): gakco_exec = "./baselines/GaKCo-SVM/bin/GaKCo" data = "./data/" gakco = GaKCoRunner(gakco_exec, data, type_, prefix) start = time.time() gakco.compute_kernel(g, m, mode="train") end = time.time() return end - start
[docs]def time_blended(k1, k2, prefix, timeout=None): blended_exec = "./baselines/String_Kernels_Package/code/" data = "../data/" blended = BlendedSpectrumRunner(blended_exec, data, prefix) start = time.time() blended.compute_kernel(k1, k2, mode="train") end = time.time() return end - start
[docs]class Vocabulary(object): """A class for storing the vocabulary of a sequence dataset. Maps words or characters to indexes in the vocabulary. """ def __init__(self): self._token2idx = {} self._token2idx[0] = 0 self._size = len(self._token2idx)
[docs] def add(self, token): """ Add a token to the vocabulary. Args: token: a letter (for char-level model) or word (for word-level model) for which to create a mapping to an integer (the idx). Return: the index of the word. If it's already present, return its index. Otherwise, add it before returning the index. """ if token not in self._token2idx: self._token2idx[token] = self._size self._size += 1 return self._token2idx.get(token)
[docs] def size(self): """Return the number tokens in the vocabulary.""" return self._size
def __str__(self): return str(self._token2idx)
[docs]class FastaUtility: def __init__(self, vocab=None): r""" Initialize a helper object for parsing datasets in FASTA-like format. Parameters ---------- vocab : """ self._vocab = Vocabulary() if vocab is None else vocab
[docs] def read_data(self, data_file, vocab="inferred", regression=False): r"""Read a file with the FASTA-like format of alternating labels lines followed by sequences. For example: >1 >AAAGAT >1 >AAAAAGAT >0 >AGTC Parameters ---------- data_file : string The path to the sequences. vocab : string Returns ---------- X : list list of sequences where characters have been mapped to numbers. Y : list list of labels """ assert vocab.lower() in ["dna", "protein", "inferred"] X, Y = [], [] with open(data_file, "r") as f: label_line = True for line in f: line = line.strip().lower() if label_line: split = line.split(">") assert len(split) == 2 if regression: label = split[1] else: label = int(split[1]) assert label in [-1, 0, 1] Y.append(label) label_line = False else: seq = list(line) seq = [self._vocab.add(token) for token in seq] X.append(seq) label_line = True assert len(X) == len(Y) return X, Y
[docs] def shortest_seq(self, data_file): X, Y = self.read_data(data_file) shortest = len(X[0]) for x in X: if len(x) < shortest: shortest = len(x) return shortest
[docs]class ArabicUtility: def __init__(self, vocab=None): r""" Initialize a helper object for parsing datasets in the MADAR Arabic Dialect Identification task format. https://www.aclweb.org/anthology/L18-1535/ There are 26 dialects in one of the tasks, which is too many for us to handle right now. Instead, we just the following 6 cities: RAB - Rabat BEI - Beirut DOH - Doha CAI - Cairo TUN - Tunis MSA - Modern Standard Arabic Parameters ---------- vocab : a Vocabulary object """ self._vocab = Vocabulary() if vocab is None else vocab self._classes = Vocabulary() self._labels_to_use = ["RAB", "BEI", "DOH", "CAI", "TUN", "MSA"]
[docs] def read_data(self, data_file, vocab="inferred"): r"""Read a file with the following format: بالمناسبة ، اسمي هيروش إيجيما . MSA مش قادر نرقد كويس في الليل . CAI That is, a sequence of Arabic characters, a tab, and a three-letter label/city code. Parameters ---------- data_file : string The path to the sequences. vocab : string Returns ---------- X : list list of sequences where characters have been mapped to numbers. Y : list list of numerical labels (not one-hot) """ assert vocab.lower() in ["dna", "protein", "arabic", "inferred"] X, Y = [], [] with open(data_file, "r", encoding="utf-8") as f: for line in f: seq, label = line.rstrip().split("\t") assert len(label) == 3 if label in self._labels_to_use: if len(seq) < 10: continue seq = list(seq) seq = [self._vocab.add(token) for token in seq] X.append(seq) Y.append(self._classes.add(label)) assert len(X) == len(Y) return X, Y
[docs]class DslUtility: def __init__(self, vocab=None): self._vocab = Vocabulary() if vocab is None else vocab self._classes = Vocabulary()
[docs] def read_data(self, data_file, vocab="inferred"): assert vocab.lower() in ["dna", "protein", "arabic", "inferred"] X, Y = [], [] with open(data_file, "r", encoding="utf-8") as f: for line in f: seq, label = line.rstrip().split("\t") if len(seq) < 10: continue seq = list(seq) seq = [self._vocab.add(token) for token in seq] X.append(seq) Y.append(self._classes.add(label)) assert len(X) == len(Y) return X, Y
[docs]class FastskRunner: def __init__(self, prefix, data_location="../data"): self.prefix = prefix self.train_file = osp.join(data_location, prefix + ".train.fasta") self.test_file = osp.join(data_location, prefix + ".test.fasta") reader = FastaUtility() self.train_seq, self.Ytrain = reader.read_data(self.train_file) self.test_seq, Ytest = reader.read_data(self.test_file) Ytest = np.array(Ytest).reshape(-1, 1) self.Ytest = Ytest
[docs] def compute_train_kernel( self, g, m, t=20, approx=True, I=100, delta=0.025, skip_variance=False ): kernel = FastSK( g=g, m=m, t=t, approx=approx, max_iters=I, delta=delta, skip_variance=skip_variance, ) kernel.compute_train(self.train_seq)
[docs] def train_and_test( self, g, m, t, approx, I=100, delta=0.025, skip_variance=False, C=1 ): kernel = FastSK( g=g, m=m, t=t, approx=approx, max_iters=I, delta=delta, skip_variance=skip_variance, ) kernel.compute_kernel(self.train_seq, self.test_seq) self.Xtrain = kernel.get_train_kernel() self.Xtest = kernel.get_test_kernel() self.stdevs = kernel.get_stdevs() svm = LinearSVC(C=C, class_weight="balanced") self.clf = CalibratedClassifierCV(svm, cv=5).fit(self.Xtrain, self.Ytrain) acc, auc = self.evaluate_clf() return acc, auc
[docs] def evaluate_clf(self): acc = self.clf.score(self.Xtest, self.Ytest) probs = self.clf.predict_proba(self.Xtest)[:, 1] auc = metrics.roc_auc_score(self.Ytest, probs) return acc, auc
[docs]class FastskRegressor: def __init__(self, dataset, data_location="../data"): self.dataset = dataset self.train_file = osp.join(data_location, prefix + ".train.fasta") self.test_file = osp.join(data_location, prefix + ".test.fasta") reader = FastaUtility() self.train_seq, self.Ytrain = reader.read_data(train_file, regression=True) self.test_seq, self.Ytest = reader.read_data(test_file, regression=True) self.Ytrain = np.array(self.Ytrain).astype(np.float) self.Ytest = np.array(self.Ytest).astype(np.float)
[docs] def compute_train_kernel( self, g, m, t=20, approx=True, I=100, delta=0.025, skip_variance=False ): kernel = FastSK( g=g, m=m, t=t, approx=approx, max_iters=I, delta=delta, skip_variance=skip_variance, ) kernel.compute_train(self.train_seq)
[docs] def train_and_test(self, g, m, t, approx, I=100, delta=0.025, skip_variance=False): kernel = FastSK( g=g, m=m, t=t, approx=approx, max_iters=I, delta=delta, skip_variance=skip_variance, ) kernel.compute_kernel(self.train_seq, self.test_seq) self.Xtest = kernel.get_test_kernel() self.Xtest = np.array(self.Xtest).reshape(len(self.Xtest), -1) self.Xtrain = kernel.get_train_kernel() self.Xtrain = np.array(self.Xtrain).reshape(len(self.Xtrain), -1) # Can replace Lasso with alternative regression approaches such as SVR model = LassoCV(cv=5, n_jobs=t, random_state=293).fit(self.Xtrain, self.Ytrain) r2 = model.score(self.Xtest, self.Ytest) return r2
[docs]class GkmRunner: def __init__( self, exec_location, data_locaton, dataset, g, k, approx=False, alphabet=None, outdir="./temp", ): self.exec_location = exec_location self.dir = data_locaton self.dataset = dataset self.outdir = outdir self.g, self.k, self.alphabet = g, k, alphabet """Important note: gkmSVM's -d parameter (max_m) is *not* the same as our m = g - k parameter. It's actually the upper bound of the summation shown in equation 3 in the 2014 gkmSVM paper (ghandi2014enhanced).""" if approx: """By default, their approximation algorithm truncates the summation from eq. 3 to a value of 3 mismatches. """ self.max_m = 3 else: """If using the exact algo, the summation runs from 0 to l (their l is our g) """ self.max_m = self.g ## Data files self.train_pos_file = osp.join(self.dir, self.dataset + ".train.pos.fasta") self.train_neg_file = osp.join(self.dir, self.dataset + ".train.neg.fasta") self.test_pos_file = osp.join(self.dir, self.dataset + ".test.pos.fasta") self.test_neg_file = osp.join(self.dir, self.dataset + ".test.neg.fasta") self.train_test_pos_file = osp.join( self.outdir, self.dataset + ".train_test.pos.fasta" ) self.train_test_neg_file = osp.join( self.outdir, self.dataset + ".train_test.neg.fasta" ) ## Temp files that gkm creates if not osp.exists(self.outdir): os.makedirs(self.outdir) self.kernel_file = osp.join(self.outdir, self.dataset + "_kernel.out") self.svm_file_prefix = osp.join(self.outdir, "svmtrain") self.svmalpha = self.svm_file_prefix + "_svalpha.out" self.svseq = self.svm_file_prefix + "_svseq.fa" self.pos_pred_file = osp.join(self.outdir, self.dataset + ".preds.pos.out") self.neg_pred_file = osp.join(self.outdir, self.dataset + ".preds.neg.out")
[docs] def compute_train_kernel(self, t): execute = osp.join(self.exec_location, "gkmsvm_kernel") command = [ execute, "-a", str(2), "-l", str(self.g), "-k", str(self.k), "-d", str(self.max_m), "-T", str(t), "-R", ] if self.alphabet is not None: command += ["-A", self.alphabet] command += [self.train_pos_file, self.train_neg_file, self.kernel_file] print(" ".join(command)) output = subprocess.check_output(command)
[docs] def train_svm(self): execute = osp.join(self.exec_location, "gkmsvm_train") command = [ execute, self.kernel_file, self.train_pos_file, self.train_neg_file, self.svm_file_prefix, ] print(" ".join(command)) output = subprocess.check_output(command)
[docs] def classify(self): ## pos predictions execute = osp.join(self.exec_location, "gkmsvm_classify") command = [ execute, "-l", str(self.g), "-k", str(self.k), "-d", str(self.max_m), "-R", ] if self.alphabet is not None: command += ["-A", self.alphabet] command += [self.test_pos_file, self.svseq, self.svmalpha, self.pos_pred_file] print(" ".join(command)) subprocess.check_output(command) # get neg preds command = [ execute, "-l", str(self.g), "-k", str(self.k), "-d", str(self.max_m), "-R", ] if self.alphabet is not None: command += ["-A", self.alphabet] command += [self.test_neg_file, self.svseq, self.svmalpha, self.neg_pred_file] print(" ".join(command)) subprocess.check_output(command)
[docs] def evaluate(self): pos_preds = self.read_preds(self.pos_pred_file) neg_preds = self.read_preds(self.neg_pred_file) print("Computing accuracy...") acc = self.get_accuracy(pos_preds, neg_preds) print("Computing AUC...") auc = self.get_auc(pos_preds, neg_preds) print("Accuracy = {}, AUC = {}".format(acc, auc)) return acc, auc
[docs] def train_and_test(self, t=20): self.compute_train_kernel(t) self.train_svm() self.classify() acc, auc = self.evaluate() return acc, auc
[docs] def read_preds(self, file): preds = [] with open(file, "r") as f: for line in f: line = line.split() assert len(line) == 2 preds.append(float(line[1])) return preds
[docs] def get_accuracy(self, pos_preds, neg_preds): accuracy = 0 num_correct = 0 num_pred = len(pos_preds) + len(neg_preds) for pred in pos_preds: if pred > 0: num_correct += 1 for pred in neg_preds: if pred <= 0: num_correct += 1 return num_correct / num_pred
[docs] def get_auc(self, pos_preds, neg_preds): ytrue = [1 for _ in pos_preds] + [-1 for _ in neg_preds] yscore = [score for score in pos_preds] + [score for score in neg_preds] auc = metrics.roc_auc_score(ytrue, yscore) return auc
[docs]class GaKCoRunner: def __init__(self, exec_location, data_locaton, type_, prefix, outdir="./temp"): self.exec_location = exec_location self.data_locaton = data_locaton self.train_file = osp.join("../data", prefix + ".train.fasta") self.test_file = osp.join("../data", prefix + ".test.fasta") self.train_test_file = osp.join(outdir, prefix + "_train_test.fasta") assert type_ in ["dna", "protein"] if type_ == "protein": self.dict_file = osp.join(data_locaton, "full_prot.dict.txt") else: self.dict_file = osp.join(data_locaton, "dna.dictionary.txt") self.labels_file = osp.join(outdir, "labels.txt") self.kernel_file = osp.join(outdir, "kernel.txt") self.num_train, self.num_test = 0, 0
[docs] def compute_kernel(self, g, m, mode="train", t=1): self.g = g self.m = m self.k = g - m assert mode in ["train", "test", "train_test"] if mode == "train": data_file = self.train_file elif mode == "test": data_file = self.test_file else: data_file = self.train_test_file command = [ self.exec_location, "-g", str(self.g), "-k", str(self.k), data_file, self.dict_file, self.labels_file, self.kernel_file, ] output = subprocess.check_output(command)
[docs] def train_and_test(self, g, m, C=1): self.combine_train_and_test() self.compute_kernel(g, m, mode="train_test") self.Xtrain, self.Xtest = self.read_kernel() self.Ytrain, self.Ytest = self.read_labels() svm = LinearSVC(C=C) self.clf = CalibratedClassifierCV(svm, cv=5).fit(self.Xtrain, self.Ytrain) acc, auc = self.evaluate_clf() return acc, auc
[docs] def evaluate_clf(self): acc = self.clf.score(self.Xtest, self.Ytest) probs = self.clf.predict_proba(self.Xtest)[:, 1] auc = metrics.roc_auc_score(self.Ytest, probs) return acc, auc
[docs] def combine_train_and_test(self): lines = [] with open(self.train_file, "r") as f: for line in f: if line[0] == ">": self.num_train += 1 lines.append(line) with open(self.test_file, "r") as f: for line in f: if line[0] == ">": self.num_test += 1 lines.append(line) with open(self.train_test_file, "w+") as f: f.writelines(lines)
[docs] def read_labels(self): Ytrain, Ytest = [], [] with open(self.train_file, "r") as f: for line in f: if line[0] == ">": Ytrain.append(line.rstrip().split(">")[1]) with open(self.test_file, "r") as f: for line in f: if line[0] == ">": Ytest.append(line.rstrip().split(">")[1]) return Ytrain, Ytest
[docs] def read_kernel(self): Xtrain, Xtest = [], [] with open(self.kernel_file, "r") as f: count = 0 for line in f: x = [float(item.split(":")[1]) for item in line.rstrip().split(" ")][ : self.num_train ] if count < self.num_train: Xtrain.append(x) else: Xtest.append(x) count += 1 return Xtrain, Xtest
[docs] def get_labels(self): pass
[docs]class BlendedSpectrumRunner: def __init__(self, exec_dir, data_locaton, prefix, outdir="./temp"): self.exec_dir = exec_dir self.train_fasta = osp.join(data_locaton, prefix + ".train.fasta") self.test_fasta = osp.join(data_locaton, prefix + ".test.fasta") self.outdir = outdir if not osp.exists(self.outdir): os.makedirs(self.outdir) self.train_seq = osp.join(self.outdir, prefix + "_spectrum.train.txt") self.test_seq = osp.join(self.outdir, prefix + "_spectrum.test.txt") self.train_and_test_seq = osp.join( self.outdir, prefix + "_.train-tst.spectrum.txt" ) self.num_train, self.num_test = 0, 0 self.write_seq(self.train_fasta, mode="train") self.kernel_file = osp.join(outdir, "kernel.txt")
[docs] def combine_train_and_test(self): Xtrain, Xtest, self.Ytrain, self.Ytest = [], [], [], [] with open(self.train_fasta, "r") as f: label_line = True for line in f: line = line.rstrip() if label_line: self.num_train += 1 label = line.split(">")[1] self.Ytrain.append(label) label_line = False else: Xtrain.append(line.lower()) label_line = True with open(self.test_fasta, "r") as f: label_line = True for line in f: line = line.rstrip() if label_line: self.num_test += 1 label = line.split(">")[1] self.Ytest.append(label) label_line = False else: Xtest.append(line.lower()) label_line = True X = Xtrain + Xtest with open(self.train_and_test, "w+") as f: for x in X: f.write(x + "\n")
[docs] def write_seq(self, datafile, mode="train"): assert mode in ["train", "test"] if mode == "train": outfile = self.train_seq else: outfile = self.test_seq X, Y = [], [] with open(datafile, "r") as f: label_line = True for line in f: line = line.rstrip() if label_line: label = line.split(">")[1] Y.append(label) label_line = False else: X.append(line.lower()) label_line = True with open(outfile, "w+") as f: for x in X: f.write(x + "\n")
[docs] def compute_kernel(self, k1=3, k2=5, mode="train_and_test"): self.k1, self.k2 = k1, k2 datafile = self.train_seq assert mode in ["train", "test", "train_and_test"] if mode == "train": datafile = self.train_seq elif mode == "test": datafile = self.test_seq else: datafile = self.train_and_test_seq command = [ "java", "-cp", self.exec_dir, "ComputeStringKernel", "spectrum", str(self.k1), str(self.k2), datafile, self.kernel_file, ] output = subprocess.check_output(command)
[docs] def read_kernel(self): Xtrain, Xtest = [], [] with open(self.kernel_file, "r") as f: count = 0 for line in f: x = [float(item) for item in line.rstrip().split(" ")][: self.num_train] if count < self.num_train: Xtrain.append(x) else: Xtest.append(x) count += 1 return Xtrain, Xtest
[docs] def train_and_test(self, k1=3, k2=5, C=1): self.combine_train_and_test() self.compute_kernel(k1, k2, mode="train_and_test") self.Xtrain, self.Xtest = self.read_kernel() svm = LinearSVC(C=C, class_weight="balanced", max_iter=3000) self.clf = CalibratedClassifierCV(svm, cv=5).fit(self.Xtrain, self.Ytrain) acc, auc = self.evaluate_clf() return acc, auc
[docs] def evaluate_clf(self): acc = self.clf.score(self.Xtest, self.Ytest) probs = self.clf.predict_proba(self.Xtest)[:, 1] auc = metrics.roc_auc_score(self.Ytest, probs) return acc, auc