Source code for fastsk.utils

"""Utils for reading fasta files
"""


[docs]class Vocabulary(object): """A class for storing the vocabulary of a sequence dataset. Maps words or characters to indexes in the vocabulary. """ def __init__(self): self._token2idx = {} self._token2idx[0] = 0 self._size = len(self._token2idx)
[docs] def add(self, token): """ Add a token to the vocabulary. Args: token: a letter (for char-level model) or word (for word-level model) for which to create a mapping to an integer (the idx). Return: the index of the word. If it's already present, return its index. Otherwise, add it before returning the index. """ if token not in self._token2idx: self._token2idx[token] = self._size self._size += 1 return self._token2idx.get(token)
[docs] def size(self): """Return the number tokens in the vocabulary.""" return self._size
def __str__(self): return str(self._token2idx)
[docs]class FastaUtility: def __init__(self, vocab=None): r""" Initialize a helper object for parsing datasets in FASTA-like format. Parameters ---------- vocab : """ self._vocab = Vocabulary() if vocab is None else vocab
[docs] def read_data(self, data_file, vocab="inferred", regression=False): r"""Read a file with the FASTA-like format of alternating labels lines followed by sequences. For example: >1 >AAAGAT >1 >AAAAAGAT >0 >AGTC Parameters ---------- data_file : string The path to the sequences. vocab : string Returns ---------- X : list list of sequences where characters have been mapped to numbers. Y : list list of labels """ assert vocab.lower() in ["dna", "protein", "inferred"] X, Y = [], [] with open(data_file, "r") as f: label_line = True for line in f: line = line.strip().lower() if label_line: split = line.split(">") assert len(split) == 2 if regression: label = split[1] else: label = int(split[1]) assert label in [-1, 0, 1] Y.append(label) label_line = False else: seq = list(line) seq = [self._vocab.add(token) for token in seq] X.append(seq) label_line = True assert len(X) == len(Y) return X, Y
[docs] def shortest_seq(self, data_file): X, Y = self.read_data(data_file) shortest = len(X[0]) for x in X: if len(x) < shortest: shortest = len(x) return shortest