forgi 2.0.0 documentation

Contents

Source code for forgi.graph.sequence

from __future__ import absolute_import, unicode_literals, division
from __future__ import print_function
from builtins import (ascii, bytes, chr, dict, filter, hex, input, # pylint: disable=W0611
                      map, next, oct, pow, range, round, # pylint: disable=W0611
                      str, super, zip) # pylint: disable=W0611

try:
    from collections.abc import Sequence as SequenceABC
except ImportError:
    from collections import Sequence as SequenceABC

import logging
from collections import defaultdict, Counter
from string import ascii_lowercase, ascii_uppercase
from functools import partial
import inspect

from logging_exceptions import log_to_exception
from . import residue as fgr
log = logging.getLogger(__name__)

VALID_CHAINIDS = ascii_uppercase + ascii_lowercase


[docs]class MissingResidue(object): def __init__(self, resid, res_name): """ :param resid: A fgr.RESID instance of a string. :param resname: A string. Probably one of "AUGC". """ if not isinstance(resid, fgr.RESID): log.debug("Type of %s is %s", resid, type(resid).__name__) resid = fgr.resid_from_str(resid) self.resid = resid self.res_name = res_name
[docs] def to_bg_string(self): return "missing {} {}".format(fgr.resid_to_str(self.resid), self.res_name)
[docs] @classmethod def from_bg_fields(cls, parts): """ Used during loading of the bg_file. :param parts: A list of strings. If the first is not "missing", None is returned :returns: Either a MissingResidue instance or None """ if parts[0] != "missing": return None return cls(fgr.resid_from_str(parts[1]), parts[2])
class _Smallest(object): """ Smaller than everything, regardless of type """ def __lt__(self, other): return True class _Biggest(object): """ Bigger than everything, regardless of type. """ def __gt__(self, other): return True
[docs]def to_0_based(key): log.debug("Converting key %s", key) if isinstance(key, slice): # Step step = key.step if step is None: step = 1 if abs(step) != 1: raise IndexError("Slices with step != 1 are not supported!") # Start start = key.start if start is None: start = None elif start > 0: start = start - 1 else: raise IndexError("Slices with start <1 are currently not defined.") # Stop if step < 0: if key.stop is None: stop = None elif key.stop < -1: raise IndexError( "Slices with negative stop and negative step are currently not defined.") else: stop = key.stop - 2 else: stop = key.stop key = slice(start, stop, step) elif isinstance(key, int): if key == 0: raise IndexError("Index 0 not allowed in 1-based indexing") elif key > 0: key = key - 1 else: raise TypeError("Invalid index type {}".format(type(key).__name__)) log.debug("Returning key %s", key) return key
def _insert_breakpoints_simple(seq, breakpoints, start=0, reverse=False): """ :param seq: A sequence without any missing residues :param breakpoints: 0-based break points :param start: The coordinate "on the forward strand", i.e. the lower number of start and stop. """ log.debug("Inserting breakpoints %s into %s", seq, breakpoints) breakpoints = sorted(breakpoints) if not breakpoints: return seq if start is None: start = 0 out = [] if reverse: seq = seq[::-1] log.debug("Inserting breakpoints into %s, start=%s, " "reversed=%s", seq, start, reverse) oldbp = 0 for bp in breakpoints: log.debug("BP=%s", bp) bp -= start log.debug("bp-=start-->>>%s", bp) out.append(seq[oldbp:bp + 1]) oldbp = bp + 1 out.append(seq[bp + 1:]) # pylint: disable=undefined-loop-variable out = [o for o in out if o] seq = "&".join(out) if reverse: seq = seq[::-1] log.debug("seq with break points: %s", seq) return seq def _resid_key(x): if x.resid[2] is None: return (x.resid[1], " ") else: # Convert to newstring on python2 See: https://github.com/PythonCharmers/python-future/issues/110 return (x.resid[1], str(x.resid[2])) def _sorted_missing_residues(list_of_dicts): """ :param list_of_dicts: A list of dicts, as in my PR to biopython or with the two keys "RESID" and "res_name", """ chain_to_residues = defaultdict(list) resid_to_nucleotide = {} for res_dict in list_of_dicts: log.debug("Processing %s", res_dict) if isinstance(res_dict, MissingResidue): resid = res_dict.resid chain = res_dict.resid.chain res_name = res_dict.res_name elif "RESID" in res_dict: resid = res_dict["RESID"] if not isinstance(resid, fgr.RESID): resid = fgr.resid_from_str(resid) chain = resid.chain res_name = res_dict["res_name"] else: if res_dict["model"] not in [None, 1, "A"]: log.info("Invalid missing residue %s", res_dict) continue if res_dict["insertion"] is None: insertion = " " else: insertion = res_dict["insertion"] chain = res_dict["chain"] resid = fgr.RESID(chain, (' ', res_dict["ssseq"], insertion)) res_name = res_dict["res_name"] chain_to_residues[chain].append(resid) resid_to_nucleotide[resid] = res_name for reslist in chain_to_residues.values(): log.debug("Sorting %s", reslist) reslist.sort(key=_resid_key) return chain_to_residues, resid_to_nucleotide class _IndexHelper(object): # pylint: disable=protected-access @property def flag(self): raise NotImplementedError("Must be set in subclass") def __init__(self, parent): self.parent = parent def __getitem__(self, key): log.debug("%s.__getitem__ called. %s=True", type(self).__name__, self.flag) return self.parent._getitem(key, **{self.flag: True}) def _getitem(self, key, include_missing=False, show_modifications=False): kwargs = {"include_missing": include_missing, "show_modifications": show_modifications} kwargs.update({self.flag: True}) log.debug("%s._getitem called. flags: %s", type(self).__name__, kwargs) return self.parent._getitem(key, **kwargs) def __getattr__(self, attr): """ If attr is a method that supports **kwargs or has an attribute with the same name as the value of self.flag, then we set this attributre to True """ log.debug("Getattr called for %s", attr) f = getattr(self.parent, attr) if callable(f): argspec = inspect.getargspec(f) log.debug("For function %s: args are %s", f, argspec.args) if argspec.keywords or self.flag in argspec.args: log.debug("setting flag %s", self.flag) kwargs = {self.flag: True} f = partial(f, **kwargs) return f def __str__(self): return str(self[:]) class _WMIndexer(_IndexHelper): # pylint: disable=protected-access flag = "include_missing" def __len__(self): return len(self.parent) + len(self.parent._missing_nts) def update_dotbracket(self, struct): """ Given a dotbracket_string of the same length as the sequence, update it with '-' for missing residues and '&' for breakpoints. :returns: A string """ return self.parent._missing_into_db(struct) def export_missing(self): """ Return the missing residues in a format which can be passed to the constructor of a new Sequence object. """ return self.parent._export_missing() @property def with_modifications(self): return _MODIndexer(self) def define_length(self, d): val = 0 for i in range(0, len(d), 2): if d[i+1]>=d[i]: val += sum(1 for _ in self.iter_resids( self.to_resid(d[i]), self.to_resid(d[i + 1]))) log.debug("Define length of %s with missing incremented to %s", d, val) log.debug("Define length of %s with missing is finally %s", d, val) return val class _MODIndexer(_IndexHelper): flag = "show_modifications" @property def with_missing(self): return _WMIndexer(self)
[docs]class SeqidList(SequenceABC): def __init__(self, arg): self._list = list(arg) self._lookup = {resid: i for i, resid in enumerate(self._list)} if len(self._lookup) != len(self._list): # duplicate seqids c = Counter(self._list) for k, amount in c.most_common(): if amount > 1: log.error("Seq_id %s occurs %s times!", k, amount) else: break if c.most_common()[0][1] == 1: raise ValueError("len %{} ({}) != ({}) len {}".format( self._lookup, len(self._lookup), len(self._list), self._list)) raise ValueError( "Duplicate Seq_id encountered: {}".format(c.most_common()[0][0])) def __getitem__(self, i): return self._list[i] def __len__(self): return len(self._list)
[docs] def index(self, elem): try: return self._lookup[elem] except KeyError: raise ValueError("{} not in list".format(elem))
def __ne__(self, other): if not isinstance(other, SeqidList): return NotImplemented return not self==other def __eq__(self, other): if isinstance(other, list): return self._list == other elif not isinstance(other, SeqidList): return NotImplemented return self._list == other._list def __hash__(self): return hash(self._list) def __repr__(self): return "SeqidList({})".format(repr(self._list))
[docs]class Sequence(object): """ A class holding the RNA sequence. It supports 2 indexing conventions (see below), missing residues and modified nucleotids. The two indexing conventions are: * 1-based indexing into the sequence for which structure information is available This uses indices of type integer. Integer based indexing can not address missing residues! * Indexing using PDB-style numbers (They can start at any positive or negative number and may contain insertion codes or gaps.) This uses indices of type fgr.RESID PDB-Style indices may address missing residues (Reidues for which only sequence but no structure information is present). """ def __init__(self, seq, seqids, missing_residues=None, modifications=None): """ :param seq: A string :param missing_residues: A list of dictionaries with the following keys: "res_name", "chain", "ssseq", "insertion" """ log.debug("Sequence initialized with %s, %s, %s", seq, list( map(fgr.resid_to_str, seqids)), missing_residues) # Uses 0-based indexing i = 0 self._breaks_after = [] remaining = seq while remaining: c, remaining = remaining[0], remaining[1:] if c == '&': self._breaks_after.append(i - 1) else: i += 1 log.debug("Break-points for seq %s are: %s", seq, self._breaks_after) self._seq = seq.replace('&', '') self._seqids = SeqidList(seqids) self._missing_residues = defaultdict(list) self._missing_nts = {} if missing_residues: self._set_missing_residues(missing_residues) # A dict seq_id:nt # resid : modified res_name self._modifications = {} if modifications: self._modifications.update(modifications) def _set_missing_residues(self, missing_residues): mr, mnts = _sorted_missing_residues(missing_residues) log.debug("Setting missing residues to: %s, %s", mr, mnts) self._missing_residues = mr self._missing_nts = mnts # A dict seq_id:nt def __str__(self): return self[:] def __repr__(self): try: qname = type(self).__qualname__ except AttributeError: qname = type(self).__name__ return "<{} object with ._seq={}>".format(qname, str(self)) def __bool__(self): return bool(self._seq) __nonzero__ = __bool__ @property def backbone_breaks_after(self): """ Breakpoints as 1-based indices """ return [bp + 1 for bp in self._breaks_after]
[docs] def is_valid(self): wrong_chars = set(self._seq) - set("AUGCaugc") if wrong_chars: log.info("Illegal characters are {}".format(wrong_chars)) return False return True
def __getitem__(self, key): """ Never return missing residues! """ return self._getitem(key, False) def __len__(self): return len(self._seq) def __hash__(self): raise AttributeError() def __eq__(self, other): log.debug("{} =?= {}".format(repr(self), repr(other))) log.debug("type of other=%s, type of string literal in this module is %s", type( other).__name__, type("").__name__) if isinstance(other, type(self)): if self._seq != other._seq: log.debug("Sequence different!") return False if self._breaks_after != other._breaks_after: log.debug("Breakpoints different!") return False if self._seqids != other._seqids: log.debug("seq_ids different: %s != %s", self._seqids, other._seqids) return False if self._missing_nts != other._missing_nts: if log.isEnabledFor(logging.DEBUG): for key, val in self._missing_nts.items(): if key in other._missing_nts: if self._missing_nts[key] != other._missing_nts[key]: log.debug("nt changed for %s from %s to %s", key, self._missing_nts[key], other._missing_nts[key]) else: log.debug("%s missing in other", key) for key in other._missing_nts: if key not in self._missing_nts: log.debug("%s extra in other", key) return False log.debug("They are equal") return True return str(self) == other def _getitem(self, key, include_missing=False, show_modifications=False): log.debug("_getitem called for %s, include_missing=%s, show_modifications=%s", key, include_missing, show_modifications) if isinstance(key, int): key = to_0_based(key) if show_modifications and self._seqids[key] in self._modifications: return self._modifications[self._seqids[key]] else: return self._seq[key] elif isinstance(key, fgr.RESID): try: i = self._seqids.index(key) except ValueError: if key in self._missing_nts: nt = self._missing_nts[key] if include_missing == False: raise IndexError("No structure available for nucleotide '{}'." "For look-up including missing residues, use" "`.with_missing[key]`".format(key)) if show_modifications and key in self._modifications: return self._modifications[key] return nt error = IndexError( "Nucleotide {} is not part of this RNA".format(key)) with log_to_exception(log, error): log.error("self._missing_nts = %s", self._missing_nts) raise error else: if show_modifications and key in self._modifications: return self._modifications[key] return self._seq[i] elif isinstance(key, slice): return self._getslice(key, include_missing, show_modifications) else: raise TypeError("Wrong index type: {}".format(type(key).__name__)) def _getslice(self, key, include_missing=False, show_modifications=False): """ Do type checks and delegate to self._resid_slice or self._integer_slice """ if key.step is not None and key.step not in [1, -1]: raise IndexError("The step in the slice has to be 1 or -1") if not isinstance(key.start, (type(None), int, fgr.RESID)): raise TypeError( "Slice-start must be either an integer, a RESID or None") if not isinstance(key.stop, (type(None), int, fgr.RESID)): raise TypeError( "Slice-stop must be either an integer, a RESID or None") if isinstance(key.start, fgr.RESID) or (key.start is None and isinstance(key.stop, fgr.RESID)): if not isinstance(key.stop, (fgr.RESID, type(None))): raise TypeError( "Mixing integers and RESID for slicing is not allowed") return self._resid_slice(key, include_missing, show_modifications) return self._integer_slice(to_0_based(key), include_missing, show_modifications) def _integer_slice(self, key, include_missing=False, show_modifications=False): """ This provides an implementation for include_missing=False and delegates to resid_slice otherwise. :param key: Already 0-based, no None in slice """ if ((include_missing == False or not self._missing_nts) and (show_modifications == False or not self._modifications)): seq = self._seq[key] if key.step == -1: start = key.stop if start is not None: start += 1 else: start = key.start return _insert_breakpoints_simple(seq, self._breaks_after, start, key.step == -1) if key.start is None: startRES = None else: startRES = self._seqids[key.start] if key.stop is None: stopRES = None else: if key.step == -1: stopRES = self._seqids[key.stop + 1] else: if key.stop > 0: stopRES = self._seqids[key.stop - 1] else: stopRES = self._seqids[key.stop] key2 = slice(startRES, stopRES, key.step) log.debug("Converted integer_slice %s to resid-slice %s", key, key2) return self._resid_slice(key2, include_missing, show_modifications) def _resid_slice_to_int_slice(self, key): if key.step == -1: default_start = len(self._seqids) default_stop = None else: default_start = 0 default_stop = len(self._seqids) start_i, _ = self._resid_to_index(key.start, default_start, False) stop_i, _ = self._resid_to_index(key.stop, default_stop, False) if key.step == -1: if stop_i is not None: stop_i -= 1 else: stop_i += 1 key = slice(start_i, stop_i, key.step) log.debug("Resid-slice mapped to integer-slice {}".format(key)) return self._integer_slice(key, False) def _resid_to_index(self, resid, default, default_on_error=False): """ :returns: a tuple index, error_raised """ if resid is None: return default, False try: return self._seqids.index(resid), False except ValueError: if default_on_error: return default, True else: raise IndexError("RESID {} is not present or no structure" "is available for it.".format(resid)) def _resid_slice(self, key, include_missing, show_modifications): """ This provides an implementation for inclue_missing=True and delegates to integer_slice otherwise. :param key: A validated resid slice. """ if include_missing or show_modifications: return self._resid_slice_main(key, include_missing, show_modifications) else: return self._resid_slice_to_int_slice(key) def _resid_slice_main(self, key, with_missing, show_modifications): if key.step == -1: start, stop = key.stop, key.start else: start, stop = key.start, key.stop seqids = list(self.iter_resids(start, stop, with_missing)) if key.step == -1: seqids.reverse() seqs = [[]] seq = seqs[-1] # a reference for seqid in seqids: if seqid == "&": if show_modifications: seqs.append([]) seq = seqs[-1] else: seq.append("&") else: if show_modifications and seqid in self._modifications: seq.append(self._modifications[seqid]) else: try: seq.append(self._seq[self._seqids.index(seqid)]) except ValueError: seq.append(self._missing_nts[seqid]) if not show_modifications: assert seqs == [seq] seqs = "".join(seq) return seqs def _missing_into_db(self, dotbracket): db = "" check_i = 0 for seqid in self.iter_resids(None, None, True): if seqid == "&": db += "&" else: try: i = self._seqids.index(seqid) assert i == check_i except ValueError: db += "-" else: db += dotbracket[i] check_i += 1 return "".join(db)
[docs] def iter_resids(self, start, stop, include_missing): left_res = None start_i, start_is_missing = self._resid_to_index(start, 0, True) stop_i, stop_is_missing = self._resid_to_index( stop, len(self._seqids), True) if (start_is_missing or stop_is_missing) and not include_missing: raise IndexError( "Start or stop are missing residues, but indexing without missing residues was requested.") if not start_is_missing and start is not None: left_res = start # Flag used later if missing_residues == True found_start = not start_is_missing seq = "" if left_res is not None: yield left_res # seq+=self._seq[self._seqids.index(left_res)] #log.debug("Added first residue %s: %s", left_res, seq) start_i += 1 log.debug("Iterating over sequence %s-%s", start_i, stop_i + 1) mr_keys = None for i in range(start_i, stop_i + 1): try: right_res = self._seqids[i] except IndexError: right_res = None # Some consistency checks: if left_res is not None and right_res is not None and left_res.chain != right_res.chain: if i - 1 not in self._breaks_after: raise ValueError("Inconsist break-points discovered." "Missing breakpoint at position {} " "between chain {} and chain {}. " "Break points are {}".format(i - 1, left_res.chain, right_res.chain, self._breaks_after)) if i - 1 in self._breaks_after: if left_res is not None and right_res is not None and left_res.chain == right_res.chain: raise ValueError("Inconsist break-points discovered." "Breakpoint at position {} " "in the middle of chain {}".format(i - 1, left_res.chain)) if include_missing: mr, mr_keys = self._missing_residues_between( left_res, right_res) old_r = left_res for j, r in enumerate(mr_keys): if not found_start and r == start: log.debug("Found start for i=%d, %s", i, r) found_start = True old_r = None log.debug("Now processing missing residue %s", r) if found_start: if old_r is not None and old_r.chain != r.chain: yield "&" log.debug("Breakpoint inserted!") log.debug("Adding missing residue %s", mr[j]) yield r log.debug("Comparing %s and stop=%s", r, stop) if r == stop: log.debug("Stop found! Seq=%s", seq) return old_r = r # If key.start and key.stop resp. are not modified residues, # start_i and stop_i are already accurrate, making # additional checks here unneccessary. if found_start and i < len(self._seq): log.debug("Adding regular residue %s", self._seq[i]) if mr_keys: # No missing residues if r.chain != right_res.chain: yield "&" log.debug("Breakpoint inserted!") else: if left_res is not None and right_res is not None and left_res.chain != right_res.chain: yield "&" log.debug("Breakpoint inserted!") yield right_res left_res = right_res log.debug("seq now is %s", seq) # If start or stop are not part of sequence, # make sure they were seen in missing residues if not found_start or stop_is_missing: raise IndexError("At least one of the residues {} and {} " "is not part of this RNA".format(start, stop)) return
def _missing_residues_between(self, from_resid, to_resid): log.debug("Searching missing residues between %s and %s", from_resid, to_resid) if from_resid is not None and to_resid is not None: if from_resid.chain != to_resid.chain: left = self._missing_residues_between(from_resid, None) right = self._missing_residues_between(None, to_resid) return left[0] + right[0], left[1] + right[1] if from_resid is not None: from_key = _resid_key(from_resid) chain = from_resid.chain else: from_key = _Smallest() if to_resid is not None: to_key = _resid_key(to_resid) chain = to_resid.chain else: to_key = _Biggest() seq = "" res_ids = [] for res1 in self._missing_residues[chain]: if _resid_key(res1) > from_key and _resid_key(res1) < to_key: seq += self._missing_nts[res1] res_ids.append(res1) return seq, res_ids
[docs] def to_resid(self, i): return self._seqids[i - 1]
[docs] def to_integer(self, resid): return self._seqids.index(resid) + 1
[docs] def iter_modifications(self): """ Iter over tuples seq_id, modified residue code """ for k in sorted(self._modifications): yield k, self._modifications[k]
@property def with_missing(self): """ Allows slices with missing residues! """ if self._seqids: return _WMIndexer(self) else: # For performance reasons return self @property def with_modifications(self): if self._modifications: return _MODIndexer(self) else: return self
[docs] def get_bg_str(self): """ Used during bg-file creation """ out = [] out.append("seq {}".format(str(self))) out.append("seq_ids {}".format( " ".join(map(fgr.resid_to_str, self._seqids)))) for resid, nt in self._missing_nts.items(): out.append(MissingResidue(resid, nt).to_bg_string()) for resid, label in self._modifications.items(): out.append("modification {} {}".format( fgr.resid_to_str(resid), label)) return "\n".join(out) + "\n"
[docs] def define_length(self, d): val = 0 for i in range(0, len(d), 2): val += d[i + 1] - d[i] + 1 log.debug("Define length of %s without missing is %s", d, val) return val
def _export_missing(self): """ Implementation for Sequence().with_missing.export_missing """ out = [] for resid, nt in self._missing_nts: out.append({"RESID": resid, "res_name": nt}) return out def __add__(self, other): #print("Sequence add called") if isinstance(other, Sequence): raise NotImplementedError( "This is not yet implemented (but really should be)") elif isinstance(other, str): return str(self) + other else: return NotImplemented def __radd__(self, other): #print("Sequence radd called, other is ", repr(other), type(other).__name__, str.__name__) #print(isinstance(other, str)) if isinstance(other, Sequence): raise NotImplementedError( "This is not yet implemented (but really should be)") elif isinstance(other, str): #print("Other is str") return other + str(self) else: return NotImplemented
[docs]class SequenceLoader: """ Load the sequence-related part during loading of bg-files """ def __init__(self): self.mod = {} self.mr = [] self.seq = None self.seq_ids = []
[docs] def consume_fields(self, parts): """ Read one line from the forgi file. Returns True, if the line was used/ understood, False otherwise """ mr = MissingResidue.from_bg_fields(parts) if mr is not None: self.mr.append(mr) return True elif parts[0] == "seq": if self.seq is not None: raise ValueError("More than one seq-line encountered.") self.seq = parts[1] return True elif parts[0] == "seq_ids": if self.seq_ids: raise ValueError("More than one seq-ids line encountered.") self.seq_ids = list(map(fgr.resid_from_str, parts[1:])) return True elif parts[0] == "modification": self.mod[fgr.resid_from_str(parts[1])] = " ".join(parts[2:]) return True return False
@property def sequence(self): if self.seq is None and not self.seq_ids: # The breakpoints are only persisted at the seq and seq_id level. raise ValueError("Parsing incomplete: No seq found.") elif self.seq is None: old_chain = None seq = "" for resid in self.seq_ids: if old_chain is not None and resid.chain != old_chain: seq += "&" seq += "N" old_chain = resid.chain self.seq = seq elif not self.seq_ids: self.seq_ids = _seq_ids_from_seq_str(self.seq) return Sequence(self.seq, self.seq_ids, self.mr, self.mod)
def _seq_ids_from_seq_str(seq): """ Get a list of seq_ids with the same length as the sequence, respecting cutpoints. We start with seq_id A:1. If a '&' is encountered in the sequence, a new chain-letter is used. :param seq: A string, optionally containing '&' characters. """ seq_strs = seq.split('&') seq_ids = [] for i, seq_str in enumerate(seq_strs): for j, s in enumerate(seq_str): seq_ids += [fgr.resid_from_str( "{}:{}".format(VALID_CHAINIDS[i], j + 1))] return seq_ids

Contents