forgi 2.0.0 documentation

Contents

Source code for forgi.threedee.utilities.pdb

from __future__ import print_function, unicode_literals

from builtins import zip
from builtins import range
import sys
import re
import warnings
import itertools
import math
import numpy as np
import Bio.PDB as bpdb
from collections import defaultdict

import forgi.utilities.debug as fud
import forgi.threedee.utilities.vector as ftuv
from forgi.threedee.utilities.modified_res import to_4_letter_alphabeth
import forgi.graph.residue as fgr

from logging_exceptions import log_to_exception

import logging
log = logging.getLogger(__name__)


[docs]class AtomName(str): """ Like a string, but "C1'" and "C1*" compare equal """ def __eq__(self, other): if self.endswith("*"): self = self[:-1] + "'" if other.endswith("*"): other = other[:-1] + "'" return str(self) == str(other) def __hash__(self): if self.endswith("*"): self = self[:-1] + "'" return hash(str(self))
backbone_atoms = list(map(AtomName, ['P', "O5'", "C5'", "C4'", "C3'", "O3'"])) ring_atoms = list(map(AtomName, ["C4'", "C3'", "C2'", "C1'", "O4'"])) nonsidechain_atoms = backbone_atoms + ring_atoms chi_torsion_atoms = dict() chi_torsion_atoms['A'] = chi_torsion_atoms['G'] = list( map(AtomName, ["O4'", "C1'", "N9", "C4"])) chi_torsion_atoms['C'] = chi_torsion_atoms['U'] = list( map(AtomName, ["O4'", "C1'", "N1", "C2"])) side_chain_atoms = dict() side_chain_atoms['U'] = list( map(AtomName, ['N1', 'C2', 'O2', 'N3', 'C4', 'O4', 'C5', 'C6'])) side_chain_atoms['C'] = list( map(AtomName, ['N1', 'C2', 'O2', 'N3', 'C4', 'N4', 'C5', 'C6'])) side_chain_atoms['A'] = list( map(AtomName, ['N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N6', 'N7', 'C8', 'N9'])) side_chain_atoms['G'] = list( map(AtomName, ['N1', 'C2', 'N2', 'N3', 'C4', 'C5', 'C6', 'O6', 'N7', 'C8', 'N9'])) all_side_chains = set( side_chain_atoms['U'] + side_chain_atoms['C'] + side_chain_atoms['A'] + side_chain_atoms['G']) all_rna_atoms = set(nonsidechain_atoms) | all_side_chains RNA_RESIDUES = ["A", "U", "G", "C", 'rA', 'rC', 'rG', 'rU', 'DU'] interactions = [(AtomName(a), AtomName(b)) for a, b in map(sorted, [('P', "O5'"), ('P', 'OP1'), ('P', 'O1P'), ('P', 'OP2'), ('P', 'O2P'), ("C2'", "O2'"), ("O5'", "C5'"), ("C5'", "C4'"), ("C4'", "O4'"), ("C4'", "C3'"), ("O4'", "C1'"), ("C3'", "C2'"), ("C3'", "O3'"), ("C2'", "C1'"), ("C1'", "N1"), ('N1', 'C2'), ('N1', 'C6'), ('C6', 'C5'), ('C5', 'C4'), ('C4', 'O4'), ('C4', 'N4'), ('C4', 'N3'), ('N3', 'C2'), ('C2', 'O2'), ('C2', 'N2'), ("C1'", "N9"), ('N9', 'C8'), ('N9', 'C4'), ('C8', 'N7'), ('N7', 'C5'), ('C6', 'O6'), ('C6', 'N6')])]
[docs]def trim_chain_between(chain, start_res, end_res): ''' Remove all nucleotides between start_res and end_res, inclusive. The chain is modified in place so there is no return value. ''' to_detach = [] for res in chain: if start_res <= res.id[1] and res.id[1] <= end_res: to_detach += [res] for res in to_detach: chain.detach_child(res.id)
[docs]def extract_subchains_from_seq_ids(all_chains, seq_ids): ''' Extract a portion of one or more pdb chains. Creates a list of new chains which contain only the specified residues copied from the original chain. The chain ids are not modified. :param all_chains: A dictionary {chainid:chains}. :param seq_ids: An iterable of complete RESIDS. :returns: A dictionary chain-id:Bio.PDB.Chain.Chain objects ''' new_chains = {} assert isinstance(all_chains, dict) for r in seq_ids: if r.chain in new_chains: chain = new_chains[r.chain] else: chain = new_chains[r.chain] = bpdb.Chain.Chain(r.chain) try: chain.add(all_chains[r.chain][r.resid].copy()) except KeyError: log.info(list(sorted(all_chains[r.chain].child_dict.keys()))) raise return new_chains
[docs]def is_covalent(contact): ''' Determine if a particular contact is covalent. This does not look at the geometric distance but only at the atom names. :param contact: A pair of two Atom objects :return: `True` if they are covalently bonded `False` otherwise ''' r1 = contact[0].parent r2 = contact[1].parent r1a = (r1, contact[0]) r2a = (r2, contact[1]) if contact[0].name.find('H') >= 0 or contact[1].name.find('H') >= 0: return True ((r1, c1), (r2, c2)) = sorted((r1a, r2a), key=lambda x: x[0].id[1]) if r1.id == r2.id: if tuple(sorted((c1.name, c2.name))) in interactions: return True if r2.id[1] - r1.id[1] == 1: # neighboring residues if c1.name == 'O3*' and c2.name == 'P': return True return False
[docs]def num_noncovalent_clashes(chain): ''' Check if a chain has non-covalent clashes. Non-covalent clashes are found when two atoms that aren't covalently linked are within 1.8 A of each other. :param chain: The chain to evaluate :param return: The number of non-covalent clashes. ''' all_atoms = bpdb.Selection.unfold_entities(chain, 'A') ns = bpdb.NeighborSearch(all_atoms) contacts = ns.search_all(1.9) return len([c for c in contacts if not is_covalent(c)])
[docs]def noncovalent_distances(chain, cutoff=0.3): ''' Print out the distances between all non-covalently bonded atoms which are closer than cutoff to each other. :param chain: The Bio.PDB chain. :param cutoff: The maximum distance ''' all_atoms = bpdb.Selection.unfold_entities(chain, 'A') ns = bpdb.NeighborSearch(all_atoms) contacts = ns.search_all(cutoff) return [ftuv.magnitude(c[1] - c[0]) for c in contacts if not is_covalent(c)]
[docs]def pdb_rmsd(c1, c2, sidechains=False, superimpose=True ): ''' Calculate the all-atom rmsd between two RNA chains. :param c1: A Bio.PDB.Chain :param c2: Another Bio.PDB.Chain :return: The rmsd between the locations of all the atoms in the chains. ''' c1_list = [cr for cr in c1.get_list() if cr.resname.strip() in RNA_RESIDUES] c2_list = [cr for cr in c2.get_list() if cr.resname.strip() in RNA_RESIDUES] return residuelist_rmsd(c1_list, c2_list, sidechains, superimpose )
[docs]def residuelist_rmsd(c1_list, c2_list, sidechains=False, superimpose=True): import forgi.threedee.model.similarity as ftms if len(c1_list) != len(c2_list): raise Exception( "Chains of different length. (Maybe an RNA-DNA hybrid?)") #c1_list.sort(key=lambda x: x.id[1]) #c2_list.sort(key=lambda x: x.id[1]) to_residues = [] crds1 = [] crds2 = [] all_atoms1 = [] all_atoms2 = [] for r1, r2 in zip(c1_list, c2_list): if sidechains: anames = nonsidechain_atoms + \ side_chain_atoms[r1.resname.strip()] else: anames = nonsidechain_atoms #anames = a_5_names + a_3_names for a in anames: try: at1 = r1[a] at2 = r2[a] except: continue else: all_atoms1.append(at1) all_atoms2.append(at2) crds1.append(at1.coord) crds2.append(at2.coord) to_residues.append(r1) diff_vecs = ftms._pointwise_deviation(crds1, crds2) dev_per_res = defaultdict(list) for i, res in enumerate(to_residues): dev_per_res[res].append(diff_vecs[i]) if superimpose: sup = bpdb.Superimposer() sup.set_atoms(all_atoms1, all_atoms2) return (len(all_atoms1), sup.rms, sup.rotran, dev_per_res) else: return (len(all_atoms1), ftuv._vector_set_rmsd(crds1, crds2), None, dev_per_res)
[docs]def get_first_chain(filename): ''' Load a PDB file using the Bio.PDB module and return the first chain. :param filename: The path to the pdb file ''' with warnings.catch_warnings(): warnings.simplefilter("ignore") s = bpdb.PDBParser(PERMISSIVE=False).get_structure('t', filename) return list(s.get_chains())[0]
[docs]def pdb_file_rmsd(fn1, fn2): ''' Calculate the RMSD of all the atoms in two pdb structures. :param fn1: The first filename. :param fn2: The second filename. :return: The rmsd between the two structures. ''' with warnings.catch_warnings(): warnings.simplefilter("ignore") s1 = bpdb.PDBParser().get_structure('t', fn1) s2 = bpdb.PDBParser().get_structure('t', fn2) c1, _, _ = get_biggest_chain(fn1) c2, _, _ = get_biggest_chain(fn2) rmsd = pdb_rmsd(c1, c2) return rmsd
[docs]def renumber_chain(chain, resids=None): ''' Renumber all the residues in this chain so that they start at 1 and end at len(chain) :param chain: A Bio.PDB.Chain object :return: The same chain, but with renamed nucleotides ''' counter = 1 if resids is None: resids = [(' ', i + 1, ' ') for i in range(len(chain))] new_child_dict = dict() new_child_list = [] for res, r_new in zip(chain, resids): res.id = r_new new_child_dict[res.id] = res new_child_list.append(res) chain.child_dict = new_child_dict chain.child_list = new_child_list return chain
[docs]def output_chain(chain, filename, fr=None, to=None): ''' Dump a chain to an output file. Remove the hydrogen atoms. :param chain: The Bio.PDB.Chain to dump. :param filename: The place to dump it. ''' class HSelect(bpdb.Select): def accept_atom(self, atom): if atom.name.find('H') >= 0: return False else: return True m = bpdb.Model.Model(' ') s = bpdb.Structure.Structure(' ') m.add(chain) s.add(m) io = bpdb.PDBIO() io.set_structure(s) io.save(filename, HSelect())
[docs]def output_multiple_chains(chains, filename, file_type="pdb"): ''' Dump multiple chains to an output file. Remove the hydrogen atoms. :param chains: An iterable of Bio.PDB.Chain to dump. :param filename: The place to dump it. ''' class HSelect(bpdb.Select): def accept_atom(self, atom): if atom.name.find('H') >= 0: return False else: return True m = bpdb.Model.Model(0) s = bpdb.Structure.Structure('stru') for chain in chains: log.debug("Adding chain %s with %s residues", chain.id, len(chain)) m.add(chain) if file_type=="pdb" and len(chain.id)!=1: raise ValueError("Cannot save chain with name %s (not a single character) " "in PDB format. Use cif format instead!") s.add(m) if file_type == "pdb": io = bpdb.PDBIO() else: io = bpdb.MMCIFIO() io.set_structure(s) try: io.save(filename, HSelect()) except Exception as e: with log_to_exception(log, e): log.error("Could not output PDB with chains and residues:") for chain in s[0]: log.error("%s: %s", chain.id, [r.id for r in chain]) raise
[docs]def get_particular_chain(in_filename, chain_id, parser=None): ''' Load a PDB file and return a particular chain. :param in_filename: The name of the pdb file. :param chain_id: The id of the chain. :return: A Bio.PDB.Chain object containing that particular chain. ''' chains, mr, ir = get_all_chains(in_filename, parser) chain, = [c for c in chains if c.id == chain_id] return chain, mr, ir
[docs]def get_biggest_chain(in_filename, parser=None): ''' Load the PDB file located at filename, select the longest chain and return it. :param in_filename: The location of the original file. :return: A Bio.PDB chain structure corresponding to the longest chain in the structure stored in in_filename ''' chains, mr, ir = get_all_chains(in_filename, parser) biggest = 0 biggest_len = 0 for i in range(len(chains)): c = chains[i] # Only count RNA residues num_residues = 0 for res in c: if (res.resname.strip() == 'A' or res.resname.strip() == 'C' or res.resname.strip() == 'G' or res.resname.strip() == 'U'): num_residues += 1 if num_residues > biggest_len: biggest = i biggest_len = num_residues # sys.exit(1) orig_chain = chains[biggest] return orig_chain, mr, ir
def _extract_symmetrymatrices_from_cif_dict(cif_dict): """ Code originally by G. Entzian, modified Extract matrices for symmetry operations from the cif-dict to convert the assymetric unit to the biological unit :returns: A tuple matrices, vectors. matrices is a dictionary {operation_id: rot_matrix} vectors a dictionary {operation_id: vector} """ symmetry_ids = cif_dict["_pdbx_struct_oper_list.id"] matrices = defaultdict(lambda: [[0.0 for x in range(3)] for y in range(3)]) vectors = defaultdict(lambda: [0.0 for x in range(3)]) for i in range(1,4): v_key = '_pdbx_struct_oper_list.vector['+str(i)+']' value_vectors = cif_dict[v_key] if not isinstance(value_vectors, list): value_vectors = [value_vectors] for k, v in enumerate(value_vectors): vectors[symmetry_ids[k]][i-1] = float(v) for j in range(1,4): key = '_pdbx_struct_oper_list.matrix['+str(i)+']['+str(j)+']' value_per_matrix = cif_dict[key] if not isinstance(value_per_matrix, list): value_per_matrix = [value_per_matrix] for k, v in enumerate(value_per_matrix): matrices[symmetry_ids[k]][i-1][j-1] = float(v) return matrices, vectors def _convert_cif_operation_id_expression(expression): """ By Gregor Entzian """ tmp_expr = expression op_id_list = [] match = re.findall("\([\w\-\,\d]+\)", tmp_expr)#("(\d+)\-(\d+)", id) if match: for m in match: match_s = re.search("\((\d+)\-(\d+)\)", m) if match_s: f = match_s.group(1) t = match_s.group(2) f = int(f) t = int(t) for n in range(f,t+1): op_id_list.append(str(n)) tmp_expr = tmp_expr.replace(m,",") continue else: #split ',' in case of more entries. In case of one entry use the same code. l = m[1:-1] # remove parentheses parts = l.split(',') for p in parts: op_id_list.append(str(p)) tmp_expr = tmp_expr.replace(m,",") continue if tmp_expr != None and tmp_expr != "": tmp_expr = tmp_expr.replace("(","").replace(")","") parts = tmp_expr.split(',') for p in parts: if ',' not in p and p != '': match_s = re.search("(\d+)\-(\d+)", p) if match_s: f = match_s.group(1) t = match_s.group(2) f = int(f) t = int(t) for n in range(f,t+1): op_id_list.append(str(n)) continue op_id_list.append(str(p)) return tuple(op_id_list) def _extract_assembly_gen(cif_dict): """ Extracts the information, how assemblies are generated out of the chains and symmetry operations, from the cifdict. :returns: A dictionary {assembly_id: [(chainid, operationids)...]} Where operation_ids is a list """ assembly_ids = cif_dict['_pdbx_struct_assembly_gen.assembly_id'] #1 operation_ids = cif_dict['_pdbx_struct_assembly_gen.oper_expression'] #1,2 chain_id_lists = cif_dict['_pdbx_struct_assembly_gen.asym_id_list'] #A,B if not isinstance(assembly_ids, list): assembly_ids=[assembly_ids] assert not isinstance(operation_ids, list) operation_ids=[operation_ids] assert not isinstance(chain_id_lists, list) chain_id_lists=[chain_id_lists] assembly_components = defaultdict(lambda:defaultdict(list)) # assembly: operation-lists: chains for i, aid in enumerate(assembly_ids): op_ids=_convert_cif_operation_id_expression(operation_ids[i]) chain_ids=chain_id_lists[i].split() assembly_components[0][operation_ids].extend(chain_id_lists) return assembly_components def _get_new_chainid(chain_id, op_id, taken_ids): new_id= chain_id+"op"+str(op_id) while new_id in taken_ids: new_id+="a" return new_id def _extract_asym_auth_id_map(cif_dict): label_ids = cif_dict['_atom_site.label_asym_id'] auth_ids = cif_dict['_atom_site.auth_asym_id'] l2a=defaultdict(lambda: None) a2l=defaultdict(list) for i, lid in enumerate(label_ids): if lid in l2a: if l2a[lid]!=auth_ids[i]: log.error("lid %s, authid %s but before %s", lid, auth_ids[i], l2a[lid]) raise ValueError("Inconsistent cif.") else: l2a[lid]=auth_ids[i] a2l[auth_ids[i]].append(lid) log.debug(l2a) return l2a, a2l def _get_assemblies(chains, cifdict): assemblies = _extract_assembly_gen(cifdict) id2chainid, chainid2ids = _extract_asym_auth_id_map(cifdict) if assembly_nr is None: assembly_nr = list(sorted(assemblies.keys()))[0] assembly_gen = assemblies[assembly_nr] old_chains = { c.id:c for c in chains} new_chains = {} log.debug("Original chains are %s. Now performing symmetry " "for assembly %s", old_chains, assembly_nr) log.debug("AG: %s",assembly_gen) for operations, labelids in assembly_gen[0].items(): chainids = set(id2chainid[lid] for lid in labelids) lids_back = set( x for x in a2l for cid in chainids for a2l in chainid2ids[cid]) if lids_back!=set(labelids): log.error("%s, %s, extra: %s", lids_back, labelids, lids_back-set(labelids)) raise ValueError("Operation on part of an author designated assymetric unit " "(auth_asym_id) nt supported.") for chain_id in chainids: if chain_id not in old_chains: log.debug ("Skipping chain %s: not RNA? chains: %s", chain_id, old_chains.keys()) continue for op_id in op_ids: log.debug("Applying op %s to %s", op_id, chain_id) if chain_id in new_chains: chain = old_chains[chain_id].copy() newid = _get_new_chainid(chain_id, op_id, set(old_chains.keys())| set(new_chains.keys())) log.debug("Setting id %s", newid) chain.id = newid chain.transform(operation_mat[op_id], operation_vec[op_id]) new_chains[chain.id]=chain log.debug("new_chains: %s", new_chains) chains = list(new_chains.values()) return chains
[docs]def get_all_chains(in_filename, parser=None, no_annotation=False, assembly_nr=None): ''' Load the PDB file located at filename, read all chains and return them. :param in_filename: The location of the original file. :param assembly_nr: Which assembly to return. Default: The first. :return: a tuple chains, missing_residues * chains: A list of Bio.PDB chain structures corresponding to all RNA structures stored in in_filename * missing_residues: A list of dictionaries, describing the missing residues. * interacting residues: A list of residues ''' if parser is None: if in_filename.endswith(".pdb"): parser = bpdb.PDBParser() elif in_filename.endswith(".cif"): parser = bpdb.MMCIFParser() else: # Cannot determine filetype by extention. Try to read first line. with open(in_filename) as pdbfile: line = pdbfile.readline(20) # According to # page 10 of ftp://ftp.wwpdb.org/pub/pdb/doc/format_descriptions/Format_v33_A4.pdf # a HEADER entry is mandatory. Biopython sometime starts directly with ATOM if line.startswith("HEADER") or line.startswith("ATOM"): parser = bpdb.PDBParser() else: parser = bpdb.MMCIFParser() with warnings.catch_warnings(): warnings.simplefilter("ignore") s = parser.get_structure('temp', in_filename) if len(s) > 1: warnings.warn("Multiple models in file. Using only the first model") # Let's detach all H2O, to speed up processing. for chain in s[0]: log.debug("Before detaching water from %s: chain has %s residues", chain.id, len(chain)) for r in chain.child_list[:]: # We need a copy here, because we are modifying it during iteration if r.resname.strip() == "HOH": chain.detach_child(r.id) log.debug("After detaching water from %s: chain has %s residues", chain.id, len(s[0][chain.id])) # Rename residues from other programs for chain in s[0]: for r in chain: # rename rosetta-generated structures if r.resname == ' rA': r.resname = ' A' elif r.resname == ' rC': r.resname = ' C' elif r.resname == ' rG': r.resname = ' G' elif r.resname == ' rU': r.resname = ' U' # rename iFoldRNA-generated structures if r.resname == 'ADE': r.resname = ' A' elif r.resname == 'CYT': r.resname = ' C' elif r.resname == 'GUA': r.resname = ' G' elif r.resname == 'URI': r.resname = ' U' # Now search for protein interactions. if not no_annotation: interacting_residues = enumerate_interactions_kdtree(s[0]) else: interacting_residues = set() # The chains containing RNA chains = list(chain for chain in s[0] if contains_rna(chain)) try: log.debug("PDB header %s", parser.header) mr = parser.header["missing_residues"] if assembly_nr is not None: warnings.warn("Getting an assembly is not supported for the old PDB format.") except AttributeError: # A mmCIF parser cifdict = parser._mmcif_dict # We read a private attribute here, because parsing the mmcif dictionary a second time would cause a performance penalty. # Generate an assembly try: operation_mat, operation_vec = _extract_symmetrymatrices_from_cif_dict(cifdict) except KeyError: pass else: if False: # Still experimental and not working correctly. chains = _get_assemblies(chains, cifdict) mr = [] try: mask = np.array( cifdict["_pdbx_poly_seq_scheme.pdb_mon_id"], dtype=str) == "?" int_seq_ids = np.array( cifdict["_pdbx_poly_seq_scheme.pdb_seq_num"], dtype=int)[mask] cs = np.array( cifdict["_pdbx_poly_seq_scheme.pdb_strand_id"], dtype=str)[mask] insertions = np.array( cifdict["_pdbx_poly_seq_scheme.pdb_ins_code"], dtype=str)[mask] insertions[insertions == "."] = " " symbol = np.array( cifdict["_pdbx_poly_seq_scheme.mon_id"], dtype=str)[mask] except KeyError: pass else: if not no_annotation: for i, sseq in enumerate(int_seq_ids): mr.append({ "model": None, "res_name": symbol[i], "chain": cs[i], "ssseq": sseq, "insertion": insertions[i] }) except KeyError: mr = [] with open(in_filename) as f: for wholeline in f: if wholeline.startswith("REMARK 465"): line = wholeline[10:].strip() mr_info = _parse_remark_465(line) if mr_info is not None: mr.append(mr_info) else: continue else: if mr: log.info("This PDB has missing residues") elif not no_annotation: log.info("This PDB has no missing residues") '''for res1, res2 in itertools.combinations(s[0].get_residues(), 2): rna_res=None other_res=None if res1.resname.strip() in RNA_RESIDUES: rna_res=res1 else: other_res=res1 if res2.resname.strip() in RNA_RESIDUES: rna_res=res2 else: other_res=res2 if rna_res is None or other_res is None: continue if other_res.resname.strip()=="HOH": continue if residues_interact(rna_res, other_res): log.error("%s and %s interact", rna_res, other_res) interacting_residues.add(rna_res)''' log.debug("LOADING DONE: chains %s, mr %s, ir: %s", chains, mr, interacting_residues) return chains, mr, interacting_residues
def _parse_remark_465(line): """Parse missing residue remarks. Returns a dictionary describing the missing residue. The specification for REMARK 465 at http://www.wwpdb.org/documentation/file-format-content/format33/remarks2.html#REMARK%20465 only gives templates, but does not say they have to be followed. So we assume that not all pdb-files with a REMARK 465 can be understood. Returns a dictionary with the following keys: "model", "res_name", "chain", "ssseq", "insertion" """ if line: # Note that line has been stripped. assert line[0] != " " and line[-1] not in "\n ", "line has to be stripped" pattern = (r""" (\d+\s[\sA-Z][\sA-Z][A-Z] | # Either model number + residue name [A-Z]?[A-Z]?[A-Z]) # Or only residue name with # 1 (RNA) to 3 letters \s ([A-Za-z0-9]) # A single character chain \s+(\d+[A-Za-z]?)$ # Residue number: A digit followed # by an optional insertion code # (Hetero-flags make no sense in # context with missing res) """) match = re.match(pattern, line, re.VERBOSE) if match is not None: residue = {} if " " in match.group(1): model, residue["res_name"] = match.group(1).split(" ") residue["model"] = int(model) else: residue["model"] = None residue["res_name"] = match.group(1) residue["chain"] = match.group(2) try: residue["ssseq"] = int(match.group(3)) except ValueError: residue["insertion"] = match.group(3)[-1] residue["ssseq"] = int(match.group(3)[:-1]) else: residue["insertion"] = None return residue return None
[docs]def enumerate_interactions_kdtree(model): relevant_atoms = [a for a in model.get_atoms() if a.name[0] in [ "C", "N", "O"]] if not relevant_atoms: return set() kdtree = bpdb.NeighborSearch(relevant_atoms) pairs = kdtree.search_all(6, "A") res_pair_list = set() for a1, a2 in pairs: if a1.name not in all_side_chains and a2.name not in all_side_chains: continue p1 = a1.get_parent() p2 = a2.get_parent() if p1.id == p2.id: continue elif p1 < p2: res_pair_list.add((p1, p2)) else: res_pair_list.add((p2, p1)) interacting_residues = set() for res1, res2 in res_pair_list: rna_res = None other_res = None if res1.resname.strip() in RNA_RESIDUES and not res1.id[0].startswith("H_"): rna_res = res1 else: other_res = res1 if res2.resname.strip() in RNA_RESIDUES and not res2.id[0].startswith("H_"): rna_res = res2 else: other_res = res2 if rna_res is None or other_res is None: continue log.debug("%s(chain %s) and %s(chain %s, resname %s) are close", rna_res,rna_res.parent.id,other_res, other_res.parent.id, other_res.resname.strip()) # Only consider C and N. So no ions etc if any(a.name[0] in ["C", "N"] for a in other_res.get_atoms()): interacting_residues.add(rna_res) else: log.debug("but %s has wrong atoms %s", other_res, list(a.name for a in other_res.get_atoms())) log.debug("Interacting: %s", interacting_residues) return interacting_residues
"""def residues_interact(rna_res, other_res): for rna_atom in rna_res: if rna_atom.get_name() in all_side_chains: for other_atom in other_res: atom_symbol="".join(s for s in other_atom.get_name() if not s.isdigit()) if atom_symbol in ["C", "N"]: d=ftuv.vec_distance(rna_atom.coord, other_atom.coord) if d<6: return True return False""" HBOND_CUTOFF = 4.5 # 4.5 and 0.9 are values optimized against DSSR for 5T5H_A-B-C OOP_CUTOFF = 0.9 def _get_points(res1, res2): labels = {res1.resname.strip(), res2.resname.strip()} if labels == {"A", "U"}: return _points_AU(res1, res2) elif labels == {"G", "C"}: return _points_GC(res1, res2) elif labels == {"G", "U"}: return _points_GC(res1, res2) else: return None def _points_AU(res1, res2): if res1.resname.strip() == "A": resA = res1 resU = res2 else: resA = res2 resU = res1 a = resA["N6"].coord b = resU["O4"].coord c = resA["N1"].coord d = resU["N3"].coord return (resA["C8"].coord, resU["C6"].coord), (a, b), (c, d) def _points_GU(res1, res2): if res1.resname.strip() == "G": resG = res1 resU = res2 else: resG = res2 resU = res1 a = resG["O6"].coord b = resU["N3"].coord c = resG["N1"].coord d = resU["O2"].coord return (resG["C8"].coord, resU["C6"].coord), (a, b), (c, d) def _points_GC(res1, res2): if res1.resname.strip() == "G": resG = res1 resC = res2 else: resG = res2 resC = res1 a = resG["O6"].coord c = resG["N1"].coord e = resG["N2"].coord b = resC["N4"].coord d = resC["N3"].coord f = resC["O2"].coord return (resC["C6"].coord, resG["C8"].coord), (a, b), (c, d), (e, f)
[docs]def is_basepair_pair(res1, res2): pairs = _get_points(res1, res2) if not pairs: return False for pair in pairs[1:]: # pairs[0] is only for coplanarity]] d = ftuv.vec_distance(pair[0], pair[1]) if d >= HBOND_CUTOFF: return False if is_almost_coplanar(*[point for pair in pairs for point in pair]): return True return False
def _coplanar_point_indices(*points): """ Thanks to https://stackoverflow.com/a/18968498""" from numpy.linalg import svd points = np.array(points).T assert points.shape[0] <= points.shape[1], "There are only {} points in {} dimensions.".format( points.shape[1], points.shape[0]) ctr = points.mean(axis=1) x = points - ctr[:, np.newaxis] M = np.dot(x, x.T) # Could also use np.cov(x) here. normal = svd(M)[0][:, -1] out = [] for i, p in enumerate(points.T): w = p - ctr oop_distance = ftuv.magnitude( np.dot(w, normal)) / ftuv.magnitude(normal) if oop_distance <= OOP_CUTOFF: out.append(i) return out, ctr, normal
[docs]def is_almost_coplanar(*points): indices, c, n = _coplanar_point_indices(*points) return len(indices) == len(points)
[docs]def annotate_fallback(chain_list): """ If neither DSSR nor MC-Annotate are available, we use an ad-hoc implementation of canonical basepair detection as fallback. This does not work well for missing atoms or modified residues. """ kdtree = bpdb.NeighborSearch( [atom for chain in chain_list for atom in chain.get_atoms()]) pairs = kdtree.search_all(10, "R") basepairs = {} # Sorted, so conflicting basepairs are deterministically solved for res1, res2 in sorted(pairs): if res1.resname.strip() not in RNA_RESIDUES or res1.id[0].startswith("H_"): continue if res2.resname.strip() not in RNA_RESIDUES or res2.id[0].startswith("H_"): continue labels = {res1.resname.strip(), res2.resname.strip()} try: is_bp = is_basepair_pair(res1, res2) if is_bp: res1_id = fgr.resid_from_biopython(res1) res2_id = fgr.resid_from_biopython(res2) if res1_id in basepairs: warnings.warn("More than one basepair detected for {}." " Ignoring {}-{} because {}-{} is already" " part of the structure".format(res1_id, res1_id, res2_id, res1_id, basepairs[res1_id])) continue if res2_id in basepairs: warnings.warn("More than one basepair detected for {}." " Ignoring {}-{} because {}-{} is already" " part of the structure".format(res2_id, res2_id, res1_id, res2_id, basepairs[res2_id])) continue basepairs[res1_id] = res2_id basepairs[res2_id] = res1_id except KeyError as e: log.debug("Missing atom %s. %s has atoms %s, %s has atoms %s", e, res1, res1.child_dict, res2, res2.child_dict) pass seq_ids = [] for chain in sorted(chain_list, key=lambda x: x.id): for residue in chain: seq_ids.append(fgr.resid_from_biopython(residue)) bpseq = "" chain_dict = {c.id: c for c in chain_list} for i, seqid in enumerate(seq_ids): if seqid in basepairs: bp = seq_ids.index(basepairs[seqid]) + 1 else: bp = 0 bpseq += "{} {} {}\n".format(i + 1, chain_dict[seqid.chain][seqid.resid].resname.strip( ), bp) return bpseq, seq_ids
[docs]def rename_rosetta_atoms(chain): ''' Rosetta names all the backbone atoms with an asterisk rather than an apostrophe. All that needs to be reversed. :param chain. A Bio.PDB.Chain structure generated by Rosetta :return: The same chain with renamed atoms ''' for a in bpdb.Selection.unfold_entities(chain, 'A'): oldid = a.id a.name = a.name.replace('*', "'") a.fullname = a.name.replace('*', "'") a.id = a.id.replace('*', "'") #: Not needed with newer biopython versions #: Seems to be needed again? del a.parent.child_dict[oldid] a.parent.child_dict[a.id] = a # log.debug("Replaced rosetta atoms. \n%s\n%s", # chain.child_list[0].child_list, # chain.child_list[0].child_dict # ) return chain
[docs]def remove_disordered(chain): for i, residue in enumerate(chain): if hasattr(residue, "selected_child"): new_res = residue.selected_child chain.detach_child(residue.id) chain.insert(i, new_res) residue = new_res for j, atom in enumerate(residue): if hasattr(atom, "selected_child"): new_atom = atom.selected_child new_atom.altloc = " " new_atom.occupancy = 1.0 new_atom.disordered_flag = 0 residue.detach_child(atom.id) residue.insert(j, new_atom) return chain
[docs]def load_structure(pdb_filename): ''' Load a Bio.PDB.Structure object and return the largest chain. This chain will be modified so that all hetatms are removed, modified residues will be renamed to regular residues, etc... ''' chain, mr, ir = get_biggest_chain(pdb_filename) return clean_chain(chain, True)[0]
[docs]def clean_chain(chain, query_PDBeChem=False): """ Clean a pdb chain for further use with forgi. It will be modified so that all hetatms are removed, modified residues will be renamed to regular residues, residue ids will be positive integers, ... :param chain: A Bio.PDB.Chain object :param query_PDBeChem: If true, query the PDBeChem database whenever a modified residue with unknown 3-letter code is encountered. :returns: A modified version of this chain """ chain, modifications = to_4_letter_alphabeth(chain, query_PDBeChem) chain = rename_rosetta_atoms(chain) chain = remove_disordered(chain) return chain, modifications
[docs]def interchain_contacts(struct): all_atoms = bpdb.Selection.unfold_entities(struct, 'A') ns = bpdb.NeighborSearch(all_atoms) pairs = ns.search_all(2.8) ic_pairs = [] for (a1, a2) in pairs: if a1.parent.parent != a2.parent.parent: ic_pairs += [(a1, a2)] return ic_pairs
[docs]def contains_rna(chain): ''' Determine if a Bio.PDB.Chain structure corresponds to an RNA molecule. :param chain: A Bio.PDB.Chain molecule :return: True if it is an RNA molecule, False if at least one residue is not an RNA. ''' for res in chain: if res.resname.strip() in RNA_RESIDUES: return True return False
[docs]def is_protein(chain): ''' Determine if a Bio.PDB.Chain structure corresponds to an protein molecule. :param chain: A Bio.PDB.Chain molecule :return: True if it is a protein molecule, False otherwise ''' for res in chain: if res.resname in ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL']: return True return False

Contents