"""
A module for dealing with modified residues.
For now the only functionality is to find out the corresponding
unmodified residue.
"""
from __future__ import print_function
from builtins import str
from builtins import object
try:
from urllib.request import urlopen # python3
except ImportError:
from urllib import urlopen # python2
import re
import logging
import warnings
import sys
from pprint import pprint
from ...graph import residue as fgr
#from .modified_res_lookup import RESIDUE_DICT
log = logging.getLogger(__name__)
def _parse_table_row(tr):
tds = [td for td in tr.contents if td.name == "td"]
if len(tds) == 2:
key = tds[0].h3.text.strip()
value = tds[1].text.strip()
return key, value
else:
return None
def _html_to_info_dict(html):
try:
from bs4 import BeautifulSoup
except ImportError:
warnings.warn("Could not query PDBeChem, because BeautifulSoup is not installed"
"(install with `pip install forgi[pdbechem]` or `pip install forgi[all]`)")
return {}
with warnings.catch_warnings():
# We don't care what xml parser beautiful soup uses
warnings.simplefilter("ignore")
parsed_html = BeautifulSoup(html)
for h1 in parsed_html.body.find_all("h1"):
if h1.text == "wwPDB Information":
table = h1.parent.parent.parent # (h1_>td->tr->tbody)
rows = table.find_all("tr")
info = {}
for row in rows:
data = _parse_table_row(row)
if data:
info[data[0]] = data[1]
return info
return {}
[docs]def query_PDBeChem(three_letter_code):
"""
Query the PDBeChem and return a dictionary with key-value pairs.
Unfortunately, this information does not seem to be provided in any
machine-readable format. We have to parse the html.
"""
if len(three_letter_code) > 3:
raise ValueError("Illegal 3-letter code")
if not re.match("^[A-Z0-9][A-Z0-9][A-Z0-9]$", three_letter_code):
raise ValueError("Illegal 3-letter code")
html = urlopen(
"http://www.ebi.ac.uk/pdbe-srv/pdbechem/chemicalCompound/show/{}".format(three_letter_code))
residue_info = _html_to_info_dict(html)
return residue_info
[docs]def change_residue_id(residue, new_id):
chain = residue.parent
if chain is not None and new_id in chain:
raise ValueError("Cannot change id {old} to {new}. {new} already exists".format(
old=residue.id, new=new_id))
old_id = residue.id
residue.id = new_id
if chain is not None:
try:
del chain.child_dict[old_id]
except KeyError:
pass # New version of biopython. Id is a property
else:
chain.child_dict[new_id] = residue
[docs]def to_4_letter_alphabeth(chain, query_db=False):
'''
Rename the modified residues so that they have the same
names as unmodified residues.
If a residue's name is unknown, query PDBeChem.
If the residue is not a ribonucleotide, remove it from the chain.
If it is a modified ribonucleotde, replace it by the unmodified "standard parent"
:param chain: A Bio.PDB.Chain structure
:param query_db: If true, query PDBeChem whenever a 3-letter code is unknown.
:return: The same chain, but with only AUGC residues.
'''
modifications = {}
i = 0
while i < len(chain):
r = chain.child_list[i]
# Non-standard residues
if r.resname.strip() == "I":
# "I" has no standart parent (AUGC) to replace it with.
warnings.warn("Inosine will be replaced by G")
r.resname = "G"
modifications[fgr.RESID(chain=chain.id, resid=(
" ", r.id[1], r.id[2]))] = r.resname.strip()
else:
if r.resname.strip() not in "AUGC":
res_info = ModifiedResidueLookup(query_db)[r.resname]
if not res_info:
# Unknown code. Remove residue
log.info(
"Detaching %s, because we do not understand the code %s.", r, r.resname)
chain.detach_child(r.id)
continue # Continue with same i (now different residue)
if res_info["Polymer type"] != "Ribonucleotide":
# DNA/ Amino Acid. Remove residue
chain.detach_child(r.id)
continue # Continue with same i (now different residue)
if res_info["Type description"] == "NON-POLYMER":
# e.g. GTP in 3DIR
log.warning(
"Detaching %s, because %s is classified as non-polymeric.", r, r.resname)
chain.detach_child(r.id)
continue # Continue with same i (now different residue)
if res_info["Standard parent"] not in ["A", "U", "G", "C"]:
log.warning("Detaching %s, because %s has standard parent '%s'.",
r, r.resname, res_info["Standard parent"])
chain.detach_child(r.id)
continue # Continue with same i (now different residue)
modifications[fgr.RESID(chain=chain.id, resid=(
" ", r.id[1], r.id[2]))] = r.resname.strip()
r.resname = res_info["Standard parent"]
# rename modified residues
if r.id[0].strip():
log.debug("Modified residue: %s", r.id)
# The following was added because of 1NYI.pdb. It includes a single G-residue denoted as HETATOM in chain A.
# It forms a base-pair, but is probably not connected to the backbone.
# Instead of removing it/ in addition to removing it,
# introducing cutpoint whereever the PDB Chain
# contains gaps would be another option (TODO).
# A plain AUGC as a HETATOM means it is a ligand.
# 1Y26 has H_ADE instead of H_A
if r.id[0].strip() in ["H_A", "H_U", "H_G", "H_C", "H_ G", "H_ C", "H_ A", "H_ U", "H_ADE", "H_CYT", "H_GUA", "H_URI"]:
log.debug("Detaching %s, because it is a ligand", r.id)
chain.detach_child(r.id)
continue # Continue with same i (now different residue)
change_residue_id(r, (' ', r.id[1], r.id[2]))
i += 1 # Go to next residue.
return chain, modifications
[docs]class ModifiedResidueLookup(object):
"""
Convenience wrapper to access modified_res_lookup.RESIDUE_DICT.
If a key does not exist, query the database to add it."""
def __init__(self, query_db=False):
"""
:param query_db: If true, query PDBeChem whenever a 3-letter code is not understood.
"""
from . import modified_res_lookup
self._dict = modified_res_lookup.RESIDUE_DICT
self._query_db= query_db
def __getitem__(self, key):
key = key.strip()
if key not in self._dict:
if not self._query_db:
return None
try:
self._dict[key] = query_PDBeChem(key)
log.debug("Successfully looked up %s", key)
except:
log.warning("Could not look-up modified residue key %s", key)
self._dict[key] = None
return self._dict[key]
[docs] def clean_failed(self):
for k, v in self._dict.items():
if v is None:
del self._dict[k]
def __str__(self):
return str(self._dict)
[docs]def dict_from_pdbs(filenames):
# Just look at the SEQRES headers.
ignore = set("AUGC")
out_dict = {}
for filename in filenames:
with open(filename) as f:
log.info("Opened file %s", filename)
for line in f:
if line.startswith("SEQRES"):
codes = line[20:].split()
for code in codes:
code = code.strip()
if code not in out_dict and code not in ignore:
try:
res_info = query_PDBeChem(code)
out_dict[code] = res_info
except (ValueError, LookupError) as e:
log.warning(
"3-letter code '%s' not found: %s", code, e)
out_dict[code] = None
return {k: v for k, v in out_dict.items()}
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
print("# Created by `python {}`".format(" ".join(sys.argv)))
print("RESIDUE_DICT = ", end="")
d = dict_from_pdbs(sys.argv[1:])
for ion in ["MG", "NA", "K", "CL"]:
if ion not in d:
d[ion] = None
pprint({k: v for k, v in d.items() if v is not None})