From 48ee58f850eb8848ed4d616f7e036ac4ba7d50c9 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Tue, 10 Sep 2024 10:20:01 +0200 Subject: [PATCH 1/5] add threeLetter to pdb header and atomic --- prody/atomic/atomic.py | 12 ++++++++++-- prody/atomic/chain.py | 8 ++++++-- prody/proteins/header.py | 22 ++++++++++++++++------ 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/prody/atomic/atomic.py b/prody/atomic/atomic.py index b4a6700f4..fcebf30a7 100644 --- a/prody/atomic/atomic.py +++ b/prody/atomic/atomic.py @@ -247,12 +247,20 @@ def getSequence(self, **kwargs): residues (e.g. water molecules) in the chain and **X** will be used for non-standard residue names.""" + threeLetter = kwargs.get('threeLetter', False) + get = AAMAP.get if hasattr(self, 'getResnames'): - seq = ''.join([get(res, 'X') for res in self.getResnames()]) + if threeLetter: + seq = ' '.join(self.getResnames()) + else: + seq = ''.join([get(res, 'X') for res in self.getResnames()]) else: res = self.getResname() - seq = get(res, 'X') + if threeLetter: + seq = res + else: + seq = get(res, 'X') return seq diff --git a/prody/atomic/chain.py b/prody/atomic/chain.py index d5534f783..ea5f73518 100644 --- a/prody/atomic/chain.py +++ b/prody/atomic/chain.py @@ -8,10 +8,14 @@ __all__ = ['Chain'] -def getSequence(resnames): +def getSequence(resnames, **kwargs): """Returns polypeptide sequence as from list of *resnames* (residue name abbreviations).""" + threeLetter = kwargs.get('threeLetter', False) + if threeLetter: + return ' '.join(resnames) + get = AAMAP.get return ''.join([get(rn, 'X') for rn in resnames]) @@ -140,7 +144,7 @@ def getSequence(self, **kwargs): else: calpha = self.calpha if calpha: - seq = getSequence(calpha.getResnames()) + seq = getSequence(calpha.getResnames(), **kwargs) else: seq = '' self._seq = seq diff --git a/prody/proteins/header.py b/prody/proteins/header.py index 1a64132c6..b22e6483b 100644 --- a/prody/proteins/header.py +++ b/prody/proteins/header.py @@ -235,7 +235,7 @@ def cleanString(string, nows=False): return ' '.join(string.strip().split()) -def parsePDBHeader(pdb, *keys): +def parsePDBHeader(pdb, *keys, **kwargs): """Returns header data dictionary for *pdb*. This function is equivalent to ``parsePDB(pdb, header=True, model=0, meta=False)``, likewise *pdb* may be an identifier or a filename. @@ -297,12 +297,12 @@ def parsePDBHeader(pdb, *keys): raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) pdb = openFile(pdb, 'rt') - header, _ = getHeaderDict(pdb, *keys) + header, _ = getHeaderDict(pdb, *keys, **kwargs) pdb.close() return header -def getHeaderDict(stream, *keys): +def getHeaderDict(stream, *keys, **kwargs): """Returns header data in a dictionary. *stream* may be a list of PDB lines or a stream.""" @@ -325,7 +325,10 @@ def getHeaderDict(stream, *keys): keys = list(keys) for k, key in enumerate(keys): if key in _PDB_HEADER_MAP: - value = _PDB_HEADER_MAP[key](lines) + if key == 'polymers': + value = _PDB_HEADER_MAP[key](lines, **kwargs) + else: + value = _PDB_HEADER_MAP[key](lines) keys[k] = value else: raise KeyError('{0} is not a valid header data identifier' @@ -555,7 +558,7 @@ def _getReference(lines): return ref -def _getPolymers(lines): +def _getPolymers(lines, **kwargs): """Returns list of polymers (macromolecules).""" pdbid = lines['pdbid'] @@ -564,7 +567,14 @@ def _getPolymers(lines): ch = line[11] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly - poly.sequence += ''.join(getSequence(line[19:].split())) + + threeLetter = kwargs.get('threeLetter', False) + if threeLetter: + if poly.sequence != '': + poly.sequence += ' ' + poly.sequence += getSequence(line[19:].split(), **kwargs) + else: + poly.sequence += ''.join(getSequence(line[19:].split(), **kwargs)) for i, line in lines['DBREF ']: i += 1 From 6066666ae7191dc675c0f51b338866efc2926aed Mon Sep 17 00:00:00 2001 From: James Krieger Date: Tue, 10 Sep 2024 10:42:20 +0200 Subject: [PATCH 2/5] threeLetter seqres cif --- prody/atomic/atomic.py | 9 ++++++++ prody/proteins/cifheader.py | 45 +++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/prody/atomic/atomic.py b/prody/atomic/atomic.py index fcebf30a7..bc5153253 100644 --- a/prody/atomic/atomic.py +++ b/prody/atomic/atomic.py @@ -24,6 +24,15 @@ except: continue +CORE_AAMAP = AAMAP = { + 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', + 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', + 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', + 'TYR': 'Y', 'VAL': 'V' +} + +invAAMAP = dict((v, k) for k, v in CORE_AAMAP.items()) + AAMAP = { 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', diff --git a/prody/proteins/cifheader.py b/prody/proteins/cifheader.py index 9a31112da..34823b098 100644 --- a/prody/proteins/cifheader.py +++ b/prody/proteins/cifheader.py @@ -7,7 +7,8 @@ from prody import LOGGER from prody.atomic import flags, AAMAP -from prody.utilities import openFile, alignBioPairwise, GAP_PENALTY, GAP_EXT_PENALTY +from prody.atomic.atomic import invAAMAP +from prody.utilities import openFile, alignBioPairwise, GAP_EXT_PENALTY from .localpdb import fetchPDB from .header import (Chemical, Polymer, DBRef, _PDB_DBREF, @@ -57,7 +58,7 @@ def _natomsFromFormulaPart(part): return 1 return int("".join(digits)) -def parseCIFHeader(pdb, *keys): +def parseCIFHeader(pdb, *keys, **kwargs): """Returns header data dictionary for *pdb*. This function is equivalent to ``parsePDB(pdb, header=True, model=0, meta=False)``, likewise *pdb* may be an identifier or a filename. @@ -119,12 +120,12 @@ def parseCIFHeader(pdb, *keys): raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) pdb = openFile(pdb, 'rt') - header = getCIFHeaderDict(pdb, *keys) + header = getCIFHeaderDict(pdb, *keys, **kwargs) pdb.close() return header -def getCIFHeaderDict(stream, *keys): +def getCIFHeaderDict(stream, *keys, **kwargs): """Returns header data in a dictionary. *stream* may be a list of PDB lines or a stream.""" @@ -139,11 +140,17 @@ def getCIFHeaderDict(stream, *keys): keys = list(keys) for k, key in enumerate(keys): if key in _PDB_HEADER_MAP: - value = _PDB_HEADER_MAP[key](lines) + if key == 'polymers': + value = _PDB_HEADER_MAP[key](lines, **kwargs) + else: + value = _PDB_HEADER_MAP[key](lines) keys[k] = value else: try: - value = _PDB_HEADER_MAP['others'](lines, key) + if key == 'polymers': + value = _PDB_HEADER_MAP[key](lines, **kwargs) + else: + value = _PDB_HEADER_MAP[key](lines) keys[k] = value except: raise KeyError('{0} is not a valid header data identifier' @@ -758,7 +765,7 @@ def _getReference(lines): return ref -def _getPolymers(lines): +def _getPolymers(lines, **kwargs): """Returns list of polymers (macromolecules).""" pdbid = _PDB_HEADER_MAP['identifier'](lines) @@ -777,8 +784,28 @@ def _getPolymers(lines): entities[entity].append(ch) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly - poly.sequence += ''.join(item[ - '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split()) + + threeLetter = kwargs.get('threeLetter', False) + if threeLetter: + poly.sequence += ''.join(item[ + '_entity_poly.pdbx_seq_one_letter_code'].replace(';', '').split()) + else: + poly.sequence += ''.join(item[ + '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split()) + + if threeLetter: + for poly in polymers.values(): + seq = poly.sequence + resnames = [] + for item in seq.split('('): + if item.find(')') != -1: + resnames.append(item[:item.find(')')]) + letters = list(item[item.find(')')+1:]) + else: + letters = list(item) + resnames.extend([invAAMAP[letter] for letter in letters]) + + poly.sequence = ' '.join(resnames) # DBREF block 1 items2 = parseSTARSection(lines, '_struct_ref', report=False) From 0441b564eb6737fed818b27fdf131f83aa25c228 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Tue, 10 Sep 2024 10:46:33 +0200 Subject: [PATCH 3/5] add to allres block --- prody/atomic/chain.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/prody/atomic/chain.py b/prody/atomic/chain.py index ea5f73518..0b0b17094 100644 --- a/prody/atomic/chain.py +++ b/prody/atomic/chain.py @@ -138,7 +138,10 @@ def getSequence(self, **kwargs): if kwargs.get('allres', False): get = AAMAP.get - seq = ''.join([get(res.getResname(), 'X') for res in self]) + if kwargs.get('threeLetter', False): + seq = ' '.join([res.getResname() for res in self]) + else: + seq = ''.join([get(res.getResname(), 'X') for res in self]) elif self._seq: seq = self._seq else: From 13850c3fc58f13589cf3d6e5cc1f30b9b6e07d86 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Thu, 12 Sep 2024 14:47:38 +0200 Subject: [PATCH 4/5] add docs --- prody/atomic/atomic.py | 8 ++++---- prody/atomic/chain.py | 11 ++++++----- prody/proteins/cifheader.py | 25 ++++++++++++++++++------- prody/proteins/header.py | 14 ++++++++++---- prody/utilities/catchall.py | 2 +- 5 files changed, 39 insertions(+), 21 deletions(-) diff --git a/prody/atomic/atomic.py b/prody/atomic/atomic.py index bc5153253..1541c1993 100644 --- a/prody/atomic/atomic.py +++ b/prody/atomic/atomic.py @@ -251,22 +251,22 @@ def getTitle(self): return ag._title def getSequence(self, **kwargs): - """Returns one-letter sequence string for amino acids. + """Returns one-letter sequence string for amino acids, unless *longSeq* is **True**. When *allres* keyword argument is **True**, sequence will include all residues (e.g. water molecules) in the chain and **X** will be used for non-standard residue names.""" - threeLetter = kwargs.get('threeLetter', False) + longSeq = kwargs.get('longSeq', False) get = AAMAP.get if hasattr(self, 'getResnames'): - if threeLetter: + if longSeq: seq = ' '.join(self.getResnames()) else: seq = ''.join([get(res, 'X') for res in self.getResnames()]) else: res = self.getResname() - if threeLetter: + if longSeq: seq = res else: seq = get(res, 'X') diff --git a/prody/atomic/chain.py b/prody/atomic/chain.py index 0b0b17094..db74231c9 100644 --- a/prody/atomic/chain.py +++ b/prody/atomic/chain.py @@ -9,11 +9,12 @@ __all__ = ['Chain'] def getSequence(resnames, **kwargs): - """Returns polypeptide sequence as from list of *resnames* (residue - name abbreviations).""" + """Returns polypeptide sequence from a list of *resnames* using one-letter residue + name abbreviations by default, or long (usually three letter) abbrevations + if *longSeq* is **True**.""" - threeLetter = kwargs.get('threeLetter', False) - if threeLetter: + longSeq = kwargs.get('longSeq', False) + if longSeq: return ' '.join(resnames) get = AAMAP.get @@ -138,7 +139,7 @@ def getSequence(self, **kwargs): if kwargs.get('allres', False): get = AAMAP.get - if kwargs.get('threeLetter', False): + if kwargs.get('longSeq', False): seq = ' '.join([res.getResname() for res in self]) else: seq = ''.join([get(res.getResname(), 'X') for res in self]) diff --git a/prody/proteins/cifheader.py b/prody/proteins/cifheader.py index 34823b098..71c634486 100644 --- a/prody/proteins/cifheader.py +++ b/prody/proteins/cifheader.py @@ -127,7 +127,11 @@ def parseCIFHeader(pdb, *keys, **kwargs): def getCIFHeaderDict(stream, *keys, **kwargs): """Returns header data in a dictionary. *stream* may be a list of PDB lines - or a stream.""" + or a stream. + + Polymers have sequences that usually use one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**. + """ try: lines = stream.readlines() @@ -766,7 +770,10 @@ def _getReference(lines): def _getPolymers(lines, **kwargs): - """Returns list of polymers (macromolecules).""" + """Returns list of polymers (macromolecules). + + Sequence is usually one-letter abbreviations, but can be long + abbreviations (usually three letters) if *longSeq* is **True**""" pdbid = _PDB_HEADER_MAP['identifier'](lines) polymers = dict() @@ -785,15 +792,15 @@ def _getPolymers(lines, **kwargs): poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly - threeLetter = kwargs.get('threeLetter', False) - if threeLetter: + longSeq = kwargs.get('longSeq', False) + if longSeq: poly.sequence += ''.join(item[ '_entity_poly.pdbx_seq_one_letter_code'].replace(';', '').split()) else: poly.sequence += ''.join(item[ '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split()) - if threeLetter: + if longSeq: for poly in polymers.values(): seq = poly.sequence resnames = [] @@ -1264,13 +1271,17 @@ def _getOther(lines, key=None): return data -def _getUnobservedSeq(lines): +def _getUnobservedSeq(lines, **kwargs): + """Get sequence of unobserved residues. + + This sequence is usually using one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.""" key_unobs = '_pdbx_unobs_or_zero_occ_residues' try: unobs = parseSTARSection(lines, key_unobs, report=False) - polymers = _getPolymers(lines) + polymers = _getPolymers(lines, **kwargs) except: pass diff --git a/prody/proteins/header.py b/prody/proteins/header.py index b22e6483b..2de4e7123 100644 --- a/prody/proteins/header.py +++ b/prody/proteins/header.py @@ -304,7 +304,10 @@ def parsePDBHeader(pdb, *keys, **kwargs): def getHeaderDict(stream, *keys, **kwargs): """Returns header data in a dictionary. *stream* may be a list of PDB lines - or a stream.""" + or a stream. + + Polymers have sequences that usually use one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.""" lines = defaultdict(list) loc = 0 @@ -559,7 +562,10 @@ def _getReference(lines): def _getPolymers(lines, **kwargs): - """Returns list of polymers (macromolecules).""" + """Returns list of polymers (macromolecules). + + Polymers have sequences that usually use one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.""" pdbid = lines['pdbid'] polymers = dict() @@ -568,8 +574,8 @@ def _getPolymers(lines, **kwargs): poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly - threeLetter = kwargs.get('threeLetter', False) - if threeLetter: + longSeq = kwargs.get('longSeq', False) + if longSeq: if poly.sequence != '': poly.sequence += ' ' poly.sequence += getSequence(line[19:].split(), **kwargs) diff --git a/prody/utilities/catchall.py b/prody/utilities/catchall.py index 33d166dcc..d88aeee70 100644 --- a/prody/utilities/catchall.py +++ b/prody/utilities/catchall.py @@ -314,7 +314,7 @@ def calcTree(names, distance_matrix, method='upgma', linkage=False): method = method.lower().strip() - if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median']: + if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median', 'complete']: from scipy.cluster.hierarchy import linkage as hlinkage from scipy.spatial.distance import squareform From 43895d2b55fa237e2fdfc1979cecc4726fa99fd9 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Thu, 12 Sep 2024 16:11:45 +0200 Subject: [PATCH 5/5] add more nonstandard to flags --- prody/atomic/flags.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/prody/atomic/flags.py b/prody/atomic/flags.py index cee3c466a..56d9c4701 100644 --- a/prody/atomic/flags.py +++ b/prody/atomic/flags.py @@ -67,6 +67,8 @@ NONSTANDARD = { 'ASX': set(['acyclic', 'surface', 'polar', 'medium']), 'GLX': set(['acyclic', 'surface', 'large', 'polar']), + 'ASH': set(['acyclic', 'acidic', 'surface', 'polar', 'medium']), + 'GLH': set(['acyclic', 'acidic', 'surface', 'large', 'polar']), 'CSO': set(['acyclic', 'neutral', 'surface', 'medium', 'polar']), 'CYX': set(['acyclic', 'neutral', 'buried', 'medium', 'polar']), 'HIP': set(['cyclic', 'basic', 'surface', 'large', 'polar']), @@ -75,6 +77,12 @@ 'HSD': set(['cyclic', 'basic', 'surface', 'large', 'polar']), 'HSE': set(['cyclic', 'basic', 'surface', 'large', 'polar']), 'HSP': set(['cyclic', 'acidic', 'surface', 'large', 'polar']), + 'HISD': set(['cyclic', 'basic', 'surface', 'large', 'polar']), + 'HISE': set(['cyclic', 'basic', 'surface', 'large', 'polar']), + 'HISP': set(['cyclic', 'acidic', 'surface', 'large', 'polar']), + 'LYN': set(['acyclic', 'neutral', 'surface', 'large', 'polar']), + 'TYM': set(['cyclic', 'aromatic', 'surface', 'basic', 'large', 'polar']), + 'ARN': set(['acyclic', 'neutral', 'surface', 'large', 'polar']), 'MSE': set(['acyclic', 'neutral', 'buried', 'large']), 'CME': set(['acyclic', 'neutral', 'buried', 'large']), 'SEC': set(['acyclic', 'neutral', 'buried', 'polar', 'medium']),