Return non sanitised mol if the sanitisation fails (#24)

* added example for scaffold identification * added examples of fragments * added image of codeine * added SanitisationResult model * return original mol object if sanitisation fails * use SanitisationResult object * updated fragments and scaffolds examples * bumped version * added sanitisation cleanup flag * linting and formatting * fixed the version of rdkit * corrected typo --------- Co-authored-by: “roshan” <“[email protected]”>
PDBeurope · Oct 30, 2023 · 5754952 · 5754952
1 parent 3c15c4e
commit 5754952
Show file tree

Hide file tree

Showing 12 changed files with 714 additions and 23 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,7 +20,7 @@ jobs:
           python-version: "3.10"
 
       - run: |
-          pip install rdkit
+          pip install rdkit==2023.3.3
           pip install -e ".[tests]"
           pip install pre-commit
           pre-commit install && pre-commit run --all

diff --git a/doc/_static/codeine.svg b/doc/_static/codeine.svg
diff --git a/doc/_static/fragment_example.svg b/doc/_static/fragment_example.svg
diff --git a/doc/_static/scaffold_example.svg b/doc/_static/scaffold_example.svg
diff --git a/doc/guide/fragments.md b/doc/guide/fragments.md
@@ -14,26 +14,44 @@ Alternativelly fragments can be supplied in an external library (*.tsv) provided
 | phenanthrene | SMARTS | [#6]1:[#6]:[#6]:[#6]2:[#6](:[#6]:1):[#6]:[#6]:[#6]1:[#6]:2:[#6]:[#6]:[#6]:[#6]:1 | | unchecked | | PDBe |
 
 
-## Basic use case
+## Identifying fragments of a chemical component
 
 ```python
 from pdbeccdutils.core import ccd_reader
 from pdbeccdutils.core.fragment_library import FragmentLibrary
 
-component = ccd_reader.read_pdb_cif_file('HEM.cif').component
+component = ccd_reader.read_pdb_cif_file('HEL.cif').component
 fragment_library = FragmentLibrary()
 
-matches = component.library_search(library)
+matches = component.library_search(fragment_library)
 print(f'Matches found in the fragment library {matches}.')
 
-for fragment in component.fragments:
-    print(f'Fragment name {fragment.name} from source {fragment.source}')
+fragment_mols = [Chem.MolFromSmiles(fragment.smiles) for fragment in component.fragments]
+img = Draw.MolsToGridImage(fragment_mols, legends = [fragment.name for fragment in component.fragments])
+img
 ```
+<img src='../_static/fragment_example.svg' style="display:block margin-bottom:5px" />  
 
+
+## Identifying all chemical components with penicillin fragment
+
+```python
+fragment_library = FragmentLibrary()
+ccd_dict = ccd_reader.read_pdb_components_file('components.cif')
+ccd_with_penicillin_fragment = []
+for ccd_id in ccd_dict.keys():
+    component = ccd_dict[ccd_id].component
+    frag_matches = component.library_search(fragment_library)
+    for fragment in component.fragments:
+        if fragment.name == 'penicillin':
+            ccd_with_penicillin_fragment.append(ccd_id)
+
+ccd_with_penicillin_fragment
+
+['0RN', 'AIC', 'APV', 'CXN', 'HEL', 'IP1', 'MII', 'NFN', 'PN1', 'PNN', 'PNV', 'SOX', 'TAZ', 'WPP', 'X1E']
+```
 ## PDBe supplied fragments
 
 Below you can find actual fragment structures comming with the pdbeccdutil's `FragmentsLibrary` from the PDBe resource:
 
-<div align='center'>
-    <img src='../_static/pdbe_fragments.svg' />  
-</div>
+<img src='../_static/pdbe_fragments.svg' style="display:block"/>
diff --git a/doc/guide/scaffolds.md b/doc/guide/scaffolds.md
@@ -7,7 +7,17 @@
 
 ```python
 from pdbeccdutils.core import ccd_reader
+from pdbeccdutils.core.models import ScaffoldingMethod
+from rdkit.Chem import Draw
 
-component = ccd_reader.read_pdb_cif_file('HEM.cif').component
-component.get_scaffolds()
+component = ccd_reader.read_pdb_cif_file('CVV.cif').component
+scaffolds = component.get_scaffolds(scaffolding_method=ScaffoldingMethod.Brics)
+img = Draw.MolsToGridImage(scaffolds, legends = [f"scaffold {i}" for i in range(1, len(scaffolds)+1)])
+img
 ```
+<p float="left">
+    <img src="../_static/scaffold_example.svg" width="49%" />
+    <img src="../_static/codeine.svg" width="49%" />
+</p>
+
+The figure shows the scaffolds identified by pdbeccdutils using the BRICS fragmentation rule for the chemical component CVV when bound to the human kappa opioid receptor (PDB entry 6b73). Interestingly, scaffold 3 is an exact match to the scaffold of Codeine (ChEMBL485), a known analgesic that targets various opioid receptors, and its biological activity is well-documented in ChEMBL. Although the PDB does not contain the structure of Codeine, the shared scaffold between Codeine and CCD component CVV suggests that Codeine may interact with the Human kappa opioid receptor in a similar manner to CVV
diff --git a/pdbeccdutils/__init__.py b/pdbeccdutils/__init__.py
@@ -1 +1 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
diff --git a/pdbeccdutils/core/ccd_reader.py b/pdbeccdutils/core/ccd_reader.py
@@ -169,7 +169,8 @@ def _parse_pdb_mmcif(cif_block, sanitize=True):
     _handle_implicit_hydrogens(mol)
 
     if sanitize:
-        sanitized = mol_tools.sanitize(mol)
+        sanitized_result = mol_tools.sanitize(mol)
+        mol, sanitized = sanitized_result.mol, sanitized_result.status
 
     descriptors = _parse_pdb_descriptors(
         cif_block, "_pdbx_chem_comp_descriptor.", "descriptor"

diff --git a/pdbeccdutils/core/clc_reader.py b/pdbeccdutils/core/clc_reader.py
@@ -112,7 +112,8 @@ def infer_multiple_chem_comp(path_to_cif, bm, bm_id, sanitize=True):
     (mol, warnings, errors) = _parse_pdb_mmcif(cif_block, bm.graph)
     sanitized = False
     if sanitize:
-        sanitized = mol_tools.sanitize(mol)
+        sanitized_result = mol_tools.sanitize(mol)
+        mol, sanitized = sanitized_result.mol, sanitized_result.status
 
     inchi_result = mol_tools.inchi_from_mol(mol)
     if inchi_result.warnings:

diff --git a/pdbeccdutils/core/models.py b/pdbeccdutils/core/models.py
@@ -149,6 +149,19 @@ class DepictionResult(NamedTuple):
     score: float
 
 
+class SanitisationResult(NamedTuple):
+    """
+    Sanitisation result details.
+
+    Args:
+        mol: rdkit.Chem.rdchem.RWMol
+        status: Status of sanitisation process.
+    """
+
+    mol: Chem.rdchem.Mol
+    status: str
+
+
 class Descriptor(NamedTuple):
     """
     Descriptor obtained from the cif file. This is essentially

diff --git a/pdbeccdutils/core/prd_reader.py b/pdbeccdutils/core/prd_reader.py
@@ -137,7 +137,8 @@ def _parse_pdb_mmcif(cif_block, sanitize=True):
     ccd_reader._handle_implicit_hydrogens(mol)
 
     if sanitize:
-        sanitized = mol_tools.sanitize(mol)
+        sanitized_result = mol_tools.sanitize(mol)
+        mol, sanitized = sanitized_result.mol, sanitized_result.status
 
     descriptors = ccd_reader._parse_pdb_descriptors(
         cif_block, "_pdbx_chem_comp_descriptor.", "descriptor"

diff --git a/pdbeccdutils/helpers/mol_tools.py b/pdbeccdutils/helpers/mol_tools.py
@@ -22,7 +22,12 @@
 import re
 import sys
 from io import StringIO
-from pdbeccdutils.core.models import InChIFromRDKit, MolFromRDKit, ConformerType
+from pdbeccdutils.core.models import (
+    InChIFromRDKit,
+    MolFromRDKit,
+    ConformerType,
+    SanitisationResult,
+)
 from contextlib import redirect_stderr
 
 import numpy as np
@@ -79,12 +84,16 @@ def sanitize(rwmol):
     success = False
 
     try:
-        success = fix_molecule(rwmol)
+        mol_copy = rdkit.Chem.RWMol(rwmol)
+        success = fix_molecule(mol_copy)
 
         if not success:
-            return False
+            rdkit.Chem.SanitizeMol(
+                rwmol, sanitizeOps=rdkit.Chem.SanitizeFlags.SANITIZE_CLEANUP
+            )
+            return SanitisationResult(mol=rwmol, status=False)
 
-        rdkit.Chem.Kekulize(rwmol)
+        rdkit.Chem.Kekulize(mol_copy)
         # rdkit.Chem.rdmolops.AssignAtomChiralTagsFromStructure(rwmol, confId=0)
 
         # find correct conformer to assign stereochemistry
@@ -93,7 +102,7 @@ def sanitize(rwmol):
         conformer_id = -1
         conformer_types = [ConformerType.Ideal, ConformerType.Model]
         for conf_type in conformer_types:
-            conformer = get_conformer(rwmol, conf_type)
+            conformer = get_conformer(mol_copy, conf_type)
             if not is_degenerate_conformer(conformer):
                 conformer_id = conformer.GetId()
 
@@ -103,13 +112,16 @@ def sanitize(rwmol):
         # else:
         #     conformer_id = conformers[0].GetId()
 
-        rdkit.Chem.rdmolops.AssignStereochemistryFrom3D(rwmol, conformer_id)
+        rdkit.Chem.rdmolops.AssignStereochemistryFrom3D(mol_copy, conformer_id)
 
     except Exception as e:
         print(e, file=sys.stderr)
-        return False
+        rdkit.Chem.SanitizeMol(
+            rwmol, sanitizeOps=rdkit.Chem.SanitizeFlags.SANITIZE_CLEANUP
+        )
+        return SanitisationResult(mol=rwmol, status=False)
 
-    return success
+    return SanitisationResult(mol=mol_copy, status=success)
 
 
 def get_conformer(rwmol, c_type):