Merge pull request #317 from debbiemarkslab/pdb_and_setup_fixes

Pdb and setup fixes
debbiemarkslab · Nov 5, 2024 · fd0572e · fd0572e
2 parents 7568749 + f272939
commit fd0572e
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 153 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -27,12 +27,12 @@ jobs:
         conda info -a
         conda create -q -n test-environment python=${{ matrix.python-version }} numpy scipy numba pandas matplotlib
         source activate test-environment
-    - name: Run setup.py
+    - name: Install Hatch
+      uses: pypa/hatch@install
+    - name: Build and install package
       run: |
-        pip install build
-        python setup.py sdist  --formats=zip -k
-        python -m build
-        find ./dist -iname "*.zip" -print0 | xargs -0 pip install
+        hatch build
+        find ./dist -iname "*.tar.gz" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files
       run: |

diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
@@ -28,10 +28,12 @@ jobs:
         conda info -a
         conda create -q -n test-environment python=${{ matrix.python-version }} numpy scipy numba pandas matplotlib
         source activate test-environment
-    - name: Run setup.py
+    - name: Install Hatch
+      uses: pypa/hatch@install
+    - name: Build and install package
       run: |
-        python setup.py sdist --formats=zip -k
-        find ./dist -iname "*.zip" -print0 | xargs -0 pip install
+        hatch build
+        find ./dist -iname "*.tar.gz" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files
       run: |
@@ -42,14 +44,8 @@ jobs:
       with:
         run: coverage run -m unittest discover -s test -p "Test*.py"
         working-directory: ./ #optional
-    - name: Publish evcouplings to test PyPI
-      if: startsWith(github.ref, 'refs/tags')
-      uses: pypa/gh-action-pypi-publish@master
-      with:
-        password: ${{ secrets.PYPI_ACCESS_TOKEN_TEST }}
-        repository_url: https://test.pypi.org/legacy/
     - name: Publish evcouplings to PyPI
-      if: startsWith(github.ref, 'refs/tags')
-      uses: pypa/gh-action-pypi-publish@master
+      uses: pypa/[email protected]
       with:
+        user: __token__
         password: ${{ secrets.PYPI_ACCESS_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ __pycache__
 *.ipynb_checkpoints*
 notebooks_dev/*
 evcouplings.egg-info/*
+/dist/
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
@@ -470,7 +470,9 @@ def __init__(self, filehandle, keep_full_data=False):
             "_atom_site.pdbx_formal_charge": "charge",
         }
 
-        HELIX_TARGET_COLS = {
+        # full list of conf types: https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Items/_struct_conf_type.id.html;
+        # mapping between file types: https://manpages.debian.org/unstable/dssp/mkdssp.1.en.html
+        CONF_TARGET_COLS = {
             "_struct_conf.conf_type_id": "conformation_type",
             "_struct_conf.id": "id",
             # label_asym_id and label_seq_id are sufficient for merging to atom table;
@@ -508,11 +510,15 @@ def __init__(self, filehandle, keep_full_data=False):
         # decode information into dataframe with BioPython helper method; note this section may not be
         # present if no helices exist in the structure
         try:
-            self.helix_table = pd.DataFrame({
-                name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items()
-            })
+            self.conf_table = pd.DataFrame({
+                name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items()
+            }).query(
+                # there are a handful of PDB entries that have (probably wrong) secondary structure assignments
+                # extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error
+                "beg_label_asym_id == end_label_asym_id"
+            )
         except KeyError:
-            self.helix_table = None
+            self.conf_table = None
 
         # decode information into dataframe with BioPython helper method; note this section may not be
         # present if no sheets exist in the structure
@@ -526,16 +532,23 @@ def __init__(self, filehandle, keep_full_data=False):
         # create secondary structure table for merging to chain tables
         # (will only contain helix/H and strand/E, coil/C will need to be filled in)
         sse_raw = []
-        for sse_type, sse_table in [
-            ("H", self.helix_table),
-            ("E", self.sheet_table)
+        for sse_type, sse_table, sse_filter in [
+            ("H", self.conf_table, "HELX"),
+            ("E", self.sheet_table, None),
+            # also retrieve beta strands/bridges from conf_table if available
+            ("E", self.conf_table, "STRN"),
         ]:
             # skip if secondary structure element not present in PDB file at all
             if sse_table is None:
                 continue
 
+            # filter table down to relevant entries for current secondary structure type
+            if sse_filter is not None:
+                sse_table = sse_table.query(
+                    f"conformation_type.str.startswith('{sse_filter}')"
+                )
+
             for _, row in sse_table.iterrows():
-                assert row.beg_label_asym_id == row.end_label_asym_id
                 for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1):
                     sse_raw.append({
                         "label_asym_id": row.beg_label_asym_id,
@@ -694,7 +707,7 @@ def get_chain(self, chain, model=0, is_author_id=True):
             # create coordinate ID from author residue ID + insertion code
             # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)
             coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code,
-            seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", np.nan),
+            seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", pd.NA).replace("", pd.NA),
             one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"),
             # note that MSE will now be labeled as HETATM, which was not the case with MMTF
             hetatm=lambda df: df.record_type == "HETATM",
@@ -720,12 +733,13 @@ def get_chain(self, chain, model=0, is_author_id=True):
                 how="left"
             )
         else:
+            # initialize to pd.NA instead of np.nan or warning about assigning str to float64 column appears
             res_sse = res.assign(
-                sec_struct_3state=np.nan
+                sec_struct_3state=pd.NA
             )
 
         res_sse.loc[
-            res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0),
+            res_sse.sec_struct_3state.isnull() & res_sse.seqres_id.notnull(),
             "sec_struct_3state"
         ] = "C"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,64 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "evcouplings"
+version = "0.2.1"
+description = "A Framework for evolutionary couplings analysis"
+readme = "README.md"
+license = "MIT"
+authors = [
+    { name = "Thomas Hopf", email = "[email protected]" },
+]
+keywords = [
+    "analysis",
+    "couplings",
+    "evolutionary",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+    "billiard",
+    "biopython>=1.84",
+    "bokeh",
+    "click",
+    "filelock",
+    "jinja2",
+    "matplotlib",
+    "msgpack",
+    "numba",
+    "numpy",
+    "pandas",
+    "psutil",
+    "requests",
+    "ruamel.yaml<0.18",
+    "scikit-learn",
+    "scipy",
+    "seaborn",
+    "setuptools>=18.2",
+]
+
+[project.scripts]
+evcouplings = "evcouplings.utils.app:app"
+evcouplings_dbupdate = "evcouplings.utils.update_database:app"
+evcouplings_runcfg = "evcouplings.utils.pipeline:app"
+evcouplings_summarize = "evcouplings.utils.summarize:app"
+
+[project.urls]
+Homepage = "https://github.com/debbiemarkslab/EVcouplings"
+
+[tool.hatch.version]
+path = "evcouplings/__init__.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/evcouplings",
+]
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py