diff --git a/depthcharge/data/parsers.py b/depthcharge/data/parsers.py index 79ba27a..76b2f99 100644 --- a/depthcharge/data/parsers.py +++ b/depthcharge/data/parsers.py @@ -206,15 +206,15 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch: "intensity_array": parsed.intensity, } + # Parse custom fields: + entry.update(self.parse_custom_fields(spectrum)) + self._update_batch(entry) + except (IndexError, KeyError, ValueError) as exc: last_exc = exc n_skipped += 1 continue - # Parse custom fields: - entry.update(self.parse_custom_fields(spectrum)) - self._update_batch(entry) - # Update the batch: if len(self._batch["scan_id"]) == batch_size: yield self._yield_batch() @@ -225,8 +225,8 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch: if n_skipped: warnings.warn( - f"Skipped {n_skipped} spectra with invalid information." - f"Last error was: \n {str(last_exc)}" + f"Skipped {n_skipped} spectra with invalid information. " + f"Last error was:\n{str(last_exc)}" ) def _update_batch(self, entry: dict) -> None: diff --git a/tests/unit_tests/test_data/test_parsers.py b/tests/unit_tests/test_data/test_parsers.py index b730a77..b06f4eb 100644 --- a/tests/unit_tests/test_data/test_parsers.py +++ b/tests/unit_tests/test_data/test_parsers.py @@ -195,15 +195,36 @@ def test_custom_fields(mgf_small): expected = pl.Series("seq", ["LESLIEK", "EDITHR"]) assert_series_equal(parsed["seq"], expected) - with pytest.raises(KeyError): - pl.from_arrow( - MgfParser( - mgf_small, - custom_fields=CustomField( - "seq", lambda x: x["params"]["bar"], pa.string() - ), - ).iter_batches(None) - ) + # Test that spectra with invalid custom fields are skipped. + # We don't like the amino acid "D". + def seq_no_d(spec): + if "D" in (seq := spec["params"]["seq"]): + raise ValueError(f"Invalid sequence: {seq}") + return seq + + parsed = pl.from_arrow( + MgfParser( + mgf_small, + custom_fields=CustomField("seq", seq_no_d, pa.string()), + ).iter_batches(None) + ) + + assert len(parsed) == 1 + assert_series_equal(parsed["seq"], pl.Series("seq", ["LESLIEK"])) + + # Invalid custom fields will cause all spectra to get skipped. + parser = MgfParser( + mgf_small, + custom_fields=CustomField( + "seq", lambda x: x["params"]["bar"], pa.string() + ), + ) + + with pytest.warns( + UserWarning, match=r"Skipped 2 spectra with invalid information.*" + ): + spectra = list(parser.iter_batches(None)) + assert len(spectra) == 0 def test_invalid_file(tmp_path):