diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 751660293..df540aed4 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -49,7 +49,6 @@ jobs:
         pip install --user typing  # needed for Python 2 in some cases
 
         pip install --user pytest
-        pip install --user nose
 
         pip install --user dm-tree h5py typing
         pip install --user --progress-bar=off "numpy<2"
@@ -138,9 +137,6 @@ jobs:
 
         pip install --user pytest
 
-        # Install nose anyway because we currently use some of its helper functions.
-        pip install --user nose
-
         pip install --user --progress-bar=off -r requirements.txt
 
     - name: Test Python/Numpy/TF versions.
@@ -225,9 +221,6 @@ jobs:
 
         pip install --user pytest
 
-        # Install nose anyway because we currently use some of its helper functions.
-        pip install --user nose
-
         if [[ "${{matrix.tf-version}}" == 2.[0123].* || "${{matrix.tf-version}}" == 1.* ]]; then
           # Older TF needs older NumPy version.
           # https://github.com/rwth-i6/returnn/pull/1160#issuecomment-1284537803
@@ -309,9 +302,6 @@ jobs:
 
         pip install --user pytest
 
-        # Install nose anyway because we currently use some of its helper functions.
-        pip install --user nose
-
         pip install --user dm-tree h5py typing
         pip install --user --progress-bar=off "numpy<2"
         pip install --user --progress-bar=off scipy  # for some tests
@@ -400,9 +390,6 @@ jobs:
 
         pip install --user pytest
 
-        # Install nose anyway because we currently use some of its helper functions.
-        pip install --user nose
-
         pip install --user dm-tree h5py typing
         pip install --user --progress-bar=off "numpy<2"
         pip install --user --progress-bar=off scipy  # for some tests
diff --git a/docs/advanced/test_suite.rst b/docs/advanced/test_suite.rst
index b7e9a63de..56581c7ee 100644
--- a/docs/advanced/test_suite.rst
+++ b/docs/advanced/test_suite.rst
@@ -10,7 +10,7 @@ by GitHub Actions defined `here <https://github.com/rwth-i6/returnn/blob/master/
 
 The test cases are all in the `tests directory <https://github.com/rwth-i6/returnn/tree/master/tests>`__.
 
-We use nosetests but the tests can also be run manually like::
+We use pytests but the tests can also be run manually like::
 
   python3 tests/test_TFEngine.py
 
diff --git a/docs/getting_started/installation.rst b/docs/getting_started/installation.rst
index b70ab4f4e..94552f17a 100644
--- a/docs/getting_started/installation.rst
+++ b/docs/getting_started/installation.rst
@@ -21,7 +21,7 @@ and for PyTorch, use :code:`pip install torch torchaudio`.
 
 For some specific datasets or special layers, additional dependencies might be needed,
 such as ``librosa``.
-For running the tests, you need ``pytest`` and ``nose``.
+For running the tests, you need ``pytest``.
 
 You can also install RETURNN as a framework, via ``pip`` (`PyPI entry <https://pypi.org/project/returnn/>`__),
 like::
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 60efc2462..087abce84 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,6 @@
 numpy<2
 scipy
 h5py
-nose
 pytest
 dm-tree
 tensorflow==2.12.1
diff --git a/requirements-dev b/requirements-dev
index 5adc880f5..fbbe5ed50 100644
--- a/requirements-dev
+++ b/requirements-dev
@@ -1,3 +1,2 @@
 black==22.3.0
-nose
 pytest
diff --git a/tests/test_Config.py b/tests/test_Config.py
index 221c73102..f350095a3 100644
--- a/tests/test_Config.py
+++ b/tests/test_Config.py
@@ -1,7 +1,6 @@
 import sys
 import _setup_test_env  # noqa
 import unittest
-from nose.tools import assert_equal, assert_is_instance, assert_in, assert_greater, assert_true, assert_false
 from pprint import pprint
 from returnn.config import *
 from returnn.util import better_exchook
@@ -21,16 +20,16 @@ def test_old_format():
         )
     )
 
-    assert_true(config.has("num_inputs"))
-    assert_true(config.has("hidden_type"))
-    assert_equal(config.int("num_inputs", -1), 3)
-    assert_equal(config.value("hidden_type", "x"), "forward,lstm")
-    assert_equal(config.value("hidden_type", "x", index=0), "forward")
-    assert_equal(config.value("hidden_type", "x", index=1), "lstm")
-    assert_equal(config.list("hidden_type", ["x"]), ["forward", "lstm"])
+    assert config.has("num_inputs") is True
+    assert config.has("hidden_type") is True
+    assert config.int("num_inputs", -1) == 3
+    assert config.value("hidden_type", "x") == "forward,lstm"
+    assert config.value("hidden_type", "x", index=0) == "forward"
+    assert config.value("hidden_type", "x", index=1) == "lstm"
+    assert config.list("hidden_type", ["x"]) == ["forward", "lstm"]
 
-    assert_false(config.is_typed("num_inputs"))
-    assert_false(config.is_typed("hidden_type"))
+    assert config.is_typed("num_inputs") is False
+    assert config.is_typed("hidden_type") is False
 
 
 def test_json_format():
@@ -47,19 +46,19 @@ def test_json_format():
         )
     )
 
-    assert_true(config.has("num_inputs"))
-    assert_true(config.has("hidden_type"))
-    assert_equal(config.int("num_inputs", -1), 3)
-    assert_equal(config.value("hidden_type", "x"), "forward,lstm")
-    assert_equal(config.value("hidden_type", "x", index=0), "forward")
-    assert_equal(config.value("hidden_type", "x", index=1), "lstm")
-    assert_equal(config.list("hidden_type", ["x"]), ["forward", "lstm"])
+    assert config.has("num_inputs") is True
+    assert config.has("hidden_type") is True
+    assert config.int("num_inputs", -1) == 3
+    assert config.value("hidden_type", "x") == "forward,lstm"
+    assert config.value("hidden_type", "x", index=0) == "forward"
+    assert config.value("hidden_type", "x", index=1) == "lstm"
+    assert config.list("hidden_type", ["x"]) == ["forward", "lstm"]
 
-    assert_true(config.is_typed("num_inputs"))
-    assert_true(config.is_typed("hidden_type"))
-    assert_is_instance(config.typed_value("num_inputs"), int)
-    assert_is_instance(config.typed_value("hidden_type"), list)
-    assert_equal(config.typed_value("hidden_type"), ["forward", "lstm"])
+    assert config.is_typed("num_inputs") is True
+    assert config.is_typed("hidden_type") is True
+    assert isinstance(config.typed_value("num_inputs"), int)
+    assert isinstance(config.typed_value("hidden_type"), list)
+    assert config.typed_value("hidden_type") == ["forward", "lstm"]
 
 
 def test_py_config():
@@ -74,19 +73,19 @@ def test_py_config():
         )
     )
 
-    assert_true(config.has("num_inputs"))
-    assert_true(config.has("hidden_type"))
-    assert_equal(config.int("num_inputs", -1), 3)
-    assert_equal(config.value("hidden_type", "x"), "forward,lstm")
-    assert_equal(config.value("hidden_type", "x", index=0), "forward")
-    assert_equal(config.value("hidden_type", "x", index=1), "lstm")
-    assert_equal(config.list("hidden_type", ["x"]), ["forward", "lstm"])
+    assert config.has("num_inputs") is True
+    assert config.has("hidden_type") is True
+    assert config.int("num_inputs", -1) == 3
+    assert config.value("hidden_type", "x") == "forward,lstm"
+    assert config.value("hidden_type", "x", index=0) == "forward"
+    assert config.value("hidden_type", "x", index=1) == "lstm"
+    assert config.list("hidden_type", ["x"]) == ["forward", "lstm"]
 
-    assert_true(config.is_typed("num_inputs"))
-    assert_true(config.is_typed("hidden_type"))
-    assert_is_instance(config.typed_value("num_inputs"), int)
-    assert_is_instance(config.typed_value("hidden_type"), list)
-    assert_equal(config.typed_value("hidden_type"), ["forward", "lstm"])
+    assert config.is_typed("num_inputs") is True
+    assert config.is_typed("hidden_type") is True
+    assert isinstance(config.typed_value("num_inputs"), int)
+    assert isinstance(config.typed_value("hidden_type"), list)
+    assert config.typed_value("hidden_type") == ["forward", "lstm"]
 
 
 def test_rnn_init_config_py_global_var():
@@ -115,9 +114,9 @@ def test_func():
     assert rnn.config.has("task")
     assert rnn.config.has("test_value")
     assert rnn.config.has("test_func")
-    assert_equal(rnn.config.value("task", None), "search")
+    assert rnn.config.value("task", None) == "search"
     assert rnn.config.is_typed("test_value")
-    assert_equal(rnn.config.typed_value("test_value"), 42)
+    assert rnn.config.typed_value("test_value") == 42
     assert rnn.config.is_typed("test_func")
     # So far it's fine.
     # Now something a bit strange.
@@ -125,7 +124,7 @@ def test_func():
     assert rnn.config.is_typed("task")
     test_func = rnn.config.typed_dict["test_func"]
     assert callable(test_func)
-    assert_equal(test_func(), "search")
+    assert test_func() == "search"
 
 
 def test_rnn_init_config_py_cmd_type():
@@ -152,7 +151,7 @@ def test_func():
     assert rnn.config.is_typed("test_func")
     test_func = rnn.config.typed_dict["test_func"]
     assert callable(test_func)
-    assert_equal(test_func(), 0)
+    assert test_func() == 0
 
 
 def test_config_py_ext():
@@ -176,9 +175,9 @@ def test_func():
     assert config.is_typed("test_func")
     test_func = config.typed_dict["test_func"]
     assert callable(test_func)
-    assert_equal(test_func(), "train")
+    assert test_func() == "train"
     config.set("task", "search")
-    assert_equal(test_func(), "search")
+    assert test_func() == "search"
 
 
 def test_config_py_old_returnn_imports():
diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py
index ed6986708..de6626f68 100644
--- a/tests/test_Dataset.py
+++ b/tests/test_Dataset.py
@@ -10,7 +10,6 @@
 import numpy
 import tempfile
 import contextlib
-from nose.tools import assert_equal, assert_is_instance, assert_in, assert_not_in, assert_true, assert_false
 from returnn.datasets.generating import Task12AXDataset, DummyDataset, DummyDatasetMultipleSequenceLength
 from returnn.engine.batch import Batch
 from returnn.datasets.basic import Dataset, DatasetSeq, init_dataset
@@ -116,9 +115,9 @@ def test_iterate_seqs_no_chunking_1():
     dataset.chunk_size = 0
     dataset.init_seq_order(1)
     seqs = list(dataset.iterate_seqs())
-    assert_equal(len(seqs), 2)
-    assert_equal(seqs[0], (0, 0, 11))  # seq-idx, start-frame, end-frame
-    assert_equal(seqs[1], (1, 0, 11))
+    assert len(seqs) == 2
+    assert seqs[0] == (0, 0, 11)  # seq-idx, start-frame, end-frame
+    assert seqs[1] == (1, 0, 11)
 
 
 def test_iterate_seqs_chunking_1():
@@ -129,13 +128,13 @@ def test_iterate_seqs_chunking_1():
     seqs = list(dataset.iterate_seqs())
     for s in seqs:
         print(s)
-    assert_equal(len(seqs), 6)
-    assert_equal(seqs[0], (0, 0, 10))  # seq-idx, start-frame, end-frame
-    assert_equal(seqs[1], (0, 5, 11))
-    assert_equal(seqs[2], (0, 10, 11))
-    assert_equal(seqs[3], (1, 0, 10))
-    assert_equal(seqs[4], (1, 5, 11))
-    assert_equal(seqs[5], (1, 10, 11))
+    assert len(seqs) == 6
+    assert seqs[0] == (0, 0, 10)  # seq-idx, start-frame, end-frame
+    assert seqs[1] == (0, 5, 11)
+    assert seqs[2] == (0, 10, 11)
+    assert seqs[3] == (1, 0, 10)
+    assert seqs[4] == (1, 5, 11)
+    assert seqs[5] == (1, 10, 11)
 
 
 def test_iterate_seqs_chunking_varying_sequence_length():
@@ -148,15 +147,15 @@ def test_iterate_seqs_chunking_varying_sequence_length():
     seqs = list(dataset.iterate_seqs())
     for s in seqs:
         print(s)
-    assert_equal(len(seqs), 8)
-    assert_equal(seqs[0], (0, NumbersDict({"data": 0, "classes": 0}), NumbersDict({"data": 12, "classes": 6})))
-    assert_equal(seqs[1], (0, NumbersDict({"data": 6, "classes": 3}), NumbersDict({"data": 18, "classes": 9})))
-    assert_equal(seqs[2], (0, NumbersDict({"data": 12, "classes": 6}), NumbersDict({"data": 24, "classes": 12})))
-    assert_equal(seqs[3], (0, NumbersDict({"data": 18, "classes": 9}), NumbersDict({"data": 24, "classes": 12})))
-    assert_equal(seqs[4], (1, NumbersDict({"data": 0, "classes": 0}), NumbersDict({"data": 12, "classes": 6})))
-    assert_equal(seqs[5], (1, NumbersDict({"data": 6, "classes": 3}), NumbersDict({"data": 18, "classes": 9})))
-    assert_equal(seqs[6], (1, NumbersDict({"data": 12, "classes": 6}), NumbersDict({"data": 24, "classes": 12})))
-    assert_equal(seqs[7], (1, NumbersDict({"data": 18, "classes": 9}), NumbersDict({"data": 24, "classes": 12})))
+    assert len(seqs) == 8
+    assert seqs[0] == (0, NumbersDict({"data": 0, "classes": 0}), NumbersDict({"data": 12, "classes": 6}))
+    assert seqs[1] == (0, NumbersDict({"data": 6, "classes": 3}), NumbersDict({"data": 18, "classes": 9}))
+    assert seqs[2] == (0, NumbersDict({"data": 12, "classes": 6}), NumbersDict({"data": 24, "classes": 12}))
+    assert seqs[3] == (0, NumbersDict({"data": 18, "classes": 9}), NumbersDict({"data": 24, "classes": 12}))
+    assert seqs[4] == (1, NumbersDict({"data": 0, "classes": 0}), NumbersDict({"data": 12, "classes": 6}))
+    assert seqs[5] == (1, NumbersDict({"data": 6, "classes": 3}), NumbersDict({"data": 18, "classes": 9}))
+    assert seqs[6] == (1, NumbersDict({"data": 12, "classes": 6}), NumbersDict({"data": 24, "classes": 12}))
+    assert seqs[7] == (1, NumbersDict({"data": 18, "classes": 9}), NumbersDict({"data": 24, "classes": 12}))
 
 
 def test_iterate_seqs_custom_chunking():
@@ -183,13 +182,13 @@ def _custom_chunking_func(dataset, seq_idx_start, **_kwargs):
     seqs = list(dataset.iterate_seqs())
     for s in seqs:
         print(s)
-    assert_equal(len(seqs), 6)
-    assert_equal(seqs[0], (0, 0, 10))  # seq-idx, start-frame, end-frame
-    assert_equal(seqs[1], (0, 5, 11))
-    assert_equal(seqs[2], (0, 10, 11))
-    assert_equal(seqs[3], (1, 0, 10))
-    assert_equal(seqs[4], (1, 5, 11))
-    assert_equal(seqs[5], (1, 10, 11))
+    assert len(seqs) == 6
+    assert seqs[0] == (0, 0, 10)  # seq-idx, start-frame, end-frame
+    assert seqs[1] == (0, 5, 11)
+    assert seqs[2] == (0, 10, 11)
+    assert seqs[3] == (1, 0, 10)
+    assert seqs[4] == (1, 5, 11)
+    assert seqs[5] == (1, 10, 11)
 
 
 def test_batches_recurrent_1():
@@ -202,7 +201,7 @@ def test_batches_recurrent_1():
     " :type: list[Batch] "
     while batch_gen.has_more():
         (batch,) = batch_gen.peek_next_n(1)
-        assert_is_instance(batch, Batch)
+        assert isinstance(batch, Batch)
         print("batch:", batch)
         print("batch seqs:", batch.seqs)
         all_batches.append(batch)
@@ -211,47 +210,47 @@ def test_batches_recurrent_1():
     # Each batch will have 1 batch-slice (max_seqs) and up to 10 frames (chunk_size).
     # For each seq, we get 3 chunks (chunk_step 5 for 11 frames).
     # Thus, 6 batches.
-    assert_equal(len(all_batches), 6)
-
-    assert_equal(all_batches[0].start_seq, 0)
-    assert_equal(all_batches[0].end_seq, 1)  # exclusive
-    assert_equal(len(all_batches[0].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[0].seqs[0].seq_idx, 0)
-    assert_equal(all_batches[0].seqs[0].seq_start_frame, 0)
-    assert_equal(all_batches[0].seqs[0].seq_end_frame, 10)
-    assert_equal(all_batches[0].seqs[0].frame_length, 10)
-    assert_equal(all_batches[0].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[0].seqs[0].batch_frame_offset, 0)
-
-    assert_equal(all_batches[1].start_seq, 0)
-    assert_equal(all_batches[1].end_seq, 1)  # exclusive
-    assert_equal(len(all_batches[1].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[1].seqs[0].seq_idx, 0)
-    assert_equal(all_batches[1].seqs[0].seq_start_frame, 5)
-    assert_equal(all_batches[1].seqs[0].seq_end_frame, 11)
-    assert_equal(all_batches[1].seqs[0].frame_length, 6)
-    assert_equal(all_batches[1].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[1].seqs[0].batch_frame_offset, 0)
-
-    assert_equal(all_batches[2].start_seq, 0)
-    assert_equal(all_batches[2].end_seq, 1)  # exclusive
-    assert_equal(len(all_batches[2].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[2].seqs[0].seq_idx, 0)
-    assert_equal(all_batches[2].seqs[0].seq_start_frame, 10)
-    assert_equal(all_batches[2].seqs[0].seq_end_frame, 11)
-    assert_equal(all_batches[2].seqs[0].frame_length, 1)
-    assert_equal(all_batches[2].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[2].seqs[0].batch_frame_offset, 0)
-
-    assert_equal(all_batches[3].start_seq, 1)
-    assert_equal(all_batches[3].end_seq, 2)  # exclusive
-    assert_equal(len(all_batches[3].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[3].seqs[0].seq_idx, 1)
-    assert_equal(all_batches[3].seqs[0].seq_start_frame, 0)
-    assert_equal(all_batches[3].seqs[0].seq_end_frame, 10)
-    assert_equal(all_batches[3].seqs[0].frame_length, 10)
-    assert_equal(all_batches[3].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[3].seqs[0].batch_frame_offset, 0)
+    assert len(all_batches) == 6
+
+    assert all_batches[0].start_seq == 0
+    assert all_batches[0].end_seq == 1  # exclusive
+    assert len(all_batches[0].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[0].seqs[0].seq_idx == 0
+    assert all_batches[0].seqs[0].seq_start_frame == 0
+    assert all_batches[0].seqs[0].seq_end_frame == 10
+    assert all_batches[0].seqs[0].frame_length == 10
+    assert all_batches[0].seqs[0].batch_slice == 0
+    assert all_batches[0].seqs[0].batch_frame_offset == 0
+
+    assert all_batches[1].start_seq == 0
+    assert all_batches[1].end_seq == 1  # exclusive
+    assert len(all_batches[1].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[1].seqs[0].seq_idx == 0
+    assert all_batches[1].seqs[0].seq_start_frame == 5
+    assert all_batches[1].seqs[0].seq_end_frame == 11
+    assert all_batches[1].seqs[0].frame_length == 6
+    assert all_batches[1].seqs[0].batch_slice == 0
+    assert all_batches[1].seqs[0].batch_frame_offset == 0
+
+    assert all_batches[2].start_seq == 0
+    assert all_batches[2].end_seq == 1  # exclusive
+    assert len(all_batches[2].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[2].seqs[0].seq_idx == 0
+    assert all_batches[2].seqs[0].seq_start_frame == 10
+    assert all_batches[2].seqs[0].seq_end_frame == 11
+    assert all_batches[2].seqs[0].frame_length == 1
+    assert all_batches[2].seqs[0].batch_slice == 0
+    assert all_batches[2].seqs[0].batch_frame_offset == 0
+
+    assert all_batches[3].start_seq == 1
+    assert all_batches[3].end_seq == 2  # exclusive
+    assert len(all_batches[3].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[3].seqs[0].seq_idx == 1
+    assert all_batches[3].seqs[0].seq_start_frame == 0
+    assert all_batches[3].seqs[0].seq_end_frame == 10
+    assert all_batches[3].seqs[0].frame_length == 10
+    assert all_batches[3].seqs[0].batch_slice == 0
+    assert all_batches[3].seqs[0].batch_frame_offset == 0
 
     # ...
 
@@ -263,7 +262,7 @@ def test_batches_non_recurrent_1():
     all_batches = []  # type: list[Batch]
     while batch_gen.has_more():
         (batch,) = batch_gen.peek_next_n(1)
-        assert_is_instance(batch, Batch)
+        assert isinstance(batch, Batch)
         print("batch:", batch)
         print("batch seqs:", batch.seqs)
         all_batches.append(batch)
@@ -271,63 +270,63 @@ def test_batches_non_recurrent_1():
 
     # Each batch will have 5 frames (batch_size), not more, i.e. a single seq.
     # There are 2 * 11 frames in total, so 5 batches, because we concat the 2 seqs, in the non-recurrent case.
-    assert_equal(len(all_batches), 5)
-
-    assert_equal(all_batches[0].start_seq, 0)
-    assert_equal(all_batches[0].end_seq, 1)  # exclusive
-    assert_equal(len(all_batches[0].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[0].seqs[0].seq_idx, 0)
-    assert_equal(all_batches[0].seqs[0].seq_start_frame, 0)
-    assert_equal(all_batches[0].seqs[0].seq_end_frame, 5)
-    assert_equal(all_batches[0].seqs[0].frame_length, 5)
-    assert_equal(all_batches[0].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[0].seqs[0].batch_frame_offset, 0)
-
-    assert_equal(all_batches[1].start_seq, 0)
-    assert_equal(all_batches[1].end_seq, 1)  # exclusive
-    assert_equal(len(all_batches[1].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[1].seqs[0].seq_idx, 0)
-    assert_equal(all_batches[1].seqs[0].seq_start_frame, 5)
-    assert_equal(all_batches[1].seqs[0].seq_end_frame, 10)
-    assert_equal(all_batches[1].seqs[0].frame_length, 5)
-    assert_equal(all_batches[1].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[1].seqs[0].batch_frame_offset, 0)
-
-    assert_equal(all_batches[2].start_seq, 0)
-    assert_equal(all_batches[2].end_seq, 2)  # exclusive. now both seq 0 and 1
-    assert_equal(len(all_batches[2].seqs), 2)  # two copies, BatchSeqCopyPart
-    assert_equal(all_batches[2].seqs[0].seq_idx, 0)
-    assert_equal(all_batches[2].seqs[0].seq_start_frame, 10)
-    assert_equal(all_batches[2].seqs[0].seq_end_frame, 11)
-    assert_equal(all_batches[2].seqs[0].frame_length, 1)
-    assert_equal(all_batches[2].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[2].seqs[0].batch_frame_offset, 0)
-    assert_equal(all_batches[2].seqs[1].seq_idx, 1)
-    assert_equal(all_batches[2].seqs[1].seq_start_frame, 0)
-    assert_equal(all_batches[2].seqs[1].seq_end_frame, 4)
-    assert_equal(all_batches[2].seqs[1].frame_length, 4)
-    assert_equal(all_batches[2].seqs[1].batch_slice, 0)
-    assert_equal(all_batches[2].seqs[1].batch_frame_offset, 1)
-
-    assert_equal(all_batches[3].start_seq, 1)
-    assert_equal(all_batches[3].end_seq, 2)  # exclusive
-    assert_equal(len(all_batches[3].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[3].seqs[0].seq_idx, 1)
-    assert_equal(all_batches[3].seqs[0].seq_start_frame, 4)
-    assert_equal(all_batches[3].seqs[0].seq_end_frame, 9)
-    assert_equal(all_batches[3].seqs[0].frame_length, 5)
-    assert_equal(all_batches[3].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[3].seqs[0].batch_frame_offset, 0)
-
-    assert_equal(all_batches[4].start_seq, 1)
-    assert_equal(all_batches[4].end_seq, 2)  # exclusive
-    assert_equal(len(all_batches[4].seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(all_batches[4].seqs[0].seq_idx, 1)
-    assert_equal(all_batches[4].seqs[0].seq_start_frame, 9)
-    assert_equal(all_batches[4].seqs[0].seq_end_frame, 11)
-    assert_equal(all_batches[4].seqs[0].frame_length, 2)
-    assert_equal(all_batches[4].seqs[0].batch_slice, 0)
-    assert_equal(all_batches[4].seqs[0].batch_frame_offset, 0)
+    assert len(all_batches) == 5
+
+    assert all_batches[0].start_seq == 0
+    assert all_batches[0].end_seq == 1  # exclusive
+    assert len(all_batches[0].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[0].seqs[0].seq_idx == 0
+    assert all_batches[0].seqs[0].seq_start_frame == 0
+    assert all_batches[0].seqs[0].seq_end_frame == 5
+    assert all_batches[0].seqs[0].frame_length == 5
+    assert all_batches[0].seqs[0].batch_slice == 0
+    assert all_batches[0].seqs[0].batch_frame_offset == 0
+
+    assert all_batches[1].start_seq == 0
+    assert all_batches[1].end_seq == 1  # exclusive
+    assert len(all_batches[1].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[1].seqs[0].seq_idx == 0
+    assert all_batches[1].seqs[0].seq_start_frame == 5
+    assert all_batches[1].seqs[0].seq_end_frame == 10
+    assert all_batches[1].seqs[0].frame_length == 5
+    assert all_batches[1].seqs[0].batch_slice == 0
+    assert all_batches[1].seqs[0].batch_frame_offset == 0
+
+    assert all_batches[2].start_seq == 0
+    assert all_batches[2].end_seq == 2  # exclusive. now both seq 0 and 1
+    assert len(all_batches[2].seqs) == 2  # two copies, BatchSeqCopyPart
+    assert all_batches[2].seqs[0].seq_idx == 0
+    assert all_batches[2].seqs[0].seq_start_frame == 10
+    assert all_batches[2].seqs[0].seq_end_frame == 11
+    assert all_batches[2].seqs[0].frame_length == 1
+    assert all_batches[2].seqs[0].batch_slice == 0
+    assert all_batches[2].seqs[0].batch_frame_offset == 0
+    assert all_batches[2].seqs[1].seq_idx == 1
+    assert all_batches[2].seqs[1].seq_start_frame == 0
+    assert all_batches[2].seqs[1].seq_end_frame == 4
+    assert all_batches[2].seqs[1].frame_length == 4
+    assert all_batches[2].seqs[1].batch_slice == 0
+    assert all_batches[2].seqs[1].batch_frame_offset == 1
+
+    assert all_batches[3].start_seq == 1
+    assert all_batches[3].end_seq == 2  # exclusive
+    assert len(all_batches[3].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[3].seqs[0].seq_idx == 1
+    assert all_batches[3].seqs[0].seq_start_frame == 4
+    assert all_batches[3].seqs[0].seq_end_frame == 9
+    assert all_batches[3].seqs[0].frame_length == 5
+    assert all_batches[3].seqs[0].batch_slice == 0
+    assert all_batches[3].seqs[0].batch_frame_offset == 0
+
+    assert all_batches[4].start_seq == 1
+    assert all_batches[4].end_seq == 2  # exclusive
+    assert len(all_batches[4].seqs) == 1  # 1 BatchSeqCopyPart
+    assert all_batches[4].seqs[0].seq_idx == 1
+    assert all_batches[4].seqs[0].seq_start_frame == 9
+    assert all_batches[4].seqs[0].seq_end_frame == 11
+    assert all_batches[4].seqs[0].frame_length == 2
+    assert all_batches[4].seqs[0].batch_slice == 0
+    assert all_batches[4].seqs[0].batch_frame_offset == 0
 
 
 def test_batches_context_window():
@@ -344,7 +343,7 @@ def test_batches_context_window():
     all_batches = []  # type: list[Batch]
     while batch_gen.has_more():
         (batch,) = batch_gen.peek_next_n(1)
-        assert_is_instance(batch, Batch)
+        assert isinstance(batch, Batch)
         print("batch:", batch)
         print("batch seqs:", batch.seqs)
         all_batches.append(batch)
@@ -353,50 +352,50 @@ def test_batches_context_window():
     # Each batch will have 1 batch-slice (max_seqs) and up to 10 frames (chunk_size).
     # For each seq, we get 3 chunks (chunk_step 5 for 11 frames).
     # Thus, 3 batches.
-    assert_equal(len(all_batches), 3)
+    assert len(all_batches) == 3
     b0, b1, b2 = all_batches
     assert isinstance(b0, Batch)
     assert isinstance(b1, Batch)
     assert isinstance(b2, Batch)
 
-    assert_equal(b0.start_seq, 0)
-    assert_equal(b0.end_seq, 1)  # exclusive
-    assert_equal(len(b0.seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(b0.seqs[0].seq_idx, 0)
-    assert_equal(b0.seqs[0].seq_start_frame["classes"], 0)
-    assert_equal(b0.seqs[0].seq_end_frame["classes"], 5)
-    assert_equal(b0.seqs[0].frame_length["classes"], 5)
-    assert_equal(b0.seqs[0].seq_start_frame["data"], 0 - ctx_left)
-    assert_equal(b0.seqs[0].seq_end_frame["data"], 5 + ctx_right)
-    assert_equal(b0.seqs[0].frame_length["data"], 5 + ctx_lr)
-    assert_equal(b0.seqs[0].batch_slice, 0)
-    assert_equal(b0.seqs[0].batch_frame_offset, 0)
-
-    assert_equal(b1.start_seq, 0)
-    assert_equal(b1.end_seq, 1)  # exclusive
-    assert_equal(len(b1.seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(b1.seqs[0].seq_idx, 0)
-    assert_equal(b1.seqs[0].seq_start_frame["classes"], 5)
-    assert_equal(b1.seqs[0].seq_end_frame["classes"], 10)
-    assert_equal(b1.seqs[0].frame_length["classes"], 5)
-    assert_equal(b1.seqs[0].seq_start_frame["data"], 5 - ctx_left)
-    assert_equal(b1.seqs[0].seq_end_frame["data"], 10 + ctx_right)
-    assert_equal(b1.seqs[0].frame_length["data"], 5 + ctx_lr)
-    assert_equal(b1.seqs[0].batch_slice, 0)
-    assert_equal(b1.seqs[0].batch_frame_offset, 0)
-
-    assert_equal(b2.start_seq, 0)
-    assert_equal(b2.end_seq, 1)  # exclusive
-    assert_equal(len(b2.seqs), 1)  # 1 BatchSeqCopyPart
-    assert_equal(b2.seqs[0].seq_idx, 0)
-    assert_equal(b2.seqs[0].seq_start_frame["classes"], 10)
-    assert_equal(b2.seqs[0].seq_end_frame["classes"], 11)
-    assert_equal(b2.seqs[0].frame_length["classes"], 1)
-    assert_equal(b2.seqs[0].seq_start_frame["data"], 10 - ctx_left)
-    assert_equal(b2.seqs[0].seq_end_frame["data"], 11 + ctx_right)
-    assert_equal(b2.seqs[0].frame_length["data"], 1 + ctx_lr)
-    assert_equal(b2.seqs[0].batch_slice, 0)
-    assert_equal(b2.seqs[0].batch_frame_offset, 0)
+    assert b0.start_seq == 0
+    assert b0.end_seq == 1  # exclusive
+    assert len(b0.seqs) == 1  # 1 BatchSeqCopyPart
+    assert b0.seqs[0].seq_idx == 0
+    assert b0.seqs[0].seq_start_frame["classes"] == 0
+    assert b0.seqs[0].seq_end_frame["classes"] == 5
+    assert b0.seqs[0].frame_length["classes"] == 5
+    assert b0.seqs[0].seq_start_frame["data"] == 0 - ctx_left
+    assert b0.seqs[0].seq_end_frame["data"] == 5 + ctx_right
+    assert b0.seqs[0].frame_length["data"] == 5 + ctx_lr
+    assert b0.seqs[0].batch_slice == 0
+    assert b0.seqs[0].batch_frame_offset == 0
+
+    assert b1.start_seq == 0
+    assert b1.end_seq == 1  # exclusive
+    assert len(b1.seqs) == 1  # 1 BatchSeqCopyPart
+    assert b1.seqs[0].seq_idx == 0
+    assert b1.seqs[0].seq_start_frame["classes"] == 5
+    assert b1.seqs[0].seq_end_frame["classes"] == 10
+    assert b1.seqs[0].frame_length["classes"] == 5
+    assert b1.seqs[0].seq_start_frame["data"] == 5 - ctx_left
+    assert b1.seqs[0].seq_end_frame["data"] == 10 + ctx_right
+    assert b1.seqs[0].frame_length["data"] == 5 + ctx_lr
+    assert b1.seqs[0].batch_slice == 0
+    assert b1.seqs[0].batch_frame_offset == 0
+
+    assert b2.start_seq == 0
+    assert b2.end_seq == 1  # exclusive
+    assert len(b2.seqs) == 1  # 1 BatchSeqCopyPart
+    assert b2.seqs[0].seq_idx == 0
+    assert b2.seqs[0].seq_start_frame["classes"] == 10
+    assert b2.seqs[0].seq_end_frame["classes"] == 11
+    assert b2.seqs[0].frame_length["classes"] == 1
+    assert b2.seqs[0].seq_start_frame["data"] == 10 - ctx_left
+    assert b2.seqs[0].seq_end_frame["data"] == 11 + ctx_right
+    assert b2.seqs[0].frame_length["data"] == 1 + ctx_lr
+    assert b2.seqs[0].batch_slice == 0
+    assert b2.seqs[0].batch_frame_offset == 0
 
 
 def test_task12ax_window():
@@ -413,13 +412,13 @@ def test_task12ax_window():
     dataset2.init_seq_order(epoch=1)
     dataset1.load_seqs(0, 1)
     dataset2.load_seqs(0, 1)
-    assert_equal(dataset1.get_data_dim("data"), input_dim)
-    assert_equal(dataset2.get_data_dim("data"), input_dim * window)
+    assert dataset1.get_data_dim("data") == input_dim
+    assert dataset2.get_data_dim("data") == input_dim * window
     data1 = dataset1.get_data(0, "data")
     data2 = dataset2.get_data(0, "data")
     seq_len = data1.shape[0]
-    assert_equal(data1.shape, (seq_len, input_dim))
-    assert_equal(data2.shape, (seq_len, window * input_dim))
+    assert data1.shape == (seq_len, input_dim)
+    assert data2.shape == (seq_len, window * input_dim)
     data2a = data2.reshape(seq_len, window, input_dim)
     print("data1:")
     print(data1)
@@ -431,13 +430,13 @@ def test_task12ax_window():
     print(data2[0])
     print("data2a[0,0]:")
     print(data2a[0, 0])
-    assert_equal(list(data2a[0, 0]), [0] * input_dim)  # zero-padded left
-    assert_equal(list(data2a[0, 1]), list(data1[0]))
-    assert_equal(list(data2a[0, 2]), list(data1[1]))
-    assert_equal(list(data2a[1, 0]), list(data1[0]))
-    assert_equal(list(data2a[1, 1]), list(data1[1]))
-    assert_equal(list(data2a[1, 2]), list(data1[2]))
-    assert_equal(list(data2a[-1, 2]), [0] * input_dim)  # zero-padded right
+    assert list(data2a[0, 0]) == [0] * input_dim  # zero-padded left
+    assert list(data2a[0, 1]) == list(data1[0])
+    assert list(data2a[0, 2]) == list(data1[1])
+    assert list(data2a[1, 0]) == list(data1[0])
+    assert list(data2a[1, 1]) == list(data1[1])
+    assert list(data2a[1, 2]) == list(data1[2])
+    assert list(data2a[-1, 2]) == [0] * input_dim  # zero-padded right
 
 
 def test_get_seq_order():
diff --git a/tests/test_GeneratingDataset.py b/tests/test_GeneratingDataset.py
index 3ebe5470c..8753b6678 100644
--- a/tests/test_GeneratingDataset.py
+++ b/tests/test_GeneratingDataset.py
@@ -4,7 +4,6 @@
 
 import _setup_test_env  # noqa
 import unittest
-from nose.tools import assert_equal, assert_is_instance, assert_in, assert_not_in, assert_true, assert_false
 from returnn.datasets.generating import *
 from returnn.datasets.basic import DatasetSeq
 from returnn.util.basic import PY3, unicode
@@ -20,9 +19,9 @@
 
 def test_init():
     dataset = DummyDataset(input_dim=2, output_dim=3, num_seqs=4)
-    assert_equal(dataset.num_inputs, 2)
-    assert_equal(dataset.num_outputs, {"classes": (3, 1), "data": (2, 2)})
-    assert_equal(dataset.num_seqs, 4)
+    assert dataset.num_inputs == 2
+    assert dataset.num_outputs == {"classes": (3, 1), "data": (2, 2)}
+    assert dataset.num_seqs == 4
 
 
 def test_load_seqs():
@@ -60,12 +59,12 @@ def test_StaticDataset_custom_keys():
     dataset = StaticDataset([{"source": numpy.array([1, 2, 3]), "target": numpy.array([3, 4, 5, 6, 7])}])
     dataset.init_seq_order(epoch=1)
     assert dataset.num_seqs == 1
-    assert_equal(dataset.get_data_keys(), ["source", "target"])
-    assert_equal(dataset.num_outputs["source"][1], 1)
-    assert_equal(dataset.num_outputs["target"][1], 1)
+    assert dataset.get_data_keys() == ["source", "target"]
+    assert dataset.num_outputs["source"][1] == 1
+    assert dataset.num_outputs["target"][1] == 1
     dataset.load_seqs(0, 1)
-    assert_equal(list(dataset.get_data(0, "source")), [1, 2, 3])
-    assert_equal(list(dataset.get_data(0, "target")), [3, 4, 5, 6, 7])
+    assert list(dataset.get_data(0, "source")) == [1, 2, 3]
+    assert list(dataset.get_data(0, "target")) == [3, 4, 5, 6, 7]
 
 
 def test_StaticDataset_custom_keys_with_dims():
@@ -75,12 +74,12 @@ def test_StaticDataset_custom_keys_with_dims():
     )
     dataset.init_seq_order(epoch=1)
     assert dataset.num_seqs == 1
-    assert_equal(dataset.get_data_keys(), ["source", "target"])
-    assert_equal(dataset.num_outputs["source"][1], 1)
-    assert_equal(dataset.num_outputs["target"][1], 1)
+    assert dataset.get_data_keys() == ["source", "target"]
+    assert dataset.num_outputs["source"][1] == 1
+    assert dataset.num_outputs["target"][1] == 1
     dataset.load_seqs(0, 1)
-    assert_equal(list(dataset.get_data(0, "source")), [1, 2, 3])
-    assert_equal(list(dataset.get_data(0, "target")), [3, 4, 5, 6, 7])
+    assert list(dataset.get_data(0, "source")) == [1, 2, 3]
+    assert list(dataset.get_data(0, "target")) == [3, 4, 5, 6, 7]
 
 
 def test_StaticDataset_utf8():
@@ -96,18 +95,18 @@ def test_StaticDataset_utf8():
     print("utf8 byte list:", s_byte_list)
     assert len(s_byte_list) == 4 > 3
     raw = numpy.array(s_byte_list, dtype="uint8")
-    assert_equal(raw.tolist(), [119, 195, 171, 114])
+    assert raw.tolist() == [119, 195, 171, 114]
     data = StaticDataset([{"data": raw}], output_dim={"data": (255, 1)})
     if "data" not in data.labels:
         data.labels["data"] = [chr(i) for i in range(255)]  # like in SprintDataset
     data.init_seq_order(epoch=1)
     data.load_seqs(0, 1)
     raw_ = data.get_data(seq_idx=0, key="data")
-    assert_equal(raw.tolist(), raw_.tolist())
+    assert raw.tolist() == raw_.tolist()
     assert data.can_serialize_data(key="data")
     s_serialized = data.serialize_data(key="data", data=raw)
     print("serialized:", s_serialized, "repr:", repr(s_serialized), "type:", type(s_serialized))
-    assert_equal(s, s_serialized)
+    assert s == s_serialized
 
 
 # might be moved to a separate test_MetaDataset ...
@@ -135,9 +134,9 @@ def test_ConcatSeqsDataset():
         concat_num_seqs = num_seqs + num_seqs // 2
         dataset.load_seqs(0, concat_num_seqs)
         assert dataset.num_seqs == concat_num_seqs == 3
-        assert_equal(dataset.get_data(0, "data").tolist(), [1, 2, 3])
-        assert_equal(dataset.get_data(1, "data").tolist(), [1, 2, 3])
-        assert_equal(dataset.get_data(2, "data").tolist(), [1, 2, 3, 1, 2, 3])
+        assert dataset.get_data(0, "data").tolist() == [1, 2, 3]
+        assert dataset.get_data(1, "data").tolist() == [1, 2, 3]
+        assert dataset.get_data(2, "data").tolist() == [1, 2, 3, 1, 2, 3]
 
 
 # might be moved to a separate test_MetaDataset ...
@@ -166,9 +165,9 @@ def test_ConcatSeqsDataset_repeat_in_between_last_frame_up_to_multiple_of():
         dataset.init_seq_order(epoch=1)
         dataset.load_seqs(0, concat_num_seqs)
         assert dataset.num_seqs == concat_num_seqs == 3
-        assert_equal(dataset.get_data(0, "data").tolist(), [1, 2])
-        assert_equal(dataset.get_data(1, "data").tolist(), [1, 2, 3])
-        assert_equal(dataset.get_data(2, "data").tolist(), [1, 2, 2, 2, 2, 1, 2, 3, 3, 3, 1, 2, 3])
+        assert dataset.get_data(0, "data").tolist() == [1, 2]
+        assert dataset.get_data(1, "data").tolist() == [1, 2, 3]
+        assert dataset.get_data(2, "data").tolist() == [1, 2, 2, 2, 2, 1, 2, 3, 3, 3, 1, 2, 3]
 
 
 def test_BytePairEncoding_unicode():
@@ -177,13 +176,13 @@ def test_BytePairEncoding_unicode():
         vocab_file="%s/bpe-unicode-demo.vocab" % my_dir,
         unknown_label="<unk>",
     )
-    assert_equal(bpe.num_labels, 189)
-    assert_equal(bpe.id_to_label(5), "z")
-    assert_equal(bpe.label_to_id("z"), 5)
-    assert_equal(bpe.bpe._bpe_codes[("n", "d</w>")], 1)
-    assert_equal(bpe.id_to_label(6), "å")
-    assert_equal(bpe.label_to_id("å"), 6)
-    assert_equal(bpe.bpe._bpe_codes[("à", "nd</w>")], 2)
+    assert bpe.num_labels == 189
+    assert bpe.id_to_label(5) == "z"
+    assert bpe.label_to_id("z") == 5
+    assert bpe.bpe._bpe_codes[("n", "d</w>")] == 1
+    assert bpe.id_to_label(6) == "å"
+    assert bpe.label_to_id("å") == 6
+    assert bpe.bpe._bpe_codes[("à", "nd</w>")] == 2
 
     def get_bpe_seq(text):
         """
@@ -195,12 +194,12 @@ def get_bpe_seq(text):
         print("%r -> %r" % (text, res))
         return res
 
-    assert_equal(get_bpe_seq("kod"), "k@@ o@@ d")  # str
-    assert_equal(get_bpe_seq("kod"), "k@@ o@@ d")  # unicode
-    assert_equal(get_bpe_seq("råt"), "råt")
-    assert_equal(
-        get_bpe_seq("råt råt iz ďër iz ďër ám àn iz ďër ë låk ë kod áv dres wër yù wêk dù ďë àsk"),
-        "råt råt iz ďër iz ďër ám à@@ n iz ďër ë låk ë k@@ o@@ d áv d@@ r@@ e@@ s w@@ ër yù w@@ ê@@ k dù ďë à@@ s@@ k",
+    assert get_bpe_seq("kod") == "k@@ o@@ d"  # str
+    assert get_bpe_seq("kod") == "k@@ o@@ d"  # unicode
+    assert get_bpe_seq("råt") == "råt"
+    assert (
+        get_bpe_seq("råt råt iz ďër iz ďër ám àn iz ďër ë låk ë kod áv dres wër yù wêk dù ďë àsk")
+        == "råt råt iz ďër iz ďër ám à@@ n iz ďër ë låk ë k@@ o@@ d áv d@@ r@@ e@@ s w@@ ër yù w@@ ê@@ k dù ďë à@@ s@@ k"
     )
 
 
diff --git a/tests/test_HDFDataset.py b/tests/test_HDFDataset.py
index a042b6cc6..4595daa49 100644
--- a/tests/test_HDFDataset.py
+++ b/tests/test_HDFDataset.py
@@ -11,10 +11,6 @@
 
 from returnn.datasets import Dataset
 from returnn.datasets.hdf import *
-from nose.tools import assert_equal
-from nose.tools import assert_not_equal
-from nose.tools import assert_raises
-from nose.tools import raises
 import returnn.util.basic as util
 import h5py
 import numpy as np
@@ -31,9 +27,9 @@ def test_HDFDataset_init():
     This method tests initialization of the HDFDataset class
     """
     toy_dataset = HDFDataset()
-    assert_equal(toy_dataset.file_start, [0], "self.file_start init problem, should be [0]")
-    assert_equal(toy_dataset.files, [], "self.files init problem, should be []")
-    assert_equal(toy_dataset.file_seq_start, [], "self.file_seq_start init problem, should be []")
+    assert toy_dataset.file_start == [0], "self.file_start init problem, should be [0]"
+    assert toy_dataset.files == [], "self.files init problem, should be []"
+    assert toy_dataset.file_seq_start == [], "self.file_seq_start init problem, should be []"
     return toy_dataset
 
 
@@ -204,10 +200,10 @@ def test_hdf_dump_not_frame_synced():
     assert hdf_reader.num_seqs == orig_reader.num_seqs == num_seqs
     for seq_idx in range(num_seqs):
         # Not synced, i.e. different lengths:
-        assert_not_equal(orig_reader.seq_lens[seq_idx]["data"], orig_reader.seq_lens[seq_idx]["classes"])
+        assert orig_reader.seq_lens[seq_idx]["data"] != orig_reader.seq_lens[seq_idx]["classes"]
         for key in orig_reader.data_keys:
-            assert_equal(hdf_reader.seq_lens[seq_idx][key], orig_reader.seq_lens[seq_idx][key])
-            assert_equal(hdf_reader.data[key][seq_idx].tolist(), orig_reader.data[key][seq_idx].tolist())
+            assert hdf_reader.seq_lens[seq_idx][key] == orig_reader.seq_lens[seq_idx][key]
+            assert hdf_reader.data[key][seq_idx].tolist() == orig_reader.data[key][seq_idx].tolist()
 
 
 def test_HDFDataset_partition_epoch():
@@ -227,10 +223,10 @@ def test_HDFDataset_partition_epoch():
     assert hdf_reader.num_seqs == orig_reader.num_seqs == num_seqs
     for seq_idx in range(num_seqs):
         # Not synced, i.e. different lengths:
-        assert_not_equal(orig_reader.seq_lens[seq_idx]["data"], orig_reader.seq_lens[seq_idx]["classes"])
+        assert orig_reader.seq_lens[seq_idx]["data"] != orig_reader.seq_lens[seq_idx]["classes"]
         for key in orig_reader.data_keys:
-            assert_equal(hdf_reader.seq_lens[seq_idx][key], orig_reader.seq_lens[seq_idx][key])
-            assert_equal(hdf_reader.data[key][seq_idx].tolist(), orig_reader.data[key][seq_idx].tolist())
+            assert hdf_reader.seq_lens[seq_idx][key] == orig_reader.seq_lens[seq_idx][key]
+            assert hdf_reader.data[key][seq_idx].tolist() == orig_reader.data[key][seq_idx].tolist()
 
 
 def test_SimpleHDFWriter():
@@ -266,7 +262,7 @@ def test_SimpleHDFWriter():
     for i, seq_len in enumerate(seq_lens):
         assert reader.seq_lens[i]["data"] == seq_len
     print("tags:", reader.seq_tags)
-    assert_equal(reader.seq_tags, ["seq-%i" % i for i in range(reader.num_seqs)])
+    assert reader.seq_tags == ["seq-%i" % i for i in range(reader.num_seqs)]
     assert isinstance(reader.seq_tags[0], str)
 
 
@@ -353,7 +349,7 @@ def test_SimpleHDFWriter_empty_extra():
             b = reader.data["test-extra"][i][k]
             assert numpy.allclose(a, b), f"i={i}"
 
-    assert_equal(reader.seq_tags, ["seq-%i" % i for i in range(reader.num_seqs)])
+    assert reader.seq_tags == ["seq-%i" % i for i in range(reader.num_seqs)]
     assert isinstance(reader.seq_tags[0], str)
 
 
@@ -404,7 +400,7 @@ def test_read_simple_hdf():
     reader.read_all()
     print("tags:", reader.seq_tags)
     assert len(seq_lens) == reader.num_seqs
-    assert_equal(reader.seq_tags, ["seq-0", "seq-1"])
+    assert reader.seq_tags == ["seq-0", "seq-1"]
     for i, seq_len in enumerate(seq_lens):
         assert reader.seq_lens[i]["data"] == seq_len
     assert "data" in reader.data_keys  # "classes" might be in there as well, although not really correct/existing
diff --git a/tests/test_LearningRateControl.py b/tests/test_LearningRateControl.py
index 2c902c64a..a1b59e470 100644
--- a/tests/test_LearningRateControl.py
+++ b/tests/test_LearningRateControl.py
@@ -4,7 +4,6 @@
 import _setup_test_env  # noqa
 from returnn.config import Config
 from returnn.learning_rate_control import *
-from nose.tools import assert_equal
 import numpy
 import unittest
 
@@ -98,7 +97,7 @@ def test_init_error_old():
     assert "train_score" in error
     assert "dev_score" in error
     assert "dev_error" in error
-    assert_equal(lrc.get_error_key(1), "dev_score")
+    assert lrc.get_error_key(1) == "dev_score"
     lrc.get_learning_rate_for_epoch(2)
     lrc.set_epoch_error(2, {"train_score": 1.8})
     lrc.set_epoch_error(2, {"dev_score": 1.9, "dev_error": 0.5})
@@ -117,7 +116,7 @@ def test_init_error_new():
     assert "train_score" in error
     assert "dev_score" in error
     assert "dev_error" in error
-    assert_equal(lrc.get_error_key(1), "dev_score")
+    assert lrc.get_error_key(1) == "dev_score"
     lrc.get_learning_rate_for_epoch(2)
     lrc.set_epoch_error(2, {"train_score": {"cost:output": 1.8}})
     lrc.set_epoch_error(2, {"dev_score": {"cost:output": 1.9}, "dev_error": {"error:output": 0.5}})
@@ -142,7 +141,7 @@ def test_init_error_muliple_out():
     assert "dev_score_out2" in error
     assert "dev_error_output" in error
     assert "dev_error_out2" in error
-    assert_equal(lrc.get_error_key(1), "dev_score_output")
+    assert lrc.get_error_key(1) == "dev_score_output"
     lrc.get_learning_rate_for_epoch(2)
     lrc.set_epoch_error(2, {"train_score": {"cost:output": 1.8, "cost:out2": 2.8}})
     lrc.set_epoch_error(
@@ -157,15 +156,15 @@ def test_newbob():
     config.update({"learning_rate_control": "newbob", "learning_rate": lr})
     lrc = load_learning_rate_control_from_config(config)
     assert isinstance(lrc, NewbobRelative)
-    assert_equal(lrc.get_learning_rate_for_epoch(1), lr)
+    assert lrc.get_learning_rate_for_epoch(1) == lr
     lrc.set_epoch_error(1, {"train_score": {"cost:output": 1.9344199658230012}})
     lrc.set_epoch_error(1, {"dev_score": {"cost:output": 1.99}, "dev_error": {"error:output": 0.6}})
     error = lrc.get_epoch_error_dict(1)
     assert "train_score" in error
     assert "dev_score" in error
     assert "dev_error" in error
-    assert_equal(lrc.get_error_key(1), "dev_score")
-    assert_equal(lrc.get_learning_rate_for_epoch(2), lr)  # epoch 2 cannot be a different lr yet
+    assert lrc.get_error_key(1) == "dev_score"
+    assert lrc.get_learning_rate_for_epoch(2) == lr  # epoch 2 cannot be a different lr yet
     lrc.set_epoch_error(2, {"train_score": {"cost:output": 1.8}})
     lrc.set_epoch_error(2, {"dev_score": {"cost:output": 1.9}, "dev_error": {"error:output": 0.5}})
     lrc.get_learning_rate_for_epoch(3)
@@ -185,7 +184,7 @@ def test_newbob_multi_epoch():
     )
     lrc = load_learning_rate_control_from_config(config)
     assert isinstance(lrc, NewbobMultiEpoch)
-    assert_equal(lrc.get_learning_rate_for_epoch(1), lr)
+    assert lrc.get_learning_rate_for_epoch(1) == lr
     lrc.set_epoch_error(
         1,
         {
@@ -194,7 +193,7 @@ def test_newbob_multi_epoch():
             "train_score": 3.095824052426714,
         },
     )
-    assert_equal(lrc.get_learning_rate_for_epoch(2), lr)  # epoch 2 cannot be a different lr yet
+    assert lrc.get_learning_rate_for_epoch(2) == lr  # epoch 2 cannot be a different lr yet
 
 
 def test_later_default_lr():
diff --git a/tests/test_Log.py b/tests/test_Log.py
index d33705979..68b987f30 100644
--- a/tests/test_Log.py
+++ b/tests/test_Log.py
@@ -11,7 +11,6 @@
 import sys
 from pprint import pprint
 import unittest
-from nose.tools import assert_less, assert_in, assert_equal
 from returnn.util import better_exchook
 
 
@@ -114,7 +113,7 @@ def test_filter_out():
     ls = filter(None, s.splitlines())
     ls = filter_out(ls)
     pprint(ls)
-    assert_equal(len(ls), 5)
+    assert len(ls) == 5
 
 
 def test_returnn_startup():
@@ -124,9 +123,9 @@ def test_returnn_startup():
     if not 3 <= len(ls) <= 10:  # not fixed because might change
         print("output:\n%s\n\nNum lines: %i" % ("\n".join(ls), len(ls)))
         raise Exception("unexpected output number of lines")
-    assert_equal(count_start_with(ls, "RETURNN starting up, version "), 1)
-    assert_equal(count_start_with(ls, "TensorFlow: "), 1)
-    assert_in("Task: No-operation", ls)
+    assert count_start_with(ls, "RETURNN starting up, version ") == 1
+    assert count_start_with(ls, "TensorFlow: ") == 1
+    assert "Task: No-operation" in ls
 
 
 def test_returnn_startup_verbose():
@@ -136,11 +135,11 @@ def test_returnn_startup_verbose():
     if not 3 <= len(ls) <= 15:  # not fixed because might change
         print("output:\n%s\n\nNum lines: %i" % ("\n".join(ls), len(ls)))
         raise Exception("unexpected output number of lines")
-    assert_equal(count_start_with(ls, "RETURNN starting up, version "), 1)
-    assert_equal(count_start_with(ls, "RETURNN command line options: "), 1)
-    assert_equal(count_start_with(ls, "TensorFlow: "), 1)
-    assert_in("Task: No-operation", ls)
-    assert_in("Quitting", ls)
+    assert count_start_with(ls, "RETURNN starting up, version ") == 1
+    assert count_start_with(ls, "RETURNN command line options: ") == 1
+    assert count_start_with(ls, "TensorFlow: ") == 1
+    assert "Task: No-operation" in ls
+    assert "Quitting" in ls
 
 
 def test_simple_log():
@@ -154,40 +153,40 @@ def test_simple_log():
 print("hello log 2", file=log.v3)
   """
     out = run([py], input=code.encode("utf8"))
-    assert_equal(out.splitlines(), ["hello stdout 1", "hello stdout 2", "hello log 1", "hello log 2"])
+    assert out.splitlines() == ["hello stdout 1", "hello stdout 2", "hello log 1", "hello log 2"]
 
 
 def test_StreamIO():
     import io
 
     buf = io.StringIO()
-    assert_equal(buf.getvalue(), "")
+    assert buf.getvalue() == ""
     print("buf: %r" % buf.getvalue())
 
     buf.write("hello")
     print("buf: %r" % buf.getvalue())
-    assert_equal(buf.getvalue(), "hello")
+    assert buf.getvalue() == "hello"
     buf.truncate(0)  # should not change the position, thus the buffer is empty but position is len("hello")
     print("buf: %r" % buf.getvalue())
-    assert_equal(buf.getvalue(), "")
+    assert buf.getvalue() == ""
 
     buf.write("hello")
     print("buf: %r" % buf.getvalue())
     if PY3:
         # This behavior is not correct in Python 2.7. https://bugs.python.org/issue30250
-        assert_equal(buf.getvalue(), "\x00\x00\x00\x00\x00hello")  # zero-filled
+        assert buf.getvalue() == "\x00\x00\x00\x00\x00hello"  # zero-filled
     buf.truncate(0)
     buf.seek(0)
     print("buf: %r" % buf.getvalue())
-    assert_equal(buf.getvalue(), "")
+    assert buf.getvalue() == ""
 
     buf.write("hello")
     print("buf: %r" % buf.getvalue())
-    assert_equal(buf.getvalue(), "hello")
+    assert buf.getvalue() == "hello"
     buf.truncate(0)
     buf.seek(0)
     print("buf: %r" % buf.getvalue())
-    assert_equal(buf.getvalue(), "")
+    assert buf.getvalue() == ""
 
 
 if __name__ == "__main__":
diff --git a/tests/test_Pretrain.py b/tests/test_Pretrain.py
index 39f1dac03..21723e7ca 100644
--- a/tests/test_Pretrain.py
+++ b/tests/test_Pretrain.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 import _setup_test_env  # noqa
-from nose.tools import assert_equal, assert_in, assert_not_in
 from returnn.pretrain import pretrain_from_config
 from returnn.config import Config
 
@@ -63,15 +62,15 @@ def test_config_net_dict1():
     config.update(config_dict)
     config.typed_dict["network"] = net_dict
     pretrain = pretrain_from_config(config)
-    assert_equal(pretrain.get_train_num_epochs(), 2)
+    assert pretrain.get_train_num_epochs() == 2
     net1_json = pretrain.get_network_json_for_epoch(1)
     net2_json = pretrain.get_network_json_for_epoch(2)
     net3_json = pretrain.get_network_json_for_epoch(3)
-    assert_in("hidden_0", net1_json)
-    assert_not_in("hidden_1", net1_json)
-    assert_in("hidden_0", net2_json)
-    assert_in("hidden_1", net2_json)
-    assert_equal(net2_json, net3_json)
+    assert "hidden_0" in net1_json
+    assert "hidden_1" not in net1_json
+    assert "hidden_0" in net2_json
+    assert "hidden_1" in net2_json
+    assert net2_json == net3_json
 
 
 def test_config_net_dict2():
@@ -79,4 +78,4 @@ def test_config_net_dict2():
     config.update(config_dict)
     config.typed_dict["network"] = net_dict2
     pretrain = pretrain_from_config(config)
-    assert_equal(pretrain.get_train_num_epochs(), 3)
+    assert pretrain.get_train_num_epochs() == 3
diff --git a/tests/test_ResNet.py b/tests/test_ResNet.py
index ee3eb32af..1ea0a265f 100644
--- a/tests/test_ResNet.py
+++ b/tests/test_ResNet.py
@@ -2,7 +2,6 @@
 import tensorflow as tf
 import sys
 import os
-from nose.tools import assert_equal, assert_is_instance
 import contextlib
 import unittest
 import numpy.testing
diff --git a/tests/test_SprintDataset.py b/tests/test_SprintDataset.py
index f6fc6e318..6face63cc 100644
--- a/tests/test_SprintDataset.py
+++ b/tests/test_SprintDataset.py
@@ -1,5 +1,4 @@
 import _setup_test_env  # noqa
-from nose.tools import assert_equal, assert_true
 from returnn.engine.batch import Batch
 from returnn.config import Config
 import returnn.util.basic as util
@@ -63,11 +62,11 @@ def test_assign_dev_data():
         "--*.feature-dimension=2 --*.trainer-output-dimension=3 --*.crnn-dataset=DummyDataset(2,3,num_seqs=4,seq_len=10)",
     )
     dataset.init_seq_order(epoch=1)
-    assert_true(dataset.is_less_than_num_seqs(0))
+    assert dataset.is_less_than_num_seqs(0) is True
     recurrent = False
     batch_generator = dataset.generate_batches(recurrent_net=recurrent, batch_size=5)
     batches = batch_generator.peek_next_n(2)
-    assert_equal(len(batches), 2)
+    assert len(batches) == 2
 
 
 def test_window():
@@ -94,12 +93,12 @@ def test_window():
         dataset2.init_seq_order(epoch=1)
         dataset1.load_seqs(0, 1)
         dataset2.load_seqs(0, 1)
-        assert_equal(dataset1.get_data_dim("data"), input_dim)
-        assert_equal(dataset2.get_data_dim("data"), input_dim * window)
+        assert dataset1.get_data_dim("data") == input_dim
+        assert dataset2.get_data_dim("data") == input_dim * window
         data1 = dataset1.get_data(0, "data")
         data2 = dataset2.get_data(0, "data")
-        assert_equal(data1.shape, (seq_len, input_dim))
-        assert_equal(data2.shape, (seq_len, window * input_dim))
+        assert data1.shape == (seq_len, input_dim)
+        assert data2.shape == (seq_len, window * input_dim)
         data2a = data2.reshape(seq_len, window, input_dim)
         print("data1:")
         print(data1)
@@ -111,13 +110,13 @@ def test_window():
         print(data2[0])
         print("data2a[0,0]:")
         print(data2a[0, 0])
-        assert_equal(list(data2a[0, 0]), [0] * input_dim)  # zero-padded left
-        assert_equal(list(data2a[0, 1]), list(data1[0]))
-        assert_equal(list(data2a[0, 2]), list(data1[1]))
-        assert_equal(list(data2a[1, 0]), list(data1[0]))
-        assert_equal(list(data2a[1, 1]), list(data1[1]))
-        assert_equal(list(data2a[1, 2]), list(data1[2]))
-        assert_equal(list(data2a[-1, 2]), [0] * input_dim)  # zero-padded right
+        assert list(data2a[0, 0]) == [0] * input_dim  # zero-padded left
+        assert list(data2a[0, 1]) == list(data1[0])
+        assert list(data2a[0, 2]) == list(data1[1])
+        assert list(data2a[1, 0]) == list(data1[0])
+        assert list(data2a[1, 1]) == list(data1[1])
+        assert list(data2a[1, 2]) == list(data1[2])
+        assert list(data2a[-1, 2]) == [0] * input_dim  # zero-padded right
     finally:
         dataset1._exit_handler()
         dataset2._exit_handler()
diff --git a/tests/test_SprintInterface.py b/tests/test_SprintInterface.py
index 47c3cfed1..863565adb 100644
--- a/tests/test_SprintInterface.py
+++ b/tests/test_SprintInterface.py
@@ -4,7 +4,6 @@
 import os
 
 import _setup_test_env  # noqa
-from nose.tools import assert_equal
 import returnn.sprint.interface as SprintAPI
 from returnn.tf.engine import Engine
 from tempfile import mkdtemp
@@ -101,7 +100,7 @@ def test_forward():
     features = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4], [0.4, 0.5]])
     seq_len = features.shape[0]
     posteriors = SprintAPI._forward("segment1", features.T).T
-    assert_equal(posteriors.shape, (seq_len, outputDim))
+    assert posteriors.shape == (seq_len, outputDim)
 
     SprintAPI.exit()
 
diff --git a/tests/test_TFEngine.py b/tests/test_TFEngine.py
index e4ba51a40..63a7c5aca 100644
--- a/tests/test_TFEngine.py
+++ b/tests/test_TFEngine.py
@@ -13,11 +13,11 @@
 from returnn.tf.util.data import SpatialDim
 from returnn.tf.network import ExternData
 from returnn.config import Config
-from nose.tools import assert_equal, assert_not_equal, assert_is_instance, assert_raises
 import unittest
 import numpy
 import numpy.testing
 from pprint import pprint
+import pytest
 import contextlib
 from returnn.util import better_exchook
 
@@ -122,7 +122,7 @@ def test_FeedDictDataProvider():
     for seq_idx in range(num_seqs):
         feed_dict, meta = data_provider.get_feed_dict(single_threaded=True)
         print(feed_dict, meta)
-        assert_is_instance(feed_dict, dict)
+        assert isinstance(feed_dict, dict)
         assert extern_data.data["data"].placeholder in feed_dict
         assert extern_data.data["data"].size_placeholder[0] in feed_dict
         assert extern_data.data["classes"].placeholder in feed_dict
@@ -131,21 +131,21 @@ def test_FeedDictDataProvider():
         data_size = feed_dict[extern_data.data["data"].size_placeholder[0]]
         classes = feed_dict[extern_data.data["classes"].placeholder]
         classes_size = feed_dict[extern_data.data["classes"].size_placeholder[0]]
-        assert_is_instance(data, numpy.ndarray)
-        assert_is_instance(data_size, numpy.ndarray)
-        assert_is_instance(classes, numpy.ndarray)
-        assert_is_instance(classes_size, numpy.ndarray)
-        assert_equal(data.shape, (n_batch, seq_len, n_data_dim))
-        assert_equal(data_size.shape, (n_batch,))
-        assert_equal(classes.shape, (n_batch, seq_len))
-        assert_equal(classes_size.shape, (n_batch,))
-        assert_equal(list(data_size), [seq_len])
-        assert_equal(list(classes_size), [seq_len])
+        assert isinstance(data, numpy.ndarray)
+        assert isinstance(data_size, numpy.ndarray)
+        assert isinstance(classes, numpy.ndarray)
+        assert isinstance(classes_size, numpy.ndarray)
+        assert data.shape == (n_batch, seq_len, n_data_dim)
+        assert data_size.shape == (n_batch,)
+        assert classes.shape == (n_batch, seq_len)
+        assert classes_size.shape == (n_batch,)
+        assert list(data_size) == [seq_len]
+        assert list(classes_size) == [seq_len]
         numpy.testing.assert_almost_equal(list(data[0, 0]), expected_first_data[seq_idx])
         numpy.testing.assert_almost_equal(list(data[0, -1]), expected_last_data[seq_idx])
-        assert_equal(classes.tolist(), [expected_classes[seq_idx]])
+        assert classes.tolist() == [expected_classes[seq_idx]]
 
-    with assert_raises(AssertionError):  # assert that there are batches left should fail
+    with pytest.raises(AssertionError):  # assert that there are batches left should fail
         feed_dict, meta = data_provider.get_feed_dict(single_threaded=True)
 
 
@@ -190,19 +190,19 @@ def test_DatasetDataProvider():
             ]
         )
 
-        assert_is_instance(data, numpy.ndarray)
-        assert_is_instance(data_size, numpy.ndarray)
-        assert_is_instance(classes, numpy.ndarray)
-        assert_is_instance(classes_size, numpy.ndarray)
-        assert_equal(data.shape, (n_batch, seq_len, n_data_dim))
-        assert_equal(data_size.shape, (n_batch,))
-        assert_equal(classes.shape, (n_batch, seq_len))
-        assert_equal(classes_size.shape, (n_batch,))
-        assert_equal(list(data_size), [seq_len] * n_batch)
-        assert_equal(list(classes_size), [seq_len] * n_batch)
+        assert isinstance(data, numpy.ndarray)
+        assert isinstance(data_size, numpy.ndarray)
+        assert isinstance(classes, numpy.ndarray)
+        assert isinstance(classes_size, numpy.ndarray)
+        assert data.shape == (n_batch, seq_len, n_data_dim)
+        assert data_size.shape == (n_batch,)
+        assert classes.shape == (n_batch, seq_len)
+        assert classes_size.shape == (n_batch,)
+        assert list(data_size) == [seq_len] * n_batch
+        assert list(classes_size) == [seq_len] * n_batch
         numpy.testing.assert_almost_equal(list(data[0, 0]), [-0.5, -0.4])
         numpy.testing.assert_almost_equal(list(data[0, -1]), [0.3, 0.4])
-        assert_equal(classes[0].tolist(), [1, 2, 0, 1, 2])
+        assert classes[0].tolist() == [1, 2, 0, 1, 2]
 
         step = 1  # step 0 was above
         while True:
@@ -889,9 +889,9 @@ def test_engine_forward_to_hdf():
     ds = HDFDataset()
     ds.add_file(output_file)
 
-    assert_equal(ds.num_inputs, n_classes_dim)  # forwarded input is network output
-    assert_equal(ds.get_num_timesteps(), seq_len * num_seqs)
-    assert_equal(ds.get_total_num_seqs(), num_seqs)
+    assert ds.num_inputs == n_classes_dim  # forwarded input is network output
+    assert ds.get_num_timesteps() == seq_len * num_seqs
+    assert ds.get_total_num_seqs() == num_seqs
 
     os.remove(output_file)
 
@@ -1297,9 +1297,9 @@ def test_engine_rec_subnet_count():
     engine.init_train_from_config(config=config, train_data=dataset, dev_data=None, eval_data=None)
 
     out = engine.forward_single(dataset=dataset, seq_idx=0)
-    assert_equal(out.shape, (seq_len,))
-    assert_equal(out.dtype, numpy.int32)
-    assert_equal(list(out[:]), list(range(1, seq_len + 1)))
+    assert out.shape == (seq_len,)
+    assert out.dtype == numpy.int32
+    assert list(out[:]) == list(range(1, seq_len + 1))
 
     engine.finalize()
 
@@ -1383,9 +1383,9 @@ def test_engine_end_layer(extra_rec_kwargs=None):
     rec_layer = engine.network.layers["output"]
     assert isinstance(rec_layer, RecLayer)
     assert isinstance(rec_layer.cell, _SubnetworkRecCell)
-    assert_equal(set(rec_layer.cell.input_layers_moved_out), set())
-    assert_equal(set(rec_layer.cell.output_layers_moved_out), {"stop_token"})
-    assert_equal(set(rec_layer.cell.layers_in_loop), {"output"})
+    assert set(rec_layer.cell.input_layers_moved_out) == set()
+    assert set(rec_layer.cell.output_layers_moved_out) == {"stop_token"}
+    assert set(rec_layer.cell.layers_in_loop) == {"output"}
 
     # Now reinit for search.
     assert not engine.use_search_flag
@@ -1476,13 +1476,11 @@ def check_engine_search(extra_rec_kwargs=None):
     assert isinstance(rec_layer, RecLayer)
     assert isinstance(rec_layer.cell, _SubnetworkRecCell)
     if rec_layer._optimize_move_layers_out:
-        assert_equal(set(rec_layer.cell.input_layers_moved_out), set())
-        assert_equal(set(rec_layer.cell.output_layers_moved_out), {"output", "embed", "prob"})
-        assert_equal(set(rec_layer.cell.layers_in_loop), set())
+        assert set(rec_layer.cell.input_layers_moved_out) == set()
+        assert set(rec_layer.cell.output_layers_moved_out) == {"output", "embed", "prob"}
+        assert set(rec_layer.cell.layers_in_loop) == set()
     else:
-        assert_equal(
-            set(rec_layer.cell.layers_in_loop).difference({"data:classes"}), {"embed", "prob", "output", "end"}
-        )
+        assert set(rec_layer.cell.layers_in_loop).difference({"data:classes"}) == {"embed", "prob", "output", "end"}
 
     # Now reinit for search.
     assert not engine.use_search_flag
@@ -1593,17 +1591,13 @@ def check_engine_search_attention(extra_rec_kwargs=None):
     assert isinstance(rec_layer, RecLayer)
     assert isinstance(rec_layer.cell, _SubnetworkRecCell)
     if rec_layer._optimize_move_layers_out:
-        assert_equal(set(rec_layer.cell.input_layers_moved_out), set())
-        assert_equal(set(rec_layer.cell.output_layers_moved_out), set())
-        assert_equal(
-            set(rec_layer.cell.layers_in_loop), {"end", "output", "output_prob", "c", "c_in", "orth_embed", "s"}
-        )
+        assert set(rec_layer.cell.input_layers_moved_out) == set()
+        assert set(rec_layer.cell.output_layers_moved_out) == set()
+        assert set(rec_layer.cell.layers_in_loop) == {"end", "output", "output_prob", "c", "c_in", "orth_embed", "s"}
     else:
         assert not rec_layer.cell.input_layers_moved_out
         assert not rec_layer.cell.output_layers_moved_out
-        assert_equal(
-            set(rec_layer.cell.layers_in_loop), {"end", "output", "output_prob", "c", "c_in", "orth_embed", "s"}
-        )
+        assert set(rec_layer.cell.layers_in_loop) == {"end", "output", "output_prob", "c", "c_in", "orth_embed", "s"}
 
     print("Search...")
     engine.search(dataset=dataset)
@@ -2815,9 +2809,9 @@ def test_rec_optim_all_out():
     assert isinstance(rec_layer.cell, _SubnetworkRecCell)
     assert rec_layer._optimize_move_layers_out
     # Now it was initialized and optimized for training.
-    assert_equal(set(rec_layer.cell.input_layers_moved_out), set())
-    assert_equal(set(rec_layer.cell.output_layers_moved_out), {"output", "prob", "embed"})
-    assert_equal(set(rec_layer.cell.layers_in_loop), set())
+    assert set(rec_layer.cell.input_layers_moved_out) == set()
+    assert set(rec_layer.cell.output_layers_moved_out) == {"output", "prob", "embed"}
+    assert set(rec_layer.cell.layers_in_loop) == set()
 
     # Now reinit for search.
     assert not engine.use_search_flag
@@ -2831,9 +2825,9 @@ def test_rec_optim_all_out():
     assert isinstance(rec_layer.cell, _SubnetworkRecCell)
     assert rec_layer._optimize_move_layers_out
     # Now it was initialized and optimized for search.
-    assert_equal(set(rec_layer.cell.input_layers_moved_out), set())
-    assert_equal(set(rec_layer.cell.output_layers_moved_out), set())
-    assert_equal(set(rec_layer.cell.layers_in_loop), {"prob", "output", "end", "embed"})
+    assert set(rec_layer.cell.input_layers_moved_out) == set()
+    assert set(rec_layer.cell.output_layers_moved_out) == set()
+    assert set(rec_layer.cell.layers_in_loop) == {"prob", "output", "end", "embed"}
 
     engine.search(dataset=dataset)
     print("error keys:")
@@ -3348,8 +3342,8 @@ def run(run_idx, optimize_move_layers_out):
         assert isinstance(rec_layer, RecLayer)
         assert isinstance(rec_layer.cell, _SubnetworkRecCell)
         if optimize_move_layers_out:
-            assert_equal(set(rec_layer.cell.input_layers_moved_out), {"output", "orth_embed"})
-            assert_equal(set(rec_layer.cell.output_layers_moved_out), {"output_prob", "att"})
+            assert set(rec_layer.cell.input_layers_moved_out) == {"output", "orth_embed"}
+            assert set(rec_layer.cell.output_layers_moved_out) == {"output_prob", "att"}
         else:
             assert not rec_layer.cell.input_layers_moved_out
             assert not rec_layer.cell.output_layers_moved_out
@@ -3658,8 +3652,8 @@ def test_rec_subnet_construct_2():
 
         assert isinstance(rec_layer, RecLayer)
         assert isinstance(rec_layer.cell, _SubnetworkRecCell)
-        assert_equal(set(rec_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-        assert_equal(set(rec_layer.cell.output_layers_moved_out), {"output_prob", "readout", "readout_in", "s2"})
+        assert set(rec_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+        assert set(rec_layer.cell.output_layers_moved_out) == {"output_prob", "readout", "readout_in", "s2"}
         print("Construct search net")
         search_net = TFNetwork(extern_data=extern_data, train_flag=False, eval_flag=True, search_flag=True)
         search_net.construct_from_dict(net_dict)
@@ -3797,8 +3791,8 @@ def test_rec_subnet_construct_3():
 
         assert isinstance(rec_layer, RecLayer)
         assert isinstance(rec_layer.cell, _SubnetworkRecCell)
-        assert_equal(set(rec_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-        assert_equal(set(rec_layer.cell.output_layers_moved_out), {"output_prob", "readout", "readout_in", "s2"})
+        assert set(rec_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+        assert set(rec_layer.cell.output_layers_moved_out) == {"output_prob", "readout", "readout_in", "s2"}
         print("Construct search net")
         search_net = TFNetwork(extern_data=extern_data, train_flag=False, eval_flag=True, search_flag=True)
         search_net.construct_from_dict(net_dict)
@@ -4674,7 +4668,7 @@ def test_preload_from_files():
             numpy.testing.assert_array_equal(param_orig, param_clone_main)
 
         main = engine.network.layers["main_" + layer_name]
-        assert_equal(set(main.params.keys()), {"W", "b"})
+        assert set(main.params.keys()) == {"W", "b"}
 
     engine.finalize()
 
@@ -4799,8 +4793,8 @@ def test_preload_from_files_with_reuse():
 
         main = engine.network.layers["main_" + layer_name]
         clone = engine.network.layers["clone_" + layer_name]
-        assert_equal(set(main.params.keys()), {"W", "b"})
-        assert_equal(set(clone.params.keys()), set())
+        assert set(main.params.keys()) == {"W", "b"}
+        assert set(clone.params.keys()) == set()
 
     engine.finalize()
 
@@ -4890,7 +4884,7 @@ def test_preload_from_files_ignore_missing():
             numpy.testing.assert_array_equal(param_orig, param_clone_main)
 
         main = engine.network.layers[layer_name]
-        assert_equal(set(main.params.keys()), {"W", "b"})
+        assert set(main.params.keys()) == {"W", "b"}
 
     engine.finalize()
 
@@ -5927,8 +5921,8 @@ def make_net_dict(dim):
         dim = FeatureDim("out", dim)
         return {"output": {"class": "linear", "activation": "sigmoid", "out_dim": dim}}
 
-    assert_equal(Engine._net_dict_diff(make_net_dict(13), make_net_dict(13)), [])
-    assert_not_equal(Engine._net_dict_diff(make_net_dict(13), make_net_dict(17)), [])
+    assert Engine._net_dict_diff(make_net_dict(13), make_net_dict(13)) == []
+    assert Engine._net_dict_diff(make_net_dict(13), make_net_dict(17)) != []
 
 
 if __name__ == "__main__":
diff --git a/tests/test_TFNativeOp.py b/tests/test_TFNativeOp.py
index 3f2af2ae2..d820976e1 100644
--- a/tests/test_TFNativeOp.py
+++ b/tests/test_TFNativeOp.py
@@ -17,7 +17,6 @@
 import returnn.native_op as native_op
 from returnn.util.basic import unicode
 import unittest
-from nose.tools import assert_equal, assert_is_instance
 import numpy
 import numpy.testing
 from numpy.testing import assert_almost_equal, assert_allclose
@@ -277,9 +276,9 @@ def test_LstmLowMem_fwd_simple_1():
     print("vY:", vY)
     print("vC:", vC)
     print("vd:", vd)
-    assert_equal(vY.shape, (n_time, n_batch, n_cells))
-    assert_equal(vC.shape, (n_time, n_batch, n_cells))
-    assert_equal(d.shape, (n_batch, n_cells))
+    assert vY.shape == (n_time, n_batch, n_cells)
+    assert vC.shape == (n_time, n_batch, n_cells)
+    assert d.shape == (n_batch, n_cells)
     vintern = vX + vy0 + vb
     vcellIn = numpy.tanh(vintern)
     vgates = 1.0 / (1.0 + numpy.exp(-vintern))  # sigmoid
@@ -331,11 +330,11 @@ def test_LstmLowMem_bwd_simple_1():
     print("op vDb:", vDb)
     print("op vDh:", vDh)
     print("op vDc:", vDc)
-    assert_equal(vDX.shape, (n_time, n_batch, n_in))
-    assert_equal(vDW.shape, (n_in + n_cells, n_cells * 4))
-    assert_equal(vDb.shape, (n_cells * 4,))
-    assert_equal(vDh.shape, (n_batch, n_cells))
-    assert_equal(vDc.shape, (n_batch, n_cells))
+    assert vDX.shape == (n_time, n_batch, n_in)
+    assert vDW.shape == (n_in + n_cells, n_cells * 4)
+    assert vDb.shape == (n_cells * 4,)
+    assert vDh.shape == (n_batch, n_cells)
+    assert vDc.shape == (n_batch, n_cells)
     vDh1 = vDY
     vgc = numpy.tanh(vc)
     vDoutGate_in = (1.0 - vgates) * vgates * vgc * vDh1
@@ -345,10 +344,10 @@ def test_LstmLowMem_bwd_simple_1():
     vDfgtGate_in = (1.0 - vgates) * vgates * vc0 * vDc2
     vDintern = numpy.array([vDcellIn_in, vDinpGate_in, vDfgtGate_in, vDoutGate_in])
     vDb_ = vDintern
-    assert_equal(vDb_.shape, vDb.shape)
+    assert vDb_.shape == vDb.shape
     assert_allclose(vDb, vDb_)
     vDW_ = numpy.array([vX * vDintern, vy0 * vDintern])
-    assert_equal(vDW_.shape, vDW.shape)
+    assert vDW_.shape == vDW.shape
     assert_allclose(vDW, vDW_)
     vDx1 = numpy.sum(vDintern)
     assert_allclose(vDX, vDx1)
@@ -395,7 +394,7 @@ def test_NativeLstm2_shape_inference_normal():
         out, _, _, final_cell_state = op(inputs, weights, y0, c0, index, start, step)
         print("out:", out)
         assert isinstance(out, tf.Tensor)
-        assert_equal(out.shape.as_list(), [n_time, n_batch, n_hidden])
+        assert out.shape.as_list() == [n_time, n_batch, n_hidden]
 
 
 def test_NativeLstm2_shape_inference_unknown_batchnlen():
@@ -416,7 +415,7 @@ def test_NativeLstm2_shape_inference_unknown_batchnlen():
         out, _, _, final_cell_state = op(inputs, weights, y0, c0, index, start, step)
         print("out:", out)
         assert isinstance(out, tf.Tensor)
-        assert_equal(out.shape.as_list(), [None, None, n_hidden])
+        assert out.shape.as_list() == [None, None, n_hidden]
 
 
 def test_NativeLstm2_shape_inference_unknown_rank():
@@ -435,7 +434,7 @@ def test_NativeLstm2_shape_inference_unknown_rank():
         out, _, _, final_cell_state = op(inputs, weights, y0, c0, index, start, step)
         print("out:", out)
         assert isinstance(out, tf.Tensor)
-        assert_equal(out.shape.as_list(), [None, None, n_hidden])
+        assert out.shape.as_list() == [None, None, n_hidden]
 
 
 def test_NativeLstm2_0len_run():
@@ -1620,11 +1619,11 @@ def test_py_baum_welch():
     print("Done.")
     print("score:")
     print(repr(obs_scores))
-    assert_equal(obs_scores.shape, (seq_len, n_batch))
+    assert obs_scores.shape == (seq_len, n_batch)
     bw = numpy.exp(-fwdbwd)
     print("Baum-Welch soft alignment:")
     print(repr(bw))
-    assert_equal(bw.shape, (seq_len, n_batch, n_classes))
+    assert bw.shape == (seq_len, n_batch, n_classes)
     from numpy import array, float32
 
     if seq_len == n_classes:
@@ -1646,9 +1645,9 @@ def test_py_baum_welch():
             ],
             dtype=float32,
         )
-        assert_equal(ref_align.shape, (seq_len, 1, n_classes))
+        assert ref_align.shape == (seq_len, 1, n_classes)
         ref_align = numpy.tile(ref_align, (1, n_batch, 1))
-        assert_equal(ref_align.shape, bw.shape)
+        assert ref_align.shape == bw.shape
         # print("Reference alignment:")
         # print(repr(ref_align))
         print("mean square diff:", numpy.mean(numpy.square(ref_align - bw)))
@@ -1738,11 +1737,11 @@ def test_fast_bw_uniform():
     fwdbwd, score = session.run([fwdbwd, obs_scores])
     print("score:")
     print(repr(score))
-    assert_equal(score.shape, (seq_len, n_batch))
+    assert score.shape == (seq_len, n_batch)
     bw = numpy.exp(-fwdbwd)
     print("Baum-Welch soft alignment:")
     print(repr(bw))
-    assert_equal(bw.shape, (seq_len, n_batch, n_classes))
+    assert bw.shape == (seq_len, n_batch, n_classes)
     from numpy import array, float32
 
     if seq_len == n_classes:
@@ -1764,9 +1763,9 @@ def test_fast_bw_uniform():
             ],
             dtype=float32,
         )
-        assert_equal(ref_align.shape, (seq_len, 1, n_classes))
+        assert ref_align.shape == (seq_len, 1, n_classes)
         ref_align = numpy.tile(ref_align, (1, n_batch, 1))
-        assert_equal(ref_align.shape, bw.shape)
+        assert ref_align.shape == bw.shape
         # print("Reference alignment:")
         # print(repr(ref_align))
         print("mean square diff:", numpy.mean(numpy.square(ref_align - bw)))
@@ -2233,10 +2232,10 @@ def test_py_viterbi():
     print("Done.")
     print("score:")
     print(repr(obs_scores))
-    assert_equal(obs_scores.shape, (n_batch,))
+    assert obs_scores.shape == (n_batch,)
     print("Hard alignment:")
     print(repr(alignment))
-    assert_equal(alignment.shape, (seq_len, n_batch))
+    assert alignment.shape == (seq_len, n_batch)
     if seq_len == n_classes:
         print("Extra check identity...")
         for i in range(n_batch):
@@ -2246,7 +2245,7 @@ def test_py_viterbi():
         print("Extra check ref_align (7,5)...")
         assert_allclose(obs_scores, -1.6218603, rtol=1e-5)  # should be the same everywhere
         for i in range(n_batch):
-            assert_equal(alignment[:, i].tolist(), [0, 1, 1, 2, 3, 3, 4])
+            assert alignment[:, i].tolist() == [0, 1, 1, 2, 3, 3, 4]
     print("Done.")
 
 
@@ -2287,10 +2286,10 @@ def test_fast_viterbi():
     print("Done.")
     print("score:")
     print(repr(obs_scores))
-    assert_equal(obs_scores.shape, (n_batch,))
+    assert obs_scores.shape == (n_batch,)
     print("Hard alignment:")
     print(repr(alignment))
-    assert_equal(alignment.shape, (seq_len, n_batch))
+    assert alignment.shape == (seq_len, n_batch)
     if seq_len == n_classes:
         print("Extra check identity...")
         for i in range(n_batch):
@@ -2300,7 +2299,7 @@ def test_fast_viterbi():
         print("Extra check ref_align (7,5)...")
         assert_allclose(obs_scores, -1.6218603, rtol=1e-5)  # should be the same everywhere
         for i in range(n_batch):
-            assert_equal(alignment[:, i].tolist(), [0, 1, 1, 2, 3, 3, 4])
+            assert alignment[:, i].tolist() == [0, 1, 1, 2, 3, 3, 4]
     print("Done.")
 
 
@@ -2329,10 +2328,10 @@ def test_fast_viterbi_rnd():
     )
     print("ref score:")
     print(repr(ref_scores))
-    assert_equal(ref_scores.shape, (n_batch,))
+    assert ref_scores.shape == (n_batch,)
     print("ref hard alignment:")
     print(repr(ref_alignment))
-    assert_equal(ref_alignment.shape, (seq_len, n_batch))
+    assert ref_alignment.shape == (seq_len, n_batch)
     print("Construct fast_viterbi call...")
     alignment, scores = fast_viterbi(
         am_scores=tf.constant(am_scores),
@@ -2345,10 +2344,10 @@ def test_fast_viterbi_rnd():
     print("Done.")
     print("score:")
     print(repr(scores))
-    assert_equal(scores.shape, (n_batch,))
+    assert scores.shape == (n_batch,)
     print("Hard alignment:")
     print(repr(alignment))
-    assert_equal(alignment.shape, (seq_len, n_batch))
+    assert alignment.shape == (seq_len, n_batch)
     assert_allclose(scores, ref_scores, rtol=1e-5)
     assert_allclose(alignment, ref_alignment, rtol=1e-5)
     print("Done.")
@@ -2574,13 +2573,13 @@ def _wrap_tf_edit_distance(a, b):
 
 
 def test_wrap_tf_edit_distance():
-    assert_equal(_wrap_tf_edit_distance([1], []), 1)
-    assert_equal(_wrap_tf_edit_distance([1, 2], [1]), 1)
-    assert_equal(_wrap_tf_edit_distance([2, 2], [1]), 2)
-    assert_equal(_wrap_tf_edit_distance([2, 1], [1]), 1)
-    assert_equal(_wrap_tf_edit_distance([2, 1], [1, 1]), 1)
-    assert_equal(_wrap_tf_edit_distance([2, 1], [1, 1, 1]), 2)
-    assert_equal(_wrap_tf_edit_distance([2, 1], [2, 1, 1]), 1)
+    assert _wrap_tf_edit_distance([1], []) == 1
+    assert _wrap_tf_edit_distance([1, 2], [1]) == 1
+    assert _wrap_tf_edit_distance([2, 2], [1]) == 2
+    assert _wrap_tf_edit_distance([2, 1], [1]) == 1
+    assert _wrap_tf_edit_distance([2, 1], [1, 1]) == 1
+    assert _wrap_tf_edit_distance([2, 1], [1, 1, 1]) == 2
+    assert _wrap_tf_edit_distance([2, 1], [2, 1, 1]) == 1
 
 
 def _naive_optimal_completion_edit_distance(a, b):
@@ -3212,7 +3211,7 @@ def _run_rnnt(acts, labels, input_lengths, label_lengths, expected_costs, expect
 
     if not is_checked_out():
         raise unittest.SkipTest("HawkAaronWarpTransducer not checked out?")
-    assert_equal(acts.shape, expected_grads.shape)
+    assert acts.shape == expected_grads.shape
     acts_t = tf.constant(acts)
     labels_t = tf.constant(labels)
     input_lengths_t = tf.constant(input_lengths)
diff --git a/tests/test_TFNetworkLayer.py b/tests/test_TFNetworkLayer.py
index 7e2822e98..539da7750 100644
--- a/tests/test_TFNetworkLayer.py
+++ b/tests/test_TFNetworkLayer.py
@@ -3,7 +3,6 @@
 
 import _setup_test_env  # noqa
 import tensorflow as tf
-from nose.tools import assert_equal, assert_not_equal, assert_is_instance
 import unittest
 import numpy.testing
 import tempfile
@@ -731,7 +730,7 @@ def test_concat_sources_dim1():
         config.update(dict(num_inputs=4, num_outputs=9))
         network = TFNetwork(config=config, train_flag=True)
         network.construct_from_dict(net_dict)
-        assert_equal(network.get_layer("concat").output.shape, (None, 6))
+        assert network.get_layer("concat").output.shape == (None, 6)
         out = network.get_default_output_layer()
         assert out.output.shape == (None, 9)
         feed_dict = make_feed_dict(network.extern_data.data.values(), same_time=True)
@@ -794,7 +793,7 @@ def test_ConcatLayer():
         network = TFNetwork(config=config)
         network.construct_from_dict(net_dict)
         out = network.get_default_output_layer()
-        assert_equal(out.output.shape, (None, 10))
+        assert out.output.shape == (None, 10)
         feed_dict = make_feed_dict(network.extern_data, same_time=True)
         session.run(tf_compat.v1.global_variables_initializer())
         session.run(out.output.placeholder, feed_dict=feed_dict)
@@ -810,10 +809,10 @@ def test_ConcatLayer_range_dyn():
         network = TFNetwork(config=config)
         network.construct_from_dict(net_dict)
         out = network.get_default_output_layer().output
-        assert_equal(out.batch_shape, (None,))
+        assert out.batch_shape == (None,)
         feed_dict = make_feed_dict(network.extern_data, n_time=7)
         session.run(out.placeholder, feed_dict=feed_dict)
-        assert_equal(session.run(out.dim_tags[0].get_dim_value(), feed_dict=feed_dict), 14)
+        assert session.run(out.dim_tags[0].get_dim_value(), feed_dict=feed_dict) == 14
 
 
 def test_LinearLayer_batch_feature_major():
@@ -883,16 +882,20 @@ def test_batch_norm_vars():
         pprint(layer.params)
         assert layer.use_batch_norm
         bn_prefix = "batch_norm/v2_"
-        assert_equal(
-            set(layer.params.keys()),
-            {"W", "b", bn_prefix + "beta", bn_prefix + "mean", bn_prefix + "gamma", bn_prefix + "variance"},
-        )
-        assert_equal(layer.params["W"].get_shape().as_list(), [n_in, n_out])
-        assert_equal(layer.params["b"].get_shape().as_list(), [n_out])
-        assert_equal(layer.params[bn_prefix + "beta"].get_shape().as_list(), [n_out])
-        assert_equal(layer.params[bn_prefix + "gamma"].get_shape().as_list(), [n_out])
-        assert_equal(layer.params[bn_prefix + "mean"].get_shape().as_list(), [n_out])
-        assert_equal(layer.params[bn_prefix + "variance"].get_shape().as_list(), [n_out])
+        assert set(layer.params.keys()) == {
+            "W",
+            "b",
+            bn_prefix + "beta",
+            bn_prefix + "mean",
+            bn_prefix + "gamma",
+            bn_prefix + "variance",
+        }
+        assert layer.params["W"].get_shape().as_list() == [n_in, n_out]
+        assert layer.params["b"].get_shape().as_list() == [n_out]
+        assert layer.params[bn_prefix + "beta"].get_shape().as_list() == [n_out]
+        assert layer.params[bn_prefix + "gamma"].get_shape().as_list() == [n_out]
+        assert layer.params[bn_prefix + "mean"].get_shape().as_list() == [n_out]
+        assert layer.params[bn_prefix + "variance"].get_shape().as_list() == [n_out]
 
 
 def _test_batch_norm_param_old_to_new_import(old_version, new_version):
@@ -1298,12 +1301,10 @@ def test_activation_layer_net_construct():
                 [[[0, 0], [-1, -1], [2, 2]]], dtype="float32"
             )
         }
-        assert_equal(
-            feed[network.extern_data.get_default_input_data().placeholder].shape, (n_batch, seq_len, num_inputs)
-        )
+        assert feed[network.extern_data.get_default_input_data().placeholder].shape == (n_batch, seq_len, num_inputs)
         v = session.run(out, feed_dict=feed)
-        assert_equal(v.shape, (n_batch, seq_len, num_inputs))
-        assert_equal(v.tolist(), [[[0, 0], [0, 0], [2, 2]]])
+        assert v.shape == (n_batch, seq_len, num_inputs)
+        assert v.tolist() == [[[0, 0], [0, 0], [2, 2]]]
 
 
 def test_activation_layer_abs_for_stft():
@@ -1343,11 +1344,9 @@ def test_activation_layer_abs_for_stft():
                 [[[0], [0], [2], [1], [4], [3]]], dtype=numpy.float32
             )
         }
-        assert_equal(
-            feed[network.extern_data.get_default_input_data().placeholder].shape, (n_batch, seq_len, num_inputs)
-        )
+        assert feed[network.extern_data.get_default_input_data().placeholder].shape == (n_batch, seq_len, num_inputs)
         v = session.run(out, feed_dict=feed)
-        assert_equal(v.shape, (n_batch, seq_len - (frame_size - 1), fft_size // 2 + 1))
+        assert v.shape == (n_batch, seq_len - (frame_size - 1), fft_size // 2 + 1)
 
         input_stft = tf.signal.stft(
             numpy.array([[0, 0, 2, 1, 4, 3]], dtype=numpy.float32),
@@ -1357,7 +1356,7 @@ def test_activation_layer_abs_for_stft():
             window_fn=tf.signal.hann_window,
         )
         exp_output = tf.math.abs(input_stft)
-        assert_equal(v.tolist(), exp_output.eval().tolist())
+        assert v.tolist() == exp_output.eval().tolist()
 
 
 def test_activation_layer_net_construct_two_out():
@@ -1392,12 +1391,10 @@ def test_activation_layer_net_construct_two_out():
                 [[[0, 0], [-1, -1], [2, 2]]], dtype="float32"
             )
         }
-        assert_equal(
-            feed[network.extern_data.get_default_input_data().placeholder].shape, (n_batch, seq_len, num_inputs)
-        )
+        assert feed[network.extern_data.get_default_input_data().placeholder].shape == (n_batch, seq_len, num_inputs)
         v, v2 = session.run([out, out2], feed_dict=feed)
-        assert_equal(v.shape, (n_batch, seq_len, num_inputs))
-        assert_equal(v.tolist(), [[[0, 0], [0, 0], [2, 2]]])
+        assert v.shape == (n_batch, seq_len, num_inputs)
+        assert v.tolist() == [[[0, 0], [0, 0], [2, 2]]]
 
 
 def _test_simple_eval_func(s):
@@ -1575,7 +1572,7 @@ def test_CombineLayer_broadcast():
         config.update(dict(num_inputs=4, num_outputs=9))
         network = TFNetwork(config=config, train_flag=True)
         network.construct_from_dict(net_dict)
-        assert_equal(network.get_layer("combine").output.shape, (None, 5))
+        assert network.get_layer("combine").output.shape == (None, 5)
         out = network.get_default_output_layer()
         assert out.output.shape == (None, 9)
         feed_dict = make_feed_dict(network.extern_data.data.values(), same_time=True)
@@ -1595,7 +1592,7 @@ def test_CombineLayer_broadcast_multiple():
         config.update(dict(num_inputs=4, num_outputs=9))
         network = TFNetwork(config=config, train_flag=True)
         network.construct_from_dict(net_dict)
-        assert_equal(network.get_layer("combine").output.batch_shape, (5, 5, 3))
+        assert network.get_layer("combine").output.batch_shape == (5, 5, 3)
         out = network.get_default_output_layer()
         assert out.output.batch_shape == (5, 5, 9) and not out.output.have_batch_axis()
         feed_dict = make_feed_dict(network.extern_data.data.values(), same_time=True)
@@ -2017,7 +2014,7 @@ def test_CombineLayer_time_broadcast():
         network = TFNetwork(config=config, train_flag=True)
         network.construct_from_dict(net_dict)
         out = network.get_default_output_layer()
-        assert_equal(out.output.batch_shape, (None, n_features, None))
+        assert out.output.batch_shape == (None, n_features, None)
         feed_dict = make_feed_dict(network.extern_data, n_batch=n_batch, n_time=n_time)
         session.run(tf_compat.v1.global_variables_initializer())
         out_v = session.run(out.output.placeholder, feed_dict=feed_dict)
@@ -2047,7 +2044,7 @@ def test_CombineLayer_time_broadcast_swapped():
         network = TFNetwork(config=config, train_flag=True)
         network.construct_from_dict(net_dict)
         out = network.get_default_output_layer()
-        assert_equal(out.output.batch_shape, (None, n_features, None))
+        assert out.output.batch_shape == (None, n_features, None)
         feed_dict = make_feed_dict(network.extern_data, n_batch=n_batch, n_time=n_time)
         session.run(tf_compat.v1.global_variables_initializer())
         out_v = session.run(out.output.placeholder, feed_dict=feed_dict)
@@ -2460,7 +2457,7 @@ def test_subnetwork_layer_net_construct():
         config.update(dict(num_inputs=4, num_outputs=3))
         network = TFNetwork(config=config, train_flag=True)
         network.construct_from_dict(net_dict)
-        assert_equal(network.layers["sub"].output.dim, 2)
+        assert network.layers["sub"].output.dim == 2
         sub_layer = network.layers["sub"]
         assert isinstance(sub_layer, SubnetworkLayer)
         sub_layer_deps = sub_layer.get_dep_layers()
@@ -2541,8 +2538,8 @@ def test_constant_layer():
         network.construct_from_dict(config.typed_dict["network"])
         out = network.get_default_output_layer(must_exist=True)
         v = session.run(out.output.placeholder)
-        assert_equal(v.shape, ())  # (batch,), where batch==1 for broadcasting
-        assert_equal(v, 42)
+        assert v.shape == ()  # (batch,), where batch==1 for broadcasting
+        assert v == 42
 
 
 def test_compare_layer():
@@ -2563,9 +2560,9 @@ def test_compare_layer():
         network.construct_from_dict(config.typed_dict["network"])
         out = network.get_default_output_layer(must_exist=True)
         v = session.run(out.output.placeholder)
-        assert_equal(v.shape, ())  # (batch,), where batch==1 for broadcasting
-        assert_equal(v.dtype, numpy.dtype("bool"))
-        assert_equal(v, True)
+        assert v.shape == ()  # (batch,), where batch==1 for broadcasting
+        assert v.dtype == numpy.dtype("bool")
+        assert v == True
 
 
 def test_ShiftAxisLayer():
@@ -2601,10 +2598,10 @@ def test_ShiftAxisLayer():
         feed_dict = {network.layers["data"].output.placeholder: input_np}
         v = session.run(out.output.placeholder, feed_dict)
 
-        assert_equal(v.shape, (batch_size, time_size, feat_size))
-        assert_equal(np.equal(v[0, shift_amount:, 0], np.arange(time_size - shift_amount)).all(), True)
-        assert_equal((v[:, :shift_amount, :] == 0).all(), True)  # padding
-        assert_equal((v[1:, shift_amount:, :] == 1).all(), True)
+        assert v.shape == (batch_size, time_size, feat_size)
+        assert np.equal(v[0, shift_amount:, 0], np.arange(time_size - shift_amount)).all() == True
+        assert (v[:, :shift_amount, :] == 0).all() == True  # padding
+        assert (v[1:, shift_amount:, :] == 1).all() == True
 
 
 def test_ShiftAxisLayer_small_time():
@@ -2639,8 +2636,8 @@ def test_ShiftAxisLayer_small_time():
         feed_dict = {network.layers["data"].output.placeholder: input_np}
         v = session.run(out.output.placeholder, feed_dict)
 
-        assert_equal(v.shape, (batch_size, time_size, feat_size))
-        assert_equal((v == 0).all(), True)  # padding
+        assert v.shape == (batch_size, time_size, feat_size)
+        assert (v == 0).all() == True  # padding
 
 
 def test_ReinterpretDataLayer_change_batch_to_spatial():
@@ -2812,9 +2809,9 @@ def test_SoftmaxOverSpatialLayer_start():
         out_data = SoftmaxOverSpatialLayer.get_out_data_from_opts(**opts)
         print("output:", out_data)
         out_data.sanity_check(ignore_placeholder=True)  # placeholder might be overwritten later
-        assert_equal(out_data.shape, (n_dim, None))  # layer moves time-dim to back
+        assert out_data.shape == (n_dim, None)  # layer moves time-dim to back
         layer = SoftmaxOverSpatialLayer(output=out_data, **opts)
-        assert_equal(layer.output.shape, (n_dim, None))
+        assert layer.output.shape == (n_dim, None)
         try:
             out_np = session.run(layer.output.placeholder, feed_dict={net.extern_data.get_batch_info().dim: n_batch})
         except Exception as exc:
@@ -2822,13 +2819,13 @@ def test_SoftmaxOverSpatialLayer_start():
 
             help_on_tf_exception(session=session, exception=exc, fetches=layer.output.placeholder)
             raise
-        assert_equal(out_np.shape, (n_batch, n_dim, n_time))
+        assert out_np.shape == (n_batch, n_dim, n_time)
         # check if masking worked
         range_idxs = numpy.ones_like(start_idxs) * numpy.expand_dims(numpy.arange(n_time), axis=0)
         cond = range_idxs < numpy.broadcast_to(start_idxs, [n_batch, n_time])  # (B, T)
         cond = numpy.expand_dims(cond, axis=1)
         cond = numpy.broadcast_to(cond, [n_batch, n_dim, n_time])  # (B, D, T)
-        assert_equal(cond.sum(), n_dim * start_idxs.sum())  # check num of conds
+        assert cond.sum() == n_dim * start_idxs.sum()  # check num of conds
         numpy.testing.assert_array_equal(out_np[cond], 0)
 
 
@@ -2864,12 +2861,12 @@ def test_SoftmaxOverSpatialLayer_window():
         out_data = SoftmaxOverSpatialLayer.get_out_data_from_opts(**opts)
         print("output:", out_data)
         out_data.sanity_check(ignore_placeholder=True)  # placeholder might be overwritten later
-        assert_equal(out_data.shape, (n_dim, None))  # layer moves time-dim to back
+        assert out_data.shape == (n_dim, None)  # layer moves time-dim to back
         layer = SoftmaxOverSpatialLayer(output=out_data, **opts)
         layer.output.sanity_check()
-        assert_equal(layer.output.shape, (n_dim, None))
+        assert layer.output.shape == (n_dim, None)
         out_np = session.run(layer.output.placeholder, feed_dict=make_feed_dict(net.extern_data, n_batch=n_batch))
-        assert_equal(out_np.shape, (n_batch, n_dim, n_time))
+        assert out_np.shape == (n_batch, n_dim, n_time)
         # check if window masking worked:
         # handle edge cases correctly: (start is 0-based)
         # 1. if the energy time-dim is less than `window_size`, we adjust the window size.
@@ -3000,10 +2997,9 @@ def test_SplitDimsLayer_simple_time():
     with make_scope() as session:
         net = TFNetwork(config=config)
         net.construct_from_dict({"output": {"class": "split_dims", "axis": "t", "dims": (-1, 1), "from": "data:data"}})
-        assert_equal(
-            net.get_default_output_layer().output.get_dim_tag(1),
-            net.extern_data.get_default_input_data().get_dim_tag(1),
-        )
+        assert net.get_default_output_layer().output.get_dim_tag(
+            1
+        ) == net.extern_data.get_default_input_data().get_dim_tag(1)
         out_t = net.get_default_output_layer().output.placeholder
         assert out_t.shape.as_list() == [None, None, 1, 20]
         in_v = numpy.arange(0, n_batch * n_time * n_in).astype("float32").reshape((n_batch, n_time, n_in))
@@ -3028,7 +3024,7 @@ def test_SplitDimsLayer_simple_time2():
         out = net.get_default_output_layer().output
         print(in_)
         print(out)
-        assert_equal(out.get_dim_tag(2), in_.get_dim_tag(1))
+        assert out.get_dim_tag(2) == in_.get_dim_tag(1)
         assert out.time_dim_axis == 2
         out_t = out.placeholder
         assert out_t.shape.as_list() == [None, 1, None, 20]
@@ -3040,25 +3036,29 @@ def test_SplitDimsLayer_simple_time2():
 
 
 def test_SplitDimsLayer_resolve_dims():
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(3, -1)), (3, 5))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(3, 5)), (3, 5))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(5, -1)), (5, 3))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(-1, 3, 5)), (2, 3, 5))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, -1, 5)), (2, 3, 5))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1)), (2, 3, 5))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1, 1)), (2, 3, 5, 1))
-
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(3, -1), pad_to_multiples=True), (3, 5))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=3 * 5 + 1, new_dims=(3, -1), pad_to_multiples=True), (3, 6))
-    assert_equal(SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1), pad_to_multiples=True), (2, 3, 5))
-    assert_equal(
-        SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1, 1), pad_to_multiples=True), (2, 3, 5, 1)
+    assert SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(3, -1)) == (3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(3, 5)) == (3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(5, -1)) == (5, 3)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(-1, 3, 5)) == (2, 3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, -1, 5)) == (2, 3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1)) == (2, 3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1, 1)) == (2, 3, 5, 1)
+
+    assert SplitDimsLayer._resolve_dims(old_dim=3 * 5, new_dims=(3, -1), pad_to_multiples=True) == (3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=3 * 5 + 1, new_dims=(3, -1), pad_to_multiples=True) == (3, 6)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1), pad_to_multiples=True) == (2, 3, 5)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5, new_dims=(2, 3, -1, 1), pad_to_multiples=True) == (
+        2,
+        3,
+        5,
+        1,
     )
-    assert_equal(
-        SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5 + 2, new_dims=(2, 3, -1), pad_to_multiples=True), (2, 3, 6)
-    )
-    assert_equal(
-        SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5 + 2, new_dims=(2, 3, -1, 1), pad_to_multiples=True), (2, 3, 6, 1)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5 + 2, new_dims=(2, 3, -1), pad_to_multiples=True) == (2, 3, 6)
+    assert SplitDimsLayer._resolve_dims(old_dim=2 * 3 * 5 + 2, new_dims=(2, 3, -1, 1), pad_to_multiples=True) == (
+        2,
+        3,
+        6,
+        1,
     )
 
 
@@ -3074,7 +3074,7 @@ def test_SplitDimsLayer_batch_feature_major_keep_feature():
             {"output": {"class": "split_dims", "from": "data", "axis": "T", "dims": [-1, 1]}}  # [B,D,T,1]
         )
         out = net.get_default_output_layer().output
-        assert_equal(out.get_dim_tag(2), net.extern_data.get_default_input_data().get_time_dim_tag())
+        assert out.get_dim_tag(2) == net.extern_data.get_default_input_data().get_time_dim_tag()
         assert out.dim_tags[1].dimension == n_in and out.dim_tags[3].dimension == 1
         assert out.placeholder.shape.as_list() == [None, n_in, None, 1]
         assert out.feature_dim_axis == 1  # https://github.com/rwth-i6/returnn/issues/596
@@ -3347,15 +3347,15 @@ def _check_MergeDimsLayer(
     opts.update({"network": net, "name": "merge_dims_test", "sources": [src]})
     out_data = MergeDimsLayer.get_out_data_from_opts(**opts)
     out_data.sanity_check(ignore_placeholder=True)  # placeholder might be overwritten later
-    assert_equal(out_data.shape, out_data_shape)
+    assert out_data.shape == out_data_shape
     layer = MergeDimsLayer(output=out_data, **opts)
-    assert_equal(layer.output.shape, out_data_shape)
+    assert layer.output.shape == out_data_shape
     out_np, size_placeholder = session.run([layer.output.placeholder, layer.output.size_placeholder.as_dict()])
     print("output:", out_data)
-    assert_equal(out_np.shape, out_static_shape)
+    assert out_np.shape == out_static_shape
 
     if out_sizes:
-        assert_equal(sorted(size_placeholder.keys()), sorted(out_sizes))
+        assert sorted(size_placeholder.keys()) == sorted(out_sizes)
         for k in size_placeholder.keys():
             numpy.testing.assert_array_equal(size_placeholder[k], out_sizes[k])
 
@@ -3811,7 +3811,7 @@ def test_FlattenBatchLayer():
         out_v = session.run(out_t, feed_dict={in_data.placeholder: in_v, in_data.size_placeholder[0]: in_seq_lens})
         assert isinstance(out_v, numpy.ndarray)
         assert out_v.shape == (sum(in_seq_lens), n_in)
-        assert_equal(out_v.tolist(), [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [18, 19]])
+        assert out_v.tolist() == [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [18, 19]]
 
 
 def test_UnflattenBatchLayer():
@@ -6429,8 +6429,8 @@ def test_CondLayer_multiple_outputs():
         net = TFNetwork(config=config)
         net.construct_from_dict(net_dict)
         out_t = net.get_default_output_layer().output.placeholder
-        assert_equal(session.run(out_t, feed_dict={net.extern_data.data["cond"].placeholder: True}), 6)
-        assert_equal(session.run(out_t, feed_dict={net.extern_data.data["cond"].placeholder: False}), 35)
+        assert session.run(out_t, feed_dict={net.extern_data.data["cond"].placeholder: True}) == 6
+        assert session.run(out_t, feed_dict={net.extern_data.data["cond"].placeholder: False}) == 35
 
 
 def test_ScatterNdLayer_RangeLayer():
@@ -6811,23 +6811,23 @@ def test_ScatterNdLayer_pos_batch_last_dim():
         scatter = ScatterNdLayer(output=scatter_out_template, **scatter_opts)
         print("scatter out dim tags:")
         pprint(scatter.output.get_batch_shape_dim_tags())
-        assert_equal(scatter.output.get_size_dim_tag(0), pos.output.get_time_dim_tag())
-        assert_equal(scatter.output.get_size_dim_tag(1), data.output.get_time_dim_tag())
+        assert scatter.output.get_size_dim_tag(0) == pos.output.get_time_dim_tag()
+        assert scatter.output.get_size_dim_tag(1) == data.output.get_time_dim_tag()
         session.run(scatter.output.placeholder, feed_dict=make_feed_dict([data.output, pos.output, val.output]))
 
 
 def test_ConvLayer_get_valid_out_dim():
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=2, padding="same"), 10)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=3, padding="same"), 10)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=2, padding="valid"), 9)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=3, padding="valid"), 8)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=2, filter_size=2, padding="valid"), 5)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=3, filter_size=2, padding="valid"), 3)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=3, filter_size=1, padding="valid"), 4)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=10, stride=3, filter_size=2, padding="same"), 4)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=41, stride=1, filter_size=2, padding="valid"), 40)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=40, stride=2, filter_size=2, padding="valid"), 20)
-    assert_equal(ConvLayer.calc_out_dim(in_dim=2, stride=1, filter_size=3, padding="valid"), 0)
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=2, padding="same") == 10
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=3, padding="same") == 10
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=2, padding="valid") == 9
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=1, filter_size=3, padding="valid") == 8
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=2, filter_size=2, padding="valid") == 5
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=3, filter_size=2, padding="valid") == 3
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=3, filter_size=1, padding="valid") == 4
+    assert ConvLayer.calc_out_dim(in_dim=10, stride=3, filter_size=2, padding="same") == 4
+    assert ConvLayer.calc_out_dim(in_dim=41, stride=1, filter_size=2, padding="valid") == 40
+    assert ConvLayer.calc_out_dim(in_dim=40, stride=2, filter_size=2, padding="valid") == 20
+    assert ConvLayer.calc_out_dim(in_dim=2, stride=1, filter_size=3, padding="valid") == 0
 
 
 def test_LengthLayer():
@@ -6934,7 +6934,7 @@ def test_RandIntLayer():
         out = net.layers["output"].output.placeholder
         v = session.run(out, feed_dict=feed)
 
-        assert_equal(v.shape, (5, n_batch, max(input_len), 3))
+        assert v.shape == (5, n_batch, max(input_len), 3)
 
 
 def test_rand_indices():
@@ -6960,8 +6960,8 @@ def test_rand_indices():
             (net.layers["indices_flat"].output.placeholder, net.layers["output"].output.placeholder),
             feed_dict=make_feed_dict(net.extern_data, n_batch=n_batch, n_time=n_time),
         )
-        assert_equal(indices_flat.shape, (n_batch, n_time, sz[-1].dimension))
-        assert_equal(output.shape, (n_batch, n_time, sz[-1].dimension, feature_dim.dimension))
+        assert indices_flat.shape == (n_batch, n_time, sz[-1].dimension)
+        assert output.shape == (n_batch, n_time, sz[-1].dimension, feature_dim.dimension)
 
 
 def test_RandomLayer():
@@ -7153,7 +7153,7 @@ def test_untrainable_params():
         network.construct_from_dict(config.typed_dict["network"])
         l1 = network.layers["l1"]
         l2 = network.layers["output"]
-        assert_equal(set(network.get_trainable_params()), {l1.params["W"], l1.params["b"]})
+        assert set(network.get_trainable_params()) == {l1.params["W"], l1.params["b"]}
 
 
 def test_reuse_params():
@@ -7180,9 +7180,9 @@ def test_reuse_params():
         network.construct_from_dict(config.typed_dict["network"])
         l1 = network.layers["l1"]
         l2 = network.layers["output"]
-        assert_equal(set(l1.params.keys()), {"W", "b"})
-        assert_equal(set(l2.params.keys()), set())
-        assert_equal(set(network.get_trainable_params()), {l1.params["W"], l1.params["b"]})
+        assert set(l1.params.keys()) == {"W", "b"}
+        assert set(l2.params.keys()) == set()
+        assert set(network.get_trainable_params()) == {l1.params["W"], l1.params["b"]}
 
 
 def test_reuse_params_map_custom():
@@ -7224,9 +7224,9 @@ def test_reuse_params_map_custom():
         network.construct_from_dict(config.typed_dict["network"])
         l1 = network.layers["l1"]
         l2 = network.layers["output"]
-        assert_equal(set(l1.params.keys()), {"W"})
-        assert_equal(set(l2.params.keys()), {"b"})
-        assert_equal(set(network.get_trainable_params()), {l1.params["W"], l2.params["b"]})
+        assert set(l1.params.keys()) == {"W"}
+        assert set(l2.params.keys()) == {"b"}
+        assert set(network.get_trainable_params()) == {l1.params["W"], l2.params["b"]}
 
 
 def test_reuse_params_map_custom_rev():
@@ -7268,9 +7268,9 @@ def test_reuse_params_map_custom_rev():
         network.construct_from_dict(config.typed_dict["network"])
         l1 = network.layers["l1"]
         l2 = network.layers["output"]
-        assert_equal(set(l1.params.keys()), {"b"})
-        assert_equal(set(l2.params.keys()), {"W"})
-        assert_equal(set(network.get_trainable_params()), {l2.params["W"], l1.params["b"]})
+        assert set(l1.params.keys()) == {"b"}
+        assert set(l2.params.keys()) == {"W"}
+        assert set(network.get_trainable_params()) == {l2.params["W"], l1.params["b"]}
 
 
 def test_reuse_params_map_custom_dep_loop():
@@ -7393,10 +7393,10 @@ def test_reuse_params_map_custom_dep_loop():
         train_rec_layer = train_net.layers["output"]
         assert isinstance(train_rec_layer, RecLayer)
         assert isinstance(train_rec_layer.cell, _SubnetworkRecCell)
-        assert_equal(set(train_rec_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-        assert_equal(set(train_rec_layer.cell.output_layers_moved_out), {"output_prob", "readout", "readout_in"})
+        assert set(train_rec_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+        assert set(train_rec_layer.cell.output_layers_moved_out) == {"output_prob", "readout", "readout_in"}
         assert isinstance(train_rec_layer.cell.output_layers_net, TFNetwork)
-        assert_equal(set(train_rec_layer.cell.output_layers_net.layers["output_prob"].params.keys()), {"b"})
+        assert set(train_rec_layer.cell.output_layers_net.layers["output_prob"].params.keys()) == {"b"}
     with make_scope() as session:
         print("Construct for search")
         search_net = TFNetwork(config=config, train_flag=False, eval_flag=True, search_flag=True)
@@ -7513,11 +7513,11 @@ def test_name_scope_share_params():
         network.construct_from_dict(net_dict)
         l1 = network.layers["layer1"]
         l2 = network.layers["output"]
-        assert_equal(set(l1.params.keys()), {"W", "b"})
-        assert_equal(set(l2.params.keys()), {"W", "b"})
+        assert set(l1.params.keys()) == {"W", "b"}
+        assert set(l2.params.keys()) == {"W", "b"}
         assert l1.params["W"] is l2.params["W"]
         assert l1.params["b"] is l2.params["b"]
-        assert_equal(set(network.get_trainable_params()), {l1.params["W"], l1.params["b"]})
+        assert set(network.get_trainable_params()) == {l1.params["W"], l1.params["b"]}
 
 
 def test_SliceLayer_output_placeholder():
@@ -7542,8 +7542,8 @@ def test_SliceLayer_output_placeholder():
         print(seq_lens)
         assert isinstance(out, numpy.ndarray)
         assert isinstance(seq_lens, numpy.ndarray)
-        assert_equal(out.tolist(), [[2, 4], [7, 9], [12, 14]])
-        assert_equal(seq_lens.tolist(), [2, 1, 1])
+        assert out.tolist() == [[2, 4], [7, 9], [12, 14]]
+        assert seq_lens.tolist() == [2, 1, 1]
 
 
 def test_SliceLayer_NCHW():
@@ -7656,7 +7656,7 @@ def test_pad_conv_slice():
         )
         out = net.get_default_output_layer().output
         in_ = net.extern_data.get_default_input_data()
-        assert_not_equal(in_.get_time_dim_tag(), out.get_time_dim_tag())
+        assert in_.get_time_dim_tag() != out.get_time_dim_tag()
         net.initialize_params(session)
         session.run((out.placeholder, out.get_sequence_lengths()), feed_dict=make_feed_dict(net.extern_data))
 
@@ -8237,15 +8237,12 @@ def test_WindowLayer_output_placeholder():
         assert isinstance(out, numpy.ndarray)
         assert isinstance(seq_lens, numpy.ndarray)
         out = out.transpose([2, 1, 0])  # [W, T', B] -> [B, T', W]
-        assert_equal(
-            out.tolist(),
-            [
-                [[1, 2, 3], [2, 3, 4], [3, 4, 5]],
-                [[6, 7, 8], [7, 8, 9], [8, 9, 10]],
-                [[11, 12, 13], [12, 13, 14], [13, 14, 15]],
-            ],
-        )
-        assert_equal(seq_lens.tolist(), [3, 1, 0])
+        assert out.tolist() == [
+            [[1, 2, 3], [2, 3, 4], [3, 4, 5]],
+            [[6, 7, 8], [7, 8, 9], [8, 9, 10]],
+            [[11, 12, 13], [12, 13, 14], [13, 14, 15]],
+        ]
+        assert seq_lens.tolist() == [3, 1, 0]
 
 
 def test_FoldLayer_unchunk():
@@ -8289,8 +8286,8 @@ def test_FoldLayer_unchunk():
         out, seq_lens = session.run([output.raw_tensor, output.dims[1].dyn_size_ext.raw_tensor])
         print(out)
         print(seq_lens)
-        assert_equal(out.tolist(), [[1, 2, 3, 4, 5], [6, 7, 8, 9, 0], [11, 12, 13, 0, 0]])
-        assert_equal(seq_lens.tolist(), [5, 4, 3])
+        assert out.tolist() == [[1, 2, 3, 4, 5], [6, 7, 8, 9, 0], [11, 12, 13, 0, 0]]
+        assert seq_lens.tolist() == [5, 4, 3]
 
 
 def test_conv_window_merge_dims():
@@ -8736,7 +8733,7 @@ def test_conv_layer_NCHW():
             },
         )
         print(out.shape)
-        assert_equal(out.shape, (10, 6, 6, 64))
+        assert out.shape == (10, 6, 6, 64)
         print(seq_lens)
         time_dim_axis = 1 if tf_util.is_gpu_available() else 0
         out, seq_lens = session.run(
@@ -8748,9 +8745,9 @@ def test_conv_layer_NCHW():
         )
         print(out.shape)
         if time_dim_axis == 1:
-            assert_equal(out.shape, (10, 64, 6, 6))
+            assert out.shape == (10, 64, 6, 6)
         else:
-            assert_equal(out.shape, (10, 6, 6, 64))
+            assert out.shape == (10, 6, 6, 64)
         print(seq_lens)
         if tf_util.is_gpu_available():
             out, seq_lens = session.run(
@@ -8761,7 +8758,7 @@ def test_conv_layer_NCHW():
                 },
             )
             print(out.shape)
-            assert_equal(out.shape, (10, 64, 6, 6))
+            assert out.shape == (10, 64, 6, 6)
             print(seq_lens)
 
 
@@ -8786,7 +8783,7 @@ def test_ConvLayer_empty_out():
         print(seq_lens)
         assert isinstance(out, numpy.ndarray)
         assert isinstance(seq_lens, numpy.ndarray)
-        assert_equal(seq_lens.tolist(), [0])
+        assert seq_lens.tolist() == [0]
         assert out.shape == (1, 0, 7)
 
 
@@ -8977,7 +8974,7 @@ def test_pool_layer_NCHW():
             },
         )
         print(out.shape)
-        assert_equal(out.shape, (10, 7, 6, 17))
+        assert out.shape == (10, 7, 6, 17)
         print(seq_lens)
         out, seq_lens = session.run(
             [pool_nchw_from_nhwc.output.placeholder, pool_nchw_from_nhwc.output.get_sequence_lengths()],
@@ -8988,9 +8985,9 @@ def test_pool_layer_NCHW():
         )
         print(pool_nchw_from_nhwc.output, out.shape)
         if pool_nchw_from_nhwc.output.feature_dim_axis == 1:
-            assert_equal(out.shape, (10, 17, 7, 6))
+            assert out.shape == (10, 17, 7, 6)
         else:
-            assert_equal(out.shape, (10, 7, 6, 17))
+            assert out.shape == (10, 7, 6, 17)
         print(seq_lens)
         if tf_util.is_gpu_available():
             out, seq_lens = session.run(
@@ -9001,7 +8998,7 @@ def test_pool_layer_NCHW():
                 },
             )
             print(out.shape)
-            assert_equal(out.shape, (10, 17, 7, 6))
+            assert out.shape == (10, 17, 7, 6)
             print(seq_lens)
 
 
@@ -9060,7 +9057,7 @@ def test_TransposedConvLayer_2d_simple():
         )
         out = net.get_default_output_layer().output.copy_as_batch_feature_major()
         assert out.batch_shape == (None, 13, None, 2)
-        assert_equal(out.get_dim_tag(2), net.extern_data.get_default_input_data().get_time_dim_tag())
+        assert out.get_dim_tag(2) == net.extern_data.get_default_input_data().get_time_dim_tag()
         assert out.dim_tags[1].dimension == n_out and out.dim_tags[3].dimension == 2
         in_v = numpy.arange(0, n_batch * n_time * n_in).astype("float32").reshape((n_batch, n_in, n_time))
         session.run(tf_compat.v1.global_variables_initializer())
@@ -9097,7 +9094,7 @@ def test_TransposedConvLayer_2d_2x2():
         )
         out = net.get_default_output_layer().output.copy_as_batch_feature_major()
         assert out.batch_shape == (None, 13, None, 2)
-        assert_not_equal(out.get_dim_tag(2), net.extern_data.get_default_input_data().get_time_dim_tag())
+        assert out.get_dim_tag(2) != net.extern_data.get_default_input_data().get_time_dim_tag()
         assert out.dim_tags[1].dimension == n_out and out.dim_tags[3].dimension == 2
         in_v = numpy.arange(0, n_batch * n_time * n_in).astype("float32").reshape((n_batch, n_in, n_time))
         session.run(tf_compat.v1.global_variables_initializer())
@@ -9218,8 +9215,8 @@ def test_ReduceLayer_NCHW():
                 src_nchw.output.size_placeholder[1]: np.full(shape=(10,), fill_value=11),
             },
         )
-        assert_equal(out1.shape, (10, 11, 16))
-        assert_equal(out2.shape, (16, 11, 16))
+        assert out1.shape == (10, 11, 16)
+        assert out2.shape == (16, 11, 16)
         assert reduce1.output.time_dim_axis == 1
         assert reduce2.output.feature_dim_axis == 0 and reduce2.output.dim == 16
         assert reduce2.output.batch_dim_axis is None
@@ -9318,30 +9315,27 @@ def test_ResizeLayer_fill_value():
         print(seq_lens)
         assert isinstance(out, numpy.ndarray)
         assert isinstance(seq_lens, numpy.ndarray)
-        assert_equal(
-            out.tolist(),
+        assert out.tolist() == [
             [
-                [
-                    1,
-                    19,
-                    19,
-                    2,
-                    19,
-                    19,
-                    3,
-                    19,
-                    19,
-                    4,
-                    19,
-                    19,
-                    5,
-                    19,
-                    19,
-                ],
-                [6, 19, 19, 7, 19, 19, 8, 19, 19, 9, 19, 19, 10, 19, 19],
+                1,
+                19,
+                19,
+                2,
+                19,
+                19,
+                3,
+                19,
+                19,
+                4,
+                19,
+                19,
+                5,
+                19,
+                19,
             ],
-        )
-        assert_equal(seq_lens.tolist(), [15, 9])
+            [6, 19, 19, 7, 19, 19, 8, 19, 19, 9, 19, 19, 10, 19, 19],
+        ]
+        assert seq_lens.tolist() == [15, 9]
 
 
 def test_ResizeLayer_fill_dropout():
@@ -9382,7 +9376,7 @@ def test_ResizeLayer_fill_dropout():
         # Non-deterministic output. But we can check some constraints.
         for i in range(len(src_seq_lens)):
             assert src_seq_lens[i] <= seq_lens[i] <= src_seq_lens[i] * factor
-            assert_equal([s for s in out[i] if s != fill_value], src_seqs[i])
+            assert [s for s in out[i] if s != fill_value] == src_seqs[i]
 
 
 def test_ResizeLayer_BFT():
@@ -9528,8 +9522,8 @@ def test_DotLayer():
         print(seq_lens)
         assert isinstance(out, numpy.ndarray)
         assert isinstance(seq_lens, numpy.ndarray)
-        assert_equal(seq_lens.tolist(), a_seq_lens)
-        assert_equal(out.shape, (B, H, max(a_seq_lens)))
+        assert seq_lens.tolist() == a_seq_lens
+        assert out.shape == (B, H, max(a_seq_lens))
 
 
 def test_DotLayer2():
@@ -9571,7 +9565,7 @@ def test_DotLayer2():
         assert layer.output.dim == V
         out = session.run(layer.output.placeholder)
         assert isinstance(out, numpy.ndarray)
-        assert_equal(out.shape, (S1, S2, B, V))
+        assert out.shape == (S1, S2, B, V)
 
 
 def test_DotLayer_linear_square_matrix():
@@ -11378,10 +11372,10 @@ def test_VariableLayer_split_info():
         network.construct_from_dict(net_dict)
         layer = network.layers["output"]
         assert isinstance(layer, VariableLayer)
-        assert_equal(
-            tf_util.get_param_axes_split_info(layer.output.placeholder),
-            [2 * [feat1.dimension] + [feat2.dimension], 3 * [feat1.dimension]],
-        )
+        assert tf_util.get_param_axes_split_info(layer.output.placeholder) == [
+            2 * [feat1.dimension] + [feat2.dimension],
+            3 * [feat1.dimension],
+        ]
 
 
 def test_VariableLayer_init_by_layer():
@@ -11421,8 +11415,8 @@ def test_VariableLayer_init_by_layer():
         tf_compat.v1.set_random_seed(tf_rnd_seed)
         net = TFNetwork(config=config)
         net.construct_from_dict(net_dict)
-        assert_equal(net.layers["random"].params, {})
-        assert_equal(net.get_params_list(), [next(iter(net.layers["var"].params.values()))])
+        assert net.layers["random"].params == {}
+        assert net.get_params_list() == [next(iter(net.layers["var"].params.values()))]
         net.initialize_params(session)
         var_v = session.run(net.layers["var"].output.placeholder)
     # Run again to check that it is deterministic.
@@ -11796,7 +11790,7 @@ def test_HDFDumpLayer():
             network.extern_data.data["data"].size_placeholder[0]: seq_lens,
             network.extern_data.data["seq_tag"].placeholder: input_tags,
         }
-        assert_equal(feed[network.extern_data.get_default_input_data().placeholder].shape, (n_batch, seq_len, n_in))
+        assert feed[network.extern_data.get_default_input_data().placeholder].shape == (n_batch, seq_len, n_in)
         session.run([out, network.get_post_control_dependencies()], feed_dict=feed)
 
         network.call_graph_reset_callbacks()
@@ -11806,8 +11800,8 @@ def test_HDFDumpLayer():
     reader.read_all()
     assert reader.num_seqs == 1
     assert reader.seq_tags == ["seq-0"]
-    assert_equal(reader.seq_lens[0]["data"], seq_lens[0])
-    assert_equal(reader.data["data"][0].shape, (seq_lens[0], n_out))
+    assert reader.seq_lens[0]["data"] == seq_lens[0]
+    assert reader.data["data"][0].shape == (seq_lens[0], n_out)
 
 
 def test_HDFDumpLayer_sparse():
@@ -11857,10 +11851,10 @@ def test_HDFDumpLayer_sparse():
     reader.read_all()
     assert reader.num_seqs == 1
     assert reader.seq_tags == ["seq-0"]
-    assert_equal(reader.seq_lens[0]["data"], classes_seq_lens[0])
-    assert_equal(reader.data["data"][0].shape, (classes_seq_lens[0],))
-    assert_equal(reader.data_sparse["data"], True)
-    assert_equal(reader.dataset.get_data_dim("data"), n_out)
+    assert reader.seq_lens[0]["data"] == classes_seq_lens[0]
+    assert reader.data["data"][0].shape == (classes_seq_lens[0],)
+    assert reader.data_sparse["data"] == True
+    assert reader.dataset.get_data_dim("data") == n_out
 
 
 def test_HDFDumpLayer_fixed_length():
@@ -11913,8 +11907,8 @@ def test_HDFDumpLayer_fixed_length():
     reader.read_all()
     assert reader.num_seqs == 1
     assert reader.seq_tags == ["seq-0"]
-    assert_equal(reader.seq_lens[0]["data"], 1)
-    assert_equal(reader.data["data"][0].shape, (1, n_out))
+    assert reader.seq_lens[0]["data"] == 1
+    assert reader.data["data"][0].shape == (1, n_out)
 
 
 def test_HDFDumpLayer_extra():
@@ -11990,10 +11984,10 @@ def test_HDFDumpLayer_extra():
     reader.read_all()
     assert reader.num_seqs == 1
     assert reader.seq_tags == ["seq-0"]
-    assert_equal(reader.seq_lens[0]["data"], input_seq_lens[0])
-    assert_equal(reader.data["data"][0].shape, (input_seq_lens[0], n_in))
-    assert_equal(reader.data["classes1"][0].shape, (classes1_seq_lens[0],))
-    assert_equal(reader.data["classes2"][0].shape, (1,))
+    assert reader.seq_lens[0]["data"] == input_seq_lens[0]
+    assert reader.data["data"][0].shape == (input_seq_lens[0], n_in)
+    assert reader.data["classes1"][0].shape == (classes1_seq_lens[0],)
+    assert reader.data["classes2"][0].shape == (1,)
     numpy.testing.assert_almost_equal(reader.data["data"][0], input_data[0])
     numpy.testing.assert_equal(reader.data["classes1"][0], classes1_data[0])
     numpy.testing.assert_equal(reader.data["classes2"][0], [classes2_data[0]])
@@ -12070,10 +12064,10 @@ def test_HDFDumpLayer_dump_whole_batch_extra_sm():
     reader.read_all()
     assert reader.num_seqs == 1
     assert reader.seq_tags == ["seq-0"]
-    assert_equal(reader.seq_lens[0]["data"], input_seq_lens[0])
-    assert_equal(reader.data["data"][0].shape, (input_seq_lens[0], n_in))
+    assert reader.seq_lens[0]["data"] == input_seq_lens[0]
+    assert reader.data["data"][0].shape == (input_seq_lens[0], n_in)
     numpy.testing.assert_almost_equal(reader.data["data"][0], input_data[0])
-    assert_equal(reader.data["sm"][0].shape, (sm_seq_lens1[0] * sm_seq_lens2[0],))
+    assert reader.data["sm"][0].shape == (sm_seq_lens1[0] * sm_seq_lens2[0],)
     numpy.testing.assert_equal(numpy.reshape(reader.data["sm"][0], sm_data[0].shape), sm_data[0])
 
 
@@ -12148,9 +12142,9 @@ def test_HDFDumpLayer_dump_whole_batch_extra_sm1():
     reader.read_all()
     assert reader.num_seqs == 1
     assert reader.seq_tags == ["seq-0"]
-    assert_equal(reader.data["data"][0].shape, (input_seq_lens[0], n_in))
+    assert reader.data["data"][0].shape == (input_seq_lens[0], n_in)
     numpy.testing.assert_almost_equal(reader.data["data"][0], input_data[0])
-    assert_equal(reader.data["sm"][0].shape, (sm_seq_lens1[0] * sm_seq_lens2[0],))
+    assert reader.data["sm"][0].shape == (sm_seq_lens1[0] * sm_seq_lens2[0],)
     sm_data_ = numpy.transpose(sm_data, (1, 0, 3, 2))
     numpy.testing.assert_equal(numpy.reshape(reader.data["sm"][0], sm_data_[0].shape), sm_data_[0])
 
@@ -12517,7 +12511,7 @@ def test_automatic_seq_lengths():
         session.run(tf_compat.v1.global_variables_initializer())
         in_data = net.extern_data.get_default_input_data()
         out_data = net.layers["output"].output.copy_as_batch_spatial_major()
-        assert_equal(out_data.shape, in_data.shape)
+        assert out_data.shape == in_data.shape
         n_batch = 3
         max_seq_len = 10
         feed = make_feed_dict([in_data], n_batch=n_batch, n_time=max_seq_len)
@@ -12525,8 +12519,8 @@ def test_automatic_seq_lengths():
         out_v, out_lens_v = session.run((out_data.placeholder, out_lens), feed_dict=feed)
         in_v = feed[in_data.placeholder]
         in_lens_v = feed[in_data.size_placeholder[0]]
-        assert_equal(in_v.shape, out_v.shape)
-        assert_equal(in_lens_v.tolist(), out_lens_v.tolist())
+        assert in_v.shape == out_v.shape
+        assert in_lens_v.tolist() == out_lens_v.tolist()
         # So far, everything should always be true, unless we have messed some op really up.
         # Now we want to do the main test, i.e. whether we get the same tensor.
         from returnn.tf.util.basic import print_graph_output
@@ -12564,7 +12558,7 @@ def test_automatic_seq_lengths2():
         session.run(tf_compat.v1.global_variables_initializer())
         in_data = net.extern_data.get_default_input_data()
         out_data = net.layers["output"].output.copy_as_batch_spatial_major()
-        assert_equal(out_data.shape, in_data.shape)
+        assert out_data.shape == in_data.shape
         n_batch = 3
         max_seq_len = 10
         feed = make_feed_dict([in_data], n_batch=n_batch, n_time=max_seq_len)
@@ -12572,8 +12566,8 @@ def test_automatic_seq_lengths2():
         out_v, out_lens_v = session.run((out_data.placeholder, out_lens), feed_dict=feed)
         in_v = feed[in_data.placeholder]
         in_lens_v = feed[in_data.size_placeholder[0]]
-        assert_equal(in_v.shape, out_v.shape)
-        assert_equal(in_lens_v.tolist(), out_lens_v.tolist())
+        assert in_v.shape == out_v.shape
+        assert in_lens_v.tolist() == out_lens_v.tolist()
         # So far, everything should always be true, unless we have messed some op really up.
         # Now we want to do the main test, i.e. whether we get the same tensor.
         from returnn.tf.util.basic import print_graph_output
diff --git a/tests/test_TFNetworkRecLayer.py b/tests/test_TFNetworkRecLayer.py
index b789d83ea..2f6d99302 100644
--- a/tests/test_TFNetworkRecLayer.py
+++ b/tests/test_TFNetworkRecLayer.py
@@ -6,7 +6,6 @@
 import tensorflow as tf
 import sys
 import os
-from nose.tools import assert_equal, assert_not_equal, assert_is_instance
 from numpy.testing import assert_almost_equal, assert_allclose
 import unittest
 import numpy.testing
@@ -743,7 +742,7 @@ def check(
         )
         cu_size = cudnn_rnn_params_size(T=T, S=S, **common_kwargs)[0]
         my_size = RecLayer._get_cudnn_param_size(**common_kwargs)
-        assert_equal(cu_size.eval(), my_size)
+        assert cu_size.eval() == my_size
 
     with tf_compat.v1.Session():
         check(rnn_mode="lstm", num_units=5, input_size=3)
@@ -840,7 +839,7 @@ def test_cudnn_save_restore():
                     network1.extern_data.data["data"].size_placeholder[0]: seq_lens,
                 },
             )
-            assert_equal(output_data1.shape, (seq_lens[0], 1, num_outputs))  # (time, batch, dim)
+            assert output_data1.shape == (seq_lens[0], 1, num_outputs)  # (time, batch, dim)
             print("Saveable params:")
             pprint(network1.get_saveable_params_list())
             network1.save_params_to_file(filename=model_filename, session=session)
@@ -861,7 +860,7 @@ def test_cudnn_save_restore():
                     print("  param %r: %r" % (param_name, param1))
                     param1old = params[layer_name][param_name]
                     param1new = param1.eval(session)
-                    assert_equal(param1old.shape, param1new.shape)
+                    assert param1old.shape == param1new.shape
                     # Unfortunately, this doesn't seem to be the case.
                     # Also, doesn't need to be, because they have two biases, so it's not unique.
                     # assert param1old.ndim == 1
@@ -944,7 +943,7 @@ def check(**kwargs):
                 print("bias:", p, "shape:", tf.shape(p).eval())
             s2 = sum([tf.reduce_prod(tf.shape(p)).eval() for p in weights + biases])
             print("summed up size:", s2)
-            assert_equal(s1, s2)
+            assert s1 == s2
 
         check(num_layers=1, num_units=5, input_size=3, direction="unidirectional")
         check(num_layers=1, num_units=5, input_size=3, direction="bidirectional")  # fails in TF 1.2.0
@@ -997,7 +996,7 @@ def make_feed_dict(seq_len=10):
         network.initialize_params(session=session)
         print("Test run...")
         output_data1 = session.run(network.get_default_output_layer().output.placeholder, feed_dict=make_feed_dict(5))
-        assert_equal(output_data1.shape, (5, 1, num_outputs))  # (time, batch, dim)
+        assert output_data1.shape == (5, 1, num_outputs)  # (time, batch, dim)
 
         layer = network.layers["output"]
         loss_t = network.get_total_loss() * layer.loss.get_normalization_factor()
@@ -1447,7 +1446,7 @@ def test_rec_RecStepInfoLayer():
         )
         assert isinstance(out_v, numpy.ndarray)
         assert out_v.shape == (n_time,)
-        assert_equal(out_v.tolist(), [0, 1, 2])
+        assert out_v.tolist() == [0, 1, 2]
 
 
 def test_rec_RecStepInfoLayer_broadcast_moved_out():
@@ -1666,11 +1665,11 @@ def test_RecUnstackLayer_rec_no_input_explicit_axis():
         from returnn.tf.layers.rec import _SubnetworkRecCell
 
         assert isinstance(cell, _SubnetworkRecCell)
-        assert_equal(cell.input_layers_moved_out, ["input"])
-        assert_equal(cell.layers_in_loop, ["output"])
+        assert cell.input_layers_moved_out == ["input"]
+        assert cell.layers_in_loop == ["output"]
         in_data = net.extern_data.get_default_input_data()
         out_data = rec_layer.output
-        assert_equal(in_data.get_time_dim_tag(), out_data.get_time_dim_tag())
+        assert in_data.get_time_dim_tag() == out_data.get_time_dim_tag()
         from test_TFNetworkLayer import make_feed_dict
 
         session.run(out_data.placeholder, feed_dict=make_feed_dict([in_data]))
@@ -1701,11 +1700,11 @@ def test_RecUnstackLayer_rec_no_input_declare_rec_time():
         from returnn.tf.layers.rec import _SubnetworkRecCell
 
         assert isinstance(cell, _SubnetworkRecCell)
-        assert_equal(cell.input_layers_moved_out, ["input"])
-        assert_equal(cell.layers_in_loop, ["output"])
+        assert cell.input_layers_moved_out == ["input"]
+        assert cell.layers_in_loop == ["output"]
         in_data = net.extern_data.get_default_input_data()
         out_data = rec_layer.output
-        assert_equal(in_data.get_time_dim_tag(), out_data.get_time_dim_tag())
+        assert in_data.get_time_dim_tag() == out_data.get_time_dim_tag()
         from test_TFNetworkLayer import make_feed_dict
 
         session.run(out_data.placeholder, feed_dict=make_feed_dict([in_data]))
@@ -1733,10 +1732,10 @@ def test_search_no_rec_explicit():
     assert len(expected_final_seqs) == len(expected_debug_out) == beam_size
     n_time = 3
     n_classes = 4
-    assert_equal(logits.shape, (n_time, n_classes))
+    assert logits.shape == (n_time, n_classes)
     n_batch = 1
     logits = numpy.expand_dims(logits, axis=0)
-    assert_equal(logits.shape, (n_batch, n_time, n_classes))
+    assert logits.shape == (n_batch, n_time, n_classes)
     print("logits:")
     print(logits)
 
@@ -1764,27 +1763,27 @@ def test_search_no_rec_explicit():
     )
     net = TFNetwork(extern_data=extern_data, search_flag=True, train_flag=False, eval_flag=False)
     net.construct_from_dict(net_dict)
-    assert_equal(net.used_data_keys, {"data"})  # not classes
+    assert net.used_data_keys == {"data"}  # not classes
     rec_layer = net.layers["output"]
     assert isinstance(rec_layer, RecLayer)
     subnet = rec_layer.cell
     assert isinstance(subnet, _SubnetworkRecCell)
-    assert_equal(subnet.layers_in_loop, ["output"])
+    assert subnet.layers_in_loop == ["output"]
     sub_layer = subnet.net.layers["output"]
     assert isinstance(sub_layer, ChoiceLayer)
-    assert_equal(sub_layer.output.beam.beam_size, beam_size)
-    assert_equal(rec_layer.output.beam.beam_size, beam_size)
+    assert sub_layer.output.beam.beam_size == beam_size
+    assert rec_layer.output.beam.beam_size == beam_size
     input_search_choices = net.get_search_choices(sources=rec_layer.sources)
     assert not input_search_choices
     assert rec_layer.output.is_time_major
-    assert_equal(rec_layer.get_search_beam_size(), beam_size)
+    assert rec_layer.get_search_beam_size() == beam_size
     feed_dict = {
         net.extern_data.get_batch_info().dim: 1,
         net.extern_data.data["data"].placeholder: logits,
         net.extern_data.data["data"].size_placeholder[0]: [n_time],
     }
     with tf_compat.v1.Session() as session:
-        assert_equal(session.run(net.get_data_batch_dim(), feed_dict=feed_dict), n_batch)
+        assert session.run(net.get_data_batch_dim(), feed_dict=feed_dict) == n_batch
         out, out_sizes = session.run(
             (rec_layer.output.placeholder, rec_layer.output.get_sequence_lengths()), feed_dict=feed_dict
         )
@@ -1793,9 +1792,9 @@ def test_search_no_rec_explicit():
         print(out)
         assert isinstance(out_sizes, numpy.ndarray)
         assert isinstance(out, numpy.ndarray)
-        assert_equal(out_sizes.shape, (n_batch * beam_size,))
-        assert_equal(out.shape, (n_time, n_batch * beam_size))
-        assert_equal(out_sizes.tolist(), [n_time] * beam_size)
+        assert out_sizes.shape == (n_batch * beam_size,)
+        assert out.shape == (n_time, n_batch * beam_size)
+        assert out_sizes.tolist() == [n_time] * beam_size
         out = numpy.reshape(out, (n_time, n_batch, beam_size))
 
     print("Debug out:")
@@ -1808,7 +1807,7 @@ def test_search_no_rec_explicit():
         out_seq = out[:, 0, beam].tolist()
         expected_seq = expected_final_seqs[beam]
         print("beam %i, out seq %r, expected seq %r" % (beam, out_seq, expected_seq))
-        assert_equal(out_seq, expected_final_seqs[beam])
+        assert out_seq == expected_final_seqs[beam]
 
     assert len(debug_out) == n_time
     # Could be that it is not in order (because of parallel execution of the loop).
@@ -1821,12 +1820,12 @@ def test_search_no_rec_explicit():
             assert k in debug_t
             out_v = debug_t[k]
             if isinstance(v, int):
-                assert_equal(v, out_v)
+                assert v == out_v
             else:
                 assert isinstance(out_v, numpy.ndarray)
                 assert out_v.shape[0] == n_batch, "t %i, k %r, v %r" % (t, k, v)
                 out_v = out_v[0]
-                assert_equal(v, out_v.tolist(), "t %i, k %r" % (t, k))
+                assert v == out_v.tolist(), "t %i, k %r" % (t, k)
     print("Seems fine.")
 
 
@@ -1856,10 +1855,10 @@ def test_search_no_rec_explicit_dyn_len():
     assert len(expected_final_seqs) == len(expected_debug_out) == beam_size
     n_time = 3
     n_classes = 4
-    assert_equal(logits.shape, (n_time, n_classes))
+    assert logits.shape == (n_time, n_classes)
     n_batch = 1
     logits = numpy.expand_dims(logits, axis=0)
-    assert_equal(logits.shape, (n_batch, n_time, n_classes))
+    assert logits.shape == (n_batch, n_time, n_classes)
     print("logits:")
     print(logits)
 
@@ -1890,35 +1889,35 @@ def test_search_no_rec_explicit_dyn_len():
     )
     net = TFNetwork(extern_data=extern_data, search_flag=True, train_flag=False, eval_flag=False)
     net.construct_from_dict(net_dict)
-    assert_equal(net.used_data_keys, {"data"})  # not classes
+    assert net.used_data_keys == {"data"}  # not classes
     rec_layer = net.layers["output"]
     assert isinstance(rec_layer, RecLayer)
     subnet = rec_layer.cell
     assert isinstance(subnet, _SubnetworkRecCell)
-    assert_equal(set(subnet.layers_in_loop), {"output", "end"})
+    assert set(subnet.layers_in_loop) == {"output", "end"}
     sub_layer = subnet.net.layers["output"]
     assert isinstance(sub_layer, ChoiceLayer)
-    assert_equal(sub_layer.output.beam.beam_size, beam_size)
-    assert_equal(rec_layer.output.beam.beam_size, beam_size)
+    assert sub_layer.output.beam.beam_size == beam_size
+    assert rec_layer.output.beam.beam_size == beam_size
     input_search_choices = net.get_search_choices(sources=rec_layer.sources)
     assert not input_search_choices
     assert rec_layer.output.is_time_major
-    assert_equal(rec_layer.get_search_beam_size(), beam_size)
+    assert rec_layer.get_search_beam_size() == beam_size
     feed_dict = {
         net.extern_data.get_batch_info().dim: 1,
         net.extern_data.data["data"].placeholder: logits,
         net.extern_data.data["data"].size_placeholder[0]: [n_time],
     }
     with tf_compat.v1.Session() as session:
-        assert_equal(session.run(net.get_data_batch_dim(), feed_dict=feed_dict), n_batch)
+        assert session.run(net.get_data_batch_dim(), feed_dict=feed_dict) == n_batch
         out, out_sizes = session.run(
             (rec_layer.output.placeholder, rec_layer.output.get_sequence_lengths()), feed_dict=feed_dict
         )
         print("output seq lens:", out_sizes)
         assert isinstance(out_sizes, numpy.ndarray)
         assert isinstance(out, numpy.ndarray)
-        assert_equal(out_sizes.shape, (n_batch * beam_size,))
-        assert_equal(out.shape, (n_time, n_batch * beam_size))
+        assert out_sizes.shape == (n_batch * beam_size,)
+        assert out.shape == (n_time, n_batch * beam_size)
     out = numpy.reshape(out, (n_time, n_batch, beam_size))
     print("output:")
     print(out)
@@ -1928,14 +1927,14 @@ def test_search_no_rec_explicit_dyn_len():
     ChoiceLayer._debug_out = []
     pprint(debug_out)
 
-    assert_equal(out_sizes.tolist(), expected_final_seq_lens)
+    assert out_sizes.tolist() == expected_final_seq_lens
 
     # Assume that beams are sorted by score. See above.
     for beam in range(beam_size):
         out_seq = out[:, 0, beam].tolist()
         expected_seq = expected_final_seqs[beam]
         print("beam %i, out seq %r, expected seq %r" % (beam, out_seq, expected_seq))
-        assert_equal(out_seq, expected_final_seqs[beam])
+        assert out_seq == expected_final_seqs[beam]
 
     assert len(debug_out) == n_time
     # Could be that it is not in order (because of parallel execution of the loop).
@@ -1948,7 +1947,7 @@ def test_search_no_rec_explicit_dyn_len():
             assert k in debug_t
             out_v = debug_t[k]
             if isinstance(v, int):
-                assert_equal(v, out_v)
+                assert v == out_v
             else:
                 assert isinstance(out_v, numpy.ndarray)
                 assert out_v.shape[0] == n_batch, "t %i, k %r, v %r" % (t, k, v)
@@ -2202,8 +2201,8 @@ def test_search_multi_choice():
                         % (b, i, s[i][1][0], s[i][1][1], -s[i][0])
                     )
                     numpy.testing.assert_allclose(-s[i][0], raw_beam_scores[b, i], rtol=1e-5)
-                    assert_equal(s[i][1][0], raw_src_beams[b, i])
-                    assert_equal(s[i][1][1], raw_choices[b, i])
+                    assert s[i][1][0] == raw_src_beams[b, i]
+                    assert s[i][1][1] == raw_choices[b, i]
 
             # Select src beams.
             assert lin_values.shape == (n_batch, cur_beam_size, num_choices, n_time, n_hidden)
@@ -2213,12 +2212,12 @@ def test_search_multi_choice():
             choices_values = numpy.array([choices_values[b, raw_src_beams[b]] for b in range(n_batch)])
             assert choices_values.shape == (n_batch, beam_size, num_choices, n_time)
             for c in range(num_choices):
-                assert_equal(prob_values[c].shape, (n_batch, cur_beam_size, n_time, [dim1, dim2][c]))
+                assert prob_values[c].shape == (n_batch, cur_beam_size, n_time, [dim1, dim2][c])
             prob_values = [
                 numpy.array([prob_values[c][b, raw_src_beams[b]] for b in range(n_batch)]) for c in range(num_choices)
             ]
             for c in range(num_choices):
-                assert_equal(prob_values[c].shape, (n_batch, beam_size, n_time, [dim1, dim2][c]))
+                assert prob_values[c].shape == (n_batch, beam_size, n_time, [dim1, dim2][c])
 
             # Ok. Update the beam.
             scores_base = raw_beam_scores
@@ -2491,8 +2490,8 @@ def test_search_multi_choice_simple_keep_beams():
                         % (b, i, s[i][1][0], s[i][1][1], -s[i][0])
                     )
                     numpy.testing.assert_allclose(-s[i][0], raw_beam_scores[b, i], rtol=1e-5)
-                    assert_equal(s[i][1][0], raw_src_beams[b, i])
-                    assert_equal(s[i][1][1], raw_choices[b, i])
+                    assert s[i][1][0] == raw_src_beams[b, i]
+                    assert s[i][1][1] == raw_choices[b, i]
 
             # Select src beams.
             assert lin_values.shape == (n_batch, cur_beam_size, num_choices, n_time, n_hidden)
@@ -3496,8 +3495,8 @@ def train(net):
     train_out_layer = train_net.layers["output"]
     assert isinstance(train_out_layer, RecLayer)
     assert isinstance(train_out_layer.cell, _SubnetworkRecCell)
-    assert_equal(set(train_out_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-    assert_equal(set(train_out_layer.cell.output_layers_moved_out), {"output_prob", "readout_in", "readout"})
+    assert set(train_out_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+    assert set(train_out_layer.cell.output_layers_moved_out) == {"output_prob", "readout_in", "readout"}
     train(train_net)
     print("=" * 40)
 
@@ -3690,9 +3689,9 @@ def get_net_dict(l2_target_embed=0.0, l2_readout_in=0.0):
     train_out_layer = train_net.layers["output"]
     assert isinstance(train_out_layer, RecLayer)
     assert isinstance(train_out_layer.cell, _SubnetworkRecCell)
-    assert_equal(set(train_out_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-    assert_equal(set(train_out_layer.cell.output_layers_moved_out), {"output_prob", "readout_in", "readout"})
-    assert_equal(train_net.get_total_constraints(), 0)
+    assert set(train_out_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+    assert set(train_out_layer.cell.output_layers_moved_out) == {"output_prob", "readout_in", "readout"}
+    assert train_net.get_total_constraints() == 0
 
     print("Constructing train network with L2 norm on moved out input layer")
     tf_compat.v1.reset_default_graph()
@@ -3710,9 +3709,9 @@ def get_net_dict(l2_target_embed=0.0, l2_readout_in=0.0):
     train_out_layer = train_net.layers["output"]
     assert isinstance(train_out_layer, RecLayer)
     assert isinstance(train_out_layer.cell, _SubnetworkRecCell)
-    assert_equal(set(train_out_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-    assert_equal(set(train_out_layer.cell.output_layers_moved_out), {"output_prob", "readout_in", "readout"})
-    assert_not_equal(train_net.get_total_constraints(), 0)
+    assert set(train_out_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+    assert set(train_out_layer.cell.output_layers_moved_out) == {"output_prob", "readout_in", "readout"}
+    assert train_net.get_total_constraints() != 0
 
     print("Constructing train network with L2 norm on moved out output layer")
     tf_compat.v1.reset_default_graph()
@@ -3730,9 +3729,9 @@ def get_net_dict(l2_target_embed=0.0, l2_readout_in=0.0):
     train_out_layer = train_net.layers["output"]
     assert isinstance(train_out_layer, RecLayer)
     assert isinstance(train_out_layer.cell, _SubnetworkRecCell)
-    assert_equal(set(train_out_layer.cell.input_layers_moved_out), {"output", "target_embed"})
-    assert_equal(set(train_out_layer.cell.output_layers_moved_out), {"output_prob", "readout_in", "readout"})
-    assert_not_equal(train_net.get_total_constraints(), 0)
+    assert set(train_out_layer.cell.input_layers_moved_out) == {"output", "target_embed"}
+    assert set(train_out_layer.cell.output_layers_moved_out) == {"output_prob", "readout_in", "readout"}
+    assert train_net.get_total_constraints() != 0
 
 
 def test_rec_layer_move_out_of_loop_ref_att_generic_att():
@@ -3994,7 +3993,7 @@ def create_rnd_flat_att_weights(dec_t, enc_t):
         train_out_layer = train_net.layers["output"]
         assert isinstance(train_out_layer, RecLayer)
         assert isinstance(train_out_layer.cell, _SubnetworkRecCell)
-        assert_equal(train_out_layer.cell.layers_in_loop, [])  # all moved out :)
+        assert train_out_layer.cell.layers_in_loop == []  # all moved out :)
         rec_subnet = train_out_layer.cell.output_layers_net
         assert isinstance(rec_subnet, TFNetwork)
         att_layer = rec_subnet.layers["att"]
@@ -4865,22 +4864,20 @@ def get_net_dict():
         prev_loop_out_layer = loop_net.layers["prev:output"]
         assert prev_out_choice == prev_loop_out_layer.search_choices
         assert RecLayer.is_prev_step_layer(prev_out_choice.owner)
-        assert_equal(loop_net.layers["end"].get_search_choices(), cur_out_choice)
-        assert_equal(loop_net.layers["target_embed"].get_search_choices(), cur_out_choice)
-        assert_equal(loop_net.layers["prev:target_embed"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["accum_att_weights"].get_search_choices(), prev_out_choice)
-        assert_equal(
-            loop_net.layers["prev:accum_att_weights"].get_search_choices(), prev_out_choice
-        )  # will be transformed
-        assert_equal(loop_net.layers["weight_feedback"].get_search_choices(), prev_out_choice)
+        assert loop_net.layers["end"].get_search_choices() == cur_out_choice
+        assert loop_net.layers["target_embed"].get_search_choices() == cur_out_choice
+        assert loop_net.layers["prev:target_embed"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["accum_att_weights"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["prev:accum_att_weights"].get_search_choices() == prev_out_choice  # will be transformed
+        assert loop_net.layers["weight_feedback"].get_search_choices() == prev_out_choice
         loop_net.debug_search_choices(loop_net.layers["s"])
-        assert_equal(loop_net.layers["s"].get_search_choices(), cur_out_choice)
-        assert_equal(loop_net.layers["prev:s"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["prev_s_state"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["energy_in"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["att_weights"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["att"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["output_prob"].get_search_choices(), prev_out_choice)
+        assert loop_net.layers["s"].get_search_choices() == cur_out_choice
+        assert loop_net.layers["prev:s"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["prev_s_state"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["energy_in"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["att_weights"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["att"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["output_prob"].get_search_choices() == prev_out_choice
 
         out = search_net.get_layer("decision").output
         search_net.initialize_params(session)
@@ -5210,7 +5207,7 @@ def test_RnnCellLayer_with_time():
             assert l2.output.batch_dim_axis == 1
             assert l2.output.dim == 10
             assert l2.output.shape == (None, 10)
-            assert_equal(set(l1.params.keys()), set(l2.params.keys()))
+            assert set(l1.params.keys()) == set(l2.params.keys())
             for key in l1.params.keys():
                 assert l1.params[key].shape == l2.params[key].shape
 
@@ -5256,7 +5253,7 @@ def test_rec_subnet_simple_rnn():
         assert cell_sub_layer_out.batch_shape == (None, n_out)
         network.initialize_params(session)
         weights_var = network.layers["output"].params["output/W"]
-        assert_equal(weights_var.get_shape().as_list(), [n_out + n_in, n_out])
+        assert weights_var.get_shape().as_list() == [n_out + n_in, n_out]
         weights_np = (numpy.arange(0, (n_out + n_in) * n_out) - (n_out + n_in) * n_out * 0.5) * 0.1
         weights_np = weights_np.reshape((n_out + n_in, n_out))
         network.get_var_assigner(weights_var).assign(value=weights_np, session=session)
@@ -5264,7 +5261,7 @@ def test_rec_subnet_simple_rnn():
         input_np = numpy.array(input_np, dtype="float32")
         input_seq_lens = [3, 2]
         n_batch = len(input_seq_lens)
-        assert_equal(input_np.shape, (n_batch, max(input_seq_lens), n_in))
+        assert input_np.shape == (n_batch, max(input_seq_lens), n_in)
         input_placeholder = network.extern_data.data["data"].placeholder
         input_seq_lens_placeholder = network.extern_data.data["data"].size_placeholder[0]
         output_np, output_seq_lens = session.run(
@@ -5275,15 +5272,15 @@ def test_rec_subnet_simple_rnn():
                 input_seq_lens_placeholder: input_seq_lens,
             },
         )
-        assert_equal(list(output_seq_lens), input_seq_lens)
-        assert_equal(output_np.shape, (n_batch, max(input_seq_lens), n_out))
+        assert list(output_seq_lens) == input_seq_lens
+        assert output_np.shape == (n_batch, max(input_seq_lens), n_out)
         output_last_np = numpy.zeros((n_batch, n_out), dtype="float32")
         output_calc_np = numpy.zeros((n_batch, max(input_seq_lens), n_out), dtype="float32")
         for t in range(max(input_seq_lens)):
             _in = numpy.concatenate([output_last_np, input_np[:, t]], axis=1)
-            assert_equal(_in.shape, (n_batch, n_out + n_in))
+            assert _in.shape == (n_batch, n_out + n_in)
             _out = numpy.dot(_in, weights_np)
-            assert_equal(_out.shape, (n_batch, n_out))
+            assert _out.shape == (n_batch, n_out)
             _out = numpy.maximum(_out, 0.0)  # relu
             output_last_np = _out
             output_calc_np[:, t] = _out
@@ -5333,7 +5330,7 @@ def test_rec_subnet_simple_rnn():
         network.initialize_params(session)
         output_layer = network.layers["output"]
         weights_var = output_layer.params["output/output/rec/basic_rnn_cell/kernel"]
-        assert_equal(weights_var.get_shape().as_list(), [n_out + n_in, n_out])
+        assert weights_var.get_shape().as_list() == [n_out + n_in, n_out]
         # BasicRNNCell expects it as [inputs, state], but we have it as [state, inputs].
         weights_conv_np = numpy.concatenate([weights_np[n_out:], weights_np[:n_out]])
         network.get_var_assigner(weights_var).assign(value=weights_conv_np, session=session)
@@ -5347,8 +5344,8 @@ def test_rec_subnet_simple_rnn():
                 input_seq_lens_placeholder: input_seq_lens,
             },
         )
-        assert_equal(list(output_seq_lens), input_seq_lens)
-        assert_equal(output_np.shape, (n_batch, max(input_seq_lens), n_out))
+        assert list(output_seq_lens) == input_seq_lens
+        assert output_np.shape == (n_batch, max(input_seq_lens), n_out)
         print("rnn_cell subnet output:")
         print(output_np)
         assert_almost_equal(output_np, output_calc_np)
@@ -5432,18 +5429,17 @@ def check_reclayer_optimize_out(
         assert isinstance(net2_reclayer, RecLayer)
         net2_subnet = net2_reclayer.cell
         assert isinstance(net2_subnet, _SubnetworkRecCell)
-        assert_equal(set(net1_subnet.input_layers_moved_out), set())
-        assert_equal(set(net2_subnet.input_layers_moved_out), set())
-        assert_equal(set(net1_subnet.output_layers_moved_out), set())
+        assert set(net1_subnet.input_layers_moved_out) == set()
+        assert set(net2_subnet.input_layers_moved_out) == set()
+        assert set(net1_subnet.output_layers_moved_out) == set()
         # output_layers_moved_out will contain sublayers if present
         output_root_layers_moved_out = [
             name for name in net2_subnet.output_layers_moved_out if "/" not in name and name != ":i"
         ]
-        assert_equal(set(output_root_layers_moved_out), {"output"}.union(set(other_subnet_layers or [])))
-        assert_equal(
-            [v.name.split("/")[1:] for v in net1.get_params_list()],
-            [v.name.split("/")[1:] for v in net2.get_params_list()],
-        )
+        assert set(output_root_layers_moved_out) == {"output"}.union(set(other_subnet_layers or []))
+        assert [v.name.split("/")[1:] for v in net1.get_params_list()] == [
+            v.name.split("/")[1:] for v in net2.get_params_list()
+        ]
         net1.initialize_params(session=session)
         net1_params = net1.layers["output_not_opt"].get_param_values_dict(session=session)
         print("params:", list(net1_params.keys()))
@@ -6680,9 +6676,9 @@ def _subnet_base_eval_func(source, **_kwargs):
         assert isinstance(rec_layer, RecLayer)
         rec_cell = rec_layer.cell
         assert isinstance(rec_cell, _SubnetworkRecCell)
-        assert_equal(set(rec_cell.input_layers_moved_out), {"outside", "subnet/a", "subnet_a"})
-        assert_equal(set(rec_cell.layers_in_loop), {"inside", "subnet", "subnet/b", "subnet/output", "subnet_b"})
-        assert_equal(set(rec_cell.output_layers_moved_out), {"output"})
+        assert set(rec_cell.input_layers_moved_out) == {"outside", "subnet/a", "subnet_a"}
+        assert set(rec_cell.layers_in_loop) == {"inside", "subnet", "subnet/b", "subnet/output", "subnet_b"}
+        assert set(rec_cell.output_layers_moved_out) == {"output"}
 
         session.run(tf_compat.v1.global_variables_initializer())
         from test_TFNetworkLayer import make_feed_dict
@@ -7402,9 +7398,9 @@ def get_out(optimize_out):
             assert isinstance(rec_layer, RecLayer)
             cell = rec_layer.cell
             assert isinstance(cell, _SubnetworkRecCell)
-            assert_equal(cell.input_layers_moved_out, [])
+            assert cell.input_layers_moved_out == []
             if optimize_out:
-                assert_equal(cell.layers_in_loop, [])  # all moved out
+                assert cell.layers_in_loop == []  # all moved out
             out = net.get_layer("output/output_prob").output.copy_as_batch_major()
             assert out.batch_ndim == 3 and out.shape == (None, n_tgt_dim)
             feed_dict = get_feed_dict(extern_data=net.extern_data)
@@ -7501,11 +7497,14 @@ def make_extern_data():
         assert isinstance(rec_layer, RecLayer)
         cell = rec_layer.cell
         assert isinstance(cell, _SubnetworkRecCell)
-        assert_equal(cell.input_layers_moved_out, [])
-        assert_equal(
-            cell.output_layers_moved_out,
-            ["output_prob", "target_embed_raw", "output", "encoder_reduced", "encoder_int"],
-        )
+        assert cell.input_layers_moved_out == []
+        assert cell.output_layers_moved_out == [
+            "output_prob",
+            "target_embed_raw",
+            "output",
+            "encoder_reduced",
+            "encoder_int",
+        ]
 
     print("Constructing search network.")
     with make_scope():
@@ -7676,7 +7675,7 @@ def test_subnet_load_on_init_rec():
         input_np = numpy.array(input_np, dtype="float32")
         input_seq_lens = [3, 2]
         n_batch = len(input_seq_lens)
-        assert_equal(input_np.shape, (n_batch, max(input_seq_lens), n_in))
+        assert input_np.shape == (n_batch, max(input_seq_lens), n_in)
         input_placeholder = network.extern_data.data["data"].placeholder
         input_seq_lens_placeholder = network.extern_data.data["data"].size_placeholder[0]
         output_layer = network.get_default_output_layer(must_exist=True)
@@ -7688,8 +7687,8 @@ def test_subnet_load_on_init_rec():
                 input_seq_lens_placeholder: input_seq_lens,
             },
         )
-        assert_equal(list(output_seq_lens), input_seq_lens)
-        assert_equal(output_orig_np.shape, (n_batch, max(input_seq_lens), n_out))
+        assert list(output_seq_lens) == input_seq_lens
+        assert output_orig_np.shape == (n_batch, max(input_seq_lens), n_out)
         for t in range(max(output_seq_lens)):
             for b in range(n_batch):
                 if t >= output_seq_lens[b]:
@@ -7777,8 +7776,8 @@ def test_subnet_load_on_init_rec():
                 input_seq_lens_placeholder: input_seq_lens,
             },
         )
-        assert_equal(list(output_seq_lens), input_seq_lens)
-        assert_equal(output_np.shape, (n_batch, max(input_seq_lens), n_out))
+        assert list(output_seq_lens) == input_seq_lens
+        assert output_np.shape == (n_batch, max(input_seq_lens), n_out)
         for t in range(max(output_seq_lens)):
             for b in range(n_batch):
                 if t >= output_seq_lens[b]:
@@ -8854,7 +8853,7 @@ def test_reclayer_time_sync_target_diff():
         assert isinstance(rec_layer, RecLayer)
         cell = rec_layer.cell
         assert isinstance(cell, _SubnetworkRecCell)
-        assert_equal(cell.layers_in_loop, [])
+        assert cell.layers_in_loop == []
         loss = net.get_total_loss()
         from test_TFNetworkLayer import make_feed_dict
 
@@ -8910,7 +8909,7 @@ def make_config(lstm_unit):
     input_np = numpy.array(input_np, dtype="float32")
     input_seq_lens = [3, 2]
     n_batch = len(input_seq_lens)
-    assert_equal(input_np.shape, (n_batch, max(input_seq_lens), n_in))
+    assert input_np.shape == (n_batch, max(input_seq_lens), n_in)
 
     def construct_load_save_forward(lstm_unit, prev_lstm_unit=None, output_ref=None):
         """
@@ -8941,8 +8940,8 @@ def construct_load_save_forward(lstm_unit, prev_lstm_unit=None, output_ref=None)
                     input_seq_lens_placeholder: input_seq_lens,
                 },
             )
-            assert_equal(list(output_seq_lens), input_seq_lens)
-            assert_equal(output_np.shape, (n_batch, max(input_seq_lens), n_out))
+            assert list(output_seq_lens) == input_seq_lens
+            assert output_np.shape == (n_batch, max(input_seq_lens), n_out)
             for t in range(max(output_seq_lens)):
                 for b in range(n_batch):
                     if t >= output_seq_lens[b]:
@@ -9057,7 +9056,7 @@ def test_KenLmStateLayer():
                 word_seq_so_far = ["<unk>" if "@@" in w else w for w in word_seq_so_far]
                 res2 = session.run(tf_ref_score, feed_dict={ref_score_str_placeholder: " ".join(word_seq_so_far)})
                 print("  word seq so far: %r" % (word_seq_so_far,), "score:", res2)
-                assert_equal(res2, abs_score)
+                assert res2 == abs_score
 
             assert_almost_equal(abs_score, ref_score)
             print("Scores are as expected.")
@@ -9170,7 +9169,7 @@ def test_KenLmStateLayer_dense():
                 word_seq_so_far = ["<unk>" if "@@" in w else w for w in word_seq_so_far]
                 res2 = session.run(tf_ref_score, feed_dict={ref_score_str_placeholder: " ".join(word_seq_so_far)})
                 print("  word seq so far: %r" % (word_seq_so_far,), "score:", res2)
-                assert_equal(res2, abs_score)
+                assert res2 == abs_score
 
             assert_almost_equal(abs_score, ref_score)
             print("Scores are as expected.")
@@ -9453,21 +9452,19 @@ def get_net_dict():
         prev_loop_out_layer = loop_net.layers["prev:output"]
         assert prev_out_choice == prev_loop_out_layer.search_choices
         assert RecLayer.is_prev_step_layer(prev_out_choice.owner)
-        assert_equal(loop_net.layers["end"].get_search_choices(), cur_out_choice)
-        assert_equal(loop_net.layers["target_embed"].get_search_choices(), cur_out_choice)
-        assert_equal(loop_net.layers["prev:target_embed"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["accum_att_weights"].get_search_choices(), prev_out_choice)
-        assert_equal(
-            loop_net.layers["prev:accum_att_weights"].get_search_choices(), prev_out_choice
-        )  # will be transformed
-        assert_equal(loop_net.layers["weight_feedback"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["s"].get_search_choices(), cur_out_choice)
-        assert_equal(loop_net.layers["prev:s"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["prev_s_state"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["energy_in"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["att_weights"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["att"].get_search_choices(), prev_out_choice)
-        assert_equal(loop_net.layers["output_prob"].get_search_choices(), prev_out_choice)
+        assert loop_net.layers["end"].get_search_choices() == cur_out_choice
+        assert loop_net.layers["target_embed"].get_search_choices() == cur_out_choice
+        assert loop_net.layers["prev:target_embed"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["accum_att_weights"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["prev:accum_att_weights"].get_search_choices() == prev_out_choice  # will be transformed
+        assert loop_net.layers["weight_feedback"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["s"].get_search_choices() == cur_out_choice
+        assert loop_net.layers["prev:s"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["prev_s_state"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["energy_in"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["att_weights"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["att"].get_search_choices() == prev_out_choice
+        assert loop_net.layers["output_prob"].get_search_choices() == prev_out_choice
 
 
 def test_onlineblstm():
@@ -9947,7 +9944,7 @@ def test_MaskedComputationLayer_UnmaskLayer_in_loop():
         in_v, out_v = session.run((in_data.placeholder, out_data.placeholder), feed_dict=feed_dict)
         print(in_v)
         print(out_v)
-        assert_equal(in_v.shape, out_v.shape)
+        assert in_v.shape == out_v.shape
         for b in range(in_v.shape[0]):
             x = 0.0
             for t in range(in_v.shape[1]):
@@ -10027,7 +10024,7 @@ def test_MaskedComputationLayer_UnmaskLayer_in_loop_opt():
         print(out_v)
         print("seq lens:", out_seq_lens_v)
         pprint(extra_v)
-        assert_equal(in_v.shape, out_v.shape)
+        assert in_v.shape == out_v.shape
         for b in range(in_v.shape[0]):
             x = 0.0
             for t in range(in_v.shape[1]):
@@ -10096,7 +10093,7 @@ def test_MaskedComputationLayer_in_loop_auto_unmask():
             print(in_v)
             print(out_v)
             print("seq lens:", out_seq_lens_v)
-            assert_equal(in_v.shape, out_v.shape)
+            assert in_v.shape == out_v.shape
             for b in range(in_v.shape[0]):
                 x = 1
                 for t in range(in_v.shape[1]):
@@ -10183,7 +10180,7 @@ def test_MaskedComputationLayer_sub_layers():
             print(in_v)
             print(out_v)
             print("seq lens:", out_seq_lens_v)
-            assert_equal(in_v.shape, out_v.shape)
+            assert in_v.shape == out_v.shape
             for b in range(in_v.shape[0]):
                 x = 0
                 for t in range(in_v.shape[1]):
@@ -10890,7 +10887,7 @@ def test_MaskedComputationLayer_UnmaskLayer_masked_outside():
         in_v, out_v = session.run((in_data.placeholder, out_data.placeholder), feed_dict=feed_dict)
         print(in_v)
         print(out_v)
-        assert_equal(in_v.shape, out_v.shape)
+        assert in_v.shape == out_v.shape
         for b in range(in_v.shape[0]):
             x = 0.0
             for t in range(in_v.shape[1]):
@@ -10963,9 +10960,9 @@ def test_MaskedComputationLayer_outside():
                     break
                 if not in_mask_v[b, t]:
                     continue
-                assert_equal(in_v[b, t], out_v[b, t_])
+                assert in_v[b, t] == out_v[b, t_]
                 t_ += 1
-            assert_equal(t_, out_lens[b])
+            assert t_ == out_lens[b]
         assert out_v.shape == (num_batch, max(out_lens))
 
 
@@ -11001,7 +10998,7 @@ def test_MaskedComputationLayer_name_scope():
         params = net.get_params_list()
         print(params)
         assert len(params) == 2
-        assert_equal(set(p.name for p in params), {"output/W:0", "output/b:0"})
+        assert set(p.name for p in params) == {"output/W:0", "output/b:0"}
 
 
 def test_MaskedComputationLayer_rec_name_scope():
@@ -11036,7 +11033,7 @@ def test_MaskedComputationLayer_rec_name_scope():
         params = net.get_params_list()
         print(params)
         assert len(params) == 3
-        assert_equal(set(p.name for p in params), {"output/rec/W:0", "output/rec/W_re:0", "output/rec/b:0"})
+        assert set(p.name for p in params) == {"output/rec/W:0", "output/rec/W_re:0", "output/rec/b:0"}
 
 
 def test_MaskedComputationLayer_subnet_name_scope():
@@ -11078,7 +11075,7 @@ def test_MaskedComputationLayer_subnet_name_scope():
         params = net.get_params_list()
         print(params)
         assert len(params) == 2
-        assert_equal(set(p.name for p in params), {"output/linear/W:0", "output/linear/b:0"})
+        assert set(p.name for p in params) == {"output/linear/W:0", "output/linear/b:0"}
 
 
 def test_MaskedComputationLayer_rec_subnet_name_scope():
@@ -11138,7 +11135,7 @@ def test_MaskedComputationLayer_rec_subnet_name_scope():
         params = net.get_params_list()
         print(params)
         assert len(params) == 2
-        assert_equal(set(p.name for p in params), {"linear/W:0", "linear/b:0"})
+        assert set(p.name for p in params) == {"linear/W:0", "linear/b:0"}
 
 
 def test_MaskedComputationLayer_dyn_size_none():
@@ -13955,11 +13952,11 @@ def test_CumConcatLayer_self_attention_equal_to_SelfAttentionLayer():
                 single_weights = single_layer.params["QKV"]
                 multi_weights = network.get_layer("multi_layer_qkv0").params["W"]
 
-            assert_equal(single_layer.output.batch_shape, (None, None, num_heads * value_dim))
-            assert_equal(multi_layer.output.batch_shape, (None, None, num_heads * value_dim))
+            assert single_layer.output.batch_shape == (None, None, num_heads * value_dim)
+            assert multi_layer.output.batch_shape == (None, None, num_heads * value_dim)
 
             # set weights equal.
-            assert_equal(single_weights.shape, multi_weights.shape)
+            assert single_weights.shape == multi_weights.shape
             weights = numpy.random.rand(*single_weights.shape)
             session.run(tf.compat.v1.assign(single_weights, weights))
             session.run(tf.compat.v1.assign(multi_weights, weights))
diff --git a/tests/test_TFUpdater.py b/tests/test_TFUpdater.py
index a96068250..185e14fd1 100644
--- a/tests/test_TFUpdater.py
+++ b/tests/test_TFUpdater.py
@@ -12,7 +12,6 @@
 from returnn.tf.layers.base import LayerBase, Loss
 import returnn.tf.compat as tf_compat
 from returnn.log import log
-from nose.tools import assert_equal, assert_is_instance, assert_is, assert_in
 from numpy.testing import assert_almost_equal
 import unittest
 import numpy.testing
@@ -118,9 +117,9 @@ def test_add_check_numerics_ops():
         check = add_check_numerics_ops([y])
         session.run(check)
         z1 = tf_compat.v1.log(x - 3, name="z1")
-        assert_equal(str(session.run(z1)), "-inf")
+        assert str(session.run(z1)) == "-inf"
         z2 = tf_compat.v1.log(x - 4, name="z2")
-        assert_equal(str(session.run(z2)), "nan")
+        assert str(session.run(z2)) == "nan"
         check1 = add_check_numerics_ops([z1])
         try:
             session.run(check1)
@@ -145,8 +144,8 @@ def test_grad_add_check_numerics_ops():
         y = 1.0 / x
         grad_x = tf.gradients(y, x)[0]
         print("grad_x:", grad_x.eval())
-        assert_equal(str(float("-inf")), "-inf")
-        assert_equal(str(grad_x.eval()), "-inf")
+        assert str(float("-inf")) == "-inf"
+        assert str(grad_x.eval()) == "-inf"
 
         session.run(x.assign(1.0))
         opt = tf_compat.v1.train.GradientDescentOptimizer(learning_rate=1.0)
@@ -316,7 +315,7 @@ def test_Updater_decouple_constraints_simple_graph():
         )
         network.initialize_params(session=session)
         var = network.get_layer("var").output.placeholder
-        assert_equal(session.run(var), 1.0)
+        assert session.run(var) == 1.0
 
         updater = Updater(config=config, network=network, initial_learning_rate=1.0)
         updater.set_learning_rate(1.0, session=session)
@@ -361,7 +360,7 @@ def test_Updater_decouple_constraints_simple_graph_grad_accum():
         )
         network.initialize_params(session=session)
         var = network.get_layer("var").output.placeholder
-        assert_equal(session.run(var), 1.0)
+        assert session.run(var) == 1.0
 
         updater = Updater(config=config, network=network, initial_learning_rate=1.0)
         updater.set_learning_rate(1.0, session=session)
@@ -372,7 +371,7 @@ def test_Updater_decouple_constraints_simple_graph_grad_accum():
 
         tf_util.print_graph_output(update_op)
 
-        assert_equal(session.run(updater.global_train_step), 0)
+        assert session.run(updater.global_train_step) == 0
         expected_var = 1.0
         for i in range(10):
             print("Run step", i)
@@ -380,7 +379,7 @@ def test_Updater_decouple_constraints_simple_graph_grad_accum():
                 update_op,
                 feed_dict={extern_data.data["data"].placeholder: [0.0, 0.0, 0.0], extern_data.get_batch_info().dim: 3},
             )
-            assert_equal(session.run(updater.global_train_step), i + 1)
+            assert session.run(updater.global_train_step) == i + 1
             var_value = session.run(var)
             print("var:", var_value)
             if i % 2 == 1:
diff --git a/tests/test_TFUtil.py b/tests/test_TFUtil.py
index f67834a78..ce6417d32 100644
--- a/tests/test_TFUtil.py
+++ b/tests/test_TFUtil.py
@@ -7,7 +7,6 @@
 from returnn.tf.util.basic import *
 from returnn.tf.util.data import SpatialDim, FeatureDim
 import returnn.tf.compat as tf_compat
-from nose.tools import assert_equal, assert_not_equal, assert_is_instance, assert_is, assert_in, assert_true
 from numpy.testing import assert_almost_equal, assert_allclose
 from pprint import pprint
 import contextlib
@@ -28,53 +27,53 @@ def test_tf_version_tuple():
 
 def test_Data():
     data = Data(name="my_data", shape=(None, 13))
-    assert_equal(data.name, "my_data")
-    assert_equal(data.dim, 13)
-    assert_equal(data.batch_dim_axis, 0)
-    assert_equal(data.time_dim_axis, 1)
-    assert_equal(data.feature_dim_axis, 2)
-    assert_equal(data.batch_ndim, 3)
-    assert_equal(data.batch_shape, (None, None, 13))
-    assert_equal(data.dtype, "float32")
-    assert_equal(data.sparse, False)
+    assert data.name == "my_data"
+    assert data.dim == 13
+    assert data.batch_dim_axis == 0
+    assert data.time_dim_axis == 1
+    assert data.feature_dim_axis == 2
+    assert data.batch_ndim == 3
+    assert data.batch_shape == (None, None, 13)
+    assert data.dtype == "float32"
+    assert data.sparse == False
 
 
 def test_Data_dim():
     data = Data(name="my_data", dim=13)
-    assert_equal(data.name, "my_data")
-    assert_equal(data.dim, 13)
-    assert_equal(data.batch_dim_axis, 0)
-    assert_equal(data.time_dim_axis, 1)
-    assert_equal(data.feature_dim_axis, 2)
-    assert_equal(data.batch_ndim, 3)
-    assert_equal(data.batch_shape, (None, None, 13))
-    assert_equal(data.dtype, "float32")
-    assert_equal(data.sparse, False)
+    assert data.name == "my_data"
+    assert data.dim == 13
+    assert data.batch_dim_axis == 0
+    assert data.time_dim_axis == 1
+    assert data.feature_dim_axis == 2
+    assert data.batch_ndim == 3
+    assert data.batch_shape == (None, None, 13)
+    assert data.dtype == "float32"
+    assert data.sparse == False
 
 
 def test_Data_dim_none():
     data = Data(name="my_data", dim=None)
-    assert_equal(data.dim, None)
-    assert_equal(data.batch_dim_axis, 0)
-    assert_equal(data.time_dim_axis, 1)
-    assert_equal(data.feature_dim_axis, 2)
-    assert_equal(data.batch_ndim, 3)
-    assert_equal(data.batch_shape, (None, None, None))
-    assert_equal(data.dtype, "float32")
-    assert_equal(data.sparse, False)
+    assert data.dim == None
+    assert data.batch_dim_axis == 0
+    assert data.time_dim_axis == 1
+    assert data.feature_dim_axis == 2
+    assert data.batch_ndim == 3
+    assert data.batch_shape == (None, None, None)
+    assert data.dtype == "float32"
+    assert data.sparse == False
 
 
 def test_Data_dim_none_auto_create_placeholders():
     data = Data(name="my_data", dim=None, auto_create_placeholders=True)
-    assert_equal(data.dim, None)
-    assert_equal(data.batch_dim_axis, 0)
-    assert_equal(data.time_dim_axis, 1)
-    assert_equal(data.feature_dim_axis, 2)
+    assert data.dim == None
+    assert data.batch_dim_axis == 0
+    assert data.time_dim_axis == 1
+    assert data.feature_dim_axis == 2
     data_ = Data(name="my_data", dim=None)
-    assert_equal(data.batch_ndim, 3)
-    assert_equal(data.batch_shape, (None, None, None))
-    assert_equal(data.dtype, "float32")
-    assert_equal(data.sparse, False)
+    assert data.batch_ndim == 3
+    assert data.batch_shape == (None, None, None)
+    assert data.dtype == "float32"
+    assert data.sparse == False
     assert (data.batch_dim_axis, data.time_dim_axis, data.feature_dim_axis) == (
         data_.batch_dim_axis,
         data_.time_dim_axis,
@@ -90,37 +89,37 @@ def test_Data_default_time_no_time():
 
 def test_Data_copy_time_major():
     data = Data(name="my_data", dim=13)
-    assert_equal(data.batch_dim_axis, 0)
-    assert_equal(data.time_dim_axis, 1)
-    assert_equal(data.feature_dim_axis, 2)
-    assert_equal(data.batch_ndim, 3)
+    assert data.batch_dim_axis == 0
+    assert data.time_dim_axis == 1
+    assert data.feature_dim_axis == 2
+    assert data.batch_ndim == 3
     data2 = data.copy_as_time_major()
-    assert_equal(data2.time_dim_axis, 0)
-    assert_equal(data2.batch_dim_axis, 1)
-    assert_equal(data2.feature_dim_axis, 2)
-    assert_equal(data2.batch_ndim, 3)
+    assert data2.time_dim_axis == 0
+    assert data2.batch_dim_axis == 1
+    assert data2.feature_dim_axis == 2
+    assert data2.batch_ndim == 3
 
 
 def test_Data_copy_batch_major():
     data = Data(name="my_data", dim=13, time_dim_axis=0, batch_dim_axis=1)
-    assert_equal(data.time_dim_axis, 0)
-    assert_equal(data.batch_dim_axis, 1)
-    assert_equal(data.feature_dim_axis, 2)
-    assert_equal(data.batch_ndim, 3)
+    assert data.time_dim_axis == 0
+    assert data.batch_dim_axis == 1
+    assert data.feature_dim_axis == 2
+    assert data.batch_ndim == 3
     data2 = data.copy_as_batch_major()
-    assert_equal(data2.batch_dim_axis, 0)
-    assert_equal(data2.time_dim_axis, 1)
-    assert_equal(data2.feature_dim_axis, 2)
-    assert_equal(data2.batch_ndim, 3)
+    assert data2.batch_dim_axis == 0
+    assert data2.time_dim_axis == 1
+    assert data2.feature_dim_axis == 2
+    assert data2.batch_ndim == 3
 
 
 def test_Data_copy_as_batch_major_no_extra_feat():
     data = Data(name="att_weights_output", shape=(None,), batch_dim_axis=1)
     print("data", data, "feat axis:", data.feature_dim_axis_or_unspecified, data.feature_dim_axis)
-    assert_equal(data.time_dim_axis, 0)
+    assert data.time_dim_axis == 0
     data2 = data.copy_as_batch_major()
-    assert_equal(data2.batch_dim_axis, 0)
-    assert_equal(data2.time_dim_axis, 1)
+    assert data2.batch_dim_axis == 0
+    assert data2.time_dim_axis == 1
     # No check for feature_dim_axis, as this behavior does not matter here.
 
 
@@ -129,41 +128,41 @@ def test_Data_spatial_batch_axes():
     d2 = Data(name="ff_out_output", shape=(None, 9001), dtype="float32")
     spatial_axes1 = d1.get_spatial_batch_axes()
     spatial_axes2 = d2.get_spatial_batch_axes()
-    assert_equal(len(spatial_axes1), len(spatial_axes2))
+    assert len(spatial_axes1) == len(spatial_axes2)
     spatial_axes1 = d1.get_spatial_axes()
     spatial_axes2 = d2.get_spatial_axes()
-    assert_equal(len(spatial_axes1), len(d1.get_spatial_batch_axes()))
-    assert_equal(spatial_axes1, spatial_axes2)
+    assert len(spatial_axes1) == len(d1.get_spatial_batch_axes())
+    assert spatial_axes1 == spatial_axes2
 
 
 def test_Data_spatial_batch_axes_2():
     d = Data(name="data", shape=(None, 9000))
-    assert_equal(d.get_spatial_batch_axes(), [1])
+    assert d.get_spatial_batch_axes() == [1]
     d = Data(name="data", shape=(13, 9000))
-    assert_equal(d.get_spatial_batch_axes(), [1])
+    assert d.get_spatial_batch_axes() == [1]
     d = Data(name="data", shape=(None, 13, 9000))
-    assert_equal(d.get_spatial_batch_axes(), [1, 2])
+    assert d.get_spatial_batch_axes() == [1, 2]
 
 
 def test_Data_get_bc_spatial_batch_shape():
     d = Data(name="data", shape=(None, 9000))
-    assert_equal(d.get_bc_spatial_batch_shape(), (1, 1, 9000))
+    assert d.get_bc_spatial_batch_shape() == (1, 1, 9000)
     d = Data(name="data", shape=(13, 9000))
-    assert_equal(d.get_bc_spatial_batch_shape(), (1, 1, 9000))
+    assert d.get_bc_spatial_batch_shape() == (1, 1, 9000)
     d = Data(name="data", shape=(None, 13, 9000))
-    assert_equal(d.get_bc_spatial_batch_shape(), (1, 1, 1, 9000))
+    assert d.get_bc_spatial_batch_shape() == (1, 1, 1, 9000)
 
 
 def test_Data_get_bc_shape():
     d = Data(name="data", shape=(None, 9000))
-    assert_equal(d.get_bc_shape(), (1, 1, 9000))
+    assert d.get_bc_shape() == (1, 1, 9000)
     d = Data(name="data", shape=(13, 9000))
-    assert_equal(d.get_bc_shape(), (1, 1, 9000))
+    assert d.get_bc_shape() == (1, 1, 9000)
     d = Data(name="data", shape=(None, 13, 9000))
-    assert_equal(d.get_bc_shape(), (1, 1, 1, 9000))
+    assert d.get_bc_shape() == (1, 1, 1, 9000)
     d = Data(name="data", shape=(None, 13, 9000))
-    assert_equal(d.get_bc_shape({"*": None}), (None, None, 13, 9000))
-    assert_equal(d.get_bc_shape({("B", "dim:13"): None}), (None, 1, 13, 9000))
+    assert d.get_bc_shape({"*": None}) == (None, None, 13, 9000)
+    assert d.get_bc_shape({("B", "dim:13"): None}) == (None, 1, 13, 9000)
 
 
 def test_Data_copy_template_adding_time_dim_no_feature():
@@ -380,17 +379,17 @@ def test_Data_find_matching_dim_map_different_static_dims():
 
     # without broadcast_matches=False should not match
     is_equal_opts = dict(allow_same_feature_dim=True, allow_same_spatial_dim=True, treat_feature_as_spatial=True)
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts), [0, 1])
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts), [])
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(2), is_equal_opts=is_equal_opts), [])
+    assert d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts) == [0, 1]
+    assert d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts) == []
+    assert d1.find_matching_dims(d2.get_dim_tag(2), is_equal_opts=is_equal_opts) == []
 
     # with broadcast_matches=True should match
     is_equal_opts = dict(
         allow_same_feature_dim=True, allow_same_spatial_dim=True, treat_feature_as_spatial=True, broadcast_matches=True
     )
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts), [0, 1])
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts), [0, 1, 2])
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(2), is_equal_opts=is_equal_opts), [0, 1, 2])
+    assert d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts) == [0, 1]
+    assert d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts) == [0, 1, 2]
+    assert d1.find_matching_dims(d2.get_dim_tag(2), is_equal_opts=is_equal_opts) == [0, 1, 2]
 
     mapping = d1.find_matching_dim_map(d2, list(range(d2.batch_ndim)))  # maps d2 -> d1
     assert len(mapping.values()) == d2.batch_ndim
@@ -410,15 +409,15 @@ def test_Data_find_matching_dim_map_broadcast_matches():
 
     # default should not match
     is_equal_opts = dict(allow_same_feature_dim=True, allow_same_spatial_dim=True, treat_feature_as_spatial=True)
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts), [1])
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts), [])
+    assert d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts) == [1]
+    assert d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts) == []
 
     # with broadcast_matches=True should match
     is_equal_opts_match = dict(
         allow_same_feature_dim=True, allow_same_spatial_dim=True, treat_feature_as_spatial=True, broadcast_matches=True
     )
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts_match), [1])
-    assert_equal(d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts_match), [1, 2])
+    assert d1.find_matching_dims(d2.get_dim_tag(0), is_equal_opts=is_equal_opts_match) == [1]
+    assert d1.find_matching_dims(d2.get_dim_tag(1), is_equal_opts=is_equal_opts_match) == [1, 2]
 
     mapping = d1.find_matching_dim_map(d2, list(range(d2.batch_ndim)), is_equal_opts)  # maps d2 -> d1
     assert mapping[0] == 1 and mapping[1] == 2
@@ -702,14 +701,14 @@ def test_Data_copy_compatible_to_batch_feature_is_dynamic():
     start = Data(name="t_start_output", shape=(None,), dtype="int32", sparse=True, dim=None, batch_dim_axis=1)
     start.size_placeholder = {0: dec.size_placeholder[0]}
     print("start:", start)
-    assert_equal(start.get_time_dim_tag(), dec.get_time_dim_tag())
+    assert start.get_time_dim_tag() == dec.get_time_dim_tag()
     # energy: batch_shape_meta=[F|'time-with-postfix:0_data_target0',B,T|'time-with-postfix:encoder']
     energy = Data(name="energy2_output", shape=(None, None), batch_dim_axis=1, time_dim_axis=2, feature_dim_axis=0)
     energy.size_placeholder = {0: dec.size_placeholder[0], 1: enc.size_placeholder[0]}
     print("energy:", energy)
-    assert_equal(energy.get_size_dim_tag(0), dec.get_time_dim_tag())
-    assert_equal(energy.get_size_dim_tag(1), enc.get_time_dim_tag())
-    assert_equal(energy.get_time_dim_tag(), enc.get_time_dim_tag())
+    assert energy.get_size_dim_tag(0) == dec.get_time_dim_tag()
+    assert energy.get_size_dim_tag(1) == enc.get_time_dim_tag()
+    assert energy.get_time_dim_tag() == enc.get_time_dim_tag()
     t = start.copy_compatible_to(energy, check_sparse=False, check_dtype=False)
     print("t:", t)
     assert t.shape == (None, 1) and t.time_dim_axis == energy.time_dim_axis
@@ -717,7 +716,7 @@ def test_Data_copy_compatible_to_batch_feature_is_dynamic():
     assert t.sparse and t.feature_dim_axis is None  # because it is sparse
     assert set(t.size_placeholder.keys()) == {0}
     assert t.size_placeholder[0] is dec.size_placeholder[0]
-    assert_equal(t.get_size_dim_tag(0), dec.get_time_dim_tag())
+    assert t.get_size_dim_tag(0) == dec.get_time_dim_tag()
 
 
 def test_Data_copy_compatible_to_bias_to_batch_time_spatial_feature():
@@ -917,12 +916,12 @@ def test_Data_get_common_data_one_undefined_time():
     c.size_placeholder = b.size_placeholder.copy()
     print("c:", c)
     c.sanity_check()
-    assert_equal(b.get_time_dim_tag(), c.get_time_dim_tag())
+    assert b.get_time_dim_tag() == c.get_time_dim_tag()
 
     out = Data.get_common_data([a, b, c])
     print("out:", out)
     assert out.shape == (None, 1) and out.batch_dim_axis == 0
-    assert_equal(out.get_time_dim_tag(), b.get_time_dim_tag())
+    assert out.get_time_dim_tag() == b.get_time_dim_tag()
 
 
 def test_Data_get_common_data_copy_compatible_to_different_time_dim():
@@ -933,8 +932,8 @@ def test_Data_get_common_data_copy_compatible_to_different_time_dim():
     common_data = Data.get_common_data([a, b], allow_broadcast_all_sources=True)
     print("common:", common_data)
     assert common_data.shape == (None, None, 3, 5) and common_data.batch_dim_axis == 0
-    assert_equal(common_data.get_size_dim_tag(0), a.get_time_dim_tag())
-    assert_equal(common_data.get_size_dim_tag(1), b.get_time_dim_tag())
+    assert common_data.get_size_dim_tag(0) == a.get_time_dim_tag()
+    assert common_data.get_size_dim_tag(1) == b.get_time_dim_tag()
     aa = a.copy_compatible_to(common_data)
     bb = b.copy_compatible_to(common_data)
     print("aa:", aa)
@@ -946,9 +945,9 @@ def test_Data_get_common_data_copy_compatible_to_different_time_dim():
         if d1 == 1 or d2 == 1:
             continue  # it's fine, that will broadcast
         assert d1 == d2, "mismatch in axis %i" % i
-    assert_equal(aa.get_dim_tag(axis=1), a.get_time_dim_tag())
+    assert aa.get_dim_tag(axis=1) == a.get_time_dim_tag()
     assert aa.batch_shape[2] == 1
-    assert_equal(bb.get_dim_tag(axis=2), b.get_time_dim_tag())
+    assert bb.get_dim_tag(axis=2) == b.get_time_dim_tag()
     assert bb.batch_shape[1] == 1
     x = aa.placeholder + bb.placeholder
     session.run(
@@ -965,19 +964,19 @@ def test_Data_get_common_data_copy_compatible_to_different_time_dim_different_st
     b = Data(name="b", shape=(3, None, 5), auto_create_placeholders=True)
     print("a:", a)
     print("b:", b)
-    assert_not_equal(a.get_time_dim_tag(), b.get_time_dim_tag())
+    assert a.get_time_dim_tag() != b.get_time_dim_tag()
     common_data = Data.get_common_data([a, b], allow_broadcast_all_sources=True)
     print("common:", common_data)
     assert common_data.shape.count(None) == 2 and 3 in common_data.shape and 5 in common_data.shape
     assert common_data.batch_ndim == 5
-    assert_equal(common_data.get_size_dim_tag(0), a.get_time_dim_tag())
-    assert_equal(common_data.get_size_dim_tag(1), b.get_time_dim_tag())
+    assert common_data.get_size_dim_tag(0) == a.get_time_dim_tag()
+    assert common_data.get_size_dim_tag(1) == b.get_time_dim_tag()
     common_tags, _ = Dim.get_all_dimension_tags([common_data])
     print("common dim tags:")
     pprint(common_tags)
     assert len(common_tags) == common_data.batch_ndim  # all unique
-    assert_in(a.get_time_dim_tag(), common_tags)
-    assert_in(b.get_time_dim_tag(), common_tags)
+    assert a.get_time_dim_tag() in common_tags
+    assert b.get_time_dim_tag() in common_tags
     aa = a.copy_compatible_to(common_data)
     bb = b.copy_compatible_to(common_data)
     print("aa:", aa)
@@ -989,8 +988,8 @@ def test_Data_get_common_data_copy_compatible_to_different_time_dim_different_st
         if d1 == 1 or d2 == 1:
             continue  # it's fine, that will broadcast
         assert d1 == d2, "mismatch in axis %i" % i
-    assert_equal(aa.get_size_dim_tag(0), a.get_time_dim_tag())
-    assert_equal(bb.get_size_dim_tag(0), b.get_time_dim_tag())
+    assert aa.get_size_dim_tag(0) == a.get_time_dim_tag()
+    assert bb.get_size_dim_tag(0) == b.get_time_dim_tag()
     x = aa.placeholder + bb.placeholder
     session.run(
         x,
@@ -1106,11 +1105,11 @@ def test_Data_copy_compatible_to_move_spatial_axes():
     a = Data(name="a", shape=(3, None, 5))
     a.size_placeholder = {1: common.size_placeholder[0]}
     print("a:", a)
-    assert_equal(common.get_time_dim_tag(), a.get_time_dim_tag())
+    assert common.get_time_dim_tag() == a.get_time_dim_tag()
     b = a.copy_compatible_to(common)
     print("b:", b)
     assert b.shape == common.shape
-    assert_equal(b.get_time_dim_tag(), a.get_time_dim_tag())
+    assert b.get_time_dim_tag() == a.get_time_dim_tag()
 
 
 def test_Data_copy_add_spatial_dim_added_time_at_end():
@@ -1384,7 +1383,7 @@ def test_Data_copy_add_spatial_dim_most_right():
     print(d1, "spatial axes:", d1.get_spatial_batch_axes())
     d2 = d1.copy_add_spatial_dim(1)
     print(d2, "spatial axes:", d2.get_spatial_batch_axes())
-    assert_equal(d2.get_spatial_batch_axes(), [1])
+    assert d2.get_spatial_batch_axes() == [1]
 
 
 def test_Data_copy_add_spatial_dim_no_batch_end():
@@ -1520,7 +1519,7 @@ def test_Data_auto_create_placeholders_same_dim_tags_as_existing():
         assert seq_len is data.get_sequence_lengths() is classes.get_sequence_lengths()
         assert seq_len.op.type == "Placeholder"
         placeholder_ops = [op for op in graph.get_operations() if op.type == "Placeholder"]
-        assert_equal(set(placeholder_ops), {data.placeholder.op, classes.placeholder.op, time_tag.dyn_size.op})
+        assert set(placeholder_ops) == {data.placeholder.op, classes.placeholder.op, time_tag.dyn_size.op}
 
 
 def test_Data_copy_masked_0():
@@ -1586,7 +1585,7 @@ def test_Dim_MarkedDim_sorted():
     print(ls)
     print(sorted(ls))
     # Test current order, but the order itself doesn't really matter for anything.
-    assert_equal(sorted(ls), [a, b, a_implicit2, a_implicit, b_implicit])
+    assert sorted(ls) == [a, b, a_implicit2, a_implicit, b_implicit]
 
 
 def test_Dim_find_matching_dim_map_match_priority():
@@ -1598,7 +1597,7 @@ def test_Dim_find_matching_dim_map_match_priority():
     filter_feat_dim_map = filter_.find_matching_dim_map(
         other=Data("dummy", [filter_in_dim, out_dim], dtype="float32"), other_axes=[0, 1]
     )
-    assert_equal(filter_feat_dim_map, {0: 1, 1: 0})
+    assert filter_feat_dim_map == {0: 1, 1: 0}
 
 
 def test_ExternData_ext_Data_batch_info():
@@ -1965,9 +1964,9 @@ def test_sequence_mask_len_via_loop():
     mask = sequence_mask_time_major(seq_len)
     seq_len_v, mask_v = session.run((seq_len, mask))
     print(seq_len_v)
-    assert_equal(seq_len_v.tolist(), [2, 3])
+    assert seq_len_v.tolist() == [2, 3]
     print(mask_v)
-    assert_equal(mask_v.tolist(), [[True, True], [True, True], [False, True]])
+    assert mask_v.tolist() == [[True, True], [True, True], [False, True]]
 
 
 def test_get_initializer_zero():
@@ -2002,35 +2001,35 @@ def test_get_initializer_xavier():
     shape = (2, 3)
     initializer = get_initializer("xavier")
     v = initializer(shape)
-    assert_equal(session.run(v).shape, shape)  # returns some random matrix
+    assert session.run(v).shape == shape  # returns some random matrix
 
 
 def test_get_initializer_glorot_uniform():
     shape = (2, 3)
     initializer = get_initializer("glorot_uniform")
     v = initializer(shape)
-    assert_equal(session.run(v).shape, shape)  # returns some random matrix
+    assert session.run(v).shape == shape  # returns some random matrix
 
 
 def test_get_initializer_glorot_normal_with_scale():
     shape = (2, 3)
     initializer = get_initializer('VarianceScaling(scale=6.0, mode="fan_avg", distribution="normal")')
     v = initializer(shape)
-    assert_equal(session.run(v).shape, shape)  # returns some random matrix
+    assert session.run(v).shape == shape  # returns some random matrix
 
 
 def test_get_initializer_uniform():
     shape = (2, 3)
     initializer = get_initializer("RandomUniform(-0.01, 0.01)")
     v = initializer(shape)
-    assert_equal(session.run(v).shape, shape)  # returns some random matrix
+    assert session.run(v).shape == shape  # returns some random matrix
 
 
 def test_get_initializer_gauss():
     shape = (2, 3)
     initializer = get_initializer("RandomNormal(0.0, 0.01)")
     v = initializer(shape)
-    assert_equal(session.run(v).shape, shape)  # returns some random matrix
+    assert session.run(v).shape == shape  # returns some random matrix
 
 
 def test_wrap_distribution_non_zero():
@@ -2048,7 +2047,7 @@ def count_event_logger_threads():
 
     tmp_dir = tempfile.mkdtemp()
     writer = tf_compat.v1.summary.FileWriter(tmp_dir)
-    assert_equal(count_event_logger_threads(), 1)
+    assert count_event_logger_threads() == 1
     assert isinstance(writer.event_writer, EventFileWriter)
     assert isinstance(writer.event_writer._worker, _EventLoggerThread)
     writer.close()
@@ -2056,33 +2055,33 @@ def count_event_logger_threads():
     # https://github.com/tensorflow/tensorflow/issues/4820
     # The _EventLoggerThread is still running (at least in TF 1.1.0).
     stop_event_writer_thread(writer)
-    assert_equal(count_event_logger_threads(), 0)
+    assert count_event_logger_threads() == 0
 
 
 def test_single_strided_slice():
     x = tf.expand_dims(tf.range(10), axis=0)
-    assert_equal(list(tf.shape(x).eval()), [1, 10])
-    assert_equal(list(single_strided_slice(x, axis=1, begin=3, end=6, step=2)[0].eval()), [3, 5])
-    assert_equal(list(single_strided_slice(x, axis=1, begin=4)[0].eval()), list(range(4, 10)))
-    assert_equal(list(single_strided_slice(x, axis=1, end=3)[0].eval()), [0, 1, 2])
-    assert_equal(list(single_strided_slice(x, axis=tf.constant(1), end=3)[0].eval()), [0, 1, 2])
-    assert_equal(list(single_strided_slice(x, axis=tf.constant(-1), end=3)[0].eval()), [0, 1, 2])
+    assert list(tf.shape(x).eval()) == [1, 10]
+    assert list(single_strided_slice(x, axis=1, begin=3, end=6, step=2)[0].eval()) == [3, 5]
+    assert list(single_strided_slice(x, axis=1, begin=4)[0].eval()) == list(range(4, 10))
+    assert list(single_strided_slice(x, axis=1, end=3)[0].eval()) == [0, 1, 2]
+    assert list(single_strided_slice(x, axis=tf.constant(1), end=3)[0].eval()) == [0, 1, 2]
+    assert list(single_strided_slice(x, axis=tf.constant(-1), end=3)[0].eval()) == [0, 1, 2]
     x2 = tf.reshape(tf.range(9), (3, 3))
-    assert_equal(list(x2[0].eval()), [0, 1, 2])
-    assert_equal(list(tf.squeeze(single_strided_slice(x2, axis=tf.constant(0), end=1), axis=0).eval()), [0, 1, 2])
+    assert list(x2[0].eval()) == [0, 1, 2]
+    assert list(tf.squeeze(single_strided_slice(x2, axis=tf.constant(0), end=1), axis=0).eval()) == [0, 1, 2]
 
 
 def test_slice_pad_zeros():
     x = tf.constant([1, 2, 3, 4])
-    assert_equal(list(slice_pad_zeros(x, begin=1, end=3).eval()), [2, 3])
-    assert_equal(list(slice_pad_zeros(x, begin=-2, end=2).eval()), [0, 0, 1, 2])
-    assert_equal(list(slice_pad_zeros(x, begin=-2, end=6).eval()), [0, 0, 1, 2, 3, 4, 0, 0])
-    assert_equal(list(slice_pad_zeros(x, begin=2, end=6).eval()), [3, 4, 0, 0])
+    assert list(slice_pad_zeros(x, begin=1, end=3).eval()) == [2, 3]
+    assert list(slice_pad_zeros(x, begin=-2, end=2).eval()) == [0, 0, 1, 2]
+    assert list(slice_pad_zeros(x, begin=-2, end=6).eval()) == [0, 0, 1, 2, 3, 4, 0, 0]
+    assert list(slice_pad_zeros(x, begin=2, end=6).eval()) == [3, 4, 0, 0]
 
 
 def test_circular_pad():
     x = tf.reshape(tf.range(9), (3, 3))
-    assert_equal(list(x[0].eval()), [0, 1, 2])
+    assert list(x[0].eval()) == [0, 1, 2]
     x_ref = numpy.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
     numpy.testing.assert_equal(x.eval(), x_ref)
     y = circular_pad(x, paddings=1)
@@ -2090,7 +2089,7 @@ def test_circular_pad():
     numpy.testing.assert_equal(y.eval(), y_ref)
 
     x = tf.expand_dims(tf.reshape(tf.range(9), (3, 3)), axis=2)
-    assert_equal(list(x[0, :, 0].eval()), [0, 1, 2])
+    assert list(x[0, :, 0].eval()) == [0, 1, 2]
     x_ref = numpy.array([[[0], [1], [2]], [[3], [4], [5]], [[6], [7], [8]]])
     numpy.testing.assert_equal(x.eval(), x_ref)
     y = circular_pad(x, paddings=1, axes=(0, 1))
@@ -2108,17 +2107,17 @@ def test_circular_pad():
 
 def test_reuse_name_scope_double():
     with reuse_name_scope("double"):
-        assert_equal(tf_compat.v1.get_default_graph()._name_stack, "double")
+        assert tf_compat.v1.get_default_graph()._name_stack == "double"
         with reuse_name_scope("sub"):
-            assert_equal(tf_compat.v1.get_default_graph()._name_stack, "double/sub")
-            assert_equal(get_current_name_scope(), "double/sub")
+            assert tf_compat.v1.get_default_graph()._name_stack == "double/sub"
+            assert get_current_name_scope() == "double/sub"
 
 
 def test_reuse_name_scope_mix1():
     with reuse_name_scope("mix1"):
-        assert_equal(tf_compat.v1.get_default_graph()._name_stack, "mix1")
+        assert tf_compat.v1.get_default_graph()._name_stack == "mix1"
         with tf.name_scope("sub"):
-            assert_equal(tf_compat.v1.get_default_graph()._name_stack, "mix1/sub")
+            assert tf_compat.v1.get_default_graph()._name_stack == "mix1/sub"
             # The following is not true because get_current_name_scope is only var-scope:
             # assert_equal(get_current_name_scope(), "mix1/sub")
 
@@ -2126,7 +2125,7 @@ def test_reuse_name_scope_mix1():
 def test_reuse_name_scope_mix2():
     with tf.name_scope("mix2"):
         with reuse_name_scope("sub"):
-            assert_equal(tf_compat.v1.get_default_graph()._name_stack, "mix2/sub")
+            assert tf_compat.v1.get_default_graph()._name_stack == "mix2/sub"
             # The following is not true because get_current_name_scope is only var-scope:
             # assert_equal(get_current_name_scope(), "mix2/sub")
 
@@ -2134,42 +2133,42 @@ def test_reuse_name_scope_mix2():
 def test_reuse_name_scope_mix3():
     with reuse_name_scope("mix3"):
         with tf_compat.v1.variable_scope("sub"):
-            assert_equal(get_current_name_scope(), "mix3/sub")
+            assert get_current_name_scope() == "mix3/sub"
 
 
 def test_reuse_name_scope_mix4():
     with tf_compat.v1.variable_scope("mix4"):
         with reuse_name_scope("sub"):
-            assert_equal(get_current_name_scope(), "mix4/sub")
+            assert get_current_name_scope() == "mix4/sub"
 
 
 def test_reuse_name_scope_2():
     with reuse_name_scope("lstm2"):
         with reuse_name_scope("rec") as scope:
-            assert_is_instance(scope, tf_compat.v1.VariableScope)
-            assert_equal(scope.name, "lstm2/rec")
-            assert_equal(get_current_name_scope(), "lstm2/rec")
+            assert isinstance(scope, tf_compat.v1.VariableScope)
+            assert scope.name == "lstm2/rec"
+            assert get_current_name_scope() == "lstm2/rec"
             with tf.name_scope("sub"):
-                assert_equal(get_current_name_scope(), "lstm2/rec/sub")
+                assert get_current_name_scope() == "lstm2/rec/sub"
 
 
 def test_reuse_name_scope():
     with reuse_name_scope("lstm0"):
         with tf_compat.v1.variable_scope("rec"):
             a = tf_compat.v1.get_variable("a", shape=(3, 4))
-            assert_is_instance(a, tf.Variable)
-            assert_equal(a.name, "lstm0/rec/a:0")
+            assert isinstance(a, tf.Variable)
+            assert a.name == "lstm0/rec/a:0"
 
             b = tf.Variable(name="b", initial_value=tf.zeros((2,)))
-            assert_equal(b.name, "lstm0/rec/b:0")
+            assert b.name == "lstm0/rec/b:0"
 
     with reuse_name_scope("lstm0"):
         with reuse_name_scope("rec"):
             c = tf.Variable(name="c", initial_value=tf.zeros((2,)))
-            assert_equal(c.name, "lstm0/rec/c:0")
+            assert c.name == "lstm0/rec/c:0"
 
             c2 = tf.Variable(name="c", initial_value=tf.zeros((2,)))
-            assert_equal(c2.name, "lstm0/rec/c_1:0")
+            assert c2.name == "lstm0/rec/c_1:0"
 
 
 def test_reuse_name_scope_root():
@@ -2179,56 +2178,56 @@ def test_reuse_name_scope_root():
 
 def test_reuse_var_scope():
     with tf_compat.v1.variable_scope("v1"):
-        assert_equal(get_current_var_scope_name(), "v1")
-        assert_equal(get_current_name_scope(), "v1")
+        assert get_current_var_scope_name() == "v1"
+        assert get_current_name_scope() == "v1"
         with tf_compat.v1.variable_scope("v2") as scope:
-            assert_equal(get_current_var_scope_name(), "v1/v2")
-            assert_equal(get_current_name_scope(), "v1/v2")
+            assert get_current_var_scope_name() == "v1/v2"
+            assert get_current_name_scope() == "v1/v2"
             with tf.name_scope("v3"):
-                assert_equal(get_current_name_scope(), "v1/v2/v3")
-                assert_equal(get_current_var_scope_name(), "v1/v2")
-                assert_equal(scope.name, "v1/v2")
+                assert get_current_name_scope() == "v1/v2/v3"
+                assert get_current_var_scope_name() == "v1/v2"
+                assert scope.name == "v1/v2"
                 # Note: tf.compat.v1.variable_scope(scope) is broken here.
                 with reuse_name_scope(scope):
-                    assert_equal(get_current_var_scope_name(), "v1/v2")
-                    assert_equal(get_current_name_scope(), "v1/v2")
+                    assert get_current_var_scope_name() == "v1/v2"
+                    assert get_current_name_scope() == "v1/v2"
 
 
 def test_name_var_scope_mixing():
     with tf_compat.v1.variable_scope("mv1"):
-        assert_equal(get_current_var_scope_name(), "mv1")
-        assert_equal(get_current_name_scope(), "mv1")
+        assert get_current_var_scope_name() == "mv1"
+        assert get_current_name_scope() == "mv1"
         with tf_compat.v1.variable_scope("v2") as scope:
-            assert_equal(get_current_var_scope_name(), "mv1/v2")
-            assert_equal(get_current_name_scope(), "mv1/v2")
+            assert get_current_var_scope_name() == "mv1/v2"
+            assert get_current_name_scope() == "mv1/v2"
             with tf.name_scope("v3"):
-                assert_equal(get_current_name_scope(), "mv1/v2/v3")
-                assert_equal(get_current_var_scope_name(), "mv1/v2")
-                assert_equal(scope.name, "mv1/v2")
+                assert get_current_name_scope() == "mv1/v2/v3"
+                assert get_current_var_scope_name() == "mv1/v2"
+                assert scope.name == "mv1/v2"
                 # Note: tf.compat.v1.variable_scope("v4") is broken here.
                 with reuse_name_scope("v4"):
-                    assert_equal(get_current_var_scope_name(), "mv1/v2/v3/v4")
-                    assert_equal(get_current_name_scope(), "mv1/v2/v3/v4")
+                    assert get_current_var_scope_name() == "mv1/v2/v3/v4"
+                    assert get_current_name_scope() == "mv1/v2/v3/v4"
                     with reuse_name_scope(scope):
-                        assert_equal(get_current_var_scope_name(), "mv1/v2")
-                        assert_equal(get_current_name_scope(), "mv1/v2")
+                        assert get_current_var_scope_name() == "mv1/v2"
+                        assert get_current_name_scope() == "mv1/v2"
 
 
 def test_reuse_name_scope_of_tensor():
     with tf.name_scope("scope1") as scope1:
         x = tf.constant(42)
     with tf.name_scope("scope2") as scope2:
-        assert_equal(get_current_name_scope() + "/", scope2)
+        assert get_current_name_scope() + "/" == scope2
         with reuse_name_scope_of_tensor(x):
-            assert_equal(get_current_name_scope() + "/", scope1)
+            assert get_current_name_scope() + "/" == scope1
 
 
 def test_reuse_name_scope_of_tensor_root():
     x = tf.constant(42)
     with tf.name_scope("scope2") as scope2:
-        assert_equal(get_current_name_scope() + "/", scope2)
+        assert get_current_name_scope() + "/" == scope2
         with reuse_name_scope_of_tensor(x):
-            assert_equal(get_current_name_scope(), "")
+            assert get_current_name_scope() == ""
 
 
 def test_loop_var_creation():
@@ -2296,7 +2295,7 @@ def test_gather_nd_grad():
     # Thus K == 2. gather_nd out will be idxs_exp.shape[:2] + params.shape[2:] = (beam,batch,n_in).
     gathered = tf.gather_nd(base, idxs_exp)  # (beam,batch,n_in)
     gathered_shape, _ = session.run([tf.shape(gathered), gathered])
-    assert_equal(list(gathered_shape), [n_beam, n_batch, n_in])
+    assert list(gathered_shape) == [n_beam, n_batch, n_in]
 
     base_grad = tf.gradients(gathered, base)
     assert base_grad is not None
@@ -2381,30 +2380,30 @@ def rel_embed(x, v, t):
 
 def test_dimshuffle():
     x = tf.zeros((2, 3, 5))
-    assert_equal(list(session.run(tf.shape(x))), [2, 3, 5])
-    assert_equal(list(session.run(tf.shape(dimshuffle(x, (1, 2, 0))))), [3, 5, 2])
-    assert_equal(list(session.run(tf.shape(dimshuffle(x, ("x", 1, 2, 0))))), [1, 3, 5, 2])
-    assert_equal(list(session.run(tf.shape(dimshuffle(x, ("x", 1, "x", 2, "x", 0, "x"))))), [1, 3, 1, 5, 1, 2, 1])
+    assert list(session.run(tf.shape(x))) == [2, 3, 5]
+    assert list(session.run(tf.shape(dimshuffle(x, (1, 2, 0))))) == [3, 5, 2]
+    assert list(session.run(tf.shape(dimshuffle(x, ("x", 1, 2, 0))))) == [1, 3, 5, 2]
+    assert list(session.run(tf.shape(dimshuffle(x, ("x", 1, "x", 2, "x", 0, "x"))))) == [1, 3, 1, 5, 1, 2, 1]
     x = tf.zeros((2, 1, 3))
-    assert_equal(list(session.run(tf.shape(dimshuffle(x, (2, 0))))), [3, 2])
-    assert_equal(list(session.run(tf.shape(dimshuffle(x, (2, "x", "x", 0))))), [3, 1, 1, 2])
+    assert list(session.run(tf.shape(dimshuffle(x, (2, 0))))) == [3, 2]
+    assert list(session.run(tf.shape(dimshuffle(x, (2, "x", "x", 0))))) == [3, 1, 1, 2]
 
 
 def test_expand_multiple_dims():
     x = tf.zeros((2, 3, 5))
-    assert_equal(list(session.run(tf.shape(x))), [2, 3, 5])
-    assert_equal(list(session.run(tf.shape(expand_multiple_dims(x, (1, 2))))), [2, 1, 1, 3, 5])
-    assert_equal(list(session.run(tf.shape(expand_multiple_dims(x, (1, 4))))), [2, 1, 3, 5, 1])
-    assert_equal(list(session.run(tf.shape(expand_multiple_dims(x, (1, 3, 5))))), [2, 1, 3, 1, 5, 1])
+    assert list(session.run(tf.shape(x))) == [2, 3, 5]
+    assert list(session.run(tf.shape(expand_multiple_dims(x, (1, 2))))) == [2, 1, 1, 3, 5]
+    assert list(session.run(tf.shape(expand_multiple_dims(x, (1, 4))))) == [2, 1, 3, 5, 1]
+    assert list(session.run(tf.shape(expand_multiple_dims(x, (1, 3, 5))))) == [2, 1, 3, 1, 5, 1]
 
 
 def test_move_axis():
     x = tf.zeros((2, 3, 5))
-    assert_equal(list(session.run(tf.shape(x))), [2, 3, 5])
-    assert_equal(list(session.run(tf.shape(move_axis(x, old_axis=0, new_axis=1)))), [3, 2, 5])
-    assert_equal(list(session.run(tf.shape(move_axis(x, old_axis=0, new_axis=2)))), [3, 5, 2])
-    assert_equal(list(session.run(tf.shape(move_axis(x, old_axis=2, new_axis=0)))), [5, 2, 3])
-    assert_equal(list(session.run(tf.shape(move_axis(x, old_axis=2, new_axis=1)))), [2, 5, 3])
+    assert list(session.run(tf.shape(x))) == [2, 3, 5]
+    assert list(session.run(tf.shape(move_axis(x, old_axis=0, new_axis=1)))) == [3, 2, 5]
+    assert list(session.run(tf.shape(move_axis(x, old_axis=0, new_axis=2)))) == [3, 5, 2]
+    assert list(session.run(tf.shape(move_axis(x, old_axis=2, new_axis=0)))) == [5, 2, 3]
+    assert list(session.run(tf.shape(move_axis(x, old_axis=2, new_axis=1)))) == [2, 5, 3]
 
 
 def test_flatten_with_seq_len_mask():
@@ -2414,42 +2413,38 @@ def test_flatten_with_seq_len_mask():
     assert x.shape.ndims == 3
     print("x (time-major):", x.eval().tolist())
     print("x (batch-major):", x.eval().transpose(1, 0, 2).tolist())
-    assert_equal(x.eval()[0].tolist(), [[0, 1], [2, 3], [4, 5]])
-    assert_equal(x.eval()[:3, 0].tolist(), [[0, 1], [6, 7], [12, 13]])
+    assert x.eval()[0].tolist() == [[0, 1], [2, 3], [4, 5]]
+    assert x.eval()[:3, 0].tolist() == [[0, 1], [6, 7], [12, 13]]
     flat_bm = flatten_with_seq_len_mask(x, seq_lens=seq_lens, batch_dim_axis=1, time_dim_axis=0)
     assert flat_bm.shape.ndims == 2
     print("flat (batch-major):", flat_bm.eval().tolist())
-    assert_equal(
-        flat_bm.eval().tolist(), [[0, 1], [6, 7], [12, 13], [18, 19], [2, 3], [8, 9], [14, 15], [4, 5], [10, 11]]
-    )
+    assert flat_bm.eval().tolist() == [[0, 1], [6, 7], [12, 13], [18, 19], [2, 3], [8, 9], [14, 15], [4, 5], [10, 11]]
     flat_tm = flatten_with_seq_len_mask_time_major(x, seq_lens=seq_lens, batch_dim_axis=1, time_dim_axis=0)
     assert flat_tm.shape.ndims == 2
     print("flat (time-major):", flat_tm.eval().tolist())
-    assert_equal(
-        flat_tm.eval().tolist(), [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [18, 19]]
-    )
+    assert flat_tm.eval().tolist() == [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [18, 19]]
 
 
 def test_constant_with_shape():
     x = session.run(constant_with_shape(3, [2, 3]))
-    assert_equal(x.shape, (2, 3))
-    assert_equal(x.dtype, numpy.int32)
-    assert_equal(x.flatten().tolist(), [3] * 2 * 3)
+    assert x.shape == (2, 3)
+    assert x.dtype == numpy.int32
+    assert x.flatten().tolist() == [3] * 2 * 3
 
     x = session.run(constant_with_shape(7.0, [2, 3]))
-    assert_equal(x.shape, (2, 3))
-    assert_equal(x.dtype, numpy.float32)
-    assert_equal(x.flatten().tolist(), [7.0] * 2 * 3)
+    assert x.shape == (2, 3)
+    assert x.dtype == numpy.float32
+    assert x.flatten().tolist() == [7.0] * 2 * 3
 
     x = session.run(constant_with_shape(False, [2, 3]))
-    assert_equal(x.shape, (2, 3))
-    assert_equal(x.dtype, numpy.bool_)
-    assert_equal(x.flatten().tolist(), [False] * 2 * 3)
+    assert x.shape == (2, 3)
+    assert x.dtype == numpy.bool_
+    assert x.flatten().tolist() == [False] * 2 * 3
 
     x = session.run(constant_with_shape(True, [2, 3]))
-    assert_equal(x.shape, (2, 3))
-    assert_equal(x.dtype, numpy.bool_)
-    assert_equal(x.flatten().tolist(), [True] * 2 * 3)
+    assert x.shape == (2, 3)
+    assert x.dtype == numpy.bool_
+    assert x.flatten().tolist() == [True] * 2 * 3
 
 
 def naive_windowed_batch(source, window, padding="same"):
@@ -2607,7 +2602,7 @@ def check():
                 y = custom_gradient.generic_loss_and_error_signal(loss=1.0, x=x, grad_x=3.0)
                 assert y.graph is graph
                 (grad_y,) = tf.gradients(y, x)
-                assert_equal(session.run([y, x, grad_y]), [1.0, 2.0, 3.0])
+                assert session.run([y, x, grad_y]) == [1.0, 2.0, 3.0]
 
     check()
     check()
@@ -2623,7 +2618,7 @@ def test_CustomGradient_generic_loss_and_error_signal_post_func():
             z = 2.0 * y
             assert y.graph is graph
             (grad_z,) = tf.gradients(z, x)
-            assert_equal(session.run([z, x, grad_z]), [4.0, 5.0, 6.0])
+            assert session.run([z, x, grad_z]) == [4.0, 5.0, 6.0]
 
 
 def test_global_tensor():
@@ -2637,21 +2632,21 @@ def f():
     x = global_tensor(f, name="hello")
     x2 = global_tensor(f, name="hello")
     x3 = global_tensor(f, name="hello")
-    assert_equal(C.i, 1)
-    assert_is(x, x2)
-    assert_is(x, x3)
-    assert_equal(x.eval(), 42)
+    assert C.i == 1
+    assert x is x2
+    assert x is x3
+    assert x.eval() == 42
 
 
 def test_encode_raw_direct():
     raw = tf_compat.v1.decode_raw(tf.constant("ABC"), tf.uint8)
-    assert_equal(list(raw.eval()), [65, 66, 67])
+    assert list(raw.eval()) == [65, 66, 67]
 
 
 def test_encode_raw_simple():
     raw = tf_compat.v1.decode_raw(tf.constant("hello"), tf.uint8)
     back = encode_raw(raw)
-    assert_equal(back.eval(), b"hello")
+    assert back.eval() == b"hello"
 
 
 def test_encode_raw_seq_lens():
@@ -2660,7 +2655,7 @@ def test_encode_raw_seq_lens():
     raw = tf_compat.v1.decode_raw(tf.constant(strs), tf.uint8)
     seq_lens = tf.constant([len(s) for s in strs_stripped])
     back = encode_raw(raw, seq_lens=seq_lens)
-    assert_equal(list(back.eval()), [s.encode("utf8") for s in strs_stripped])
+    assert list(back.eval()) == [s.encode("utf8") for s in strs_stripped]
 
 
 @unittest.skip("broken? https://github.com/tensorflow/tensorflow/issues/11240")
@@ -2670,7 +2665,7 @@ def test_sequential_control_dependencies():
         [lambda: v.initializer, lambda: tf_compat.v1.assign(v, 3), lambda: tf_compat.v1.assign(v, v.read_value() + 5)]
     ):
         x = v.read_value()
-    assert_equal(x.eval(), 3 + 5)
+    assert x.eval() == 3 + 5
 
 
 @unittest.skip("broken? https://github.com/tensorflow/tensorflow/issues/11240")
@@ -2679,7 +2674,7 @@ def test_var_init():
     v = tf.Variable(initial_value=2, trainable=False, name="test_var_init")
     with tf.control_dependencies([v.initializer]):
         x = v.read_value()
-    assert_equal(x.eval(), 2)
+    assert x.eval() == 2
 
 
 def test_resource_var_init():
@@ -2694,16 +2689,16 @@ def test_resource_var_init():
     )
     with tf.control_dependencies([v.initializer]):
         x = v.read_value()
-    assert_equal(x.eval(), 2)
+    assert x.eval() == 2
 
 
 @unittest.skip("broken? see also test_var_init")  # TODO...
 def test_true_once():
     x = true_once()
-    assert_equal(x.eval(), True)
-    assert_equal(x.eval(), False)
-    assert_equal(x.eval(), False)
-    assert_equal(x.eval(), False)
+    assert x.eval() == True
+    assert x.eval() == False
+    assert x.eval() == False
+    assert x.eval() == False
 
 
 @unittest.skip("broken?")  # TODO...
@@ -2731,7 +2726,7 @@ def test_enforce_copy():
             x = tf.add(0, [a, b, v.read_value()])
     x_eval = list(x.eval())
     assert len(x_eval) == 3
-    assert_equal(x_eval[1:], [2, 3])
+    assert x_eval[1:] == [2, 3]
     # x[0] might depend on the implementation, and TF version.
     # In TF 1, it is 3. In TF 2, it is 2. (2 is actually probably more correct...)
     assert x_eval[0] in [2, 3]
@@ -2754,21 +2749,21 @@ def test_TensorArray():
     f = 0
     f = session.run(write, feed_dict={index: 0, value: 1, flow: f})
     f = session.run(write, feed_dict={index: 1, value: 2, flow: f})
-    assert_equal(session.run(read, feed_dict={index: 0, flow: f}), 1)
-    assert_equal(session.run(read, feed_dict={index: 1, flow: f}), 2)
+    assert session.run(read, feed_dict={index: 0, flow: f}) == 1
+    assert session.run(read, feed_dict={index: 1, flow: f}) == 2
 
 
 def test_tfconv1d_evensize():
     filters = tf.constant([[[2.0]], [[3.0]]])  # [filter_width, in_channels, out_channels]
     assert isinstance(filters, tf.Tensor)
-    assert_equal(filters.get_shape().as_list(), [2, 1, 1])
+    assert filters.get_shape().as_list() == [2, 1, 1]
     value = tf.constant([[[5.0], [7.0]]])  # (batch, time, dim)
     assert isinstance(value, tf.Tensor)
-    assert_equal(value.get_shape().as_list(), [1, 2, 1])
+    assert value.get_shape().as_list() == [1, 2, 1]
     res = tf.nn.conv1d(value, filters=filters, stride=1, padding="SAME", data_format="NHWC")
     resv = res.eval()
     assert isinstance(resv, numpy.ndarray)
-    assert_equal(resv.shape, (1, 2, 1))  # (batch, time, dim)
+    assert resv.shape == (1, 2, 1)  # (batch, time, dim)
     # Tests that the kernel-size of 2 is applied on current-frame + right-frame.
     # Note that in the Dataset with context_window = 2, it will do the corresponding thing,
     # i.e. adds one right-frame and no left-frame, such that if you use padding="VALID",
@@ -2784,13 +2779,13 @@ def test_tf_tile():
     v2 = tf.tile(v, [beam_size])  # (beam*batch,)
     v2.set_shape((beam_size * batch_size,))
     print(v2.eval())
-    assert_equal(list(v2.eval()), [1, 2, 3] * 5)
+    assert list(v2.eval()) == [1, 2, 3] * 5
     v3 = tf.reshape(v2, [beam_size, batch_size])  # (beam,batch)
     r = v3.eval()
     print(r)
     assert isinstance(r, numpy.ndarray)
     for beam in range(beam_size):
-        assert_equal(list(r[beam]), [1, 2, 3])
+        assert list(r[beam]) == [1, 2, 3]
 
 
 def test_tile_transposed():
@@ -2801,13 +2796,13 @@ def test_tile_transposed():
     v2 = tile_transposed(v, axis=0, multiples=beam_size)  # (batch*beam,)
     v2.set_shape((batch_size * beam_size,))
     print(v2.eval())
-    assert_equal(list(v2.eval()), [1] * 5 + [2] * 5 + [3] * 5)
+    assert list(v2.eval()) == [1] * 5 + [2] * 5 + [3] * 5
     v3 = tf.reshape(v2, [batch_size, beam_size])  # (batch,beam)
     r = v3.eval()
     print(r)
     assert isinstance(r, numpy.ndarray)
     for beam in range(beam_size):
-        assert_equal(list(r[:, beam]), [1, 2, 3])
+        assert list(r[:, beam]) == [1, 2, 3]
 
 
 def test_expand_dims_unbroadcast_instead_of_tf_tile():
@@ -2821,7 +2816,7 @@ def test_expand_dims_unbroadcast_instead_of_tf_tile():
     print(r)
     assert isinstance(r, numpy.ndarray)
     for beam in range(beam_size):
-        assert_equal(list(r[:, beam]), [1, 2, 3])
+        assert list(r[:, beam]) == [1, 2, 3]
 
 
 def test_expand_dims_unbroadcast_negative_axis():
@@ -2834,7 +2829,7 @@ def test_expand_dims_unbroadcast_negative_axis():
     r = v2.eval()
     print(r)
     assert isinstance(r, numpy.ndarray)
-    assert_equal(r.shape, (batch_size, n_time, expand_dim, n_dim))  # (batch, time, dim)
+    assert r.shape == (batch_size, n_time, expand_dim, n_dim)  # (batch, time, dim)
 
 
 def test_where_nan():
@@ -2849,11 +2844,11 @@ def test_where_nan():
     # SelectOp, https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/cwise_op_select.cc
     # We later check for nan. assert_equal does not work as-is because (nan == nan) is False.
     # Thus, we resort to this check:
-    assert_equal(str(float("nan")), "nan")
+    assert str(float("nan")) == "nan"
 
     where_0_nan = tf.where(True, 0.0, float("nan"))
     print("where_0_nan:", where_0_nan.eval())
-    assert_equal(where_0_nan.eval(), 0.0)
+    assert where_0_nan.eval() == 0.0
 
     x = tf.constant(0.0)
     x_equal_0 = tf.equal(x, 0.0)
@@ -2862,19 +2857,19 @@ def test_where_nan():
     print("grad_x:", grad_x.eval())  # nan? or 0?
     # This is expected when you look at the resulting computation graph for the gradient.
     # You will have grad(1./x, x) * 0.0 in the graph in the back-propagation of the gradient, which is nan.
-    assert_equal(str(grad_x.eval()), "nan")
+    assert str(grad_x.eval()) == "nan"
 
     safe_x = tf.where(x_equal_0, 2.0, x)
     grad_safe_x = tf.where(x_equal_0, 0.0, 1.0 / safe_x)
     print("grad_safe_x:", grad_safe_x.eval())  # nan? ln(2)? 0?
     # This works, because at no time, there is nan in the back-propagation.
-    assert_equal(grad_safe_x.eval(), 0.0)
+    assert grad_safe_x.eval() == 0.0
 
     f = tf.cond(x_equal_0, lambda: 0.0, lambda: 1.0 / x)
     grad_cond_x = tf.gradients(f, x)[0]
     print("grad_cond_x:", grad_cond_x.eval())  # nan? or 0?
     # This is different than tf.where because really only one branch will go into the gradient.
-    assert_equal(grad_cond_x.eval(), 0.0)
+    assert grad_cond_x.eval() == 0.0
 
 
 def test_variable_summaries():
@@ -2897,10 +2892,10 @@ def test_get_variable_from_tensor():
 def test_VariableAssigner():
     v = tf.Variable(initial_value=1.0)
     session.run(v.initializer)
-    assert_equal(session.run(v), 1.0)
+    assert session.run(v) == 1.0
     assigner = VariableAssigner(v)
     assigner.assign(value=2.0, session=session)
-    assert_equal(session.run(v), 2.0)
+    assert session.run(v) == 2.0
 
 
 def test_VariableAssigner_ResourceVariable():
@@ -2911,17 +2906,17 @@ def test_VariableAssigner_ResourceVariable():
         use_resource=True,
     )
     session.run(v.initializer)
-    assert_equal(session.run(v), 1.0)
+    assert session.run(v) == 1.0
     assigner = VariableAssigner(v)
     assigner.assign(value=2.0, session=session)
-    assert_equal(session.run(v), 2.0)
+    assert session.run(v) == 2.0
 
 
 def test_map_labels():
     x = tf.constant([0, 1, 2, 3, 2, 1, 0])
     label_map = {0: 1, 1: 2, 2: 3, 3: 0}
     y = map_labels(x, label_map=label_map)
-    assert_equal(session.run(y).tolist(), [1, 2, 3, 0, 3, 2, 1])
+    assert session.run(y).tolist() == [1, 2, 3, 0, 3, 2, 1]
 
 
 def test_map_labels_SparseTensor():
@@ -2935,7 +2930,7 @@ def test_map_labels_SparseTensor():
     assert isinstance(y, tf.SparseTensor)
     y_eval = session.run(y)
     assert isinstance(y_eval, tf_compat.v1.SparseTensorValue)
-    assert_equal(y_eval.values.tolist(), [1, 2, 3, 0])
+    assert y_eval.values.tolist() == [1, 2, 3, 0]
 
 
 def test_sparse_labels():
@@ -2947,9 +2942,9 @@ def test_sparse_labels():
     assert isinstance(y_eval.indices, numpy.ndarray)
     assert isinstance(y_eval.values, numpy.ndarray)
     assert isinstance(y_eval.dense_shape, numpy.ndarray)
-    assert_equal(y_eval.indices.tolist(), [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1]])
-    assert_equal(y_eval.values.tolist(), [0, 1, 2, 3, 4, 5])
-    assert_equal(y_eval.dense_shape.tolist(), [2, 4])
+    assert y_eval.indices.tolist() == [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1]]
+    assert y_eval.values.tolist() == [0, 1, 2, 3, 4, 5]
+    assert y_eval.dense_shape.tolist() == [2, 4]
 
 
 def test_remove_labels():
@@ -2966,9 +2961,9 @@ def test_remove_labels():
     assert isinstance(y_eval.indices, numpy.ndarray)
     assert isinstance(y_eval.values, numpy.ndarray)
     assert isinstance(y_eval.dense_shape, numpy.ndarray)
-    assert_equal(y_eval.indices.tolist(), [[0, 0], [0, 1], [1, 0]])
-    assert_equal(y_eval.values.tolist(), [0, 2, 3])
-    assert_equal(y_eval.dense_shape.tolist(), [3, 2])
+    assert y_eval.indices.tolist() == [[0, 0], [0, 1], [1, 0]]
+    assert y_eval.values.tolist() == [0, 2, 3]
+    assert y_eval.dense_shape.tolist() == [3, 2]
 
 
 def test_ctc_greedy_decode():
@@ -3003,9 +2998,9 @@ def test_ctc_greedy_decode():
     assert isinstance(y1_eval.dense_shape, numpy.ndarray)
     print("y indices:", y1_eval.indices.tolist())
     print("y values:", y1_eval.values.tolist())
-    assert_equal(y2_eval.indices.tolist(), y1_eval.indices.tolist())
-    assert_equal(y2_eval.values.tolist(), y1_eval.values.tolist())
-    assert_equal(y2_eval.dense_shape.tolist(), y1_eval.dense_shape.tolist())
+    assert y2_eval.indices.tolist() == y1_eval.indices.tolist()
+    assert y2_eval.values.tolist() == y1_eval.values.tolist()
+    assert y2_eval.dense_shape.tolist() == y1_eval.dense_shape.tolist()
 
 
 def test_supported_devices_for_op():
@@ -3182,12 +3177,12 @@ def test_clip_by_value_with_identity_grad():
         x = numpy.array(x, dtype="float32")
         y, err_x, err2_x = session.run([y_t, err_x_t, err2_x_t], feed_dict={x_t: x})
         print("x:", x, "y:", y, "err_x:", err_x, "err2_x:", err2_x)
-        assert_equal(err_x, err_y)
+        assert err_x == err_y
         assert -limit <= y <= limit
         if abs(x) > limit:
-            assert_equal(err2_x, 0.0)
+            assert err2_x == 0.0
         if abs(x) < limit:
-            assert_equal(err2_x, err_y)
+            assert err2_x == err_y
 
 
 def test_safe_log_and_grad():
@@ -3315,7 +3310,7 @@ def test_string_merge():
     print(res)
     res = [s.decode("utf8") for s in res]
     print(res)
-    assert_equal(res, ["sub@@ word test", "hel@@ lo wo@@ r@@ ld", "foo"])
+    assert res == ["sub@@ word test", "hel@@ lo wo@@ r@@ ld", "foo"]
 
 
 def test_vocab_string_merge():
@@ -3331,7 +3326,7 @@ def test_vocab_string_merge():
     print(res)
     res = [s.decode("utf8") for s in res]
     print(res)
-    assert_equal(res, ["sub@@ word test </s>", "hel@@ lo wo@@ r@@ ld </s>", "foo </s>"])
+    assert res == ["sub@@ word test </s>", "hel@@ lo wo@@ r@@ ld </s>", "foo </s>"]
 
 
 def test_string_replace():
@@ -3346,7 +3341,7 @@ def test_string_replace():
     print(res)
     res = [s.decode("utf8") for s in res]
     print(res)
-    assert_equal(res, ["subword test", "hello world", "foo"])
+    assert res == ["subword test", "hello world", "foo"]
 
 
 def test_words_split_get_sparse_tensor_length():
@@ -3371,16 +3366,13 @@ def test_words_split_get_sparse_tensor_length():
     assert num_words.shape == (len(strings),)
     dense_words = dense_words.tolist()
     print(dense_words)
-    assert_equal(
-        dense_words,
-        [
-            [b"subword", b"test", b"", b""],
-            [b"a", b"b", b"c", b"d"],
-            [b"hello", b"world", b"", b""],
-            [b"foo", b"", b"", b""],
-        ],
-    )
-    assert_equal(num_words.tolist(), word_lens)
+    assert dense_words == [
+        [b"subword", b"test", b"", b""],
+        [b"a", b"b", b"c", b"d"],
+        [b"hello", b"world", b"", b""],
+        [b"foo", b"", b"", b""],
+    ]
+    assert num_words.tolist() == word_lens
 
 
 def test_string_words_calc_wer():
@@ -3393,8 +3385,8 @@ def test_string_words_calc_wer():
     print(wer, ref_num_words)
     assert isinstance(wer, numpy.ndarray)
     assert isinstance(ref_num_words, numpy.ndarray)
-    assert_equal(wer.tolist(), [1, 2, 1, 0])
-    assert_equal(ref_num_words.tolist(), [3, 4, 3, 1])
+    assert wer.tolist() == [1, 2, 1, 0]
+    assert ref_num_words.tolist() == [3, 4, 3, 1]
 
 
 def test_kenlm():
@@ -3440,10 +3432,10 @@ def test_kenlm_bpe():
     print("input strings:", input_strings)
     print("output scores:", output_scores)
     assert isinstance(output_scores, numpy.ndarray)
-    assert_equal(output_scores.shape, (len(input_strings),))
+    assert output_scores.shape == (len(input_strings),)
     assert_almost_equal(output_scores[0], -9.251298)  # example from above
-    assert_equal(output_scores[0], output_scores[1])
-    assert_equal(output_scores[2], output_scores[3])
+    assert output_scores[0] == output_scores[1]
+    assert output_scores[2] == output_scores[3]
     print("Scores are as expected.")
 
 
@@ -3506,11 +3498,11 @@ def transition(state, input):
         next_states, out_labels, weights = transitions([state], [input])
         return next_states[0], out_labels[0], weights[0]
 
-    assert_equal(transition(0, "Mars "), (0, [output_symbols["Mars"]], 0.0))
-    assert_equal(transition(0, "Martian "), (0, [output_symbols["Martian"]], 0.0))
-    assert_equal(transition(0, "Mar"), (5, [], 0.0))
-    assert_equal(transition(5, "s"), (6, [output_symbols["Mars"]], 0.0))
-    assert_equal(transition(0, "Unknown "), (-1, [], float("-inf")))
+    assert transition(0, "Mars ") == (0, [output_symbols["Mars"]], 0.0)
+    assert transition(0, "Martian ") == (0, [output_symbols["Martian"]], 0.0)
+    assert transition(0, "Mar") == (5, [], 0.0)
+    assert transition(5, "s") == (6, [output_symbols["Mars"]], 0.0)
+    assert transition(0, "Unknown ") == (-1, [], float("-inf"))
 
 
 def test_layer_norms():
@@ -3563,29 +3555,29 @@ def test_layer_norms():
 
 
 def test_transform_param_axes_split_info_to_new_shape():
-    assert_equal(transform_param_axes_split_info_to_new_shape([[7], [7] * 4], [7 * 2, 7 * 8]), [[7 * 2], [7 * 2] * 4])
-    assert_equal(
-        transform_param_axes_split_info_to_new_shape([[3, 7], [7] * 4], [3 + 7 * 2, 7 * 8]), [[3, 7 * 2], [7 * 2] * 4]
-    )
-    assert_equal(
-        transform_param_axes_split_info_to_new_shape([[3, 7], [7] * 4], [1 + 7 * 2, 7 * 8]), [[1, 7 * 2], [7 * 2] * 4]
-    )
-    assert_equal(
-        transform_param_axes_split_info_to_new_shape([[7, 7], [7] * 4], [3 + 7 * 2, 7 * 8]), [[3, 7 * 2], [7 * 2] * 4]
-    )
-    assert_equal(
-        transform_param_axes_split_info_to_new_shape([[7, 7], [7] * 4], [7 * 2 + 7 * 2, 7 * 8]),
-        [[7 * 2, 7 * 2], [7 * 2] * 4],
-    )
-    assert_equal(transform_param_axes_split_info_to_new_shape([[7], [7] * 4], [7, 7 * 8]), [[7], [7 * 2] * 4])
-    assert_equal(
-        transform_param_axes_split_info_to_new_shape([[1000, 621, 1280], [1000]], (2645, 1000)),
-        [[1000, 621, 1024], [1000]],
-    )
-    assert_equal(
-        transform_param_axes_split_info_to_new_shape([[512, 128, 32], [544]], (512, 544)),
-        [[512, 0, 0], [544]],
-    )
+    assert transform_param_axes_split_info_to_new_shape([[7], [7] * 4], [7 * 2, 7 * 8]) == [[7 * 2], [7 * 2] * 4]
+    assert transform_param_axes_split_info_to_new_shape([[3, 7], [7] * 4], [3 + 7 * 2, 7 * 8]) == [
+        [3, 7 * 2],
+        [7 * 2] * 4,
+    ]
+    assert transform_param_axes_split_info_to_new_shape([[3, 7], [7] * 4], [1 + 7 * 2, 7 * 8]) == [
+        [1, 7 * 2],
+        [7 * 2] * 4,
+    ]
+    assert transform_param_axes_split_info_to_new_shape([[7, 7], [7] * 4], [3 + 7 * 2, 7 * 8]) == [
+        [3, 7 * 2],
+        [7 * 2] * 4,
+    ]
+    assert transform_param_axes_split_info_to_new_shape([[7, 7], [7] * 4], [7 * 2 + 7 * 2, 7 * 8]) == [
+        [7 * 2, 7 * 2],
+        [7 * 2] * 4,
+    ]
+    assert transform_param_axes_split_info_to_new_shape([[7], [7] * 4], [7, 7 * 8]) == [[7], [7 * 2] * 4]
+    assert transform_param_axes_split_info_to_new_shape([[1000, 621, 1280], [1000]], (2645, 1000)) == [
+        [1000, 621, 1024],
+        [1000],
+    ]
+    assert transform_param_axes_split_info_to_new_shape([[512, 128, 32], [544]], (512, 544)) == [[512, 0, 0], [544]]
 
 
 def test_get_op_attrib_keys():
@@ -3593,13 +3585,13 @@ def test_get_op_attrib_keys():
     assert isinstance(x, tf.Tensor)
     assert isinstance(x.op, tf.Operation)
     print("x op:", x.op.type)
-    assert_in(x.op.type, ["BatchMatMul", "BatchMatMulV2"])
-    assert_equal(x.get_shape().as_list(), [3, 4, 7])
+    assert x.op.type in ["BatchMatMul", "BatchMatMulV2"]
+    assert x.get_shape().as_list() == [3, 4, 7]
     attrib_keys = get_op_attrib_keys(x)
     print("matmul attrib keys:", attrib_keys)
-    assert_equal(sorted(attrib_keys), ["T", "adj_x", "adj_y"])
+    assert sorted(attrib_keys) == ["T", "adj_x", "adj_y"]
     dtype = x.op.get_attr("T")
-    assert_equal(dtype, tf.float32)
+    assert dtype == tf.float32
 
 
 def test_get_op_input_names_MatMul():
@@ -3607,10 +3599,10 @@ def test_get_op_input_names_MatMul():
     assert isinstance(x, tf.Tensor)
     assert isinstance(x.op, tf.Operation)
     print("x op:", x.op.type)
-    assert_in(x.op.type, ["BatchMatMul", "BatchMatMulV2"])
+    assert x.op.type in ["BatchMatMul", "BatchMatMulV2"]
     input_names = get_op_input_names(x.op)
     print("matmul input names:", input_names)
-    assert_equal(sorted(input_names), ["x", "y"])
+    assert sorted(input_names) == ["x", "y"]
 
 
 def test_get_op_input_names_Constant():
@@ -3618,10 +3610,10 @@ def test_get_op_input_names_Constant():
     assert isinstance(x, tf.Tensor)
     assert isinstance(x.op, tf.Operation)
     print("x op:", x.op.type)
-    assert_equal(x.op.type, "Const")
+    assert x.op.type == "Const"
     input_names = get_op_input_names(x.op)
     print("constant input names:", input_names)
-    assert_equal(sorted(input_names), [])
+    assert sorted(input_names) == []
 
 
 def test_get_op_attrib_keys__is_variable_initialized():
@@ -3647,7 +3639,7 @@ def test_print_graph_output():
 def test_get_var_ops():
     with tf_compat.v1.variable_scope("test_get_var_ops"):
         v = tf_compat.v1.get_variable("v", ())
-        assert_equal(find_ops_with_tensor_input(v), [v.initializer])
+        assert find_ops_with_tensor_input(v) == [v.initializer]
 
 
 def test_find_ops_with_tensor_input():
@@ -3659,11 +3651,11 @@ def test_find_ops_with_tensor_input():
         x1b = tf.add(x1a, v2, name="x1b")
         x2a = tf.multiply(v1, v2, name="x2a")
         x2b = tf.multiply(x2a, x0, name="x2b")
-        assert_equal(find_ops_with_tensor_input(x0), [x1a.op, x2b.op])
+        assert find_ops_with_tensor_input(x0) == [x1a.op, x2b.op]
         print("v1 usages:", find_ops_with_tensor_input(v1))
-        assert_equal(find_ops_with_tensor_input(v1), [v1.initializer, x1a.op, x2a.op])
-        assert_equal(find_ops_with_tensor_input(v2), [v2.initializer, x1b.op, x2a.op])
-        assert_equal(find_ops_with_tensor_input(v2, fetches=[x2b]), [x2a.op])
+        assert find_ops_with_tensor_input(v1) == [v1.initializer, x1a.op, x2a.op]
+        assert find_ops_with_tensor_input(v2) == [v2.initializer, x1b.op, x2a.op]
+        assert find_ops_with_tensor_input(v2, fetches=[x2b]) == [x2a.op]
 
 
 def test_get_var_update_ops():
@@ -3734,11 +3726,11 @@ def test_get_variable_grad_from_update_ops():
             print("update op inputs by name:", get_op_input_names(update_ops[0]))
             session.run(var.initializer)  # reset
             session.run(tf_compat.v1.global_variables_initializer())  # from Adam or so
-            assert_equal(session.run(var), 0.0)
+            assert session.run(var) == 0.0
             grad = get_variable_grad_from_update_ops(var, update_ops)
             print("grad:", grad)
             _, grad_np = session.run([minimize_op, grad])
-            assert_equal(grad_np, -2.0)
+            assert grad_np == -2.0
 
 
 def test_get_variable_grad_from_update_ops_mix_sparse_dense():
@@ -3806,29 +3798,29 @@ def test_mixed_dense_sparse_grad():
         var_np = session.run(var)
         print("var:")
         print(var_np)
-        assert_equal(var_np[0, 0], var_np[2, 0])
-        assert_not_equal(var_np[0, 0], var_np[1, 0])
+        assert var_np[0, 0] == var_np[2, 0]
+        assert var_np[0, 0] != var_np[1, 0]
 
 
 def test_tensor_array_is_dynamic_size():
     ta1 = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
-    assert_equal(tensor_array_is_dynamic_size(ta1), True)
+    assert tensor_array_is_dynamic_size(ta1) == True
     ta2 = tf.TensorArray(tf.float32, size=0, dynamic_size=False)
-    assert_equal(tensor_array_is_dynamic_size(ta2), False)
+    assert tensor_array_is_dynamic_size(ta2) == False
 
 
 def test_tensor_array_like():
     ta1 = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
     ta1 = tensor_array_like(ta1)
-    assert_equal(tensor_array_is_dynamic_size(ta1), True)
+    assert tensor_array_is_dynamic_size(ta1) == True
 
 
 def test_tensor_array_like_elem_shape():
     ta1 = tf.TensorArray(tf.float32, size=0, dynamic_size=True, element_shape=tf.TensorShape([None, 13]))
     ta2 = tensor_array_like(ta1)
-    assert_equal(tensor_array_is_dynamic_size(ta2), True)
-    assert_equal(tensor_array_element_shape(ta1).as_list(), [None, 13])
-    assert_equal(tensor_array_element_shape(ta2).as_list(), [None, 13])
+    assert tensor_array_is_dynamic_size(ta2) == True
+    assert tensor_array_element_shape(ta1).as_list() == [None, 13]
+    assert tensor_array_element_shape(ta2).as_list() == [None, 13]
 
 
 def test_copy_with_new_split_axes():
@@ -4648,28 +4640,18 @@ def test_get_linear_alignment_out_to_in_indices():
     #       * input_len=7, output_len=3, resulting indices [1,3,5].
     #       * input_len=3, output_len=3, resulting indices [0,1,2].
     #       * input_len=2, output_len=4, resulting indices [0,0,1,1].
-    assert_equal(
-        session.run(get_linear_alignment_out_to_in_indices(input_lens=[7], output_lens=[3])).tolist(), [[1, 3, 5]]
-    )
-    assert_equal(
-        session.run(get_linear_alignment_out_to_in_indices(input_lens=[3], output_lens=[3])).tolist(), [[0, 1, 2]]
-    )
-    assert_equal(
-        session.run(get_linear_alignment_out_to_in_indices(input_lens=[2], output_lens=[4])).tolist(), [[0, 0, 1, 1]]
-    )
-    assert_equal(
-        session.run(get_linear_alignment_out_to_in_indices(input_lens=[7, 3, 1], output_lens=[3, 3, 3])).tolist(),
-        [[1, 3, 5], [0, 1, 2], [0, 0, 0]],
-    )
-    assert_equal(
-        session.run(
-            get_linear_alignment_out_to_in_indices(input_lens=[7, 4, 2, 1], output_lens=[3, 4, 4, 2], pad_value=-1)
-        ).tolist(),
-        [[1, 3, 5, -1], [0, 1, 2, 3], [0, 0, 1, 1], [0, 0, -1, -1]],
-    )
-    assert_equal(
-        session.run(get_linear_alignment_out_to_in_indices(input_lens=[2], output_lens=[3])).tolist(), [[0, 1, 1]]
-    )
+    assert session.run(get_linear_alignment_out_to_in_indices(input_lens=[7], output_lens=[3])).tolist() == [[1, 3, 5]]
+    assert session.run(get_linear_alignment_out_to_in_indices(input_lens=[3], output_lens=[3])).tolist() == [[0, 1, 2]]
+    assert session.run(get_linear_alignment_out_to_in_indices(input_lens=[2], output_lens=[4])).tolist() == [
+        [0, 0, 1, 1]
+    ]
+    assert session.run(
+        get_linear_alignment_out_to_in_indices(input_lens=[7, 3, 1], output_lens=[3, 3, 3])
+    ).tolist() == [[1, 3, 5], [0, 1, 2], [0, 0, 0]]
+    assert session.run(
+        get_linear_alignment_out_to_in_indices(input_lens=[7, 4, 2, 1], output_lens=[3, 4, 4, 2], pad_value=-1)
+    ).tolist() == [[1, 3, 5, -1], [0, 1, 2, 3], [0, 0, 1, 1], [0, 0, -1, -1]]
+    assert session.run(get_linear_alignment_out_to_in_indices(input_lens=[2], output_lens=[3])).tolist() == [[0, 1, 1]]
 
 
 def test_get_rnnt_linear_aligned_output():
@@ -4678,74 +4660,47 @@ def test_get_rnnt_linear_aligned_output():
     #     * input_len=0, targets=[a,b,c] (len 3), output=[a,b,c] (len 3).
     #     * input_len=4, targets=[a] (len 1), output=[B,B,a,B,B] (len 5).
     #     * input_len=3, targets=[a,b] (len 2), output=[B,a,B,b,B] (len 5)
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(input_lens=[4], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4)[0]
-        ).tolist(),
-        [[4, 1, 4, 2, 4, 3, 4]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(input_lens=[0], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4)[0]
-        ).tolist(),
-        [[1, 2, 3]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(input_lens=[4], targets=[[1]], target_lens=[1], blank_label_idx=4)[0]
-        ).tolist(),
-        [[4, 4, 1, 4, 4]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(input_lens=[3], targets=[[1, 2]], target_lens=[2], blank_label_idx=4)[0]
-        ).tolist(),
-        [[4, 1, 4, 2, 4]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(
-                input_lens=[2], targets=tf.zeros((1, 0), dtype=tf.int32), target_lens=[0], blank_label_idx=4
-            )[0]
-        ).tolist(),
-        [[4, 4]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(
-                input_lens=[4, 3, 2, 0],
-                targets=[[1, 2, 3], [1, 2, -1], [-1, -1, -1], [1, 2, 3]],
-                target_lens=[3, 2, 0, 3],
-                blank_label_idx=4,
-            )[0]
-        ).tolist(),
-        [[4, 1, 4, 2, 4, 3, 4], [4, 1, 4, 2, 4, 0, 0], [4, 4, 0, 0, 0, 0, 0], [1, 2, 3, 0, 0, 0, 0]],
-    )
+    assert session.run(
+        get_rnnt_linear_aligned_output(input_lens=[4], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4)[0]
+    ).tolist() == [[4, 1, 4, 2, 4, 3, 4]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(input_lens=[0], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4)[0]
+    ).tolist() == [[1, 2, 3]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(input_lens=[4], targets=[[1]], target_lens=[1], blank_label_idx=4)[0]
+    ).tolist() == [[4, 4, 1, 4, 4]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(input_lens=[3], targets=[[1, 2]], target_lens=[2], blank_label_idx=4)[0]
+    ).tolist() == [[4, 1, 4, 2, 4]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(
+            input_lens=[2], targets=tf.zeros((1, 0), dtype=tf.int32), target_lens=[0], blank_label_idx=4
+        )[0]
+    ).tolist() == [[4, 4]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(
+            input_lens=[4, 3, 2, 0],
+            targets=[[1, 2, 3], [1, 2, -1], [-1, -1, -1], [1, 2, 3]],
+            target_lens=[3, 2, 0, 3],
+            blank_label_idx=4,
+        )[0]
+    ).tolist() == [[4, 1, 4, 2, 4, 3, 4], [4, 1, 4, 2, 4, 0, 0], [4, 4, 0, 0, 0, 0, 0], [1, 2, 3, 0, 0, 0, 0]]
     # RNA test
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(
-                input_lens=[7], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4, targets_consume_time=True
-            )[0]
-        ).tolist(),
-        [[4, 1, 4, 2, 4, 3, 4]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(
-                input_lens=[3], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4, targets_consume_time=True
-            )[0]
-        ).tolist(),
-        [[1, 2, 3]],
-    )
-    assert_equal(
-        session.run(
-            get_rnnt_linear_aligned_output(
-                input_lens=[2], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4, targets_consume_time=True
-            )[0]
-        ).tolist(),
-        [[1, 2]],
-    )
+    assert session.run(
+        get_rnnt_linear_aligned_output(
+            input_lens=[7], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4, targets_consume_time=True
+        )[0]
+    ).tolist() == [[4, 1, 4, 2, 4, 3, 4]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(
+            input_lens=[3], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4, targets_consume_time=True
+        )[0]
+    ).tolist() == [[1, 2, 3]]
+    assert session.run(
+        get_rnnt_linear_aligned_output(
+            input_lens=[2], targets=[[1, 2, 3]], target_lens=[3], blank_label_idx=4, targets_consume_time=True
+        )[0]
+    ).tolist() == [[1, 2]]
 
 
 if __name__ == "__main__":
diff --git a/tests/test_TaskSystem.py b/tests/test_TaskSystem.py
index bd6e1fe0e..051d3f419 100644
--- a/tests/test_TaskSystem.py
+++ b/tests/test_TaskSystem.py
@@ -6,7 +6,6 @@
 from returnn.util.task_system import *
 import inspect
 import unittest
-from nose.tools import assert_equal, assert_is_instance
 from returnn.util import better_exchook
 
 better_exchook.replace_traceback_format_tb()
@@ -101,11 +100,11 @@ def test_pickle():
     obj = DemoClass()
     s = pickle_dumps(obj.method)
     inst = pickle_loads(s)
-    assert_equal(inst(), 42)
+    assert inst() == 42
 
 
 def test_pickle_unicode_str():
-    assert_equal(pickle_loads(pickle_dumps("â")), "â")
+    assert pickle_loads(pickle_dumps("â")) == "â"
 
 
 if __name__ == "__main__":
diff --git a/tests/test_TranslationDataset.py b/tests/test_TranslationDataset.py
index ee4ab88aa..1c24440ec 100644
--- a/tests/test_TranslationDataset.py
+++ b/tests/test_TranslationDataset.py
@@ -9,11 +9,6 @@
 import gzip
 import pickle
 
-from nose.tools import assert_equal
-from nose.tools import assert_not_equal
-from nose.tools import assert_raises
-from nose.tools import raises
-
 import _setup_test_env  # noqa
 from returnn.util import better_exchook
 from returnn.datasets.lm import TranslationDataset, TranslationFactorsDataset
@@ -108,17 +103,17 @@ def test_translation_dataset():
         translation_dataset.load_seqs(0, 10)
 
         num_seqs = len(dummy_source_text.splitlines())
-        assert_equal(translation_dataset.num_seqs, num_seqs)
+        assert translation_dataset.num_seqs == num_seqs
 
         # Reconstruct the sentences from the word ids and compare with input.
         for sequence_index in range(num_seqs):
             source_word_ids = translation_dataset.get_data(sequence_index, "data")
             source_sentence = word_ids_to_sentence(source_word_ids, inverse_source_vocabulary)
-            assert_equal(source_sentence, dummy_source_text.splitlines()[sequence_index] + postfix)
+            assert source_sentence == dummy_source_text.splitlines()[sequence_index] + postfix
 
             target_word_ids = translation_dataset.get_data(sequence_index, "classes")
             target_sentence = word_ids_to_sentence(target_word_ids, inverse_target_vocabulary)
-            assert_equal(target_sentence, dummy_target_text_with_unk.splitlines()[sequence_index] + postfix)
+            assert target_sentence == dummy_target_text_with_unk.splitlines()[sequence_index] + postfix
 
     shutil.rmtree(dummy_dataset)
 
@@ -189,7 +184,7 @@ def test_translation_factors_dataset():
         translation_dataset.load_seqs(0, 10)
 
         num_seqs = len(dummy_target_text_factored_format.splitlines())
-        assert_equal(translation_dataset.num_seqs, num_seqs)
+        assert translation_dataset.num_seqs == num_seqs
 
         # Reconstruct the sentences from the word ids for all factors and compare with input.
         data_keys = source_data_keys + target_data_keys
@@ -198,7 +193,7 @@ def test_translation_factors_dataset():
             for sequence_index in range(num_seqs):
                 word_ids = translation_dataset.get_data(sequence_index, data_keys[index])
                 sentence = word_ids_to_sentence(word_ids, inverse_vocabularies[index])
-                assert_equal(sentence, text.splitlines()[sequence_index] + postfix)
+                assert sentence == text.splitlines()[sequence_index] + postfix
 
     shutil.rmtree(dummy_dataset)
 
diff --git a/tests/test_Util.py b/tests/test_Util.py
index 690b403c1..92f6edebb 100644
--- a/tests/test_Util.py
+++ b/tests/test_Util.py
@@ -1,13 +1,13 @@
 # -*- coding: utf8 -*-
 
 import _setup_test_env  # noqa
-from nose.tools import assert_equal, assert_not_equal, assert_raises, assert_true, assert_is
 from numpy.testing import assert_almost_equal
 from returnn.util.basic import *
 import sys
 import os
 import numpy as np
 import numpy
+import pytest
 import unittest
 import textwrap
 import signal
@@ -55,34 +55,35 @@ def _get_tmp_dir() -> str:
 
 def test_cmd_true():
     r = sys_cmd_out_lines("true")
-    assert_equal(r, [])
+    assert r == []
 
 
 def test_cmd_false():
-    assert_raises(CalledProcessError, lambda: sys_cmd_out_lines("false"))
+    with pytest.raises(CalledProcessError):
+        sys_cmd_out_lines("false")
 
 
 def test_cmd_stdout():
     r = sys_cmd_out_lines("echo 1; echo 2;")
-    assert_equal(r, ["1", "2"])
+    assert r == ["1", "2"]
 
 
 def test_cmd_stderr():
     r = sys_cmd_out_lines("echo x >&2")
-    assert_equal(r, [], "cmd() output should only cover stdout")
+    assert r == [], "cmd() output should only cover stdout"
 
 
 def test_hms():
-    assert_equal(hms(5), "0:00:05")
-    assert_equal(hms(65), "0:01:05")
-    assert_equal(hms(65 + 60 * 60), "1:01:05")
+    assert hms(5) == "0:00:05"
+    assert hms(65) == "0:01:05"
+    assert hms(65 + 60 * 60) == "1:01:05"
 
 
 def test_hms_fraction():
-    assert_equal(hms_fraction(0, decimals=3), "0:00:00.000")
-    assert_equal(hms_fraction(5, decimals=3), "0:00:05.000")
-    assert_equal(hms_fraction(5.345, decimals=3), "0:00:05.345")
-    assert_equal(hms_fraction(65.345, decimals=3), "0:01:05.345")
+    assert hms_fraction(0, decimals=3) == "0:00:00.000"
+    assert hms_fraction(5, decimals=3) == "0:00:05.000"
+    assert hms_fraction(5.345, decimals=3) == "0:00:05.345"
+    assert hms_fraction(65.345, decimals=3) == "0:01:05.345"
 
 
 def test_uniq():
@@ -90,10 +91,10 @@ def test_uniq():
 
 
 def test_slice_pad_zeros():
-    assert_equal(list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=1, end=3)), [2, 3])
-    assert_equal(list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=-2, end=2)), [0, 0, 1, 2])
-    assert_equal(list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=-2, end=6)), [0, 0, 1, 2, 3, 4, 0, 0])
-    assert_equal(list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=2, end=6)), [3, 4, 0, 0])
+    assert list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=1, end=3)) == [2, 3]
+    assert list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=-2, end=2)) == [0, 0, 1, 2]
+    assert list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=-2, end=6)) == [0, 0, 1, 2, 3, 4, 0, 0]
+    assert list(slice_pad_zeros(np.array([1, 2, 3, 4]), begin=2, end=6)) == [3, 4, 0, 0]
 
 
 def test_math_PiecewiseLinear():
@@ -102,19 +103,19 @@ def test_math_PiecewiseLinear():
     eps = 1e-5
     f = PiecewiseLinear({1: 2, 3: 4, 5: 1})
     assert str(f) == "PiecewiseLinear({1: 2, 3: 4, 5: 1})"
-    assert_equal(f(0), 2)
-    assert_equal(f(1 - eps), 2)
-    assert_equal(f(1), 2)
+    assert f(0) == 2
+    assert f(1 - eps) == 2
+    assert f(1) == 2
     assert_almost_equal(f(1 + eps), 2, decimal=4)
-    assert_equal(f(2), 3)
+    assert f(2) == 3
     assert_almost_equal(f(3 - eps), 4, decimal=4)
-    assert_equal(f(3), 4)
+    assert f(3) == 4
     assert_almost_equal(f(3 + eps), 4, decimal=4)
-    assert_equal(f(4), 2.5)
+    assert f(4) == 2.5
     assert_almost_equal(f(5 - eps), 1, decimal=4)
-    assert_equal(f(5), 1)
-    assert_equal(f(5 + eps), 1)
-    assert_equal(f(6), 1)
+    assert f(5) == 1
+    assert f(5 + eps) == 1
+    assert f(6) == 1
 
 
 def test_math_PiecewiseLinear_kwargs():
@@ -159,14 +160,14 @@ def test_math_StepFunction_kwargs():
 
 
 def test_parse_orthography_into_symbols():
-    assert_equal(list("hi"), parse_orthography_into_symbols("hi"))
-    assert_equal(list(" hello "), parse_orthography_into_symbols(" hello "))
-    assert_equal(list("  "), parse_orthography_into_symbols("  "))
-    assert_equal(list("hello ") + ["[FOO]"] + list(" bar "), parse_orthography_into_symbols("hello [FOO] bar "))
+    assert list("hi") == parse_orthography_into_symbols("hi")
+    assert list(" hello ") == parse_orthography_into_symbols(" hello ")
+    assert list("  ") == parse_orthography_into_symbols("  ")
+    assert list("hello ") + ["[FOO]"] + list(" bar ") == parse_orthography_into_symbols("hello [FOO] bar ")
 
 
 def test_parse_orthography():
-    assert_equal(list("hi ") + ["[HES]"] + list(" there") + ["[END]"], parse_orthography("hi [HES] there "))
+    assert list("hi ") + ["[HES]"] + list(" there") + ["[END]"] == parse_orthography("hi [HES] there ")
 
 
 def test_NumbersDict_minus_1():
@@ -174,7 +175,7 @@ def test_NumbersDict_minus_1():
     b = NumbersDict(10)
     r = a - b
     print(a, b, r)
-    assert_equal(r, NumbersDict(numbers_dict={"classes": 1, "data": 1}, broadcast_value=-10))
+    assert r == NumbersDict(numbers_dict={"classes": 1, "data": 1}, broadcast_value=-10)
 
 
 def test_NumbersDict_eq_1():
@@ -184,19 +185,19 @@ def test_NumbersDict_eq_1():
     r2 = a.elem_eq(b, result_with_default=True)
     r2a = a == b
     print(a, b, r1, r2, r2a)
-    assert_is(all(r2.values()), r2a)
-    assert_is(r1.value, None)
-    assert_equal(r1.dict, {"classes": True, "data": True})
-    assert_equal(r1, NumbersDict({"classes": True, "data": True}))
-    assert_is(r2.value, None)
-    assert_equal(r2.dict, {"classes": True, "data": True})
-    assert_true(r2a)
+    assert all(r2.values()) is r2a
+    assert r1.value is None
+    assert r1.dict == {"classes": True, "data": True}
+    assert r1 == NumbersDict({"classes": True, "data": True})
+    assert r2.value is None
+    assert r2.dict == {"classes": True, "data": True}
+    assert r2a is True
 
 
 def test_NumbersDict_eq_2():
     a = NumbersDict(10)
-    assert_equal(a, 10)
-    assert_not_equal(a, 5)
+    assert a == 10
+    assert a != 5
 
 
 def test_NumbersDict_mul():
@@ -204,7 +205,7 @@ def test_NumbersDict_mul():
     b = a * 2
     assert isinstance(b, NumbersDict)
     assert b.value == 2
-    assert_equal(b.dict, {"data": 6, "classes": 4})
+    assert b.dict == {"data": 6, "classes": 4}
 
 
 def test_NumbersDict_float_div():
@@ -212,7 +213,7 @@ def test_NumbersDict_float_div():
     b = a / 2.0
     assert isinstance(b, NumbersDict)
     assert_almost_equal(b.value, 0.5)
-    assert_equal(list(sorted(b.dict.keys())), ["classes", "data"])
+    assert list(sorted(b.dict.keys())) == ["classes", "data"]
     assert_almost_equal(b.dict["data"], 1.5)
     assert_almost_equal(b.dict["classes"], 1.0)
 
@@ -221,10 +222,10 @@ def test_NumbersDict_int_floordiv():
     a = NumbersDict(numbers_dict={"data": 3, "classes": 2}, broadcast_value=1)
     b = a // 2
     assert isinstance(b, NumbersDict)
-    assert_equal(b.value, 0)
-    assert_equal(list(sorted(b.dict.keys())), ["classes", "data"])
-    assert_equal(b.dict["data"], 1)
-    assert_equal(b.dict["classes"], 1)
+    assert b.value == 0
+    assert list(sorted(b.dict.keys())) == ["classes", "data"]
+    assert b.dict["data"] == 1
+    assert b.dict["classes"] == 1
 
 
 def test_NumbersDict_to_dict():
@@ -252,7 +253,7 @@ def __init__(self, b, c, **kwargs):
 
     kwargs = collect_class_init_kwargs(C)
     print(kwargs)
-    assert_equal(sorted(kwargs), ["a", "b", "c"])
+    assert sorted(kwargs) == ["a", "b", "c"]
 
 
 def test_terminal_size():
@@ -263,11 +264,11 @@ def test_try_get_caller_name():
     def sub():
         return try_get_caller_name()
 
-    assert_equal(sub(), "test_try_get_caller_name")
+    assert sub() == "test_try_get_caller_name"
 
 
 def test_camel_case_to_snake_case():
-    assert_equal(camel_case_to_snake_case("CamelCaseOp"), "camel_case_op")
+    assert camel_case_to_snake_case("CamelCaseOp") == "camel_case_op"
 
 
 def test_NativeCodeCompiler():
@@ -291,9 +292,9 @@ def test_NativeCodeCompiler():
     lib.get_magic.restype = ctypes.c_int
     lib.get_magic.argtypes = ()
 
-    assert_equal(lib.get_magic(), 13)
+    assert lib.get_magic() == 13
     lib.set_magic(42)
-    assert_equal(lib.get_magic(), 42)
+    assert lib.get_magic() == 42
 
 
 def test_PyExtModCompiler():
@@ -341,10 +342,10 @@ def test_PyExtModCompiler():
     print("lib_filename:", lib_filename)
     mod = native.load_py_module()
 
-    assert_equal(mod.func(0), 42)
-    assert_equal(mod.func(1), 43)
-    assert_equal(mod.func(2), 44)
-    assert_equal(mod.func(-2), 40)
+    assert mod.func(0) == 42
+    assert mod.func(1) == 43
+    assert mod.func(2) == 44
+    assert mod.func(-2) == 40
 
 
 def test_Stats():
@@ -423,7 +424,7 @@ def test_get_func_kwargs():
     def dummy_func(net, var, update_ops):
         pass
 
-    assert_equal(list(getargspec(dummy_func).args), ["net", "var", "update_ops"])
+    assert list(getargspec(dummy_func).args) == ["net", "var", "update_ops"]
 
 
 def test_next_type_attrib_in_mro_chain():
@@ -438,9 +439,9 @@ def method(self):
     class Bar(Foo):
         pass
 
-    assert_equal(type_attrib_mro_chain(Foo, "method"), [Foo.method, Base.method])
-    assert_equal(type_attrib_mro_chain(Bar, "method"), [Foo.method, Base.method])
-    assert_equal(next_type_attrib_in_mro_chain(Bar, "method", Foo.method), Base.method)
+    assert type_attrib_mro_chain(Foo, "method") == [Foo.method, Base.method]
+    assert type_attrib_mro_chain(Bar, "method") == [Foo.method, Base.method]
+    assert next_type_attrib_in_mro_chain(Bar, "method", Foo.method) == Base.method
 
 
 def test_simple_obj_repr():
@@ -454,15 +455,15 @@ def __init__(self, a, b=13):
     x = X(a=42)
     x_repr = repr(x)
 
-    assert_equal(x_repr, "X(a=42, b=13)")
+    assert x_repr == "X(a=42, b=13)"
 
 
 def test_obj_diff_str():
-    assert_equal(obj_diff_str({"a": 1, "b": 2}, {"a": 1, "b": 3}), "dict diff:\n['b'] self: 2 != other: 3")
+    assert obj_diff_str({"a": 1, "b": 2}, {"a": 1, "b": 3}) == "dict diff:\n['b'] self: 2 != other: 3"
 
 
 def test_obj_diff_str_non_str_key():
-    assert_equal(obj_diff_str({1: 1, 2: 2}, {1: 1, 2: 3}), "dict diff:\n[2] self: 2 != other: 3")
+    assert obj_diff_str({1: 1, 2: 2}, {1: 1, 2: 3}) == "dict diff:\n[2] self: 2 != other: 3"
 
 
 def test_obj_diff_list_allowed_mapping():
@@ -487,23 +488,20 @@ def _allowed_mapping(a_, b_):
     assert ac_diff
     for line in ac_diff:
         print(line)
-    assert_equal(
-        ac_diff,
-        [
-            "dict diff:",
-            "['b'] dict diff:",
-            "['b']   key 'A:b' not in other",
-            "['b']   key 'B:b' not in self",
-        ],
-    )
+    assert ac_diff == [
+        "dict diff:",
+        "['b'] dict diff:",
+        "['b']   key 'A:b' not in other",
+        "['b']   key 'B:b' not in self",
+    ]
 
 
 @unittest.skipIf(PY3, "only for Python 2")
 def test_py2_utf8_str_to_unicode():
-    assert_equal(py2_utf8_str_to_unicode("a"), "a")
-    assert_is(type(py2_utf8_str_to_unicode("a")), str)
-    assert_equal(py2_utf8_str_to_unicode("äöü"), "äöü")
-    assert_is(type(py2_utf8_str_to_unicode("äöü")), unicode)
+    assert py2_utf8_str_to_unicode("a") == "a"
+    assert type(py2_utf8_str_to_unicode("a")) is str
+    assert py2_utf8_str_to_unicode("äöü") == "äöü"
+    assert type(py2_utf8_str_to_unicode("äöü")) is unicode
 
 
 def test_CollectionReadCheckCovered():
@@ -537,7 +535,7 @@ def test_import_():
 
     mod = import_("github.com/rwth-i6/returnn-experiments", "common/test.py", "20210302-01094bef2761")
     print("Loaded mod %s, name %s, file %s" % (mod, mod.__name__, mod.__file__))
-    assert_equal(mod.hello(), "hello world")
+    assert mod.hello() == "hello world"
 
 
 def test_import_root_repo_mod():
@@ -545,7 +543,7 @@ def test_import_root_repo_mod():
 
     mod = import_("github.com/rwth-i6/returnn_common", "test.py", "20210602-1bc6822")
     print("Loaded mod %s, name %s, file %s" % (mod, mod.__name__, mod.__file__))
-    assert_equal(mod.hello(), "hello world")
+    assert mod.hello() == "hello world"
 
 
 def test_import_root_repo_pkg():
@@ -555,7 +553,7 @@ def test_import_root_repo_pkg():
     print("Loaded mod %s, name %s, file %s" % (mod, mod.__name__, mod.__file__))
     from returnn_import.github_com.rwth_i6.returnn_common.v20210602162042_1bc6822b2fd1 import test
 
-    assert_equal(test.hello(), "hello world")
+    assert test.hello() == "hello world"
 
 
 def test_import_root_repo_sub_mod():
@@ -563,7 +561,7 @@ def test_import_root_repo_sub_mod():
 
     mod = import_("github.com/rwth-i6/returnn_common", "test/hello.py", "20210603-3752d77")
     print("Loaded mod %s, name %s, file %s" % (mod, mod.__name__, mod.__file__))
-    assert_equal(mod.hello(), "hello world")
+    assert mod.hello() == "hello world"
 
 
 def test_import_pkg_py_import():
@@ -624,7 +622,7 @@ def check(s):
         print("check:", s)
         a = ast.literal_eval(s)
         b = literal_py_to_pickle.literal_eval(s)
-        assert_equal(a, b)
+        assert a == b
 
     checks = [
         "0",
@@ -701,7 +699,7 @@ def test_NonDaemonicSpawnProcess_hang_1514():
 def test_expand_env_vars():
     os.environ["TMPA"] = "/testA"
     os.environ["TMPB"] = "testB"
-    assert_equal(expand_env_vars("$TMPA/$TMPB/returnn/file_cache"), "/testA/testB/returnn/file_cache")
+    assert expand_env_vars("$TMPA/$TMPB/returnn/file_cache") == "/testA/testB/returnn/file_cache"
 
 
 def test_bpe_PrefixTree():
@@ -771,28 +769,28 @@ def test_bpe_DepthFirstSearch():
     tree.add("he@@")
 
     dfs = DepthFirstSearch(tree, "hello")
-    assert_equal(dfs.search(), ["he@@", "llo"])
+    assert dfs.search() == ["he@@", "llo"]
     dfs = DepthFirstSearch(tree, "helo")
-    assert_equal(dfs.search(), None)
+    assert dfs.search() == None
     dfs = DepthFirstSearch(tree, "x")
-    assert_equal(dfs.search(), None)
+    assert dfs.search() == None
     dfs = DepthFirstSearch(tree, "llo")
-    assert_equal(dfs.search(), ["llo"])
+    assert dfs.search() == ["llo"]
 
     tree.add("hello")
     dfs = DepthFirstSearch(tree, "hello")
-    assert_equal(dfs.search(), ["hello"])
+    assert dfs.search() == ["hello"]
     dfs = DepthFirstSearch(tree, "hello", sampler=lambda: True)
-    assert_equal(dfs.search(), ["he@@", "llo"])
+    assert dfs.search() == ["he@@", "llo"]
 
     tree.add("hel@@")
     tree.add("lo")
     dfs = DepthFirstSearch(tree, "hello")
-    assert_equal(dfs.search(), ["hello"])
+    assert dfs.search() == ["hello"]
     dfs = DepthFirstSearch(tree, "hello", sampler=lambda: True)
-    assert_equal(dfs.search(), ["he@@", "llo"])
+    assert dfs.search() == ["he@@", "llo"]
     dfs = DepthFirstSearch(tree, "hello", sampler=lambda _it=itertools.count(): next(_it) in {3})
-    assert_equal(dfs.search(), ["hel@@", "lo"])
+    assert dfs.search() == ["hel@@", "lo"]
 
 
 def test_bpe_DepthFirstSearch_word_prefix():
@@ -805,16 +803,16 @@ def test_bpe_DepthFirstSearch_word_prefix():
     tree.add("lo")
 
     search = DepthFirstSearch(tree, "hello")
-    assert_equal(search.search(), ["▁hello"])
+    assert search.search() == ["▁hello"]
     search = DepthFirstSearch(tree, "hello", sampler=lambda: True)
-    assert_equal(search.search(), ["▁hel", "lo"])
+    assert search.search() == ["▁hel", "lo"]
 
     tree.add("▁he")
     tree.add("llo")
     search = DepthFirstSearch(tree, "hello", sampler=lambda: True)
-    assert_equal(search.search(), ["▁he", "llo"])
+    assert search.search() == ["▁he", "llo"]
     search = DepthFirstSearch(tree, "hello", sampler=lambda _it=itertools.count(): next(_it) in {3})
-    assert_equal(search.search(), ["▁hel", "lo"])
+    assert search.search() == ["▁hel", "lo"]
 
 
 def test_bpe_CharSyncSearch():
@@ -826,17 +824,17 @@ def test_bpe_CharSyncSearch():
     tree.add("he@@")
 
     search = CharSyncSearch(tree, "hello")
-    assert_equal(search.search(), [["he@@", "llo"]])
+    assert search.search() == [["he@@", "llo"]]
     search = CharSyncSearch(tree, "helo")
-    assert_equal(search.search(), [])
+    assert search.search() == []
     search = CharSyncSearch(tree, "x")
-    assert_equal(search.search(), [])
+    assert search.search() == []
     search = CharSyncSearch(tree, "llo")
-    assert_equal(search.search(), [["llo"]])
+    assert search.search() == [["llo"]]
 
     tree.add("hello")
     search = CharSyncSearch(tree, "hello")
-    assert_equal(search.search(), [["he@@", "llo"], ["hello"]])
+    assert search.search() == [["he@@", "llo"], ["hello"]]
 
 
 def test_bpe_CharSyncSearch_word_prefix():
@@ -849,15 +847,15 @@ def test_bpe_CharSyncSearch_word_prefix():
     tree.add("▁hi")
 
     search = CharSyncSearch(tree, "hello")
-    assert_equal(search.search(), [["▁hel", "lo"], ["▁hello"]])
+    assert search.search() == [["▁hel", "lo"], ["▁hello"]]
     search = CharSyncSearch(tree, "helo")
-    assert_equal(search.search(), [])
+    assert search.search() == []
     search = CharSyncSearch(tree, "x")
-    assert_equal(search.search(), [])
+    assert search.search() == []
     search = CharSyncSearch(tree, "lo")
-    assert_equal(search.search(), [])
+    assert search.search() == []
     search = CharSyncSearch(tree, "hi")
-    assert_equal(search.search(), [["▁hi"]])
+    assert search.search() == [["▁hi"]]
 
 
 def test_file_cache():
diff --git a/tests/test_demos.py b/tests/test_demos.py
index d38d54c21..f38543f22 100644
--- a/tests/test_demos.py
+++ b/tests/test_demos.py
@@ -8,7 +8,6 @@
 from glob import glob
 import shutil
 import unittest
-from nose.tools import assert_less, assert_in
 from returnn.util import better_exchook
 from returnn.util.basic import which_pip
 
@@ -147,7 +146,7 @@ def test_demo_tf_task12ax():
     # this seems not to be correct anymore, at least in the GitHub CI env.
     # On my local machine (Mac M1), I actually get it quite a bit lower, like 0.00127.
     # I'm not 100% sure that there is maybe sth wrong or not quite optimal...
-    assert_less(fer, 0.015)
+    assert fer < 0.015
 
 
 @unittest.skipIf(not tf, "no TF")
@@ -210,7 +209,7 @@ def test_demo_torch_task12ax():
     out = run(py, "rnn.py", "demos/demo-torch.config", print_stdout=True)
     # Also see test_demo_tf_task12ax above.
     fer = parse_last_fer(out)
-    assert_less(fer, 0.02)
+    assert fer < 0.02
 
 
 def _test_torch_export_to_onnx(cfg_filename: str) -> str:
@@ -300,7 +299,7 @@ def test_demo_rf_torch_task12ax():
     out = run(py, "rnn.py", "demos/demo-rf.config", print_stdout=True)
     # Also see test_demo_tf_task12ax above.
     fer = parse_last_fer(out)
-    assert_less(fer, 0.02)
+    assert fer < 0.02
 
 
 @unittest.skipIf(not tf, "no TF")
@@ -309,7 +308,7 @@ def test_demo_rf_tf_task12ax():
     out = run(py, "rnn.py", "demos/demo-rf.config", "++backend", "tensorflow-net-dict", print_stdout=True)
     # Also see test_demo_tf_task12ax above.
     fer = parse_last_fer(out)
-    assert_less(fer, 0.02)
+    assert fer < 0.02
 
 
 def test_demo_iter_dataset_task12ax():
@@ -317,7 +316,7 @@ def test_demo_iter_dataset_task12ax():
     cleanup_tmp_models("demos/demo-tf-vanilla-lstm.12ax.config")
     # pick any 12ax config for the dataset test
     out = run(py, "demos/demo-iter-dataset.py", "demos/demo-tf-vanilla-lstm.12ax.config")
-    assert_in("Epoch 5.", out.splitlines())
+    assert "Epoch 5." in out.splitlines()
 
 
 @unittest.skipIf(not tf, "no TF")
diff --git a/tests/test_fork_exec.py b/tests/test_fork_exec.py
index 345c60258..81b3e08ae 100644
--- a/tests/test_fork_exec.py
+++ b/tests/test_fork_exec.py
@@ -16,7 +16,6 @@
 import _setup_test_env  # noqa
 import os
 import sys
-from nose.tools import assert_equal, assert_is_instance
 from pprint import pprint
 
 
@@ -164,15 +163,12 @@ def test_demo_hello_from_fork():
     pprint(ls)
     ls = filter_demo_output(ls)
     pprint(ls)
-    assert_equal(
-        set(ls),
-        {
-            "Hello from child after fork.",
-            "Hello from child atfork, magic number 3.",
-            "Hello from atfork prepare, magic number 3.",
-            "Hello from parent after fork.",
-        },
-    )
+    assert set(ls) == {
+        "Hello from child after fork.",
+        "Hello from child atfork, magic number 3.",
+        "Hello from atfork prepare, magic number 3.",
+        "Hello from parent after fork.",
+    }
 
 
 def test_demo_start_subprocess():
@@ -205,7 +201,7 @@ def patched_check_demo_start_subprocess():
     """
     Just like test_demo_start_subprocess(), but here we assert that no atfork handlers are executed.
     """
-    assert_equal(os.environ.get("__RETURNN_ATFORK_PATCHED"), "1")
+    assert os.environ.get("__RETURNN_ATFORK_PATCHED") == "1"
     ls = run_demo_check_output("demo_start_subprocess")
     pprint(ls)
     ls = filter_demo_output(ls)
@@ -213,7 +209,7 @@ def patched_check_demo_start_subprocess():
     assert "Hello from subprocess." in ls
     ls = [l for l in ls if l not in ["Ignoring pthread_atfork call!", "Ignoring __register_atfork call!"]]
     pprint(ls)
-    assert_equal(ls, ["Hello from subprocess."])
+    assert ls == ["Hello from subprocess."]
 
 
 def test_demo_start_subprocess_patched():
diff --git a/tools/import-blocks-mt-model.py b/tools/import-blocks-mt-model.py
index e3227aed1..7f09dd1c5 100755
--- a/tools/import-blocks-mt-model.py
+++ b/tools/import-blocks-mt-model.py
@@ -59,8 +59,7 @@
 import numpy
 import re
 from pprint import pprint
-from nose.tools import assert_equal, assert_is_instance
-from numpy.testing import assert_almost_equal, assert_allclose
+from numpy.testing import assert_almost_equal
 import tensorflow as tf
 import pickle
 
@@ -211,7 +210,7 @@ def import_var(our_var, blocks_param):
         if isinstance(blocks_param, str):
             blocks_param = load_blocks_var(blocks_param)
         assert isinstance(blocks_param, numpy.ndarray)
-        assert_equal(tuple(our_var.shape.as_list()), blocks_param.shape)
+        assert tuple(our_var.shape.as_list()) == blocks_param.shape
         our_loaded_params.add(our_var.name[:-2])
         our_var.load(blocks_param, session=rnn.engine.tf_session)
 
@@ -242,7 +241,7 @@ def load_blocks_var(blocks_param_name):
     expected_enc_entries = ["EncoderLookUp0.W"] + [
         "EncoderBidirectionalLSTM%i" % i for i in range(1, num_encoder_layers + 1)
     ]
-    assert_equal(set(expected_enc_entries), set(blocks_params_hierarchy[enc_name].keys()))
+    assert set(expected_enc_entries) == set(blocks_params_hierarchy[enc_name].keys())
 
     our_input_layer = find_our_input_embed_layer()
     assert our_input_layer.input_data.dim == blocks_input_dim
@@ -252,7 +251,7 @@ def load_blocks_var(blocks_param_name):
 
     dec_name = "decoder/sequencegenerator"
     dec_hierarchy_base = get_in_hierarchy(dec_name, blocks_params_hierarchy)
-    assert_equal(set(dec_hierarchy_base.keys()), {"att_trans", "readout"})
+    assert set(dec_hierarchy_base.keys()) == {"att_trans", "readout"}
     dec_embed_name = "readout/lookupfeedbackwmt15/lookuptable.W"
     get_in_hierarchy(dec_embed_name, dec_hierarchy_base)  # check
 
@@ -556,13 +555,12 @@ def load_blocks_var(blocks_param_name):
             ]
             assert blocks_energy_sum_tanh.shape == (seq_len, beam_size, energy_sum.shape[-1])
             assert_almost_equal(blocks_energy_sum_tanh[:, 0], numpy.tanh(energy_sum), decimal=5)
-            assert_equal(
-                our_dec_frame_outputs["weight_feedback.output"].shape,
-                (beam_size, seq_len if dec_step > 0 else 1, blocks_enc_ctx_out.shape[-1]),
-            )
-            assert_equal(
-                our_dec_frame_outputs["prev_s_transformed.output"].shape, (beam_size, blocks_enc_ctx_out.shape[-1])
+            assert our_dec_frame_outputs["weight_feedback.output"].shape == (
+                beam_size,
+                seq_len if dec_step > 0 else 1,
+                blocks_enc_ctx_out.shape[-1],
             )
+            assert our_dec_frame_outputs["prev_s_transformed.output"].shape == (beam_size, blocks_enc_ctx_out.shape[-1])
             our_energy_sum = our_dec_frame_outputs["energy_in.output"]
             assert our_energy_sum.shape == (beam_size, seq_len, blocks_enc_ctx_out.shape[-1])
             assert_almost_equal(our_energy_sum[0], energy_sum, decimal=4)