From a5d353dcae24778aa3f737b3b3adab19aee36788 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 9 Dec 2024 10:53:24 +0100 Subject: [PATCH] cleanup, more --- .../exp2024_04_23_baselines/claix2023.py | 53 ++----------------- 1 file changed, 5 insertions(+), 48 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py b/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py index e49c1b3d6..351355648 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py @@ -210,10 +210,6 @@ def py(): for cr_ctc in [None, {"cr_loss_scale": 0.2}, {"cr_loss_scale": 0.5}, {"cr_loss_scale": 1.0}]: # TODO also adapt specaug for CR... use_cr_ctc = cr_ctc is not None - if use_cr_ctc: - cr_ctc: Dict[str, Any] - cr_ctc = cr_ctc.copy() - cr_ctc["use_fixed_ctc_grad"] = "v2" name = f"crLoss{cr_ctc['cr_loss_scale']}-" if use_cr_ctc else "" if opts.get("time_downsampling"): name += f"time{opts['time_downsampling']}-" @@ -266,7 +262,7 @@ def py(): # purely used for training "aux_attention_decoder": rf.build_dict(TransformerDecoder, num_layers=6), **(cr_ctc if use_cr_ctc else {}), - **({"aed_loss_bug_fix": True} if use_cr_ctc else {}), + **({"use_fixed_ctc_grad": "v2", "aed_loss_bug_fix": True} if use_cr_ctc else {}), "max_seq_length_default_target": None, # Note on max seq len stats: Before, when we used max_seq_length_default_target=75 with bpe10k, # out of 281241 seqs in train, we removed only 71 seqs. @@ -349,48 +345,6 @@ def py(): "num_enc_layers": 12, "out_blank_separated": True, }, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep_v3(150_000, 100, batch_size_factor=_batch_size_factor), - "optimizer.weight_decay": 1e-2, - "max_seq_length_default_target": None, - # Note on max seq len stats: Before, when we used max_seq_length_default_target=75 with bpe10k, - # out of 281241 seqs in train, we removed only 71 seqs. - # With max seq len 19.5 secs on the audio, we also remove exactly 71 seqs. - "max_seq_length_default_input": 19.5 * _raw_sample_rate, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - "aux_attention_decoder": rf.build_dict(TransformerDecoder, num_layers=6), # purely used for training - }, - post_config_updates={"log_grad_norm": True, "__multi_proc_dataset_opts": {"num_workers": 25}}, - vocab="spm512", - train_vocab_opts={"other_opts": {"class": "SamplingBytePairEncoding", "breadth_prob": 0.01}}, - dataset_train_opts={"train_epoch_split": 1, "train_epoch_wise_filter": None}, - # avoid OOM - env_updates={"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}, - ) - - ctc_train_exp( - "time4-n12-spm512-blankSep-auxAED-b150k-ctcFixGrad", - config_96gb_bf16_accgrad1, - model_config={ - "enc_input_layer": rf.build_dict( - ConformerConvSubsample, - out_dims=[32, 64, 64], - filter_sizes=[(3, 3), (3, 3), (3, 3)], - pool_sizes=[(1, 2)], - strides=[(1, 1), (2, 1), (2, 1)], - ), - "enc_conformer_layer": rf.build_dict( - ConformerEncoderLayer, - ff=rf.build_dict( - ConformerPositionwiseFeedForward, activation=rf.build_dict(rf.relu_square), with_bias=False - ), - num_heads=8, - ), - "feature_batch_norm": True, - "num_enc_layers": 12, - "out_blank_separated": True, - }, config_updates={ **_get_cfg_lrlin_oclr_by_bs_nep_v3(150_000, 100, batch_size_factor=_batch_size_factor), "optimizer.weight_decay": 1e-2, @@ -506,6 +460,7 @@ def py(): "ctc_am_scale": am_scale, "ctc_prior_scale": prior_scale, "ctc_prior_type": prior_type, + "use_fixed_ctc_grad": "v2", }, post_config_updates={"log_grad_norm": True, "__multi_proc_dataset_opts": {"num_workers": 25}}, vocab="spm512", @@ -589,6 +544,7 @@ def py(): "__train_audio_preprocess": speed_pert_librosa_config, "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], "aux_attention_decoder": rf.build_dict(TransformerDecoder, num_layers=6), # purely used for training + "use_fixed_ctc_grad": "v2", }, post_config_updates={"log_grad_norm": True, "__multi_proc_dataset_opts": {"num_workers": 25}}, vocab="spm10k", @@ -640,6 +596,7 @@ def py(): "ctc_am_scale": am_scale, "ctc_prior_scale": prior_scale, "ctc_prior_type": prior_type, + "use_fixed_ctc_grad": "v2", }, post_config_updates={"log_grad_norm": True, "__multi_proc_dataset_opts": {"num_workers": 25}}, vocab="spm512", @@ -878,7 +835,7 @@ def py(): "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], "aux_attention_decoder": rf.build_dict(TransformerDecoder, num_layers=6), # purely used for training **(cr_ctc if use_cr_ctc else {}), - **({"aed_loss_bug_fix": True} if use_cr_ctc else {}), + **({"use_fixed_ctc_grad": "v2", "aed_loss_bug_fix": True} if use_cr_ctc else {}), }, config_deletes=["aux_loss_layers"], post_config_updates={"log_grad_norm": True, "__multi_proc_dataset_opts": {"num_workers": 25}},