ICASSP 2025 paper information

rwth-i6 · Jan 8, 2025 · 5cb7309 · 5cb7309
1 parent 6fdb181
commit 5cb7309
Show file tree

Hide file tree

Showing 9 changed files with 3,944 additions and 0 deletions.
diff --git a/2025-factored-fullsum-rightcontext/README b/2025-factored-fullsum-rightcontext/README
@@ -0,0 +1,9 @@
+* Training uses tensorflow backend, RASR for data preparation, and extended CUDA kernel
+** config files for data preparation
+** returnn config, containing both network definition and the RASR settings for the alignment FSA
+** a copy of the additional layer and CUDA code for running factored loss
+
+* decoding uses master branch of RASR 
+** recognition config
+
+
diff --git a/2025-factored-fullsum-rightcontext/decoding/feature.flow b/2025-factored-fullsum-rightcontext/decoding/feature.flow
@@ -0,0 +1,10 @@
+<?xml version="1.0" ?>
+<network name="network">
+  <out name="features"/>
+  <param name="TASK"/>
+  <param name="id"/>
+  <node filter="generic-cache" id="$(id)" name="cache" path="`cf /u/raissi/setups/librispeech/960h/work/i6_core/features/extraction/FeatureExtractionJob.Gammatone.oDJO8MMhhCEx/output/gt.cache.$(TASK)`"/>
+  <node filter="tensorflow-forward" id="$(id)" name="tf-fwd"/>
+  <link from="cache" to="tf-fwd:features"/>
+  <link from="tf-fwd:posteriors" to="network:features"/>
+</network>
diff --git a/2025-factored-fullsum-rightcontext/decoding/recognition.config b/2025-factored-fullsum-rightcontext/decoding/recognition.config
@@ -0,0 +1,174 @@
+[*]
+configuration.channel    = output-channel
+dot.channel              = nil
+encoding                 = UTF-8
+error.channel            = output-channel, stderr
+log.channel              = output-channel
+progress.channel         = output-channel
+real-time-factor.channel = output-channel
+statistics.channel       = output-channel
+system-info.channel      = output-channel
+time.channel             = output-channel
+version.channel          = output-channel
+warning.channel          = output-channel, stderr
+
+[*.output-channel]
+append     = no
+compressed = no
+file       = $(LOGFILE)
+unbuffered = no
+
+[*.session]
+inter-op-parallelism-threads = 1
+intra-op-parallelism-threads = 2
+
+[flf-lattice-tool.corpus]
+capitalize-transcriptions      = no
+file                           = /u/raissi/setups/librispeech/960h/work/i6_core/audio/encoding/BlissChangeEncodingJob.vUdgDkgc97ZK/output/corpus.xml.gz
+progress-indication            = global
+segments.file                  = /u/raissi/setups/librispeech/960h/work/i6_core/corpus/segments/SegmentCorpusJob.UKZC0F09VgYu/output/segments.$(TASK)
+warn-about-unexpected-elements = yes
+
+[flf-lattice-tool.global-cache]
+file      = `cf /u/raissi/setups/librispeech/960h/work/i6_core/recognition/advanced_tree_search/AdvancedTreeSearchLmImageAndGlobalCacheJob.Gbj19O0SC51m/output/global.cache`
+read-only = yes
+
+[flf-lattice-tool.lexicon]
+file                    = /u/raissi/setups/librispeech/960h/work/i6_core/lexicon/modification/MergeLexiconJob.z54fVoMlr0md/output/lexicon.xml.gz
+normalize-pronunciation = no
+
+[flf-lattice-tool.network]
+initial-nodes = segment
+
+[flf-lattice-tool.network.archive-writer]
+format = flf
+info   = yes
+links  = sink:1
+path   = lattice.cache.$(TASK)
+type   = archive-writer
+
+[flf-lattice-tool.network.evaluator]
+best-in-lattice = yes
+links           = sink:0
+single-best     = yes
+type            = evaluator
+word-errors     = yes
+
+[flf-lattice-tool.network.evaluator.edit-distance]
+allow-broken-words = no
+format             = bliss
+
+[flf-lattice-tool.network.expand]
+links = evaluator archive-writer
+type  = expand-transits
+
+[flf-lattice-tool.network.recognizer]
+add-confidence-score          = no
+apply-non-word-closure-filter = no
+apply-posterior-pruning       = no
+links                         = expand
+search-type                   = advanced-tree-search
+type                          = recognizer
+
+[flf-lattice-tool.network.recognizer.acoustic-model.allophones]
+add-all          = yes
+add-from-lexicon = no
+
+[flf-lattice-tool.network.recognizer.acoustic-model.hmm]
+across-word-model   = yes
+early-recombination = no
+state-repetitions   = 1
+states-per-phone    = 1
+
+[flf-lattice-tool.network.recognizer.acoustic-model.mixture-set]
+feature-scorer-type       = nn-precomputed-hybrid
+file                      = /u/raissi/setups/librispeech/960h/work/i6_core/mm/mixtures/CreateDummyMixturesJob.WXJMtY5OeGyg/output/dummy.mix
+normalize-mixture-weights = no
+prior-file                = /u/raissi/setups/librispeech/960h/work/i6_experiments/users/raissi/setups/common/helpers/priors/smoothen/SmoothenPriorsJob.oKBFK7dIraCk/output/priors.xml
+priori-scale              = 0.4
+scale                     = 1.0
+
+[flf-lattice-tool.network.recognizer.acoustic-model.state-tying]
+type                 = monophone-dense
+use-boundary-classes = no
+use-word-end-classes = yes
+
+[flf-lattice-tool.network.recognizer.acoustic-model.tdp]
+entry-m1.loop  = infinity
+entry-m2.loop  = infinity
+nonword-phones = [UNKNOWN]
+scale          = 0.1
+tying-type     = global-and-nonword
+
+[flf-lattice-tool.network.recognizer.acoustic-model.tdp.*]
+exit    = 0.0
+forward = 0.0
+loop    = 5.0
+skip    = infinity
+
+[flf-lattice-tool.network.recognizer.acoustic-model.tdp.nonword-0]
+exit    = 20.0
+forward = 0.0
+loop    = 11.0
+skip    = infinity
+
+[flf-lattice-tool.network.recognizer.acoustic-model.tdp.nonword-1]
+exit    = 20.0
+forward = 0.0
+loop    = 11.0
+skip    = infinity
+
+[flf-lattice-tool.network.recognizer.acoustic-model.tdp.silence]
+exit    = 15.0
+forward = 0.0
+loop    = 13.0
+skip    = infinity
+
+[flf-lattice-tool.network.recognizer.feature-extraction]
+file = feature.flow
+
+[flf-lattice-tool.network.recognizer.feature-extraction.tf-fwd.input-map.info-0]
+param-name             = features
+seq-length-tensor-name = extern_data/placeholders/data/data_dim0_size
+tensor-name            = extern_data/placeholders/data/data
+
+[flf-lattice-tool.network.recognizer.feature-extraction.tf-fwd.loader]
+meta-graph-file  = /u/raissi/setups/librispeech/960h/work/i6_core/returnn/compile/CompileTFGraphJob.L1VLSMvSu4PB/output/graph.meta
+saved-model-file = /u/raissi/setups/librispeech/960h/work/i6_experiments/users/raissi/costum/returnn/rasr_returnn_bw/ReturnnRasrTrainingBWJob.qckoKdPLkSdj/output/models/epoch.492
+type             = meta
+
+[flf-lattice-tool.network.recognizer.feature-extraction.tf-fwd.output-map.info-0]
+param-name  = posteriors
+tensor-name = center__output/output_batch_major
+
+[flf-lattice-tool.network.recognizer.lm]
+file  = /u/raissi/setups/librispeech/960h/work/i6_core/tools/download/DownloadJob.6ij8dDC1z4zK/output/4-gram.arpa.gz
+image = `cf /u/raissi/setups/librispeech/960h/work/i6_core/recognition/advanced_tree_search/AdvancedTreeSearchLmImageAndGlobalCacheJob.Gbj19O0SC51m/output/lm-1.image`
+scale = 0.9
+type  = ARPA
+
+[flf-lattice-tool.network.recognizer.recognizer]
+beam-pruning           = 22.0
+beam-pruning-limit     = 500000
+create-lattice         = yes
+lm-lookahead           = yes
+lm-lookahead-laziness  = 15
+optimize-lattice       = simple
+word-end-pruning       = 0.5
+word-end-pruning-limit = 10000
+
+[flf-lattice-tool.network.recognizer.recognizer.lm-lookahead]
+cache-size-high        = 3000
+cache-size-low         = 2000
+history-limit          = 1
+minimum-representation = 1
+tree-cutoff            = 30
+
+[flf-lattice-tool.network.segment]
+links = 1->recognizer:1 0->archive-writer:1 0->evaluator:1
+type  = speech-segment
+
+[flf-lattice-tool.network.sink]
+error-on-empty-lattice = no
+type                   = sink
+warn-on-empty-lattice  = yes
diff --git a/2025-factored-fullsum-rightcontext/decoding/run.sh b/2025-factored-fullsum-rightcontext/decoding/run.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -ueo pipefail
+
+if [[ $# -gt 0 ]]; then
+  TASK=$1;
+  shift;
+else
+  echo "No TASK-id given";
+  exit 1;
+fi
+
+if [ $# -gt 0 ]; then
+  LOGFILE=$1;
+  shift;
+else
+  LOGFILE=rasr.log
+fi
+
+export OMP_NUM_THREADS=2
+export TF_DEVICE='cpu'
+
+/work/tools/users/raissi/rasr/rasr_tf2/arch/linux-x86_64-standard/flf-tool.linux-x86_64-standard --config=recognition.config --*.TASK=$TASK --*.LOGFILE=$LOGFILE  $@