universa-wavlm_base_urgent24_multi-metric_audioref

2
license:cc-by-4.0
by
espnet
Audio Model
OTHER
New
2 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

`espnet/universa-wavlmbaseurgent24multi-metricaudioref` This model was trained by ftshijt using urgent24 recipe in espnet.

Code Examples

universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false
universa configtext
config: conf/train_universa_wavlm_ref_audio.yaml
print_config: false
log_level: INFO
drop_last_iter: false
dry_run: false
iterator_type: sequence
valid_iterator_type: null
output_dir: update_exp/universa_train_universa_wavlm_ref_audio_raw_fs16000
ngpu: 1
seed: 777
num_workers: 1
num_att_plot: 0
dist_backend: nccl
dist_init_method: env://
dist_world_size: null
dist_rank: null
local_rank: 0
dist_master_addr: null
dist_master_port: null
dist_launcher: null
multiprocessing_distributed: false
unused_parameters: false
sharded_ddp: false
use_deepspeed: false
deepspeed_config: null
cudnn_enabled: true
cudnn_benchmark: false
cudnn_deterministic: false
use_tf32: false
collect_stats: false
write_collected_feats: false
max_epoch: 100
patience: null
val_scheduler_criterion:
- valid
- loss
early_stopping_criterion:
- valid
- loss
- min
best_model_criterion:
-   - train
    - loss
    - min
-   - valid
    - loss
    - min
-   - train
    - acc
    - max
-   - valid
    - acc
    - max
keep_nbest_models: 5
nbest_averaging_interval: 0
grad_clip: -1
grad_clip_type: 2.0
grad_noise: false
accum_grad: 1
no_forward_run: false
resume: true
train_dtype: float32
use_amp: false
log_interval: 50
use_matplotlib: true
use_tensorboard: true
create_graph_in_tensorboard: false
use_wandb: false
wandb_project: null
wandb_id: null
wandb_entity: null
wandb_name: null
wandb_model_log_interval: -1
detect_anomaly: false
use_adapter: false
adapter: lora
save_strategy: all
adapter_conf: {}
pretrain_path: null
init_param: []
ignore_init_mismatch: false
freeze_param:
- frontend.upstream
num_iters_per_epoch: null
batch_size: 16
valid_batch_size: null
batch_bins: 1000000
valid_batch_bins: null
category_sample_size: 10
train_shape_file:
- update_exp/universa_stats_raw/train/audio_shape
- update_exp/universa_stats_raw/train/ref_audio_shape
- update_exp/universa_stats_raw/train/ref_text_shape
valid_shape_file:
- update_exp/universa_stats_raw/valid/audio_shape
- update_exp/universa_stats_raw/valid/ref_audio_shape
- update_exp/universa_stats_raw/valid/ref_text_shape
batch_type: sorted
valid_batch_type: null
fold_length:
- 256000
sort_in_batch: descending
shuffle_within_batch: false
sort_batch: descending
multiple_iterator: false
chunk_length: 500
chunk_shift_ratio: 0.5
num_cache_chunks: 1024
chunk_excluded_key_prefixes: []
chunk_default_fs: null
chunk_max_abs_length: null
chunk_discard_short_samples: true
train_data_path_and_name_and_type:
-   - dump_ark/raw/train_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/train_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/train_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/train_update/text
    - ref_text
    - text
valid_data_path_and_name_and_type:
-   - dump_ark/raw/dev_update/wav.scp
    - audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/metric.scp
    - metrics
    - metric
-   - dump_ark/raw/dev_update/ref_wav.scp
    - ref_audio
    - kaldi_ark
-   - dump_ark/raw/dev_update/text
    - ref_text
    - text
multi_task_dataset: false
allow_variable_data_keys: false
max_cache_size: 0.0
max_cache_fd: 32
allow_multi_rates: false
valid_max_cache_size: null
exclude_weight_decay: false
exclude_weight_decay_conf: {}
optim: adamw
optim_conf:
    lr: 0.001
scheduler: warmuplr
scheduler_conf:
    warmup_steps: 25000
metric2id: dump_ark/raw/train_update/metric2id
metric2type: null
metric_pad_value: -100
token_list:
- <blank>
- <unk>
- s
- ▁
- t
- e
- ▁the
- i
- a
- o
- ▁a
- r
- ▁to
- d
- ▁and
- ''''
- m
- n
- ing
- u
- y
- p
- c
- ▁of
- l
- ed
- ▁I
- ▁in
- er
- re
- ▁it
- ▁you
- ar
- ▁f
- ▁is
- ▁that
- ','
- .
- in
- al
- g
- 'on'
- ▁b
- b
- or
- ▁c
- ▁s
- f
- h
- ▁we
- an
- en
- ▁for
- le
- ▁p
- ly
- es
- w
- ▁re
- ▁on
- ▁m
- ▁be
- ic
- ll
- th
- ▁he
- k
- ur
- ve
- ▁with
- ▁so
- ▁from
- ▁was
- v
- ch
- st
- ▁w
- ▁i
- ▁this
- ▁de
- ▁like
- ▁do
- ce
- at
- il
- ck
- ▁A
- ▁have
- ▁not
- ad
- ▁st
- ow
- ro
- ne
- ▁me
- ▁my
- ▁but
- ation
- ▁at
- ▁or
- '-'
- ter
- ent
- ▁B
- ▁n
- ▁know
- ▁t
- out
- ▁are
- nd
- ▁one
- ▁li
- ▁g
- ▁The
- ol
- ion
- te
- ▁go
- ut
- ▁as
- ▁just
- as
- ▁sh
- ▁they
- is
- ▁C
- et
- ▁h
- ▁an
- ▁there
- ▁up
- ▁S
- ▁M
- ▁she
- ▁by
- ▁su
- om
- ▁can
- us
- ▁your
- ng
- ▁con
- el
- ▁us
- ment
- z
- ▁see
- ▁ab
- ▁what
- ▁out
- ▁her
- me
- ate
- ▁all
- ▁th
- ▁if
- ▁right
- ▁his
- ▁ma
- ▁lo
- ▁which
- ide
- ▁P
- ▁more
- ▁then
- ul
- ast
- x
- ight
- ill
- ▁So
- ▁sp
- ▁going
- ▁some
- ure
- ▁their
- ig
- ▁no
- ▁ro
- ▁think
- ▁who
- ▁pro
- ver
- ive
- est
- ▁co
- ▁di
- '0'
- ist
- ▁k
- age
- ▁d
- ▁time
- ▁L
- ies
- ▁will
- ▁man
- ▁when
- ▁D
- les
- ▁F
- ▁want
- ff
- ity
- ▁un
- '?'
- ▁start
- ▁G
- ▁uh
- ▁get
- ok
- ▁take
- ▁po
- li
- ▁ho
- ▁way
- ▁don
- ▁yeah
- ▁really
- ▁say
- ▁look
- ▁good
- ▁ra
- ▁pr
- ▁had
- ttle
- ▁comp
- ort
- ish
- ▁ex
- ally
- ▁sa
- ▁how
- end
- ant
- ▁O
- ▁um
- way
- ance
- ▁other
- ▁two
- ine
- ever
- able
- ▁com
- other
- ▁first
- ▁back
- ▁al
- ers
- ions
- ▁now
- ▁off
- ning
- ▁down
- ▁has
- ▁than
- ▁car
- ▁Th
- very
- ice
- ▁dr
- ▁been
- ▁him
- ▁here
- ated
- '5'
- ▁hand
- ▁day
- ▁hear
- each
- ▁would
- ▁over
- ▁oh
- ▁cha
- ood
- ▁did
- ugh
- ▁per
- ▁let
- ▁str
- ▁tra
- ▁got
- ext
- '1'
- ▁We
- ▁Shields
- ▁come
- ▁should
- ▁could
- light
- '2'
- ▁people
- ▁again
- ▁year
- ▁app
- ▁into
- ▁any
- ▁N
- ▁mean
- ▁o
- ▁mus
- ▁lot
- ▁said
- ▁long
- ▁these
- ▁lea
- sh
- ▁vi
- ▁part
- ▁every
- ▁our
- ▁You
- ious
- ▁fight
- ▁Ch
- ark
- ▁may
- ▁Hammer
- ▁because
- ▁most
- ▁came
- ▁four
- ful
- ▁No
- ize
- ▁where
- ▁okay
- ▁much
- ▁ask
- ▁through
- ▁before
- ▁work
- ▁even
- ▁three
- mber
- ▁win
- ▁flight
- ake
- K
- ▁place
- ▁play
- ▁though
- ▁pound
- ▁bit
- land
- ▁va
- ▁talk
- ▁kind
- ▁Line
- ▁make
- hap
- ▁big
- ▁leav
- ▁something
- ▁game
- ▁under
- ▁feel
- self
- ▁give
- ▁includ
- U
- ▁twenty
- ▁guard
- ▁left
- ▁round
- ▁great
- body
- ▁gra
- ress
- lso
- '3'
- ▁everything
- ▁those
- ▁after
- ▁tell
- ▁need
- ▁yes
- qua
- ham
- ▁minutes
- ▁question
- ▁around
- ▁punch
- ▁course
- ▁gonna
- ▁person
- ▁move
- ▁plan
- ▁ear
- ept
- ▁Airport
- ▁Okay
- ▁found
- ▁seven
- ▁help
- que
- ▁qui
- ▁keep
- ▁guys
- ▁house
- ▁run
- ▁turn
- ▁better
- ▁stop
- ward
- ddle
- ▁second
- ground
- ▁world
- ▁high
- ▁point
- ▁hold
- ▁call
- '6'
- ▁actually
- ▁probably
- ▁heaven
- ▁speci
- ▁everyone
- ▁why
- ▁presen
- ▁thir
- lright
- ▁eye
- eath
- ▁Tak
- '!'
- '"'
- '4'
- ▁hundred
- ▁answer
- ▁small
- ▁wait
- ▁nothing
- q
- '8'
- V
- ▁countr
- ▁problem
- ▁continu
- ▁close
- ▁priva
- ▁20
- ▁pleas
- ▁walk
- ▁open
- ▁lay
- ▁Station
- ▁moment
- ▁Yeah
- ▁public
- possibl
- ▁happen
- together
- ▁while
- asically
- ▁money
- ▁wrong
- B
- ▁puzzle
- '7'
- ▁journ
- ▁rainbow
- ▁thousand
- I
- '9'
- S
- P
- '%'
- A
- D
- L
- F
- ’
- O
- G
- N
- á
- C
- $
- Z
- Y
- R
- E
- J
- W
- M
- H
- j
- –
- ;
- Q
- X
- ']'
- −
- '&'
- T
- '['
- <sos/eos>
init: xavier_uniform
model_conf: {}
use_ref_audio: true
use_ref_text: false
use_preprocessor: true
token_type: bpe
bpemodel: data/token_list/bpe_unigram500/bpe.model
non_linguistic_symbols: null
cleaner: null
g2p: null
frontend: s3prl
frontend_conf:
    frontend_conf:
        upstream: wavlm_large
    download_dir: ./hub
    multilayer_feature: true
universa: base
universa_conf:
    embedding_dim: 256
    audio_encoder_type: transformer
    audio_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: conv2d
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    text_encoder_type: transformer
    text_encoder_params:
        num_blocks: 4
        attention_heads: 4
        linear_units: 1024
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        input_layer: linear
        normalize_before: true
        concat_after: false
        positionwise_layer_type: linear
        positionwise_conv_kernel_size: 1
        layer_drop_rate: 0.1
        qk_norm: false
        use_flash_attn: false
    cross_attention_type: multihead
    cross_attention_params:
        n_head: 4
        dropout_rate: 0.1
    pooling_type: mean
    projector_type: linear
    multi_branch: true
required:
- output_dir
- metric2id
version: '202412'
distributed: false

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.