Closed
Description
I was trying out the finetune example with my model but it kept going into nan loss. I eventually tried train-text-from-scratch, following the instructions on the README there and it goes into nan as well. I've reproduced this on two machines.
root@c5a10438d69e:/workspace/llama.cpp# ./train-text-from-scratch --vocab-model ./models/ggml-vocab-llama.gguf --ctx 64 --embd 256 --head 8 --layer 16 --checkpoint-in chk-shakespeare-256x16-LATEST.gguf --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf --train-data "shakespeare.txt" -t 6 -b 16 --seed 1 --adam-iter 256 --no-checkpointing
main: seed: 1
llama_model_loader: loaded meta data with 17 key-value pairs and 0 tensors from ./models/ggml-vocab-llama.gguf (version GGUF V3 (latest))
llama_model_loader: - kv 0: general.architecture str
llama_model_loader: - kv 1: general.name str
llama_model_loader: - kv 2: llama.context_length u32
llama_model_loader: - kv 3: llama.embedding_length u32
llama_model_loader: - kv 4: llama.block_count u32
llama_model_loader: - kv 5: llama.feed_forward_length u32
llama_model_loader: - kv 6: llama.rope.dimension_count u32
llama_model_loader: - kv 7: llama.attention.head_count u32
llama_model_loader: - kv 8: llama.attention.head_count_kv u32
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32
llama_model_loader: - kv 10: tokenizer.ggml.model str
llama_model_loader: - kv 11: tokenizer.ggml.tokens arr
llama_model_loader: - kv 12: tokenizer.ggml.scores arr
llama_model_loader: - kv 13: tokenizer.ggml.token_type arr
llama_model_loader: - kv 14: tokenizer.ggml.bos_token_id u32
llama_model_loader: - kv 15: tokenizer.ggml.eos_token_id u32
llama_model_loader: - kv 16: tokenizer.ggml.unknown_token_id u32
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_embd = 4096
llm_load_print_meta: n_head = 32
llm_load_print_meta: n_head_kv = 32
llm_load_print_meta: n_layer = 32
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: n_ff = 11008
llm_load_print_meta: rope scaling = linear
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_yarn_orig_ctx = 4096
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: model type = 7B
llm_load_print_meta: model ftype = all F32 (guessed)
llm_load_print_meta: model params = 0.00 B
llm_load_print_meta: model size = 0.00 MiB (-nan BPW)
llm_load_print_meta: general.name = LLaMA v2
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llama_model_load: vocab only - skipping tensors
llama_new_context_with_model: n_ctx = 512
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 1
main: init model
print_params: n_vocab: 32000
print_params: n_ctx: 64
print_params: n_embd: 256
print_params: n_head: 8
print_params: n_ff: 768
print_params: n_layer: 16
print_params: n_rot: 32
main: total train_iterations 0
main: seen train_samples 0
main: seen train_tokens 0
main: completed train_epochs 0
main: model_size = 240304416 bytes (229.2 MB)
main: opt_size = 360288432 bytes (343.6 MB)
main: opt iter 0
main: input_size = 131076128 bytes (125.0 MB)
main: compute_size = 701759840 bytes (669.3 MB)
main: evaluation order = LEFT_TO_RIGHT
main: tokenize training data
tokenize_file: total number of samples: 27520
main: number of training tokens: 27584
main: train data seems to have changed. restarting shuffled epoch.
main: begin training
main: work_size = 768376 bytes (0.7 MB)
train_opt_callback: iter= 0 sample=1/27520 sched=0.000000 loss=0.000000 |->
train_opt_callback: iter= 1 sample=17/27520 sched=0.010000 loss=10.373524 dt=00:00:03 eta=00:15:01 |->
train_opt_callback: iter= 2 sample=33/27520 sched=0.020000 loss=nan dt=00:00:03 eta=00:14:19 |>
train_opt_callback: iter= 3 sample=49/27520 sched=0.030000 loss=nan dt=00:00:03 eta=00:15:01 |>
^C
root@c5a10438d69e:/workspace/llama.cpp# ^C
root@c5a10438d69e:/workspace/llama.cpp# git log | head -1
commit d9b33fe95bd257b36c84ee5769cc048230067d6f
root@c5a10438d69e:/workspace/llama.cpp# lscpu | egrep "AMD|Flags"
Vendor ID: AuthenticAMD
Model name: AMD EPYC Processor
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm rep_good nopl cpuid extd_apicid amd_dcm tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 arat npt nrip_save
Virtualization: AMD-V
root@c5a10438d69e:/workspace/llama.cpp# uname -a
Linux c5a10438d69e 5.4.0-139-generic #156-Ubuntu SMP Fri Jan 20 17:27:18 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
root@c5a10438d69e:/workspace/llama.cpp# g++ --version
g++ (Ubuntu 11.3.0-1ubuntu1~22.04.1) 11.3.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@c5a10438d69e:/workspace/llama.cpp# make --version
GNU Make 4.3
Built for x86_64-pc-linux-gnu
Copyright (C) 1988-2020 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
root@c5a10438d69e:/workspace/llama.cpp#
I've bisected this and 898aeca is the first bad commit. Reverting to the previous commit, c43c2da, train-text-from-scratch and finetune appear to work fine (they don't go into nan)