[go: nahoru, domu]

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

triton.runtime.autotuner.OutOfResources #4688

Open
1 task done
GitIgnoreMaybe opened this issue Jul 5, 2024 · 5 comments
Open
1 task done

triton.runtime.autotuner.OutOfResources #4688

GitIgnoreMaybe opened this issue Jul 5, 2024 · 5 comments
Labels
pending This problem is yet to be addressed

Comments

@GitIgnoreMaybe
Copy link

Reminder

  • I have read the README and searched the existing issues.

System Info

llamafactory-0.8.3.dev0, Ubuntu 22.04.3 LTS, py3.10, cuda11.8.0

Reproduction

Command:

"_name_or_path": "microsoft/Phi-3-small-8k-instruct",
  "architectures": [
    "Phi3SmallForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout_prob": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-small-8k-instruct--configuration_phi3_small.Phi3SmallConfig",
    "AutoModelForCausalLM": "microsoft/Phi-3-small-8k-instruct--modeling_phi3_small.Phi3SmallForCausalLM",
    "AutoModelForSequenceClassification": "microsoft/Phi-3-small-8k-instruct--modeling_phi3_small.Phi3SmallForSequenceClassification",
    "AutoTokenizer": "microsoft/Phi-3-small-8k-instruct--tokenization_phi3_small.Phi3SmallTokenizer"
  },
  "blocksparse_block_size": 64,
  "blocksparse_homo_head_pattern": false,
  "blocksparse_num_local_blocks": 16,
  "blocksparse_triton_kernel_block_size": 64,
  "blocksparse_vert_stride": 8,
  "bos_token_id": 100257,
  "dense_attention_every_n_layers": 2,
  "embedding_dropout_prob": 0.1,
  "eos_token_id": 100257,
  "ff_dim_multiplier": null,
  "ff_intermediate_size": 14336,
  "ffn_dropout_prob": 0.1,
  "gegelu_limit": 20.0,
  "gegelu_pad_to_256": true,
  "hidden_act": "gegelu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "phi3small",
  "mup_attn_multiplier": 1.0,
  "mup_embedding_multiplier": 10.0,
  "mup_use_scaling": true,
  "mup_width_multiplier": 8.0,
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_sequence_to_multiple_of_64": true,
  "reorder_and_upcast_attn": false,
  "rope_embedding_base": 1000000,
  "rope_position_scale": 1.0,
  "rope_scaling": null,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 100352
}

Error

Traceback (most recent call last):
  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
    sys.exit(main())
  File "/root/LLaMA-Factory/src/llamafactory/cli.py", line 111, in main
    run_exp()
  File "/root/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
    run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
  File "/root/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 90, in run_sft
    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1932, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2268, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3307, in training_step
    loss = self.compute_loss(model, inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3338, in compute_loss
    outputs = model(**inputs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py", line 819, in forward
    return model_forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py", line 807, in __call__
    return convert_to_fp32(self.model_forward(*args, **kwargs))
  File "/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/peft_model.py", line 1430, in forward
    return self.base_model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py", line 179, in forward
    return self.model.forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 956, in forward
    outputs = self.model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 849, in forward
    layer_outputs = self._gradient_checkpointing_func(
  File "/root/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py", line 65, in custom_gradient_checkpointing_func
    return gradient_checkpointing_func(func, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_compile.py", line 24, in inner
    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/external_utils.py", line 17, in inner
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 451, in checkpoint
    return CheckpointFunction.apply(function, preserve, *args)
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 539, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 230, in forward
    outputs = run_function(*args)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 671, in forward
    hidden_states, self_attn_weights, present_key_values = self.self_attn(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 169, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 616, in forward
    attn_function_output = self._apply_blocksparse_attention(
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/modeling_phi3_small.py", line 382, in _apply_blocksparse_attention
    context_layer = self._blocksparse_layer(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/triton_blocksparse_attention_layer.py", line 165, in forward
    return blocksparse_flash_attn_padded_fwd(
  File "/root/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/69caae1f2acea34b26f535fecb1f2abb9a304695/triton_flash_blocksparse_attn.py", line 994, in blocksparse_flash_attn_padded_fwd
    _fwd_kernel_batch_inference[grid](
  File "/usr/local/lib/python3.10/dist-packages/triton/runtime/autotuner.py", line 232, in run
    return self.fn.run(*args, **kwargs)
  File "<string>", line 65, in _fwd_kernel_batch_inference
  File "/usr/local/lib/python3.10/dist-packages/triton/compiler/compiler.py", line 579, in __getattribute__
    self._init_handles()
  File "/usr/local/lib/python3.10/dist-packages/triton/compiler/compiler.py", line 568, in _init_handles
    raise OutOfResources(self.shared, max_shared, "shared memory")
triton.runtime.autotuner.OutOfResources: out of resource: shared memory, Required: 246272, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.
SCR-20240705-frfy

Expected behavior

Hello, I'm really not sure if this is a LLaMA Factory issue or the Cloud GPU provider. Does anyone knows what to do?

Others

No response

@github-actions github-actions bot added the pending This problem is yet to be addressed label Jul 5, 2024
@codemayq
Copy link
Collaborator
codemayq commented Jul 5, 2024

The reproduction command is not posted, we don't know what process you are doing.

@GitIgnoreMaybe
Copy link
Author

Hey @codemayq,

Thanks for the help.

llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path microsoft/Phi-3-small-8k-instruct \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --quantization_bit 4 \
    --quantization_method bitsandbytes \
    --template phi \
    --flash_attn fa2 \
    --dataset_dir data \
    --dataset custom_instruct_training_data.json \
    --cutoff_len 1024 \
    --learning_rate 1.0e-04 \
    --num_train_epochs 1.0 \
    --max_samples 1000 \
    --per_device_train_batch_size 5 \
    --gradient_accumulation_steps 8 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --optim adamw_torch \
    --packing False \
    --report_to none \
    --output_dir saves/Phi3-7B-8k-Chat/lora/train_2024-07-05-13-47-27 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --include_num_input_tokens_seen True \
    --lora_rank 256 \
    --lora_alpha 512 \
    --lora_dropout 0 \
    --lora_target all \
    --val_size 0.1 \
    --eval_strategy steps \
    --eval_steps 100 \
    --per_device_eval_batch_size 5

@GitIgnoreMaybe
Copy link
Author
GitIgnoreMaybe commented Jul 5, 2024

I think my LoRA rank and LoRA alpha was wrong.

@hiyouga
Copy link
Owner
hiyouga commented Jul 5, 2024

decrease the train batch size

@GitIgnoreMaybe
Copy link
Author
GitIgnoreMaybe commented Jul 7, 2024

@hiyouga Thanks for the help.

This didn't work either. But I figured out that the quantization creates the issue. It works when I'm not quantizing. Sounds like a bug, right?

Failing with this:

llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path microsoft/Phi-3-small-8k-instruct \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --quantization_bit 4 \
    --quantization_method bitsandbytes \
    --template phi \
    --flash_attn fa2 \
    --dataset_dir data \
    --dataset data_query_expansion.json \
    --cutoff_len 512 \
    --learning_rate 0.0001 \
    --num_train_epochs 8.0 \
    --max_samples 1000 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --optim adamw_torch \
    --packing False \
    --report_to none \
    --output_dir saves/Phi3-7B-8k-Chat/lora/output-q4 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --include_num_input_tokens_seen True \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0 \
    --lora_target all \
    --val_size 0.1 \
    --eval_strategy steps \
    --eval_steps 100 \
    --per_device_eval_batch_size 1

This worked:

llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path microsoft/Phi-3-small-8k-instruct \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --quantization_method bitsandbytes \
    --template phi \
    --flash_attn fa2 \
    --dataset_dir data \
    --dataset data_query_expansion.json \
    --cutoff_len 512 \
    --learning_rate 0.0001 \
    --num_train_epochs 8.0 \
    --max_samples 1000 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --optim adamw_torch \
    --packing False \
    --report_to none \
    --output_dir saves/Phi3-7B-8k-Chat/lora/output-q4 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --include_num_input_tokens_seen True \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0 \
    --lora_target all \
    --val_size 0.1 \
    --eval_strategy steps \
    --eval_steps 100 \
    --per_device_eval_batch_size 1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
pending This problem is yet to be addressed
Projects
None yet
Development

No branches or pull requests

3 participants