llamafactory on k8s demo
【代码】llamafactory on k8s demo。
·
apiVersion: batch/v1
kind: Job
metadata:
name: llamafactory-modelscope-job-sft-lora-qwen25-05b
namespace: test
spec:
completions: 1 # 总任务数量
parallelism: 1 # 并行运行的 Pod 数量
backoffLimit: 3 # 最大重试次数
template:
metadata:
labels:
app: llamafactory
spec:
nodeSelector:
gpu: "on"
containers:
- name: llamafactory-modelscope-job-sft-lora-qwen25-05b
image: llamafactory-ms:latest # 替换为你的镜像名称
imagePullPolicy: IfNotPresent
resources:
limits:
nvidia.com/gpu: 1
nvidia.com/gpumem: 20240
env:
- name: USE_MODELSCOPE_HUB
value: "1"
command: # 使用 bash 执行命令
- bash
- -xc
- |
llamafactory-cli train --stage sft \
--do_train True \
--model_name_or_path Qwen/Qwen2.5-0.5B \
--preprocessing_num_workers 16 \
--finetuning_type lora \
--template default \
--flash_attn auto \
--dataset_dir data \
--dataset identity \
--cutoff_len 2048 \
--learning_rate 5e-05 \
--num_train_epochs 30.0 \
--max_samples 100000 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--lr_scheduler_type cosine \
--max_grad_norm 1.0 \
--logging_steps 5 \
--save_steps 100 \
--warmup_steps 0 \
--packing False \
--report_to none \
--output_dir /app/output/Qwen2.5-0.5B/lora/train_2024-12-04-03-11-40 \
--bf16 True \
--plot_loss True \
--ddp_timeout 180000000 \
--optim adamw_torch \
--lora_rank 8 \
--lora_alpha 16 \
--lora_dropout 0 \
--lora_target all
volumeMounts: # 挂载目录
- name: hf-cache
mountPath: /root/.cache/huggingface
- name: ms-cache
mountPath: /root/.cache/modelscope
- name: om-cache
mountPath: /root/.cache/openmind
- name: data
mountPath: /app/data
- name: output
mountPath: /app/output
ports:
- containerPort: 7860
- containerPort: 8000
volumes:
- name: hf-cache
hostPath:
path: /root/.cache/huggingface
type: Directory
- name: ms-cache
hostPath:
path: /root/.cache/modelscope
type: Directory
- name: om-cache
hostPath:
path: /root/.cache/openmind
type: Directory
- name: data
hostPath:
path: /root/LLaMA-Factory/data
type: Directory
- name: output
hostPath:
path: /app/output
type: Directory
restartPolicy: Never # Job任务完成后不会自动重启
更多推荐




所有评论(0)