apiVersion: batch/v1
kind: Job
metadata:
  name: llamafactory-modelscope-job-sft-lora-qwen25-05b
  namespace: test
spec:
  completions: 1  # 总任务数量
  parallelism: 1  # 并行运行的 Pod 数量
  backoffLimit: 3  # 最大重试次数
  template:
    metadata:
      labels:
        app: llamafactory
    spec:
      nodeSelector:
        gpu: "on"
      containers:
        - name: llamafactory-modelscope-job-sft-lora-qwen25-05b
          image: llamafactory-ms:latest  # 替换为你的镜像名称
          imagePullPolicy: IfNotPresent
          resources:
            limits:
              nvidia.com/gpu: 1   
              nvidia.com/gpumem: 20240   
          env:
            - name: USE_MODELSCOPE_HUB
              value: "1"
          command:  # 使用 bash 执行命令
            - bash
            - -xc
            - |
              llamafactory-cli train --stage sft \
                --do_train True \
                --model_name_or_path Qwen/Qwen2.5-0.5B \
                --preprocessing_num_workers 16 \
                --finetuning_type lora \
                --template default \
                --flash_attn auto \
                --dataset_dir data \
                --dataset identity \
                --cutoff_len 2048 \
                --learning_rate 5e-05 \
                --num_train_epochs 30.0 \
                --max_samples 100000 \
                --per_device_train_batch_size 2 \
                --gradient_accumulation_steps 8 \
                --lr_scheduler_type cosine \
                --max_grad_norm 1.0 \
                --logging_steps 5 \
                --save_steps 100 \
                --warmup_steps 0 \
                --packing False \
                --report_to none \
                --output_dir /app/output/Qwen2.5-0.5B/lora/train_2024-12-04-03-11-40 \
                --bf16 True \
                --plot_loss True \
                --ddp_timeout 180000000 \
                --optim adamw_torch \
                --lora_rank 8 \
                --lora_alpha 16 \
                --lora_dropout 0 \
                --lora_target all
          volumeMounts:  # 挂载目录
            - name: hf-cache
              mountPath: /root/.cache/huggingface
            - name: ms-cache
              mountPath: /root/.cache/modelscope
            - name: om-cache
              mountPath: /root/.cache/openmind
            - name: data
              mountPath: /app/data
            - name: output
              mountPath: /app/output
          ports:
            - containerPort: 7860
            - containerPort: 8000

      volumes:
        - name: hf-cache
          hostPath:
            path: /root/.cache/huggingface
            type: Directory
        - name: ms-cache
          hostPath:
            path: /root/.cache/modelscope
            type: Directory
        - name: om-cache
          hostPath:
            path: /root/.cache/openmind
            type: Directory
        - name: data
          hostPath:
            path: /root/LLaMA-Factory/data
            type: Directory
        - name: output
          hostPath:
            path: /app/output
            type: Directory
      restartPolicy: Never  # Job任务完成后不会自动重启

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐