EADST

ms-swift LoRA SFT 训练脚本:Qwen-3.5-9B

ms-swift LoRA SFT 训练脚本:Qwen-3.5-9B,Len1024,Batch128

完整脚本

#!/usr/bin/env bash
set -euo pipefail

# ms-swift LoRA SFT launch script: max_length=1024, effective batch=128
# Usage:
#   bash run_ms_swift_lora_sft_len1024_bs128.sh
#
# Default effective batch on 8 GPUs:
#   8 GPUs * per_device_train_batch_size 2 * gradient_accumulation_steps 8 = 128 samples / optimizer step
#
# Optional overrides:
#   CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 PER_DEVICE_TRAIN_BATCH_SIZE=2 GRADIENT_ACCUMULATION_STEPS=16 bash run_ms_swift_lora_sft_len1024_bs128.sh

WORK_DIR="/root/paddlejob/workspace/env_run/output/dong/sft"
MODEL_PATH="/root/paddlejob/workspace/env_run/output/model/qwen-3.5-9b"
RAW_TRAIN_DATASET="${WORK_DIR}/sft_train_sharegpt.json"
RAW_VAL_DATASET="${WORK_DIR}/sft_val_sharegpt.json"
DATA_DIR="${WORK_DIR}/ms_swift_data"
TRAIN_DATASET="${DATA_DIR}/sft_train_messages.jsonl"
VAL_DATASET="${DATA_DIR}/sft_val_messages.jsonl"
OUTPUT_ROOT="${WORK_DIR}/outputs/qwen-3.5-9b-lora-sft-len1024-bs128"
PYTHON_BIN="${PYTHON_BIN:-$(command -v python)}"
SWIFT_BIN="${SWIFT_BIN:-$(command -v swift || true)}"

# SwanLab
export SWANLAB_API_KEY="${SWANLAB_API_KEY:-YOUR_SWANLAB_API_KEY}"
SWANLAB_PROJECT="${SWANLAB_PROJECT:-qwen-3.5-9b-sft}"
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d-%H%M%S)}"

# GPU / distributed settings
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"

# Training hyperparameters. Defaults target 8xA800 with effective batch size 128.
NUM_TRAIN_EPOCHS="${NUM_TRAIN_EPOCHS:-3}"
MAX_STEPS="${MAX_STEPS:-1000}"
PER_DEVICE_TRAIN_BATCH_SIZE="${PER_DEVICE_TRAIN_BATCH_SIZE:-2}"
PER_DEVICE_EVAL_BATCH_SIZE="${PER_DEVICE_EVAL_BATCH_SIZE:-2}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-8}"
MAX_LENGTH="${MAX_LENGTH:-1024}"
LORA_DROPOUT="${LORA_DROPOUT:-0.1}"
EVAL_STEPS="${EVAL_STEPS:-100}"
SAVE_STEPS="${SAVE_STEPS:-100}"
SAVE_TOTAL_LIMIT="${SAVE_TOTAL_LIMIT:-5}"
LORA_LR_GRID="${LORA_LR_GRID:-16:3e-5 16:5e-5}"

cd "${WORK_DIR}"
mkdir -p "${OUTPUT_ROOT}"

if [[ -z "${PYTHON_BIN}" || ! -x "${PYTHON_BIN}" ]]; then
  echo "ERROR: python executable not found: ${PYTHON_BIN}" >&2
  exit 1
fi

if [[ -z "${SWIFT_BIN}" || ! -x "${SWIFT_BIN}" ]]; then
  cat >&2 <<'MSG'
ERROR: swift command not found.
Please install ms-swift and swanlab in the current environment first.
MSG
  exit 1
fi

SWIFT_SOURCE_DIR="$(${PYTHON_BIN} - <<'PY'
import importlib.util
import sys
spec = importlib.util.find_spec('swift')
if spec is None or not spec.origin:
    sys.exit(1)
print(spec.origin)
PY
)" || {
  cat >&2 <<MSG
ERROR: Python cannot import module 'swift'.
The current swift executable is: ${SWIFT_BIN}
The current Python executable is: ${PYTHON_BIN}
MSG
  exit 1
}

echo "Using Python: ${PYTHON_BIN}"
echo "Using swift: ${SWIFT_BIN}"
echo "Using swift module: ${SWIFT_SOURCE_DIR}"
echo "Effective batch size: $((NPROC_PER_NODE * PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))"

if [[ ! -d "${MODEL_PATH}" ]]; then
  echo "ERROR: model path not found: ${MODEL_PATH}" >&2
  exit 1
fi

if [[ ! -f "${RAW_TRAIN_DATASET}" ]]; then
  echo "ERROR: raw train dataset not found: ${RAW_TRAIN_DATASET}" >&2
  exit 1
fi

if [[ ! -f "${RAW_VAL_DATASET}" ]]; then
  echo "ERROR: raw validation dataset not found: ${RAW_VAL_DATASET}" >&2
  exit 1
fi

"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_TRAIN_DATASET}" --output "${TRAIN_DATASET}"
"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_VAL_DATASET}" --output "${VAL_DATASET}"

TRAIN_TYPE_FLAG="--train_type"
if ! "${SWIFT_BIN}" sft --help 2>/dev/null | grep -q -- "--train_type"; then
  TRAIN_TYPE_FLAG="--tuner_type"
fi

for item in ${LORA_LR_GRID}; do
  IFS=":" read -r LORA_RANK LEARNING_RATE <<< "${item}"
  if [[ -z "${LORA_RANK}" || -z "${LEARNING_RATE}" ]]; then
    echo "ERROR: invalid LORA_LR_GRID item: ${item}" >&2
    exit 1
  fi

  LORA_ALPHA="$((LORA_RANK * 2))"
  RUN_NAME="qwen-3.5-9b-len${MAX_LENGTH}-bs128-lora-r${LORA_RANK}-lr${LEARNING_RATE}-drop${LORA_DROPOUT}-${RUN_TAG}"
  OUTPUT_DIR="${OUTPUT_ROOT}/${RUN_NAME}"
  mkdir -p "${OUTPUT_DIR}"

  echo "Starting SFT run: ${RUN_NAME}"
  "${SWIFT_BIN}" sft \
    --model "${MODEL_PATH}" \
    "${TRAIN_TYPE_FLAG}" lora \
    --dataset "${TRAIN_DATASET}" \
    --val_dataset "${VAL_DATASET}" \
    --torch_dtype bfloat16 \
    --num_train_epochs "${NUM_TRAIN_EPOCHS}" \
    --max_steps "${MAX_STEPS}" \
    --per_device_train_batch_size "${PER_DEVICE_TRAIN_BATCH_SIZE}" \
    --per_device_eval_batch_size "${PER_DEVICE_EVAL_BATCH_SIZE}" \
    --gradient_accumulation_steps "${GRADIENT_ACCUMULATION_STEPS}" \
    --learning_rate "${LEARNING_RATE}" \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.03 \
    --max_length "${MAX_LENGTH}" \
    --truncation_strategy delete \
    --gradient_checkpointing true \
    --packing false \
    --lora_rank "${LORA_RANK}" \
    --lora_alpha "${LORA_ALPHA}" \
    --lora_dropout "${LORA_DROPOUT}" \
    --target_modules all-linear \
    --eval_steps "${EVAL_STEPS}" \
    --save_steps "${SAVE_STEPS}" \
    --save_total_limit "${SAVE_TOTAL_LIMIT}" \
    --logging_steps 5 \
    --dataloader_num_workers 4 \
    --output_dir "${OUTPUT_DIR}" \
    --report_to swanlab \
    --swanlab_project "${SWANLAB_PROJECT}" \
    --swanlab_exp_name "${RUN_NAME}" \
    --model_name "${RUN_NAME}" \
    --model_author dong
done

脚本目标

这个脚本用于在 Qwen-3.5-9B 上进行 ms-swift LoRA SFT 训练。当前实验固定两个核心条件:

| 配置项 | 当前值 |
| --- | --- |
| max_length | 1024 |
| effective batch size | 128 |
| LoRA dropout | 0.1 |
| LoRA rank | 16 |
| learning rate | 3e-5 / 5e-5 |

它的定位不是单次训练命令,而是一个可复用的实验启动器:负责数据转换、环境检查、参数网格遍历、输出目录隔离和 SwanLab 上报。

当前会跑的实验

脚本里最关键的网格参数是:

LORA_LR_GRID="${LORA_LR_GRID:-16:3e-5 16:5e-5}"

每个 item 的格式是:

LoRA rank:learning rate

因此默认会顺序执行两组实验:

| 实验 | LoRA Rank | Learning Rate | Dropout | Max Length | Effective Batch |
| --- | --- | --- | --- | --- | --- |
| Run 1 | 16 | 3e-5 | 0.1 | 1024 | 128 |
| Run 2 | 16 | 5e-5 | 0.1 | 1024 | 128 |

第一组训练完成后,脚本会继续启动第二组。两组实验会进入不同的输出目录,并分别上报 SwanLab,后续可以直接对比训练曲线和验证集表现。

Batch Size 的计算方式

默认情况下脚本面向 8 卡训练:

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NPROC_PER_NODE=8
PER_DEVICE_TRAIN_BATCH_SIZE=2
GRADIENT_ACCUMULATION_STEPS=8

有效 batch size 的计算方式是:

8 GPUs * 2 samples/GPU * 8 gradient accumulation steps = 128 samples / optimizer step

如果只使用 4 张 GPU,也可以通过环境变量覆盖参数,同时保持有效 batch size 不变:

CUDA_VISIBLE_DEVICES=0,1,2,3 \
NPROC_PER_NODE=4 \
PER_DEVICE_TRAIN_BATCH_SIZE=2 \
GRADIENT_ACCUMULATION_STEPS=16 \
bash run_ms_swift_lora_sft_len1024_bs128.sh

数据准备

脚本使用的原始数据是 ShareGPT 格式:

RAW_TRAIN_DATASET="${WORK_DIR}/sft_train_sharegpt.json"
RAW_VAL_DATASET="${WORK_DIR}/sft_val_sharegpt.json"

训练前会先转换成 ms-swift 使用的 messages jsonl:

TRAIN_DATASET="${DATA_DIR}/sft_train_messages.jsonl"
VAL_DATASET="${DATA_DIR}/sft_val_messages.jsonl"

转换命令是:

"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_TRAIN_DATASET}" --output "${TRAIN_DATASET}"
"${PYTHON_BIN}" "${WORK_DIR}/prepare_ms_swift_dataset.py" --input "${RAW_VAL_DATASET}" --output "${VAL_DATASET}"

这里单独做数据转换是有必要的。它把原始数据格式处理和训练过程解耦,后续如果训练出错,也更容易判断问题出在数据侧还是训练侧。

训练参数

核心训练命令使用 swift sft,训练类型是 LoRA:

--train_type lora
--target_modules all-linear
--lora_rank "${LORA_RANK}"
--lora_alpha "${LORA_ALPHA}"
--lora_dropout "${LORA_DROPOUT}"

其中 lora_alpha 会按 rank 自动计算:

LORA_ALPHA="$((LORA_RANK * 2))"

当前 rank 为 16,所以 alpha 为 32

主要训练参数如下:

| 参数 | 配置 |
| --- | --- |
| num_train_epochs | 3 |
| max_steps | 1000 |
| max_length | 1024 |
| per_device_train_batch_size | 2 |
| per_device_eval_batch_size | 2 |
| gradient_accumulation_steps | 8 |
| lr_scheduler_type | cosine |
| warmup_ratio | 0.03 |
| torch_dtype | bfloat16 |
| gradient_checkpointing | true |
| packing | false |
| eval_steps | 100 |
| save_steps | 100 |
| save_total_limit | 5 |
| logging_steps | 5 |

其中 max_steps=1000 是一个硬上限。即使设置了 num_train_epochs=3,训练也会在达到 1000 step 后停止。

后台运行

带时间戳日志:

cd /root/paddlejob/workspace/env_run/output/dong/sft
nohup bash run_ms_swift_lora_sft_len1024_bs128.sh > train_len1024_bs128_$(date +%Y%m%d_%H%M%S).log 2>&1 &

固定日志文件名:

cd /root/paddlejob/workspace/env_run/output/dong/sft
nohup bash run_ms_swift_lora_sft_len1024_bs128.sh > train_len1024_bs128.log 2>&1 &

查看日志:

tail -f /root/paddlejob/workspace/env_run/output/dong/sft/train_len1024_bs128.log

查看进程:

ps -ef | grep run_ms_swift_lora_sft_len1024_bs128 | grep -v grep

停止训练:

pkill -f run_ms_swift_lora_sft_len1024_bs128.sh

小结

这个脚本的核心价值是把一次 LoRA SFT 实验标准化。它固定 max_length=1024 和有效 batch size 128,然后对 3e-55e-5 两个学习率做顺序对比。

从工程上看,它覆盖了训练中容易出问题的几个环节:数据转换、环境检查、版本兼容、输出目录隔离和 SwanLab 记录。后续如果继续扩展,可以把 LORA_LR_GRID 扩成更多 rank / learning rate 组合,也可以进一步把 max_length、dropout 和自动评测逻辑纳入实验流程。

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
NLP Windows Vim 论文速读 git-lfs Algorithm Statistics Agent PyCharm Card Attention 云服务器 RGB uWSGI TSV Template UNIX Conda PIP Augmentation LLM uwsgi Claude Pytorch Clash v0.dev Docker v2ray OpenAI Disk Pickle ms-swift Quantize 继承 Paddle Qwen News Distillation Diagram FastAPI MD5 GPT4 BeautifulSoup 第一性原理 Bert Cloudreve Web Qwen2 Gemma 算法题 Google Miniforge tar SQLite Ptyhon PDB LaTeX llama.cpp Breakpoint Sklearn AI DeepStream HuggingFace Logo Vmess Tracking XML WebCrawler C++ Linux COCO 报税 Data Jetson 版权 Bin scipy Ubuntu 签证 CSV Shortcut Nginx GoogLeNet ChatGPT GPTQ Python Pandas Bipartite 多进程 TensorRT SVR 阿里云 Proxy Base64 域名 Rebuttal EXCEL Crawler 净利润 CC Hungarian Plotly 图形思考法 transformers 递归学习法 mmap Qwen2.5 Image2Text Translation git Search Firewall Anaconda Dataset Github NLTK SQL Mixtral 论文 XGBoost 顶会 Freesound RAR Markdown torchinfo OpenCV Review BTC VGG-16 Git 腾讯云 Domain Video Random GGML Quantization diffusers SAM PyTorch VSCode Safetensors VPN LLAMA Plate tqdm Paper JSON 证件照 Numpy 公式 PDF Land YOLO InvalidArgumentError Jupyter Tensor FP32 财报 Django Streamlit Math API Llama 飞书 FP16 Color CEIR 多线程 TTS Permission printf Tiktoken Excel DeepSeek BF16 ModelScope ResNet-50 Hotel Animate SPIE LoRA Pillow Knowledge CLAP CAM LeetCode Hilton Website hf GIT TensorFlow CTC CUDA 关于博主 icon Zip ONNX Interview Baidu NameSilo Heatmap OCR Datetime Michelin 图标 WAN 音频 Magnet Password IndexTTS2 QWEN Bitcoin FP8 Input logger Use FlashAttention FP64 CV Transformers 搞笑 Food 强化学习 HaggingFace UI
站点统计

本站现有博文329篇,共被浏览858628

本站已经建立2567天!

热门文章
文章归档
回到顶部