微调环境

在autodl上租一个 rtx 4090,选择镜像 PyTorch 2.1.2 Python3.10

开机后进入 JupyterLab,新增一个 .ipynb文件

下载Qwen3-4B模型

在.ipynb文件中新增并执行下面代码:

from modelscope import snapshot_download
model_dir = snapshot_download('Qwen/Qwen3-4B', cache_dir='/root/autodl-tmp/models')

下载数据集

在.ipynb文件中新增并执行下面代码:

# 下载数据集
from modelscope import MsDataset
import os

# 设置固定的下载目录
CACHE_DIR = "/root/autodl-tmp/datasets"  # 你可以修改为任何你想要的路径


# 下载数据集到指定目录
dataset = MsDataset.load(
    'AI-ModelScope/alpaca-cleaned', 
    split='train',
    cache_dir=CACHE_DIR  # 关键参数:指定缓存目录
)

微调

在.ipynb文件中新增并执行下面代码:


# 导入必要的库
from unsloth import FastLanguageModel
import torch

# 设置模型参数
max_seq_length = 2048  # 设置最大序列长度,支持 RoPE 缩放
dtype = None  # 数据类型,None 表示自动检测。Tesla T4 使用 Float16,Ampere+ 使用 Bfloat16
load_in_4bit = True  # 使用 4bit 量化来减少内存使用

# 加载预训练模型和分词器
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/root/autodl-tmp/models/Qwen/Qwen3-4B",  # 使用Qwen3-4B模型
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 添加LoRA适配器,只需要更新1-10%的参数
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # LoRA秩,建议使用8、16、32、64、128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],  # 需要应用LoRA的模块
    lora_alpha = 16,  # LoRA缩放因子
    lora_dropout = 0,  # LoRA dropout率,0为优化设置
    bias = "none",    # 偏置项设置,none为优化设置
    use_gradient_checkpointing = "unsloth",  # 使用unsloth的梯度检查点,可减少30%显存使用
    random_state = 3407,  # 随机种子
    use_rslora = False,  # 是否使用rank stabilized LoRA
    loftq_config = None,  # LoftQ配置
)


# 加载数据集
dataset = load_dataset("/root/autodl-tmp/datasets/AI-ModelScope___alpaca-cleaned", split = "train")

# 查看数据集的特征(列名)
print("数据集特征 (Features):")
print(dataset.features)

# 查看第一个样本
print("\n第一个样本:")
print(dataset[0])

# 查看前几个样本的列名(如果每个样本的键可能不同)
print("\n所有列名 (Keys in the first few examples):")
for i in range(min(3, len(dataset))):
    print(f"样本 {i} 的键: {list(dataset[i].keys())}")


# 定义Alpaca格式的提示模板
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 获取结束标记
EOS_TOKEN = tokenizer.eos_token

# 定义数据格式化函数
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # 必须添加EOS_TOKEN,否则生成会无限继续
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# 加载Alpaca数据集
from datasets import load_dataset

dataset = load_dataset("/root/autodl-tmp/datasets/AI-ModelScope___alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)


# 设置训练参数和训练器
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# 定义训练参数
training_args = TrainingArguments(
        per_device_train_batch_size = 2,  # 每个设备的训练批次大小
        gradient_accumulation_steps = 4,  # 梯度累积步数
        warmup_steps = 5,  # 预热步数
        max_steps = 60,  # 最大训练步数
        learning_rate = 2e-4,  # 学习率
        fp16 = not is_bfloat16_supported(),  # 是否使用FP16
        bf16 = is_bfloat16_supported(),  # 是否使用BF16
        logging_steps = 1,  # 日志记录步数
        optim = "adamw_8bit",  # 优化器
        weight_decay = 0.01,  # 权重衰减
        lr_scheduler_type = "linear",  # 学习率调度器类型
        seed = 3407,  # 随机种子
        output_dir = "outputs",  # 输出目录
        report_to = "none",  # 报告方式
    )

# 创建SFTTrainer实例
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,  # 对于短序列可以设置为True,训练速度提升5倍
    args = training_args,
)


# 显示当前GPU内存状态
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


# 开始训练
trainer_stats = trainer.train()



# 显示训练后的内存和时间统计
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


# 模型推理部分
FastLanguageModel.for_inference(model)  # 启用原生2倍速推理
inputs = tokenizer(
[
    # alpaca_prompt.format(
    #     "Continue the fibonnaci sequence.",  # 指令
    #     "1, 1, 2, 3, 5, 8",  # 输入
    #     "",  # 输出留空用于生成
    # )

    alpaca_prompt.format(
        "Continue the fibonnaci sequence.",  # 指令
        "11, 11, 22, 33, 55, 88",  # 输入
        "",  # 输出留空用于生成
    )
], return_tensors = "pt").to("cuda")

# 生成输出
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)


# 使用TextStreamer进行连续推理
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    # alpaca_prompt.format(
    #     "Continue the fibonnaci sequence.",  # 指令
    #     "1, 1, 2, 3, 5, 8",  # 输入
    #     "",  # 输出留空用于生成
    # )

    alpaca_prompt.format(
        "Continue the fibonnaci sequence.",  # 指令
        "11, 11, 22, 33, 55, 88",  # 输入
        "",  # 输出留空用于生成
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)


# 保存模型
model.save_pretrained("lora_model_shuang")  # 本地保存
tokenizer.save_pretrained("lora_model_shuang")



# 加载保存的模型进行推理
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model_shuang",  # 训练时使用的模型
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)  # 启用原生2倍速推理

# 使用保存的模型进行推理示例
inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris? 用中文回答",  # 指令
        "",  # 输入
        "",  # 输出留空用于生成
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)


Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐