import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value
# bitsandbytes:专为量化设计的库,重点在于减少大语言模型(尤其是在GPU上)的内存占用。
# peft:用于将LoRA适配器集成到大语言模型(LLMs)中。
# trl:该库包含一个SFT(监督微调)类,用于辅助微调模型。
# accelerate和xformers:这些库用于提高模型的推理速度,从而优化其性能。
# wandb:该工具作为一个监控平台,用于跟踪和观察训练过程。
# datasets:与Hugging Face一起使用,该库便于加载数据集。

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
    Trainer
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os, wandb
torch.cuda.is_available()
torch.cuda.device_count()

1. 加载模型和Tokenizer

# 预训练模型
model_name = "/root/autodl-tmp/model/Meta-Llama-3-8B"


# 数据集名称
dataset_name = "scooterman/guanaco-llama3-1k"
# 加载预训练模型和tokenizer

# 量化配置
# https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True, # 模型将以4位量化格式加载
    bnb_4bit_quant_type = "nf4", # 指定4位量化的类型为 nf4 
    bnb_4bit_compute_dtype = torch.float16, # 计算数据类型 
    bnb_4bit_use_double_quant = False, # 表示不使用双重量化
)
# 模型加载
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = {"": 0} # 将模型加载到设备0(通常是第一个GPU)
)

model = prepare_model_for_kbit_training(model)

# tokenizer 加载
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True # 在生成序列时会自动添加结束标记

# 加载数据集

dataset = load_dataset(dataset_name, split="train")

dataset["text"][0]

2 wandb配置

# 监控
# 需要在WandB官网注册账号

# wandb.login(key="11a0ff012b65b101fdf6613d7c21f66a5960e623")
wandb.login(key="1d3bd30d1862bd8ef07b56cc09cb3f245f5ad878")
run = wandb.init(
    project="finetune llama-3-8B",
    job_type = "training",
)
# 计算训练参数量

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"训练参数量 : {trainable_params} || 总的参数量 : {all_param} || 训练参数量占比%: {100 * (trainable_params / all_param):.2f}"
    )

3. LoRA与训练超参配置

# LoRA config

peft_config = LoraConfig(
    r = 8,
    lora_alpha = 16, # 小技巧:把α值设置成rank值的两倍
    # scaling = alpha / r # LoRA 权重的值越大,影响就越大。
    # weight += (lora_B @ lora_A) * scaling
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    # ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj", "down_proj", "embed_tokens", "lm_head"]
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]
)
# 训练超参

training_arguments = TrainingArguments(
    output_dir = "/root/autodl-tmp/output",
    num_train_epochs = 5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2, # 梯度累积步数为2,即每2步更新一次梯度。有助于在显存有限的情况下使用较大的有效批次大小。
    optim = "paged_adamw_8bit",
    save_steps = 100, # 每100步保存一次模型 
    logging_steps = 30,
    learning_rate = 2e-4,
    weight_decay = 0.001, # 权重衰减系数,用于L2正则化,帮助防止过拟合。
    fp16 = False,
    bf16 = False,
    max_grad_norm = 0.3, # 最大梯度范数,用于梯度裁剪,防止梯度爆炸。
    max_steps = -1, # 最大训练步数为-1,表示没有限制。
    warmup_ratio = 0.3, # 预热阶段的比例。在训练开始时,学习率会逐渐升高,预热比例为0.3表示前30%的训练步骤用于预热。
    group_by_length = True, # 按序列长度分组,以提高训练效率。
    lr_scheduler_type = "linear", # 表示使用线性学习率调度。
    report_to = "wandb", # tensorboard
)
## 4. 模型微调
# SFT超参

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    peft_config = peft_config,
    tokenizer = tokenizer,
    dataset_text_field="text",
    args = training_arguments,
    packing=False
)
# 开始训练

trainer.train()
model = get_peft_model(model, peft_config)

# 计算可训练参数量
print_trainable_parameters(model)

5. 保存模型

# 保存微调模型

trainer.model.save_pretrained("/root/autodl-tmp/model/lora_model")

wandb.finish()

model.config.use_cache = True

model.eval()

6. 模型推理

# base模型测试

def stream(user_input):
    device = "cuda:0"
    system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
    B_INST, E_INST = "### Instruction:\n", "### Response:\n"
    prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
    inputs = tokenizer([prompt], return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=128)
stream("Tell me something about the Great Wall.")
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os, wandb

7. 模型合并

# 预训练模型
model_name = "/root/autodl-tmp/model/Meta-Llama-3-8B"
# 合并 base model 与 lora model
# https://huggingface.co/docs/trl/main/en/use_model#use-adapters-peft

base_model = AutoModelForCausalLM.from_pretrained(
    model_name, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.float16,
    device_map= {"": 0})
new_model = PeftModel.from_pretrained(base_model, "/root/autodl-tmp/model/lora_model")
# 模型合并
merged_model = new_model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
user_input = "Tell me something about the Great Wall."
device = "cuda:0"
system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
B_INST, E_INST = "### Instruction:\n", "### Response:\n"
prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
inputs = tokenizer([prompt], return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
_ = merged_model.generate(**inputs, streamer=streamer, max_new_tokens=128, num_return_sequences=1)
Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐