loRA微调LLMs实战代码
【代码】loRA微调LLMs实战代码。
·
import subprocess
import os
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
if '=' in line:
var, value = line.split('=', 1)
os.environ[var] = value
# bitsandbytes:专为量化设计的库,重点在于减少大语言模型(尤其是在GPU上)的内存占用。
# peft:用于将LoRA适配器集成到大语言模型(LLMs)中。
# trl:该库包含一个SFT(监督微调)类,用于辅助微调模型。
# accelerate和xformers:这些库用于提高模型的推理速度,从而优化其性能。
# wandb:该工具作为一个监控平台,用于跟踪和观察训练过程。
# datasets:与Hugging Face一起使用,该库便于加载数据集。
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
TrainingArguments,
pipeline,
logging,
TextStreamer,
Trainer
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os, wandb
torch.cuda.is_available()
torch.cuda.device_count()
1. 加载模型和Tokenizer
# 预训练模型
model_name = "/root/autodl-tmp/model/Meta-Llama-3-8B"
# 数据集名称
dataset_name = "scooterman/guanaco-llama3-1k"
# 加载预训练模型和tokenizer
# 量化配置
# https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit = True, # 模型将以4位量化格式加载
bnb_4bit_quant_type = "nf4", # 指定4位量化的类型为 nf4
bnb_4bit_compute_dtype = torch.float16, # 计算数据类型
bnb_4bit_use_double_quant = False, # 表示不使用双重量化
)
# 模型加载
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config = bnb_config,
device_map = {"": 0} # 将模型加载到设备0(通常是第一个GPU)
)
model = prepare_model_for_kbit_training(model)
# tokenizer 加载
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True # 在生成序列时会自动添加结束标记
# 加载数据集
dataset = load_dataset(dataset_name, split="train")
dataset["text"][0]
2 wandb配置
# 监控
# 需要在WandB官网注册账号
# wandb.login(key="11a0ff012b65b101fdf6613d7c21f66a5960e623")
wandb.login(key="1d3bd30d1862bd8ef07b56cc09cb3f245f5ad878")
run = wandb.init(
project="finetune llama-3-8B",
job_type = "training",
)
# 计算训练参数量
def print_trainable_parameters(model):
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"训练参数量 : {trainable_params} || 总的参数量 : {all_param} || 训练参数量占比%: {100 * (trainable_params / all_param):.2f}"
)
3. LoRA与训练超参配置
# LoRA config
peft_config = LoraConfig(
r = 8,
lora_alpha = 16, # 小技巧:把α值设置成rank值的两倍
# scaling = alpha / r # LoRA 权重的值越大,影响就越大。
# weight += (lora_B @ lora_A) * scaling
lora_dropout = 0.05,
bias = "none",
task_type = "CAUSAL_LM",
# ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj", "down_proj", "embed_tokens", "lm_head"]
target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"]
)
# 训练超参
training_arguments = TrainingArguments(
output_dir = "/root/autodl-tmp/output",
num_train_epochs = 5,
per_device_train_batch_size = 4,
gradient_accumulation_steps = 2, # 梯度累积步数为2,即每2步更新一次梯度。有助于在显存有限的情况下使用较大的有效批次大小。
optim = "paged_adamw_8bit",
save_steps = 100, # 每100步保存一次模型
logging_steps = 30,
learning_rate = 2e-4,
weight_decay = 0.001, # 权重衰减系数,用于L2正则化,帮助防止过拟合。
fp16 = False,
bf16 = False,
max_grad_norm = 0.3, # 最大梯度范数,用于梯度裁剪,防止梯度爆炸。
max_steps = -1, # 最大训练步数为-1,表示没有限制。
warmup_ratio = 0.3, # 预热阶段的比例。在训练开始时,学习率会逐渐升高,预热比例为0.3表示前30%的训练步骤用于预热。
group_by_length = True, # 按序列长度分组,以提高训练效率。
lr_scheduler_type = "linear", # 表示使用线性学习率调度。
report_to = "wandb", # tensorboard
)
## 4. 模型微调
# SFT超参
trainer = SFTTrainer(
model = model,
train_dataset = dataset,
peft_config = peft_config,
tokenizer = tokenizer,
dataset_text_field="text",
args = training_arguments,
packing=False
)
# 开始训练
trainer.train()
model = get_peft_model(model, peft_config)
# 计算可训练参数量
print_trainable_parameters(model)
5. 保存模型
# 保存微调模型
trainer.model.save_pretrained("/root/autodl-tmp/model/lora_model")
wandb.finish()
model.config.use_cache = True
model.eval()
6. 模型推理
# base模型测试
def stream(user_input):
device = "cuda:0"
system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
B_INST, E_INST = "### Instruction:\n", "### Response:\n"
prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
inputs = tokenizer([prompt], return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=128)
stream("Tell me something about the Great Wall.")
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
TrainingArguments,
pipeline,
logging,
TextStreamer,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import os, wandb
7. 模型合并
# 预训练模型
model_name = "/root/autodl-tmp/model/Meta-Llama-3-8B"
# 合并 base model 与 lora model
# https://huggingface.co/docs/trl/main/en/use_model#use-adapters-peft
base_model = AutoModelForCausalLM.from_pretrained(
model_name, low_cpu_mem_usage=True,
return_dict=True,torch_dtype=torch.float16,
device_map= {"": 0})
new_model = PeftModel.from_pretrained(base_model, "/root/autodl-tmp/model/lora_model")
# 模型合并
merged_model = new_model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
user_input = "Tell me something about the Great Wall."
device = "cuda:0"
system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
B_INST, E_INST = "### Instruction:\n", "### Response:\n"
prompt = f"{system_prompt}{B_INST}{user_input.strip()}\n\n{E_INST}"
inputs = tokenizer([prompt], return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
_ = merged_model.generate(**inputs, streamer=streamer, max_new_tokens=128, num_return_sequences=1)
更多推荐
所有评论(0)