model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/llama-3-8b-bnb-4bit",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
# token="hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
安装 LoRA
model = FastLanguageModel.get_peft_model(
model,
r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias="none", # Supports any, but = "none" is optimized# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
random_state=3407,
use_rslora=False, # We support rank stabilized LoRA
loftq_config=None, # And LoftQ
)
初始化 Hugging Face 的监督微调训练器
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False, # Can make training 5x faster for short sequences.
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
max_steps=60,
learning_rate=2e-4,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
),
)
# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B"# model_id = "mistralai/Mistral-7B-v0.1"# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
)
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
use_cache=False,
device_map="auto",
token=os.environ["HF_TOKEN"], # if model is gated like llama or mistral
attn_implementation="flash_attention_2"if use_flash_attention2 else"sdpa"
)
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(
model_id,
token=os.environ["HF_TOKEN"], # if model is gated like llama or mistral
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
安装 LoRA
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
]
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
初始化 Hugging Face 的监督微调训练器
args = TrainingArguments(
output_dir="mistral-int4-alpaca",
num_train_epochs=1,
per_device_train_batch_size=6if use_flash_attention2 else2, # you can play with the batch size depending on your hardware
gradient_accumulation_steps=4,
gradient_checkpointing=True,
optim="paged_adamw_8bit",
logging_steps=10,
save_strategy="epoch",
learning_rate=2e-4,
bf16=use_flash_attention2,
fp16=not use_flash_attention2,
tf32=use_flash_attention2,
max_grad_norm=0.3,
warmup_steps=5,
lr_scheduler_type="linear",
disable_tqdm=False,
report_to="none"
)
model = get_peft_model(model, peft_config)
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
max_seq_length=2048,
tokenizer=tokenizer,
packing=True,
formatting_func=format_instruction,
args=args,
)