from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
model_id = "ZhipuAI/glm-4-9b-chat"
model_dir = "./ZhipuAI/glm-4-9b-chat/"
model_dir = snapshot_download(model_id, cache_dir="./", revision="master")
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
model.enable_input_require_grads()
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["query_key_value", "dense", "dense_h_to_4h", "activation_func", "dense_4h_to_h"],
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
)
model = get_peft_model(model, config)
import json
import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from swanlab.integration.huggingface import SwanLabCallback
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
import swanlab
def dataset_jsonl_transfer(origin_path, new_path):
"""
将原始数据集转换为大模型微调所需数据格式的新数据集
"""
messages = []
with open(origin_path, "r") as file:
for line in file:
data = json.loads(line)
input_text = data["text"]
entities = data["entities"]
match_names = ["地点", "人名", "地理实体", "组织"]
entity_sentence = ""
for entity in entities:
entity_json = dict(entity)
entity_text = entity_json["entity_text"]
entity_names = entity_json["entity_names"]
for name in entity_names:
if name in match_names:
entity_label = name
break
entity_sentence += f"{{\"entity_text\": \"{entity_text}\", \"entity_label\": \"{entity_label}\"}}"
if entity_sentence == "":
entity_sentence = "没有找到任何实体"
message = {
"instruction": """你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体。以 json 格式输出,如 {"entity_text": "南京", "entity_label": "地理实体"} 注意:1. 输出的每一行都必须是正确的 json 字符串。2. 找不到任何实体时,输出"没有找到任何实体"。""",
"input": f"文本:{input_text}",
"output": entity_sentence,
}
messages.append(message)
with open(new_path, "w", encoding="utf-8") as file:
for message in messages:
file.write(json.dumps(message, ensure_ascii=False) + "\n")
def process_func(example):
"""
对数据集进行数据预处理,主要用于被 dataset.map 调用
"""
MAX_LENGTH = 384
input_ids, attention_mask, labels = [], [], []
system_prompt = """你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体。以 json 格式输出,如 {"entity_text": "南京", "entity_label": "地理实体"} 注意:1. 输出的每一行都必须是正确的 json 字符串。2. 找不到任何实体时,输出"没有找到任何实体"."""
instruction = tokenizer(
f"<|system|>\n{system_prompt}<|endoftext|>\n<|user|>\n{example['input']}<|endoftext|>\n<|assistant|>\n",
add_special_tokens=False,
)
response = tokenizer(f"{example['output']}", add_special_tokens=False)
input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
attention_mask = (
instruction["attention_mask"] + response["attention_mask"] + [1]
)
labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
if len(input_ids) > MAX_LENGTH:
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
def predict(messages, model, tokenizer):
"""对测试集进行模型推理,得到预测结果"""
device = "cuda"
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
return response
model_id = "ZhipuAI/glm-4-9b-chat"
model_dir = "./ZhipuAI/glm-4-9b-chat/"
model_dir = snapshot_download(model_id, cache_dir="./", revision="master")
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
model.enable_input_require_grads()
train_dataset_path = "ccfbdci.jsonl"
train_jsonl_new_path = "ccf_train.jsonl"
if not os.path.exists(train_jsonl_new_path):
dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
total_df = pd.read_json(train_jsonl_new_path, lines=True)
train_df = total_df[int(len(total_df) * 0.1):]
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["query_key_value", "dense", "dense_h_to_4h", "activation_func", "dense_4h_to_h"],
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
)
model = get_peft_model(model, config)
args = TrainingArguments(
output_dir="./output/GLM4-NER",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
logging_steps=10,
num_train_epochs=2,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True,
report_to="none",
)
swanlab_callback = SwanLabCallback(
project="GLM4-NER-fintune",
experiment_name="GLM4-9B-Chat",
description="使用智谱 GLM4-9B-Chat 模型在 NER 数据集上微调,实现关键实体识别任务。",
config={
"model": model_id,
"model_dir": model_dir,
"dataset": "qgyd2021/chinese_ner_sft",
},
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
callbacks=[swanlab_callback],
)
trainer.train()
test_df = total_df[:int(len(total_df) * 0.1)].sample(n=20)
test_text_list = []
for index, row in test_df.iterrows():
instruction = row['instruction']
input_value = row['input']
messages = [
{"role": "system", "content": f"{instruction}"},
{"role": "user", "content": f"{input_value}"}
]
response = predict(messages, model, tokenizer)
messages.append({"role": "assistant", "content": f"{response}"})
result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
test_text_list.append(swanlab.Text(result_text, caption=response))
swanlab.log({"Prediction": test_text_list})
swanlab.finish()
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
def predict(messages, model, tokenizer):
device = "cuda"
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
model_dir = "./ZhipuAI/glm-4-9b-chat/"
lora_dir = "./output/GLM4-NER/checkpoint-1700"
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(model, model_id=lora_dir)
input_text = "西安电子科技大学的陈志明爱上了隔壁西北工业大学苏春红,他们约定好毕业后去中国的苏州定居。"
test_texts = {
"instruction": """你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体。以 json 格式输出,如; {"entity_text": "南京", "地理实体"} 注意:1. 输出的每一行都必须是正确的 json 字符串。2. 找不到任何实体时,输出"没有找到任何实体"。""",
"input": f"文本:{input_text}"
}
instruction = test_texts['instruction']
input_value = test_texts['input']
messages = [
{"role": "system", "content": f"{instruction}"},
{"role": "user", "content": f"{input_value}"}
]
response = predict(messages, model, tokenizer)
print(response)