import json
import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from swanlab.integration.huggingface import SwanLabCallback
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
import swanlab
def dataset_jsonl_transfer(origin_path, new_path):
'''
将原始数据集转换为大模型微调所需数据格式的新数据集
'''
messages = []
with open(origin_path, 'r') as file:
for line in file:
data = json.loads(line)
input_text = data['text']
entities = data['entities']
match_names = ['地点', '人名', '地理实体', '组织']
entity_sentence = ''
for entity in entities:
entity_json = dict(entity)
entity_text = entity_json['entity_text']
entity_names = entity_json['entity_names']
for name in entity_names:
if name in match_names:
entity_label = name
break
entity_sentence += f"{{'entity_text': '{entity_text}', 'entity_label': '{entity_label}'}}"
if entity_sentence == '':
entity_sentence = '没有找到任何实体'
message = {
'instruction': '''你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体。以 json 格式输出,如 {'entity_text': '南京', 'entity_label': '地理实体'} 注意:1. 输出的每一行都必须是正确的 json 字符串。2. 找不到任何实体时,输出'没有找到任何实体'。''',
'input': f'文本:{input_text}',
'output': entity_sentence,
}
messages.append(message)
with open(new_path, 'w', encoding='utf-8') as file:
for message in messages:
file.write(json.dumps(message, ensure_ascii=False)+'\n')
def process_func(example):
'''
将数据集进行预处理
'''
MAX_LENGTH = 384
input_ids, attention_mask, labels = [], [], []
system_prompt = '''你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体。以 json 格式输出,如 {'entity_text': '南京', 'entity_label': '地理实体'} 注意:1. 输出的每一行都必须是正确的 json 字符串。2. 找不到任何实体时,输出'没有找到任何实体'.'''
instruction = tokenizer(
f'<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{example["input"]}<|im_end|>\n<|im_start|>assistant\n',
add_special_tokens=False,
)
response = tokenizer(f'{example["output"]}', add_special_tokens=False)
input_ids = instruction['input_ids'] + response['input_ids'] + [tokenizer.pad_token_id]
attention_mask = (
instruction['attention_mask'] + response['attention_mask'] + [1]
)
labels = [-100] * len(instruction['input_ids']) + response['input_ids'] + [tokenizer.pad_token_id]
if len(input_ids) > MAX_LENGTH:
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}
def predict(messages, model, tokenizer):
device = 'cuda'
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors='pt').to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
return response
model_id = 'qwen/Qwen2-1.5B-Instruct'
model_dir = './qwen/Qwen2-1___5B-Instruct'
model_dir = snapshot_download(model_id, cache_dir='./', revision='master')
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto', torch_dtype=torch.bfloat16)
model.enable_input_require_grads()
train_dataset_path = 'ccfbdci.jsonl'
train_jsonl_new_path = 'ccf_train.jsonl'
if not os.path.exists(train_jsonl_new_path):
dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
total_df = pd.read_json(train_jsonl_new_path, lines=True)
train_df = total_df[int(len(total_df)*0.1):]
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
)
model = get_peft_model(model, config)
args = TrainingArguments(
output_dir='./output/Qwen2-NER',
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
logging_steps=10,
num_train_epochs=2,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True,
report_to='none',
)
swanlab_callback = SwanLabCallback(
project='Qwen2-NER-fintune',
experiment_name='Qwen2-1.5B-Instruct',
description='使用通义千问 Qwen2-1.5B-Instruct 模型在 NER 数据集上微调,实现关键实体识别任务。',
config={
'model': model_id,
'model_dir': model_dir,
'dataset': 'qgyd2021/chinese_ner_sft',
},
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
callbacks=[swanlab_callback],
)
trainer.train()
test_df = total_df[:int(len(total_df)*0.1)].sample(n=20)
test_text_list = []
for index, row in test_df.iterrows():
instruction = row['instruction']
input_value = row['input']
messages = [
{'role': 'system', 'content': f'{instruction}'},
{'role': 'user', 'content': f'{input_value}'}
]
response = predict(messages, model, tokenizer)
messages.append({'role': 'assistant', 'content': f'{response}'})
result_text = f'{messages[0]}\n\n{messages[1]}\n\n{messages[2]}'
test_text_list.append(swanlab.Text(result_text, caption=response))
swanlab.log({'Prediction': test_text_list})
swanlab.finish()
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
def predict(messages, model, tokenizer):
device = 'cuda'
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors='pt').to(device)
generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
tokenizer = AutoTokenizer.from_pretrained('./qwen/Qwen2-1___5B-Instruct/', use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('./qwen/Qwen2-1___5B-Instruct/', device_map='auto', torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(model, model_id='./output/Qwen2-NER/checkpoint-1700')
input_text = '西安电子科技大学的陈志明爱上了隔壁西北工业大学苏春红,他们约定好毕业后去中国的苏州定居。'
test_texts = {
'instruction': '''你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体。以 json 格式输出,如; {'entity_text': '南京', 'entity_label': '地理实体'} 注意:1. 输出的每一行都必须是正确的 json 字符串。2. 找不到任何实体时,输出'没有找到任何实体'。''',
'input': f'文本:{input_text}'
}
instruction = test_texts['instruction']
input_value = test_texts['input']
messages = [
{'role': 'system', 'content': f'{instruction}'},
{'role': 'user', 'content': f'{input_value}'}
]
response = predict(messages, model, tokenizer)
print(response)