import torch
import torch_npu
import time
import json
import pandas as pd
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_NAME = "NousResearch/Llama-2-7b-hf"
DEVICE = "npu:0"
WARMUP_RUNS = 5
TEST_RUNS = 10
PRECISION = "fp16"
TEST_CASES = [
{"场景": "英文短文本生成", "输入": "The capital of France is", "生成长度": 50, "batch_size": 1},
{"场景": "中文对话", "输入": "请解释什么是人工智能:", "生成长度": 100, "batch_size": 1},
{"场景": "代码生成", "输入": "Write a Python function to calculate fibonacci:", "生成长度": 150, "batch_size": 1},
{"场景": "批量推理(batch=2)", "输入": "The capital of France is", "生成长度": 50, "batch_size": 2},
{"场景": "长文本叙事", "输入": "请写一篇关于人工智能未来的科幻短篇:", "生成长度": 200, "batch_size": 1},
{"场景": "多轮问答", "输入": "Q: 什么是机器学习?\nA: 机器学习是数据驱动的算法...\nQ: 它和传统编程的区别?", "生成长度": 100, "batch_size": 1},
{"场景": "高并发批量(batch=4)", "输入": "The capital of France is", "生成长度": 50, "batch_size": 4},
]
def load_model_and_tokenizer(model_name, precision):
print(f"===== 开始加载模型 {model_name}(精度:{precision}) =====")
start_load = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_name)
dtype = torch.float16 if precision == "fp16" else torch.int8
try:
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=dtype, low_cpu_mem_usage=True
).to(DEVICE)
except Exception as e:
print(f"INT8 精度加载失败,自动 fallback 到 FP16:{str(e)[:50]}")
dtype = torch.float16
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=dtype, low_cpu_mem_usage=True
).to(DEVICE)
model.eval()
end_load = time.time()
load_time = end_load - start_load
mem_used = torch.npu.memory_allocated() / 1e9
print(f"模型加载完成:耗时 {load_time:.2f} 秒,显存占用 {mem_used:.2f} GB")
return model, tokenizer, load_time, mem_used, str(dtype)
def benchmark(prompt, tokenizer, model, max_new_tokens, batch_size):
batch_inputs = [prompt] * batch_size
padding_mode = True if batch_size > 1 else False
inputs = tokenizer(
batch_inputs,
return_tensors="pt",
padding=padding_mode,
truncation=True,
max_length=512
).to(DEVICE)
print(f"预热中...({WARMUP_RUNS}次,batch_size={batch_size})")
for _ in range(WARMUP_RUNS):
with torch.no_grad():
_ = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
latencies = []
print(f"开始正式测试...({TEST_RUNS}次,生成长度={max_new_tokens})")
for i in range(TEST_RUNS):
torch.npu.synchronize()
start = time.time()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
torch.npu.synchronize()
end = time.time()
latency = end - start
latencies.append(latency)
print(f" 第{i+1}次:耗时 {latency:.2f} 秒 | 速度 {max_new_tokens/latency:.2f} tokens/秒")
avg_latency = sum(latencies) / len(latencies)
std_latency = pd.Series(latencies).std()
throughput = max_new_tokens / avg_latency
total_throughput = throughput * batch_size
mem_peak = torch.npu.max_memory_allocated() / 1e9
return {
"平均延迟 (秒)": round(avg_latency, 3),
"延迟标准差 (秒)": round(std_latency, 3),
"单请求吞吐量 (tokens/秒)": round(throughput, 2),
"批量总吞吐量 (tokens/秒)": round(total_throughput, 2),
"显存峰值 (GB)": round(mem_peak, 2),
"生成长度": max_new_tokens,
"batch_size": batch_size
}
if __name__ == "__main__":
model, tokenizer, load_time, load_mem, actual_dtype = load_model_and_tokenizer(MODEL_NAME, PRECISION)
results = []
for case in TEST_CASES:
print(f"\n===== 开始测试场景:{case['场景']} =====")
case_result = benchmark(
prompt=case["输入"],
tokenizer=tokenizer,
model=model,
max_new_tokens=case["生成长度"],
batch_size=case["batch_size"]
)
case_result.update({"场景": case["场景"]})
results.append(case_result)
df = pd.DataFrame(results)
print(df.to_string())
tokenizer.pad_token = tokenizer.eos_token
torch.npu.synchronize()
outputs = model.generate(
**inputs,
max_new_tokens=50,
use_cache=True,
cache_implementation="npu_optimized",
do_sample=False,
num_beams=1,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
torch.npu.synchronize()