import torch
import torch_npu
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
print("开始测试...")
MODEL_NAME = "NousResearch/Llama-2-7b-hf"
print(f"下载模型:{MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
print("加载到 NPU...")
model = model.npu()
model.eval()
print(f"显存占用:{torch.npu.memory_allocated() / 1e9:.2f} GB")
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.npu() for k, v in inputs.items()}
start = time.time()
outputs = model.generate(**inputs, max_new_tokens=50)
end = time.time()
text = tokenizer.decode(outputs[0])
print(f"\n生成文本:{text}")
print(f"耗时:{(end-start)*1000:.2f}ms")
print(f"吞吐量:{50/(end-start):.2f} tokens/s")
import torch
import torch_npu
import time
import json
import pandas as pd
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_NAME = "NousResearch/Llama-2-7b-hf"
DEVICE = "npu:0"
WARMUP_RUNS = 5
TEST_RUNS = 10
SAVE_RESULT = True
PRECISION = "fp16"
TEST_CASES = [
{"场景": "英文短文本生成", "输入": "The capital of France is", "生成长度": 50, "batch_size": 1},
{"场景": "中文对话", "输入": "请解释什么是人工智能:", "生成长度": 100, "batch_size": 1},
{"场景": "代码生成", "输入": "Write a Python function to calculate fibonacci:", "生成长度": 150, "batch_size": 1},
{"场景": "批量推理(batch=2)", "输入": "The capital of France is", "生成长度": 50, "batch_size": 2},
{"场景": "长文本叙事", "输入": "请写一篇关于人工智能未来的科幻短篇:", "生成长度": 200, "batch_size": 1},
{"场景": "多轮问答", "输入": "Q: 什么是机器学习?\nA: ...", "生成长度": 100, "batch_size": 1},
{"场景": "高并发批量(batch=4)", "输入": "The capital of France is", "生成长度": 50, "batch_size": 4},
]
def benchmark(prompt, tokenizer, model, max_new_tokens, batch_size):
batch_inputs = [prompt] * batch_size
inputs = tokenizer(
batch_inputs,
return_tensors="pt",
padding=True if batch_size > 1 else False,
truncation=True,
max_length=512
).to(DEVICE)
print(f"预热中...({WARMUP_RUNS}次,batch_size={batch_size})")
for _ in range(WARMUP_RUNS):
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tokenizer.eos_token_id)
latencies = []
print(f"开始正式测试...({TEST_RUNS}次)")
for i in range(TEST_RUNS):
torch.npu.synchronize()
start = time.time()
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tokenizer.eos_token_id)
torch.npu.synchronize()
end = time.time()
latencies.append(end - start)
avg_latency = sum(latencies) / len(latencies)
throughput = max_new_tokens / avg_latency
total_throughput = throughput * batch_size
mem_peak = torch.npu.max_memory_allocated() / 1e9
return {
"平均延迟 (秒)": round(avg_latency, 3),
"单请求吞吐量 (tokens/秒)": round(throughput, 2),
"批量总吞吐量 (tokens/秒)": round(total_throughput, 2),
"显存峰值 (GB)": round(mem_peak, 2),
"生成长度": max_new_tokens,
"batch_size": batch_size
}
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(DEVICE)
model.eval()
results = []
for case in TEST_CASES:
res = benchmark(case["输入"], tokenizer, model, case["生成长度"], case["batch_size"])
res.update({"场景": case["场景"]})
results.append(res)
print(f"场景 {case['场景']} 完成:总吞吐 {res['批量总吞吐量 (tokens/秒)']:.2f} tokens/s")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"llama_npu_benchmark_{timestamp}.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)