import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import time
import psutil
import numpy as np
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
plt.switch_backend('Agg')
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False
model_configs = [
("Qwen/Qwen2-7B-cls", "Qwen2-7B", 7.8),
("Qwen/Qwen1.5-7B-cls", "Qwen1.5-7B", 6.5),
("meta-llama/Llama-3-8B-cls", "Llama3-8B", 8.2),
("mistralai/Mistral-7B-v0.3-cls", "Mistral-7B", 6.5),
("google/gemma-7b-cls", "Gemma-7B", 7.0)
]
model_names, model_labels, model_sizes = zip(*model_configs)
dataset = load_dataset("ag_news", split="test[:1000]")
texts = dataset["text"]
true_labels = dataset["label"]
class_names = ["World", "Sports", "Business", "Technology"]
accuracies = []
inference_times = []
memory_usages = []
all_predictions = []
for model_name, model_label in tqdm(zip(model_names, model_labels), desc="Testing 5 LLMs"):
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.float16
)
model.eval()
start_time = time.time()
predictions = []
with torch.no_grad():
for text in texts:
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512
)
inputs = {k: v.to(model.device) for k, v in inputs.items()}
outputs = model(**inputs)
pred_label = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]
predictions.append(pred_label)
end_time = time.time()
all_predictions.append(predictions)
avg_infer_time = (end_time - start_time) / len(texts)
inference_times.append(avg_infer_time)
correct_count = sum(p == t for p, t in zip(predictions, true_labels))
accuracy = (correct_count / len(true_labels)) * 100
accuracies.append(accuracy)
if torch.cuda.is_available():
memory_usage = torch.cuda.memory_allocated(model.device) / (1024 ** 3)
else:
memory_usage = psutil.Process().memory_info().rss / (1024 ** 3)
memory_usages.append(memory_usage)
del model, tokenizer
torch.cuda.empty_cache() if torch.cuda.is_available() else None
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle("2025 Top 5 Open-Source LLMs: AG News Classification Comparison", fontsize=22, fontweight='bold', y=0.98)
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']
axes[0, 0].bar(model_labels, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
axes[0, 0].set_title("Accuracy (%)", fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel("Accuracy (%)")
axes[0, 0].set_ylim(88, 95)
axes[0, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(accuracies):
axes[0, 0].text(i, v+0.1, f"{v:.1f}%", ha='center', va='bottom', fontweight='bold', fontsize=10)
axes[0, 1].bar(model_labels, inference_times, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
axes[0, 1].set_title("Average Inference Time per Text (s)", fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel("Time (s)")
axes[0, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(inference_times):
axes[0, 1].text(i, v+0.002, f"{v:.3f}", ha='center', va='bottom', fontweight='bold', fontsize=10)
axes[0, 2].bar(model_labels, memory_usages, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
axes[0, 2].set_title("Memory Usage (GB)", fontsize=14, fontweight='bold')
axes[0, 2].set_ylabel("Memory (GB)")
axes[0, 2].grid(axis='y', alpha=0.3)
for i, v in enumerate(memory_usages):
axes[0, 2].text(i, v+0.1, f"{v:.1f}GB", ha='center', va='bottom', fontweight='bold', fontsize=10)
y_pos = np.arange(len(model_labels))
axes[1, 0].barh(y_pos, model_sizes, color=colors, alpha=0.8, edgecolor='black', linewidth=1)
axes[1, 0].set_yticks(y_pos)
axes[1, 0].set_yticklabels(model_labels)
axes[1, 0].set_title("Model Size (GB)", fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel("Size (GB)")
axes[1, 0].grid(axis='x', alpha=0.3)
for i, v in enumerate(model_sizes):
axes[1, 0].text(v+0.1, i, f"{v:.1f}GB", ha='left', va='center', fontweight='bold', fontsize=10)
def normalize(values):
"""归一化函数(0-1 区间)"""
return ((values - min(values)) / (max(values) - min(values))) if max(values) != min(values) else [0.5]*len(values)
accuracy_norm = normalize(accuracies)
speed_norm = [1 - x for x in normalize(inference_times)]
memory_norm = [1 - x for x in normalize(memory_usages)]
size_norm = [1 - x for x in normalize(model_sizes)]
categories = ["Accuracy", "Infer Speed", "Memory Eff.", "Size Eff."]
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1]
axes[1, 1].remove()
ax_radar = fig.add_subplot(2, 3, 4, projection='polar')
for i, model_label in enumerate(model_labels):
values = [accuracy_norm[i], speed_norm[i], memory_norm[i], size_norm[i]] + [accuracy_norm[i]]
ax_radar.plot(angles, values, label=model_label, color=colors[i], linewidth=2, marker='o', markersize=4)
ax_radar.fill(angles, values, color=colors[i], alpha=0.15)
ax_radar.set_xticks(angles[:-1])
ax_radar.set_xticklabels(categories, fontsize=11)
ax_radar.set_ylim(0, 1)
ax_radar.set_title("Comprehensive Score (Normalized)", fontsize=14, fontweight='bold', pad=20)
ax_radar.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=9)
ax_radar.grid(True, alpha=0.3)
best_model_idx = accuracies.index(max(accuracies))
best_model_name = model_labels[best_model_idx]
best_predictions = all_predictions[best_model_idx]
cm = confusion_matrix(true_labels, best_predictions)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
im = axes[1, 2].imshow(cm_normalized, interpolation='nearest', cmap='Blues', vmin=0, vmax=100)
axes[1, 2].set_title(f"Confusion Matrix: {best_model_name}\n(Normalized %)", fontsize=14, fontweight='bold')
axes[1, 2].set_xlabel("Predicted Class")
axes[1, 2].set_ylabel("True Class")
axes[1, 2].set_xticks(np.arange(len(class_names)))
axes[1, 2].set_yticks(np.arange(len(class_names)))
axes[1, 2].set_xticklabels(class_names, rotation=45, ha='right')
axes[1, 2].set_yticklabels(class_names)
for i in range(len(class_names)):
for j in range(len(class_names)):
text = axes[1, 2].text(j, i, f"{cm_normalized[i, j]:.1f}%",
ha="center", va="center", color="black" if cm_normalized[i, j] > 50 else "white",
fontweight='bold')
cbar = fig.colorbar(im, ax=axes[1, 2], shrink=0.8)
cbar.set_label("Percentage (%)", rotation=270, labelpad=15)
plt.tight_layout()
save_path = "/root/llm_5way_comparison_report.png"
plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
print(f"✅ 5 模型对比报告已保存至:{save_path}")
print("\n=== 5 大模型性能汇总表 ===")
print(f"{'模型名称':<15} {'准确率':<10} {'单条推理时间':<15} {'显存占用':<10} {'模型体积':<10}")
print("-" * 60)
for i in range(len(model_labels)):
print(f"{model_labels[i]:<15} {accuracies[i]:<10.1f}% {inference_times[i]:<15.3f}s {memory_usages[i]:<10.1f}GB {model_sizes[i]:<10.1f}GB")