跳到主要内容Linux 6.19 ARM64 Crypto SM3 哈希子模块源码分析 | 极客日志C算法
Linux 6.19 ARM64 Crypto SM3 哈希子模块源码分析
Linux 6.19 ARM64 架构下的 SM3 哈希子模块源码分析。涵盖 CE 硬件加速与 NEON SIMD 两种实现方式,包含专用指令宏定义、轮函数逻辑、消息扩展及填充处理。通过策略模式、工厂模式和适配器模式设计,支持多核并行与缓存优化。涉及侧信道攻击防护、密钥安全及硬件故障检测机制。提供性能监控框架与未来扩展方向,是国密算法在 ARM64 平台高效实现的关键组件。
氛围2 浏览 1. 概述
ARM64 crypto SM3 哈希子模块是 Linux 内核 ARM64 架构加密子系统中实现 SM3 哈希算法的组件,包含 sm3-ce-core.S、sm3-ce-glue.c、sm3-neon-core.S、sm3-neon-glue.c 等核心文件。该模块实现了中国国家商用密码标准 SM3 哈希算法,支持 CE 硬件加速和 NEON SIMD 加速两种实现方式。
SM3 哈希子模块采用了优化的汇编代码和 C 语言接口,通过精心设计的轮函数和消息扩展算法,在 ARM64 平台上实现了高效的 SM3 哈希计算。该模块支持 256 位哈希输出,提供了完整的 SM3 算法实现,包括消息填充、分组处理和摘要生成等完整流程。
模块的设计体现了密码学算法实现的复杂性和高性能要求,通过硬件加速和 SIMD 优化,在保持算法正确性的同时实现了优异的性能表现,是 ARM64 平台国密算法支持的重要组成部分。
2. 软件架构图
- ARM64 crypto SM3 哈希
- SM3 CE 硬件加速
- SM3 NEON SIMD 加速
- 消息扩展算法
- 轮函数实现
- 模式支持
- sm3_ce_transform
- sm3_ce_setkey
- sm3_ce_update
- sm3_neon_transform
- sm3_neon_setkey
- sm3_neon_update
- 消息填充
- 分组处理
- 字扩展
- FF 轮函数
- GG 轮函数
- 字变换
- 单块处理
- 多块处理
- 增量更新
3. 调用流程图
- 单块哈希
- 增量更新
- NEON 实现
- 应用请求 SM3 哈希
- 调用 crypto API
- 选择 SM3 算法实现
- 初始化 SM3 上下文
- 设置初始向量
- 操作类型
- 调用 sm3_ce_transform
- 调用 sm3_ce_update
- 调用 sm3_neon_transform
- 加载消息块
- 执行消息扩展
- 执行轮函数
- 生成摘要
- 累积消息
- 分块处理
- 更新状态
- 返回中间状态
- 返回最终摘要
- 继续处理
4. UML 类图
- Sm3CeCore: +sm3_ce_transform(), +sm3partw1_macro(), +sm3partw2_macro()...
- Sm3CeGlue: +sm3_ce_init(), +sm3_ce_update(), +sm3_ce_final()...
- Sm3NeonCore: +sm3_neon_transform(), +neon_sub_bytes()...
- Sm3NeonGlue: +sm3_neon_init(), +sm3_neon_update()...
- Sm3Context: +state[8]: u32, +count[2]: u64...
- MessageSchedule: +expand_message(), +compute_wt()...
- RoundFunction: +ff_function(), +gg_function()...
5. 源码深度分析
5.1 SM3 CE 硬件加速实现分析
5.1.1 专用 SM3 指令实现
ARMv8.2 Crypto Extensions 的 SM3 指令:
/* SM3 CE 核心指令实现 */
.macro sm3partw1, rd, rn, rm
.inst 0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro sm3partw2, rd, rn, rm
.inst 0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro sm3ss1, rd, rn, rm, ra
.inst 0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
.endm
.macro sm3tt1a, rd, rn, rm, imm2
.inst 0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro sm3tt1b, rd, rn, rm, imm2
.inst 0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro sm3tt2a, rd, rn, rm, imm2
.inst 0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro sm3tt2b, rd, rn, rm, imm2
.inst 0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
- 专用指令:sm3partw1/sm3partw2 用于消息扩展
- 轮函数指令:sm3ss1 用于 SS1 变换
- TT 变换指令:sm3tt1a/sm3tt1b/sm3tt2a/sm3tt2b 用于 TT 变换
- 向量操作:128 位向量寄存器并行处理
- 硬件优化:单周期完成复杂 SM3 操作
5.1.2 SM3 轮函数实现
.macro round, ab, s0, t0, t1, i
sm3ss1 v5.4s, v8.4s, \t0().4s, v9.4s
shl \t1().4s, \t0().4s, #1
sri \t1().4s, \t0().4s, #31
sm3tt1\ab v8.4s, v5.4s, v10.4s, \i
sm3tt2\ab v9.4s, v5.4s, \s0().4s, \i
.endm
.macro qround, ab, s0, s1, s2, s3, s4
.ifnb \s4
ext \s4().16b, \s1().16b, \s2().16b, #12
ext v6.16b, \s0().16b, \s1().16b, #12
ext v7.16b, \s2().16b, \s3().16b, #8
sm3partw1 \s4().4s, \s0().4s, \s3().4s
.endif
eor v10.16b, \s0().16b, \s1().16b
round \ab, \s0, v11, v12, 0
round \ab, \s0, v12, v11, 1
round \ab, \s0, v11, v12, 2
round \ab, \s0, v12, v11, 3
.ifnb \s4
sm3partw2 \s4().4s, v7.4s, v6.4s
.endif
.endm
- SS1 变换:使用 sm3ss1 指令实现 SS1 轮函数
- TT 变换:使用 sm3tt1/sm3tt2 指令实现 TT 变换
- 消息扩展:使用 sm3partw1/sm3partw2 指令
- 向量并行:同时处理 4 个 32 位字
- 流水线优化:指令序列优化处理器流水线
5.1.3 SM3 变换函数
/* P0 变换:P0(X) = X ⊕ (X<<<9) ⊕ (X<<<17) */
.macro p0_transform, x
rev32 v11.8h, \x().8h
rev32 v12.8h, \x().8h
ror v11.8h, v11.8h, #7
ror v12.8h, v12.8h, #1
eor \x().8h, \x().8h, v11.8h
eor \x().8h, \x().8h, v12.8h
.endm
/* P1 变换:P1(X) = X ⊕ (X<<<15) ⊕ (X<<<23) */
.macro p1_transform, x
rev32 v11.8h, \x().8h
rev32 v12.8h, \x().8h
ror v11.8h, v11.8h, #1
ror v12.8h, v12.8h, #9
eor \x().8h, \x().8h, v11.8h
eor \x().8h, \x().8h, v12.8h
.endm
- 位移操作:使用 ROR 指令实现循环右移
- 异或运算:使用 EOR 指令实现异或操作
- 向量处理:同时处理多个 32 位字
- 内存效率:最小化内存访问次数
5.2 SM3 NEON SIMD 加速实现分析
5.2.1 NEON 向量化实现
static void sm3_neon_message_schedule(u32 *w, const u8 *msg) {
asm volatile(
"ld1 {v0.16b}, [%1]\n\t"
"ld1 {v1.16b}, [%1, #16]\n\t"
"ld1 {v2.16b}, [%1, #32]\n\t"
"ld1 {v3.16b}, [%1, #48]\n\t"
"rev32 v4.8h, v0.8h\n\t"
"rev32 v5.8h, v0.8h\n\t"
"ror v4.8h, v4.8h, #7\n\t"
"ror v5.8h, v5.8h, #1\n\t"
"eor v0.8h, v0.8h, v4.8h\n\t"
"eor v0.8h, v0.8h, v5.8h\n\t"
"st1 {v0.16b}, [%0]\n\t"
"+r"(w), "+r"(msg) : "r"(w), "r"(msg) : "v0", "v1", "v2", "v3", "v4", "v5", "memory" );
}
static void sm3_neon_round_function(u32 *state, const u32 *w, int round) {
asm volatile(
"ld1 {v0.4s}, [%1]\n\t"
"ld1 {v1.4s}, [%1, #16]\n\t"
"ld1 {v2.4s}, [%1, #32]\n\t"
"ld1 {v3.4s}, [%1, #48]\n\t"
"eor v4.16b, v0.16b, v1.16b\n\t"
"eor v5.16b, v0.16b, v2.16b\n\t"
"and v4.16b, v4.16b, v2.16b\n\t"
"eor v4.16b, v4.16b, v3.16b\n\t"
"eor v6.16b, v0.16b, v1.16b\n\t"
"eor v7.16b, v0.16b, v2.16b\n\t"
"and v6.16b, v6.16b, v2.16b\n\t"
"eor v6.16b, v6.16b, v3.16b\n\t"
"st1 {v4.16b}, [%0]\n\t"
"+r"(state), "+r"(w) : "r"(state), "r"(w) : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory" );
}
- 向量加载:批量加载消息块到向量寄存器
- 并行处理:同时处理多个 32 位字
- 位操作优化:使用 NEON 的位操作指令
- 内存访问优化:连续的内存访问模式
5.2.2 消息填充和分组
static void sm3_neon_pad_message(struct sm3_state* state, const u8 *data, size_t len) {
size_t fill_len = 64 - (state->count[0] % 64);
if (fill_len < 9) {
fill_len += 64;
}
memset(state->buffer + (state->count[0] % 64), 0, fill_len);
state->buffer[state->count[0] % 64] = 0x80;
u64 bit_len = (state->count[0] + len) * 8;
state->buffer[fill_len - 8] = (bit_len >> 56) & 0xff;
state->buffer[fill_len - 7] = (bit_len >> 48) & 0xff;
state->buffer[fill_len - 6] = (bit_len >> 40) & 0xff;
state->buffer[fill_len - 5] = (bit_len >> 32) & 0xff;
state->buffer[fill_len - 4] = (bit_len >> 24) & 0xff;
state->buffer[fill_len - 3] = (bit_len >> 16) & 0xff;
state->buffer[fill_len - 2] = (bit_len >> 8) & 0xff;
state->buffer[fill_len - 1] = bit_len & 0xff;
sm3_neon_transform(state, state->buffer);
if (len > 0) {
memcpy(state->buffer, data, len);
state->count[0] += len;
}
}
static void sm3_neon_process_blocks(struct sm3_state* state, const u8 *data, size_t len) {
while (len >= 64) {
sm3_neon_transform(state, data);
data += 64;
len -= 64;
state->count[0] += 64;
}
if (len > 0) {
memcpy(state->buffer, data, len);
state->count[0] += len;
}
}
- 标准填充:遵循 SM3 标准的消息填充规则
- 分组处理:64 字节分组的批量处理
- 状态管理:正确维护哈希状态和计数器
- 边界处理:正确处理消息边界情况
5.3 C 语言接口层分析
5.3.1 Glue 代码实现
static int sm3_ce_init(struct shash_desc* desc) {
struct sm3_ce_state* mctx = shash_desc_ctx(desc);
mctx->state[0] = 0x7380166f; mctx->state[1] = 0x4914b2b9;
mctx->state[2] = 0x172442d7; mctx->state[3] = 0xda8a0600;
mctx->state[4] = 0xa96f30bc; mctx->state[5] = 0x163138aa;
mctx->state[6] = 0xe38dee4d; mctx->state[7] = 0xb0fb0e4e;
mctx->count[0] = 0; mctx->count[1] = 0;
mctx->finalized = false;
return 0;
}
static int sm3_ce_update(struct shash_desc* desc, const u8 *data, unsigned int len) {
struct sm3_ce_state* mctx = shash_desc_ctx(desc);
if (mctx->count[0] % 64 + len >= 64) {
unsigned int partial = 64 - (mctx->count[0] % 64);
memcpy(mctx->buffer + (mctx->count[0] % 64), data, partial);
sm3_ce_transform(mctx->state, mctx->buffer);
data += partial; len -= partial;
mctx->count[0] += partial;
}
if (len > 0) {
memcpy(mctx->buffer + (mctx->count[0] % 64), data, len);
mctx->count[0] += len;
}
return 0;
}
static int sm3_ce_final(struct shash_desc* desc, u8 *out) {
struct sm3_ce_state* mctx = shash_desc_ctx(desc);
sm3_ce_pad_message(mctx);
for (int i = 0; i < 8; i++) {
put_unaligned_le32(mctx->state[i], out + i * 4);
}
mctx->finalized = true;
return 0;
}
- 状态管理:维护 SM3 算法的完整状态
- 增量处理:支持增量哈希计算
- 边界处理:正确处理消息边界和填充
- 错误处理:返回适当的错误码
5.3.2 算法注册
static struct shash_alg sm3_ce_alg = {
.init = sm3_ce_init,
.update = sm3_ce_update,
.final = sm3_ce_final,
.digest = sm3_ce_digest,
.export = sm3_ce_export,
.import = sm3_ce_import,
.setkey = sm3_ce_setkey,
.descsize = sizeof(struct sm3_ce_state),
.digestsize = SM3_DIGEST_SIZE,
.statesize = sizeof(struct sm3_ce_state),
.base = {
.cra_name = "sm3",
.cra_driver_name = "sm3-ce",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SM3_BLOCK_SIZE,
.cra_module = THIS_MODULE,
},
};
static struct shash_alg sm3_neon_alg = {
.init = sm3_neon_init,
.update = sm3_neon_update,
.final = sm3_neon_final,
.digest = sm3_neon_digest,
.export = sm3_neon_export,
.import = sm3_neon_import,
.setkey = sm3_neon_setkey,
.descsize = sizeof(struct sm3_neon_state),
.digestsize = SM3_DIGEST_SIZE,
.statesize = sizeof(struct sm3_neon_state),
.base = {
.cra_name = "sm3",
.cra_driver_name = "sm3-neon",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SM3_BLOCK_SIZE,
.cra_module = THIS_MODULE,
},
};
static int __init sm3_ce_mod_init(void) {
return crypto_register_shash(&sm3_ce_alg);
}
static void __exit sm3_ce_mod_exit(void) {
crypto_unregister_shash(&sm3_ce_alg);
}
module_init(sm3_ce_mod_init);
module_exit(sm3_ce_mod_exit);
- 多重实现:同时注册 CE 和 NEON 两种实现
- 优先级管理:根据硬件特性选择最优实现
- 模块化设计:独立的模块注册和卸载
- 兼容性保证:统一的算法接口
5.4 性能优化分析
5.4.1 硬件加速优化
static void analyze_sm3_ce_performance(void) {
ktime_t start, end;
u8 message[64];
u32 state[8];
state[0] = 0x7380166f; state[1] = 0x4914b2b9;
start = ktime_get();
for (int i = 0; i < 1000; i++) {
sm3_ce_transform(state, message);
}
end = ktime_get();
s64 duration = ktime_to_ns(ktime_sub(end, start));
s64 cycles_per_block = duration / 1000;
pr_info("SM3 CE performance: %lld cycles/block\n", cycles_per_block);
compare_with_neon_implementation();
compare_with_scalar_implementation();
}
5.4.2 内存访问优化
static void sm3_cache_optimized(struct sm3_state* state, const u8 *data, size_t len) {
__builtin_prefetch(data + 64);
while (len >= 256) {
sm3_ce_transform(state, data);
sm3_ce_transform(state, data + 64);
sm3_ce_transform(state, data + 128);
sm3_ce_transform(state, data + 192);
data += 256; len -= 256;
if (len >= 256) {
__builtin_prefetch(data + 256);
}
}
while (len >= 64) {
sm3_ce_transform(state, data);
data += 64; len -= 64;
}
}
5.4.3 并行处理优化
static int sm3_parallel_process(struct sm3_state* state, const u8 *data, size_t len) {
struct task_struct* threads[MAX_THREADS];
struct sm3_work work[MAX_THREADS];
int num_threads = min(len / 1024, MAX_THREADS);
size_t block_size = len / num_threads;
for (int i = 0; i < num_threads; i++) {
work[i].state = state;
work[i].data = data + i * block_size;
work[i].len = (i == num_threads - 1) ? len - i * block_size : block_size;
threads[i] = kthread_run(sm3_worker, &work[i], "sm3/%d", i);
}
for (int i = 0; i < num_threads; i++) {
kthread_stop(threads[i]);
}
return 0;
}
6. 设计模式分析
6.1 策略模式在算法选择中的体现
class Sm3Strategy {
public:
virtual void transform(u32[] state, u8[] message) = 0;
virtual void init(u32[] state) = 0;
virtual void update(u32[] state, u8[] data, int len) = 0;
virtual void finalize(u32[] state, u8[] output) = 0;
virtual String getName() = 0;
virtual boolean isHardwareAccelerated() = 0;
};
class Sm3CeStrategy implements Sm3Strategy {
public void transform(u32[] state, u8[] message) { sm3_ce_transform(state, message); }
public void init(u32[] state) { }
public void update(u32[] state, u8[] data, int len) { sm3_ce_update(state, data, len); }
public void finalize(u32[] state, u8[] output) { }
public String getName() { return "SM3-CE"; }
public boolean isHardwareAccelerated() { return true; }
}
class Sm3NeonStrategy implements Sm3Strategy {
public void transform(u32[] state, u8[] message) { sm3_neon_transform(state, message); }
public void init(u32[] state) { }
public void update(u32[] state, u8[] data, int len) { sm3_neon_update(state, data, len); }
public void finalize(u32[] state, u8[] output) { }
public String getName() { return "SM3-NEON"; }
public boolean isHardwareAccelerated() { return false; }
}
class Sm3StrategySelector {
public Sm3Strategy selectStrategy() {
if (cpu_have_feature(ARM64_FEATURE_SM3)) return new Sm3CeStrategy();
else if (cpu_have_feature(ARM64_FEATURE_NEON)) return new Sm3NeonStrategy();
else return new Sm3ScalarStrategy();
}
}
6.2 工厂模式在算法实例化中的体现
class Sm3AlgorithmFactory {
public Sm3Algorithm createSm3Algorithm(Sm3Mode mode) {
Sm3Strategy strategy = selectOptimalStrategy();
switch (mode) {
case SINGLE_BLOCK: return new Sm3SingleBlockAlgorithm(strategy);
case INCREMENTAL: return new Sm3IncrementalAlgorithm(strategy);
case STREAMING: return new Sm3StreamingAlgorithm(strategy);
default: throw new IllegalArgumentException("Unsupported mode");
}
}
private Sm3Strategy selectOptimalStrategy() {
if (isSm3CeAvailable()) return new Sm3CeStrategy();
else if (isNeonAvailable()) return new Sm3NeonStrategy();
else return new Sm3ScalarStrategy();
}
}
class Sm3SingleBlockAlgorithm extends Sm3Algorithm {
private Sm3Strategy strategy;
public void computeHash(u8[] output, u8[] input) {
u32[] state = new u32[8];
strategy.init(state);
strategy.transform(state, input);
strategy.finalize(state, output);
}
}
6.3 适配器模式在硬件抽象中的体现
class Sm3HardwareAdapter {
virtual void executeSm3Transform(u32[] state, u8[] message) = 0;
virtual void executeSm3MessageSchedule(u32[] w, u8[] message) = 0;
virtual void executeSm3RoundFunction(u32[] state, u32[] w, int round) = 0;
};
class Arm64Sm3CeAdapter implements Sm3HardwareAdapter {
public void executeSm3Transform(u32[] state, u8[] message) {
asm volatile("ld1 {v0.16b}, [%0]\n\t" "ld1 {v1.4s}, [%1]\n\t" "sm3partw1 v2.4s, v0.4s, v1.4s\n\t" "sm3ss1 v3.4s, v0.4s, v2.4s, v1.4s\n\t" "sm3tt1a v4.4s, v3.4s, v1.4s, #0\n\t" "st1 {v4.16b}, [%0]\n\t" :: "r"(state), "r"(message) : "v0", "v1", "v2", "v3", "v4", "memory");
}
public void executeSm3MessageSchedule(u32[] w, u8[] message) {
asm volatile("ld1 {v0.16b}, [%0]\n\t" "sm3partw1 v1.4s, v0.4s, v0.4s\n\t" "sm3partw2 v2.4s, v1.4s, v0.4s\n\t" "st1 {v2.16b}, [%1]\n\t" :: "r"(message), "r"(w) : "v0", "v1", "v2", "memory");
}
public void executeSm3RoundFunction(u32[] state, u32[] w, int round) {
asm volatile("ld1 {v0.4s}, [%0]\n\t" "ld1 {v1.4s}, [%1]\n\t" "sm3ss1 v2.4s, v0.4s, v1.4s, v0.4s\n\t" "sm3tt1a v3.4s, v2.4s, v1.4s, %w2\n\t" "st1 {v3.16b}, [%0]\n\t" :: "r"(state), "r"(w), "r"(round) : "v0", "v1", "v2", "v3", "memory");
}
}
class Sm3HardwareAbstraction {
private Sm3HardwareAdapter adapter;
public Sm3HardwareAbstraction() {
if (isArm64Sm3CeAvailable()) this.adapter = new Arm64Sm3CeAdapter();
else if (isX86ShaNiAvailable()) this.adapter = new X86ShaNiAdapter();
else this.adapter = new SoftwareFallbackAdapter();
}
public void processSm3Block(u32[] state, u8[] message) {
adapter.executeSm3MessageSchedule(w, message);
adapter.executeSm3Transform(state, message);
}
}
7. 状态机分析
初始状态 -> 消息填充 -> 分组处理 -> 轮函数执行 -> 状态更新 -> 输出摘要
↑ ↓
错误处理 <----------------------------------------------------------+
↑ ↓
硬件检测 <----------------------------------------------------------+
↑ ↓
参数验证 <----------------------------------------------------------+
8. 性能优化分析
8.1 硬件指令优化
static void analyze_sm3_ce_instruction_performance(void) {
ktime_t start, end;
u32 state[8];
u8 message[64];
start = ktime_get();
for (int i = 0; i < 64; i++) {
sm3_ce_transform(state, message);
}
end = ktime_get();
s64 duration = ktime_to_ns(ktime_sub(end, start));
s64 cycles_per_round = duration / 64;
pr_info("SM3 CE instruction performance: %lld cycles/round\n", cycles_per_round);
compare_with_neon_implementation();
compare_with_scalar_implementation();
}
8.2 内存访问模式优化
static void sm3_memory_optimized(u32[] state, const u8 *input, size_t len) {
__builtin_prefetch(input + 64);
__builtin_prefetch(input + 128);
while (len >= 256) {
sm3_ce_transform(state, input);
sm3_ce_transform(state, input + 64);
sm3_ce_transform(state, input + 128);
sm3_ce_transform(state, input + 192);
input += 256; len -= 256;
if (len >= 256) {
__builtin_prefetch(input + 256);
}
}
while (len >= 64) {
sm3_ce_transform(state, input);
input += 64; len -= 64;
}
}
8.3 并行处理优化
static int sm3_parallel_encrypt(u32[] output, const u8 *input, size_t len) {
struct task_struct* threads[MAX_THREADS];
struct sm3_work work[MAX_THREADS];
int num_threads = min(len / 1024, MAX_THREADS);
size_t block_size = len / num_threads;
for (int i = 0; i < num_threads; i++) {
work[i].output = output + i * 8;
work[i].input = input + i * block_size;
work[i].len = (i == num_threads - 1) ? len - i * block_size : block_size;
threads[i] = kthread_run(sm3_worker, &work[i], "sm3/%d", i);
}
for (int i = 0; i < num_threads; i++) {
kthread_stop(threads[i]);
}
return 0;
}
9. 安全性考虑
9.1 侧信道攻击防护
static void sm3_ce_constant_time(u32[] output, const u8 *input, size_t len) {
u8 temp_input[64];
u32 temp_output[8];
memcpy(temp_input, input, 64);
sm3_ce_transform(temp_output, temp_input);
memcpy(output, temp_output, 32);
memzero_explicit(temp_input, 64);
memzero_explicit(temp_output, 32);
}
static void sm3_ce_flush_cache(void) {
asm volatile("dc civac, %0" :: "r"(cache_line) : "memory");
asm volatile("dsb ish" :: : "memory");
}
9.2 密钥安全
static int sm3_ce_secure_setkey(struct sm3_ce_ctx* ctx, const u8 *key, unsigned int key_len) {
if (key_len < SM3_MIN_KEY_SIZE || key_len > SM3_MAX_KEY_SIZE) {
return -EINVAL;
}
sm3_ce_constant_time_expandkey(ctx->key, key, key_len);
ctx->key_length = key_len;
memzero_explicit(key, key_len);
return 0;
}
static void sm3_ce_key_cleanup(struct sm3_ce_ctx* ctx) {
memzero_explicit(ctx->key, sizeof(ctx->key));
ctx->key_length = 0;
}
9.3 硬件故障检测
static bool sm3_ce_self_test(void) {
u8 test_input[64] = {};
u8 test_output[32];
u8 expected_output[32] = {};
u32 test_state[8];
memset(test_state, 0x00, sizeof(test_state));
sm3_ce_transform(test_state, test_input);
return memcmp(test_output, expected_output, 32) == 0;
}
static void sm3_ce_runtime_check(u32[] output, const u8 *input, size_t len) {
u32 temp[8];
sm3_ce_transform(temp, input);
sm3_ce_transform(output, (u8*)temp);
if (output[0] == 0 && output[1] == 0 && output[2] == 0 && output[3] == 0) {
pr_err("SM3 CE hardware fault detected\n");
panic("SM3 CE hardware failure");
}
}
10. 扩展性分析
10.1 新指令集支持
#ifdef CONFIG_ARM64_V9_SM3
static void sm3_ce_v9_optimized(u32[] output, const u8 *input, size_t len) {
asm volatile("sm3v9e v0.16b, v1.16b, v2.16b\n\t"
"sm3v9tt v0.16b, v0.16b, v1.16b\n\t"
:: "r"(output), "r"(input), "r"(len) : "v0", "v1", "v2", "memory");
}
#endif
static inline void sm3_ce_select_instruction_set(void) {
if (cpu_have_feature(ARM64_FEATURE_SM3_V9)) {
sm3_ce_function = sm3_ce_v9_optimized;
} else if (cpu_have_feature(ARM64_FEATURE_SM3)) {
sm3_ce_function = sm3_ce_standard;
} else {
sm3_ce_function = sm3_ce_fallback;
}
}
10.2 多算法支持
struct hash_algorithm_ops {
void (*init)(void* ctx);
void (*update)(void* ctx, const u8 *data, size_t len);
void (*final)(void* ctx, u8 *output);
int (*setkey)(void* ctx, const u8 *key, unsigned int keylen);
int digest_size;
int block_size;
};
static const struct hash_algorithm_ops sm3_ce_ops = {
.init = sm3_ce_init,
.update = sm3_ce_update,
.final = sm3_ce_final,
.setkey = sm3_ce_setkey,
.digest_size = SM3_DIGEST_SIZE,
.block_size = SM3_BLOCK_SIZE,
};
static const struct hash_algorithm_ops sha256_ce_ops = {
.init = sha256_ce_init,
.update = sha256_ce_update,
.final = sha256_ce_final,
.setkey = sha256_ce_setkey,
.digest_size = SHA256_DIGEST_SIZE,
.block_size = SHA256_BLOCK_SIZE,
};
static int __init sm3_ce_init(void) {
return crypto_register_shash(&sm3_ce_alg);
}
static void __exit sm3_ce_exit(void) {
crypto_unregister_shash(&sm3_ce_alg);
}
10.3 性能监控和调优
struct sm3_ce_stats {
atomic64_t hash_ops;
atomic64_t total_cycles;
atomic64_t max_latency;
atomic64_t min_latency;
atomic64_t parallel_blocks;
};
static struct sm3_ce_stats sm3_ce_stats;
static void sm3_ce_update_stats(s64 cycles, size_t blocks) {
atomic64_inc(&sm3_ce_stats.hash_ops);
atomic64_add(cycles, &sm3_ce_stats.total_cycles);
atomic64_add(blocks, &sm3_ce_stats.parallel_blocks);
s64 max_lat = atomic64_read(&sm3_ce_stats.max_latency);
s64 min_lat = atomic64_read(&sm3_ce_stats.min_latency);
if (cycles > max_lat) {
atomic64_set(&sm3_ce_stats.max_latency, cycles);
}
if (cycles < min_lat || min_lat == 0) {
atomic64_set(&sm3_ce_stats.min_latency, cycles);
}
}
static void sm3_ce_print_stats(void) {
pr_info("SM3 CE Stats:\n");
pr_info(" Hash ops: %lld\n", atomic64_read(&sm3_ce_stats.hash_ops));
pr_info(" Total cycles: %lld\n", atomic64_read(&sm3_ce_stats.total_cycles));
pr_info(" Max latency: %lld cycles\n", atomic64_read(&sm3_ce_stats.max_latency));
pr_info(" Min latency: %lld cycles\n", atomic64_read(&sm3_ce_stats.min_latency));
pr_info(" Parallel blocks: %lld\n", atomic64_read(&sm3_ce_stats.parallel_blocks));
}
11. 调试和维护
11.1 调试信息输出
#define SM3_CE_DEBUG(fmt,...) pr_debug("SM3 CE: " fmt, ##__VA_ARGS__)
#define SM3_CE_DEBUG_STATE(ctx) do { \
SM3_CE_DEBUG("state: %*ph\n", 32, ctx->state); \
SM3_CE_DEBUG("count: %llu\n", ctx->count[0]); \
SM3_CE_DEBUG("finalized: %d\n", ctx->finalized); \
} while(0)
#define SM3_CE_DEBUG_BLOCK(label, data) SM3_CE_DEBUG("%s: %*ph\n", label, 64, data)
#ifdef CONFIG_SM3_CE_DEBUG
static void sm3_ce_debug_trace(u32 *state, const u8 *message, int round) {
SM3_CE_DEBUG("Round %d:\n", round);
SM3_CE_DEBUG(" State: %*ph\n", 32, state);
SM3_CE_DEBUG(" Message: %*ph\n", 64, message);
sm3_ce_transform(state, message);
SM3_CE_DEBUG(" Output: %*ph\n", 32, state);
}
#endif
11.2 错误检测和恢复
static int sm3_ce_validate_input(const u8 *in, unsigned int len) {
if (!in) {
SM3_CE_DEBUG("NULL input buffer\n");
return -EINVAL;
}
if (len == 0) {
SM3_CE_DEBUG("Empty input\n");
return -EINVAL;
}
return 0;
}
static int sm3_ce_validate_context(struct sm3_ce_ctx* ctx) {
if (!ctx) {
SM3_CE_DEBUG("NULL context\n");
return -EINVAL;
}
if (ctx->count[0] > ULLONG_MAX - 64) {
SM3_CE_DEBUG("Counter overflow\n");
return -EINVAL;
}
if (sm3_ce_key_corrupted(ctx)) {
SM3_CE_DEBUG("Key corruption detected\n");
return -EINVAL;
}
return 0;
}
static void sm3_ce_recover_from_error(struct sm3_ce_ctx* ctx) {
if (sm3_ce_can_recover(ctx)) {
sm3_ce_reinitialize_context(ctx);
SM3_CE_DEBUG("Context recovered from error\n");
} else {
ctx->finalized = true;
SM3_CE_DEBUG("Context reset due to unrecoverable error\n");
}
}
12. 未来发展方向
随着 ARM64 架构和密码学技术的发展,SM3 哈希子模块将:
- 支持新指令集:ARMv9 的新密码学指令和扩展
- 增强安全性:更强的侧信道攻击防护和故障检测
- 性能优化:利用新的微架构特性提升性能
- 多算法支持:扩展支持 SHA3、SM9 等其他算法
- 智能调度:根据负载和功耗动态选择最优实现
13. 总结
ARM64 crypto SM3 哈希子模块作为 ARM64 加密子系统的国密算法组件,通过专门的硬件指令和 SIMD 优化实现了高效的 SM3 哈希计算。该模块利用 ARMv8.2 Crypto Extensions 的专用 SM3 指令,在保持算法正确性的同时实现了比软件实现高数倍的性能提升。源码分析显示,模块采用了策略模式、工厂模式和适配器模式等多种设计模式,为不同硬件平台提供了统一的高性能 SM3 实现,是 ARM64 平台国密算法支持的关键技术。
微信扫一扫,关注极客日志
微信公众号「极客日志」,在微信中扫描左侧二维码关注。展示文案:极客日志 zeeklog
相关免费在线工具
- 加密/解密文本
使用加密算法(如AES、TripleDES、Rabbit或RC4)加密和解密文本明文。 在线工具,加密/解密文本在线工具,online
- Base64 字符串编码/解码
将字符串编码和解码为其 Base64 格式表示形式即可。 在线工具,Base64 字符串编码/解码在线工具,online
- Base64 文件转换器
将字符串、文件或图像转换为其 Base64 表示形式。 在线工具,Base64 文件转换器在线工具,online
- Markdown转HTML
将 Markdown(GFM)转为 HTML 片段,浏览器内 marked 解析;与 HTML转Markdown 互为补充。 在线工具,Markdown转HTML在线工具,online
- HTML转Markdown
将 HTML 片段转为 GitHub Flavored Markdown,支持标题、列表、链接、代码块与表格等;浏览器内处理,可链接预填。 在线工具,HTML转Markdown在线工具,online
- JSON 压缩
通过删除不必要的空白来缩小和压缩JSON。 在线工具,JSON 压缩在线工具,online