1. 背景介绍
在 STM32 设备 Crash 的时候,触发硬件错误中断,在中断向量表的头部有如下几个中断:
- HardFault_Handler
- MemManage_Handler
- BusFault_Handler
- UsageFault_Handler
关于每个 Fault_Handler 的具体说明请参考文末相关资料;一些 RTOS 都会定义好一些 FaultHandler,以便在设备崩溃的时候,能够打印出一些进程上下文信息,随后 reset 设备。
另外, 除了 HardFault_Handler 默认启用外,其他几个 Handler 需要用户手动打开; HardFault 触发逻辑有以下几个,根据 SCB->HFSR 寄存器的值分为:
-
FORCED:MemManage Fault、 Bus Fault、Usage Fault 未启用,则这几个错误会由 HardFault_Handler 处理; 或者前面几个 Handler 内再次出现异常,则触发 HardFault
-
DEBUG_VT: debug 子系统未启用时发生调试行为(如中断等)
-
VECTTBL: 异常处理的时候,从中断向量表中读到错误的地址
图1. HFSR 寄存器的 bit 定义
而在发生可配置的错误中断时,具体错误的详细信息保存在 SCB->CFSR 寄存器中,其定义如下:
图2. CFSR 寄存器及其内部三个寄存器值(UFSR, BFSR, MFSR)的 bit 定义
在异常处理逻辑中,我们可以得到设备在异常发生时的上下文,包括默认压入栈的 r0, r1, r2, r3, r12, lr, pc, psr这 8 个寄存器,还可手动保存 r4~r11 这8个寄存器,最后还有部分错误状态寄存器;
一般情况下设备在异常处理之后需要 reset 重置来恢复运行,但是我们可以做一些有趣的事情:
-
“可恢复”的错误:在多线程/任务的 RTOS 系统环境下,如果是某个用户线程触发了“可恢复”的错误,我们可以保存进程上下文,然后调用 RTOS 的一些函数,清理掉这个线程,并适时重启该线程;
-
“不可恢复”的错误:如果是 RTOS 内核或者其他原因导致的不可恢复错误,我们可以将关键的寄存器信息保存在 Backup Registers/SRAM 中(STM32 Backup Registers/SRAM 读写);
就算设备未通过电池连接 VBAT 脚,但是设备 reset 过程并不会断电,因此 RTC 和备份域中的资源可以在reset过程中得以保存。
借此,我们可以提供以下一些能力:
-
研发同学可以获取设备异常发生时的上下文,直接定位到函数位置,不再需要物理连接设备复现异常,对于远程设备故障的排查十分方便
-
在攻击者尝试一些攻击行为时,比如失败的缓冲区溢出,大概率会导致设备出现异常崩溃; 我们可以第一时间发现并解决安全隐患,甚至在攻击者成功利用之前完成漏洞的修复
(配合严格的MPU策略更佳) -
发生可恢复的错误时,通知 RTOS 内核重启部分进程/任务,而不影响设备其他功能的运行,减少宕机时间
(尤其在一些关键任务的执行过程中)
2. 方案实现
模仿 RTT 代码中 HardFault_Handler 的汇编实现,为三个可配置的异常处理编写处理函数,将关键寄存器信息保存在 Backup Registers 中;
在 Handler 函数中尽量不破坏原进程上下文,最后跳转 RTT 默认的 HardFault_Handler 以保留其崩溃信息 dump 的基本功能;
-
LL 库 + GCC
-
sec.h(stm32f4xx)
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146#include <stdio.h>#include <core_cm4.h>#include <stm32f401xc.h>#include <stm32f4xx_ll_rtc.h>// 示例 main 函数int main(void){// enable 可配置错误中断enable_fault_handlers();// 检查 backup 寄存器是否有保存错误信息read_last_fault();// 两个触发异常的测试函数test_usagefault();test_memmanagefault();}// 读写备份域寄存器,需要提前正确配置硬件 RTCuint32_t read_backup_reg(uint32_t index){return LL_RTC_BAK_GetRegister(RTC, LL_RTC_BKP_DR0 + index);}void write_backup_reg(uint32_t index, uint32_t data){LL_RTC_BAK_SetRegister(RTC, LL_RTC_BKP_DR0 + index, data);return;}// 清空 Backup 寄存器void clear_backup_reg(uint32_t index){// if index == 0xff: clear all backup registersif (index == 0xff){for(uint8_t i=0; i<20; i++)write_backup_reg(i, 0);}else {write_backup_reg(index, 0);}}// 设备开机后运行,检查上次 reset 之前是否有错误上下文void read_last_fault(){if(read_backup_reg(0) == 0 ){printf("No faults since last reset\n");}else{// for(uint8_t i=0; i<20; i++)// printf("DR%02d:\t0x%08x\n", i, HAL_RTCEx_BKUPRead(&RTC_Handler, RTC_BKP_DR0 + i));printf("pc: 0x%08x\n""lr: 0x%08x\n""psr: 0x%08x\n",read_backup_reg(8),read_backup_reg(7),read_backup_reg(9));// r0-r3for(int i=0; i<4; i++)printf("r%02d: 0x%08x\n", i, read_backup_reg(i+2));// r4-r11for(int i=0; i<8; i++)printf("r%02d: 0x%08x\n", i+4, read_backup_reg(i+8+2));// r12printf("r%02d: 0x%08x\n", 12, read_backup_reg(6));// CFSR, EXC_RETURN and EMMFAR or BFARprintf("CFSR: 0x%08x\n""EXC_RET: 0x%08x\n""MFAR/BFAR: 0x%08x\n",read_backup_reg(0),read_backup_reg(1),read_backup_reg(18));clear_backup_reg(0xff);}}// 启用 3 个可配置错误中断,需要在设备启动后尽早执行void enable_fault_handlers(){SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;SCB->SHCSR |= SCB_SHCSR_BUSFAULTENA_Msk;SCB->SHCSR |= SCB_SHCSR_USGFAULTENA_Msk;// enable divided by zero faultSCB->CCR |= SCB_CCR_DIV_0_TRP_Msk;// enable unalign access faultSCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;}// 测试 UsageFaultvoid test_usagefault(){int a = 1;a = a / (a - 1);}// 测试 MemManageFaultvoid test_memmanagefault(){int (*bad_fun)(void) = (void *) 0xE1234567;bad_fun();}// 异常发生时保存寄存器值void store_fault_context(uint32_t sp_addr){uint32_t *data = (uint32_t *)sp_addr;// first, CFSR: MFSR(8bits) + BFSR(8bits) + UFSR(16bits)write_backup_reg(0, SCB->CFSR);// then value of 'EXC_RETURN'write_backup_reg(1, data[0]);// r0, r1, r2, r3, r12, lr, pc, psrfor(int i=1; i<9; i++)write_backup_reg(i+1, data[i+8]);// r4 - r11for(int i=9; i<17; i++)write_backup_reg(i+1, data[i-8]);// lastly, it's MMFAR if MemManage fault happens, BFAR if BusFault happensif( *((uint8_t *)(&SCB->CFSR)) & (1<<7) )write_backup_reg(18, SCB->MMFAR);else if( *((uint8_t *)(&SCB->CFSR + 1)) & (1<<7) )write_backup_reg(18, SCB->BFAR);elsewrite_backup_reg(18, 0xdeadbeef);}// 几个异常处理入口函数#define ASM_FaultHandler \"TST lr, #0x04 \n" \"ITE eq \n" \"MRSEQ r0, msp \n" \"MRSNE r0, psp \n" \"STMFD r0!, {r4 - r11} \n" \"STMFD r0!, {lr} \n" \"PUSH {lr} \n" \"BL store_fault_context \n" \"POP {lr} \n" \"B HardFault_Handler \n"__attribute__((optimize("O3")))void MemManage_Handler(){__asm volatile(ASM_FaultHandler);}__attribute__((optimize("O3")))void BusFault_Handler(){__asm volatile(ASM_FaultHandler);}__attribute__((optimize("O3")))void UsageFault_Handler(){__asm volatile(ASM_FaultHandler);}
-
-
RT-Thread + HAL 库 + GCC ( RT-Studio 环境 )
-
sec.h(stm32f4xx)
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150#include <stdio.h>#include <core_cm4.h>#include <stm32f401xc.h>#include <stm32f4xx_ll_rtc.h>// 示例 main 函数int main(void){// enable 可配置错误中断enable_fault_handlers();// 检查 backup 寄存器是否有保存错误信息read_last_fault();// 两个触发异常的测试函数test_usagefault();test_memmanagefault();}// 读写备份域寄存器,需要提前正确配置硬件 RTC, 可参考下方4个步骤(RTT V3.1.4 实测)extern RTC_HandleTypeDef RTC_Handler;void write_backup_reg(uint32_t index, uint32_t data){// 1. enable RTC for RT-Thread, at rtconfig.h: #define RT_USING_RTC// 2. enable RTC for BSP, at drivers/board.h: #define BSP_USING_ONCHIP_RTC and #define BSP_RTC_USING_LSI// 3. enable RTC for HAL, at drivers/stm32f4xx_hal_conf.h: #define HAL_RTC_MODULE_ENABLED// 4. export 'RTC_Handler' from drivers/drv_rtc.c, by removing the 'static' declaration for 'RTC_Handler'HAL_RTCEx_BKUPWrite(&RTC_Handler, RTC_BKP_DR0 + index, data);return;}// 清空 Backup 寄存器void clear_backup_reg(uint32_t index){// if index == 0xff: clear all backup registersif (index == 0xff){for(uint8_t i=0; i<20; i++)HAL_RTCEx_BKUPWrite(&RTC_Handler, RTC_BKP_DR0 + i, 0);}else {HAL_RTCEx_BKUPWrite(&RTC_Handler, RTC_BKP_DR0 + index, 0);}}// 设备开机后运行,检查上次 reset 之前是否有错误上下文void read_last_fault(){if(HAL_RTCEx_BKUPRead(&RTC_Handler, RTC_BKP_DR0) == 0 ){rt_kprintf("No faults since last reset\n");}else{// for(uint8_t i=0; i<20; i++)// rt_kprintf("DR%02d:\t0x%08x\n", i, HAL_RTCEx_BKUPRead(&RTC_Handler, RTC_BKP_DR0 + i));rt_kprintf("pc: 0x%08x\n""lr: 0x%08x\n""psr: 0x%08x\n",HAL_RTCEx_BKUPRead(&RTC_Handler, 8),HAL_RTCEx_BKUPRead(&RTC_Handler, 7),HAL_RTCEx_BKUPRead(&RTC_Handler, 9));// r0-r3for(int i=0; i<4; i++)rt_kprintf("r%02d: 0x%08x\n", i, HAL_RTCEx_BKUPRead(&RTC_Handler, i+2));// r4-r11for(int i=0; i<8; i++)rt_kprintf("r%02d: 0x%08x\n", i+4, HAL_RTCEx_BKUPRead(&RTC_Handler, i+8+2));// r12rt_kprintf("r%02d: 0x%08x\n", 12, HAL_RTCEx_BKUPRead(&RTC_Handler, 6));// CFSR, EXC_RETURN and EMMFAR or BFARrt_kprintf("CFSR: 0x%08x\n""EXC_RET: 0x%08x\n""MFAR/BFAR: 0x%08x\n",HAL_RTCEx_BKUPRead(&RTC_Handler, 0),HAL_RTCEx_BKUPRead(&RTC_Handler, 1),HAL_RTCEx_BKUPRead(&RTC_Handler, 18));clear_backup_reg(0xff);}}// 启用 3 个可配置错误中断,需要在设备启动后尽早执行void enable_fault_handlers(){SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;SCB->SHCSR |= SCB_SHCSR_BUSFAULTENA_Msk;SCB->SHCSR |= SCB_SHCSR_USGFAULTENA_Msk;// enable divided by zero faultSCB->CCR |= SCB_CCR_DIV_0_TRP_Msk;// enable unalign access faultSCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;}// 测试 UsageFaultvoid test_usagefault(){int a = 1;a = a / (a - 1);}// 测试 MemManageFaultvoid test_memmanagefault(){int (*bad_fun)(void) = (void *) 0xE1234567;bad_fun();}// 异常发生时保存寄存器值void store_fault_context(uint32_t sp_addr){uint32_t *data = (uint32_t *)sp_addr;// first, the fault type: 1: MemManage, 2: BusFault, 3: UsageFault// write_backup_reg(0, fault_type);// first, CFSR: MFSR(8bits) + BFSR(8bits) + UFSR(16bits)write_backup_reg(0, SCB->CFSR);// then value of 'EXC_RETURN'write_backup_reg(1, data[0]);// r0, r1, r2, r3, r12, lr, pc, psrfor(int i=1; i<9; i++)write_backup_reg(i+1, data[i+8]);// r4 - r11for(int i=9; i<17; i++)write_backup_reg(i+1, data[i-8]);// lastly, it's MMFAR if MemManage fault happens, BFAR if BusFault happensif( *((uint8_t *)(&SCB->CFSR)) & (1<<7) )write_backup_reg(18, SCB->MMFAR);else if( *((uint8_t *)(&SCB->CFSR + 1)) & (1<<7) )write_backup_reg(18, SCB->BFAR);elsewrite_backup_reg(18, 0xdeadbeef);}// 几个异常处理入口函数#define ASM_FaultHandler \"TST lr, #0x04 \n" \"ITE eq \n" \"MRSEQ r0, msp \n" \"MRSNE r0, psp \n" \"STMFD r0!, {r4 - r11} \n" \"STMFD r0!, {lr} \n" \"PUSH {lr} \n" \"BL store_fault_context \n" \"POP {lr} \n" \"B HardFault_Handler \n"__attribute__((optimize("O3")))void MemManage_Handler(){__asm volatile(ASM_FaultHandler);}__attribute__((optimize("O3")))void BusFault_Handler(){__asm volatile(ASM_FaultHandler);}__attribute__((optimize("O3")))void UsageFault_Handler(){__asm volatile(ASM_FaultHandler);}
-
-
RT-Thread + LL 库 + ARM CC (Keil uVision 5 环境)
由于 ARM CC 5 对内联汇编支持有限,多种语法都导致编译错误,因此未使用汇编重写 Handler, 而选择直接修改 cpuport.c 的rt_hw_hard_fault_exception()
函数,以保存上下文信息-
sec.h(stm32f1xx)
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869#include <stm32f1xx_ll_rtc.h>// 读写备份域寄存器,需要提前正确配置硬件 RTCuint32_t read_backup_reg(uint32_t index){return LL_RTC_BKP_GetRegister(NULL, LL_RTC_BKP_DR1 + index);}void write_backup_reg(uint32_t index, uint32_t data){LL_RTC_BKP_SetRegister(NULL, LL_RTC_BKP_DR1 + index, data);return;}// 清空 Backup 寄存器void clear_backup_reg(uint32_t index){// if index == 0xff: clear all backup registersif (index == 0xff){for(uint8_t i=0; i<20; i++)write_backup_reg(i, 0);}else {write_backup_reg(index, 0);}}// 设备开机后运行,检查上次 reset 之前是否有错误上下文void read_last_fault(){if(read_backup_reg(0) == 0 ){rt_kprintf("No faults since last reset\n");}else{rt_kprintf("pc: 0x%08x\n""lr: 0x%08x\n""psr: 0x%08x\n",read_backup_reg(8),read_backup_reg(7),read_backup_reg(9));// r0-r3for(int i=0; i<4; i++)rt_kprintf("r%02d: 0x%08x\n", i, read_backup_reg(i+2));// r4-r11for(int i=0; i<8; i++)rt_kprintf("r%02d: 0x%08x\n", i+4, read_backup_reg(i+8+2));// r12rt_kprintf("r%02d: 0x%08x\n", 12, read_backup_reg(6));// CFSR, EXC_RETURN and EMMFAR or BFARrt_kprintf("CFSR: 0x%08x\n""EXC_RET: 0x%08x\n""MFAR/BFAR: 0x%08x\n",read_backup_reg(0),read_backup_reg(1),read_backup_reg(18));clear_backup_reg(0xff);}}// 测试 UsageFaultvoid test_usagefault(){// enable divided by zero faultSCB->CCR |= SCB_CCR_DIV_0_TRP_Msk;int a = 1;a = a / (a - 1);}// 测试 MemManageFaultvoid test_memmanagefault(){// enable unalign access faultSCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;int (*bad_fun)(void) = (void *) 0xE1234567;bad_fun();} -
cpuport.c(stm32f1xx)
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950#include <stm32f1xx_ll_rtc.h>// 读写备份域寄存器,需要提前正确配置硬件 RTCvoid write_backup_reg(uint32_t index, uint32_t data){LL_RTC_BKP_SetRegister(NULL, LL_RTC_BKP_DR1 + index, data);return;}// 异常发生时保存寄存器值void store_fault_context(struct exception_info * exception_info){struct stack_frame* context = &exception_info->stack_frame;// first, CFSR: MFSR(8bits) + BFSR(8bits) + UFSR(16bits)write_backup_reg(0, SCB_CFSR);// then value of 'EXC_RETURN'write_backup_reg(1, exception_info->exc_return);// r0, r1, r2, r3, r12, lr, pc, psrwrite_backup_reg(2, context->exception_stack_frame.r0);write_backup_reg(3, context->exception_stack_frame.r1);write_backup_reg(4, context->exception_stack_frame.r2);write_backup_reg(5, context->exception_stack_frame.r3);write_backup_reg(6, context->exception_stack_frame.r12);write_backup_reg(7, context->exception_stack_frame.lr);write_backup_reg(8, context->exception_stack_frame.pc);write_backup_reg(9, context->exception_stack_frame.psr);// r4 - r11write_backup_reg(10, context->r4);write_backup_reg(11, context->r5);write_backup_reg(12, context->r6);write_backup_reg(13, context->r7);write_backup_reg(14, context->r8);write_backup_reg(15, context->r9);write_backup_reg(16, context->r10);write_backup_reg(17, context->r11);// lastly, it's MMFAR if MemManage fault happens, BFAR if BusFault happensif( *((uint8_t *)(&SCB_CFSR)) & (1<<7) )write_backup_reg(18, SCB_MMAR);else if( *((uint8_t *)(&SCB_CFSR + 1)) & (1<<7) )write_backup_reg(18, SCB_BFAR);elsewrite_backup_reg(18, 0xdeadbeef);}// patch rt_hw_hard_fault_exception 函数,在函数开始处加上一行void rt_hw_hard_fault_exception(struct exception_info * exception_info){store_fault_context(exception_info);......}
-
3. 注意事项
-
不同 RTOS 对于 msp、psp 的使用方式可能不一致(正常情况下 msp 为 Handler 线程和内核线程的栈帧, psp 为业务线程的栈帧)
-
以上方案只保存了 MemManage Fault、BusFault、UsageFault 三类可配置异常的上下文,未对 HardFault 进行处理(以最大程度保持对RTOS自身 crash 处理逻辑的兼容)