diff --git a/src/Crypto/Aes_hw_cpu.asm b/src/Crypto/Aes_hw_cpu.asm index c8df89f3..5eafb7d3 100644 --- a/src/Crypto/Aes_hw_cpu.asm +++ b/src/Crypto/Aes_hw_cpu.asm @@ -68,36 +68,6 @@ %endmacro -%macro push_xmm 2 - sub rsp, 16 * (%2 - %1 + 1) - - %assign stackoffset 0 - %assign regnumber %1 - - %rep (%2 - %1 + 1) - movdqu [rsp + 16 * stackoffset], xmm%[regnumber] - - %assign stackoffset stackoffset+1 - %assign regnumber regnumber+1 - %endrep -%endmacro - - -%macro pop_xmm 2 - %assign stackoffset 0 - %assign regnumber %1 - - %rep (%2 - %1 + 1) - movdqu xmm%[regnumber], [rsp + 16 * stackoffset] - - %assign stackoffset stackoffset+1 - %assign regnumber regnumber+1 - %endrep - - add rsp, 16 * (%2 - %1 + 1) -%endmacro - - %macro aes_hw_cpu 2 %define OPERATION %1 %define BLOCK_COUNT %2 @@ -145,8 +115,9 @@ %endmacro -%macro aes_hw_cpu_32_blocks 1 - %define OPERATION_32_BLOCKS %1 +%macro aes_hw_cpu_32_blocks 2 + %define AES_HW_CPU_32_BLOCKS_NAME %1 + %define OPERATION_32_BLOCKS %2 %ifidn __BITS__, 64 %define MAX_REG_BLOCK_COUNT 15 @@ -156,7 +127,29 @@ %ifidn __OUTPUT_FORMAT__, win64 %if MAX_REG_BLOCK_COUNT > 5 - push_xmm 6, MAX_REG_BLOCK_COUNT + sub rsp, 16 * (MAX_REG_BLOCK_COUNT - 6 + 1) + 8 +AES_HW_CPU_32_BLOCKS_NAME %+ _alloc_end: + movdqu [rsp + 16 * 0], xmm6 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm6_end: + movdqu [rsp + 16 * 1], xmm7 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm7_end: + movdqu [rsp + 16 * 2], xmm8 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm8_end: + movdqu [rsp + 16 * 3], xmm9 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm9_end: + movdqu [rsp + 16 * 4], xmm10 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm10_end: + movdqu [rsp + 16 * 5], xmm11 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm11_end: + movdqu [rsp + 16 * 6], xmm12 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm12_end: + movdqu [rsp + 16 * 7], xmm13 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm13_end: + movdqu [rsp + 16 * 8], xmm14 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm14_end: + movdqu [rsp + 16 * 9], xmm15 +AES_HW_CPU_32_BLOCKS_NAME %+ _save_xmm15_end: +AES_HW_CPU_32_BLOCKS_NAME %+ _prolog_end: %endif %endif @@ -174,15 +167,80 @@ %ifidn __OUTPUT_FORMAT__, win64 %if MAX_REG_BLOCK_COUNT > 5 - pop_xmm 6, MAX_REG_BLOCK_COUNT + movdqu xmm6, [rsp + 16 * 0] + movdqu xmm7, [rsp + 16 * 1] + movdqu xmm8, [rsp + 16 * 2] + movdqu xmm9, [rsp + 16 * 3] + movdqu xmm10, [rsp + 16 * 4] + movdqu xmm11, [rsp + 16 * 5] + movdqu xmm12, [rsp + 16 * 6] + movdqu xmm13, [rsp + 16 * 7] + movdqu xmm14, [rsp + 16 * 8] + movdqu xmm15, [rsp + 16 * 9] + add rsp, 16 * (MAX_REG_BLOCK_COUNT - 6 + 1) + 8 %endif %endif %undef OPERATION_32_BLOCKS + %undef AES_HW_CPU_32_BLOCKS_NAME %undef MAX_REG_BLOCK_COUNT %endmacro +; Win64 unwind metadata for the 32-block AES-NI routines. +; +; The records below are hand-encoded and must stay in exact lockstep with the +; prologue emitted by aes_hw_cpu_32_blocks: the unwind codes describe the "sub +; rsp" allocation followed by the xmm6..xmm15 saves, listed in descending prolog +; offset order. The slot count (22 = 10 SAVE_XMM128 pairs + 1 ALLOC_LARGE pair) +; and the recorded allocation size are therefore fixed for the win64 / +; MAX_REG_BLOCK_COUNT == 15 layout. If that saved-register range or the +; allocation ever changes, update the prologue and this table together; a +; mismatch makes the OS unwinder mis-restore the caller's context. + +%macro win64_aesni_32_unwind_info 2 +%ifidn __OUTPUT_FORMAT__, win64 + section .pdata rdata align=4 + align 4 + dd %1 wrt ..imagebase + dd %2 wrt ..imagebase + dd %1 %+ _unwind_info wrt ..imagebase + + section .xdata rdata align=8 + align 4 +%1 %+ _unwind_info: + db 1 + db %1 %+ _prolog_end - %1 + db 22 + db 0 + db %1 %+ _save_xmm15_end - %1, (15 << 4) | 8 + dw 9 + db %1 %+ _save_xmm14_end - %1, (14 << 4) | 8 + dw 8 + db %1 %+ _save_xmm13_end - %1, (13 << 4) | 8 + dw 7 + db %1 %+ _save_xmm12_end - %1, (12 << 4) | 8 + dw 6 + db %1 %+ _save_xmm11_end - %1, (11 << 4) | 8 + dw 5 + db %1 %+ _save_xmm10_end - %1, (10 << 4) | 8 + dw 4 + db %1 %+ _save_xmm9_end - %1, (9 << 4) | 8 + dw 3 + db %1 %+ _save_xmm8_end - %1, (8 << 4) | 8 + dw 2 + db %1 %+ _save_xmm7_end - %1, (7 << 4) | 8 + dw 1 + db %1 %+ _save_xmm6_end - %1, (6 << 4) | 8 + dw 0 + db %1 %+ _alloc_end - %1, 1 + dw (16 * (15 - 6 + 1) + 8) / 8 + + section .text +%endif +%endmacro + + %ifidn __BITS__, 16 USE16 @@ -312,8 +370,10 @@ ; void aes_hw_cpu_decrypt_32_blocks (const byte *ks, byte *data); aes_function_entry aes_hw_cpu_decrypt_32_blocks - aes_hw_cpu_32_blocks dec + aes_hw_cpu_32_blocks aes_hw_cpu_decrypt_32_blocks, dec aes_function_exit +aes_hw_cpu_decrypt_32_blocks_end: + win64_aesni_32_unwind_info aes_hw_cpu_decrypt_32_blocks, aes_hw_cpu_decrypt_32_blocks_end ; void aes_hw_cpu_encrypt (const byte *ks, byte *data); @@ -326,8 +386,10 @@ ; void aes_hw_cpu_encrypt_32_blocks (const byte *ks, byte *data); aes_function_entry aes_hw_cpu_encrypt_32_blocks - aes_hw_cpu_32_blocks enc + aes_hw_cpu_32_blocks aes_hw_cpu_encrypt_32_blocks, enc aes_function_exit +aes_hw_cpu_encrypt_32_blocks_end: + win64_aesni_32_unwind_info aes_hw_cpu_encrypt_32_blocks, aes_hw_cpu_encrypt_32_blocks_end %endif ; __BITS__ != 16 diff --git a/src/Crypto/Aes_x64.asm b/src/Crypto/Aes_x64.asm index 65965af1..628e0f2e 100644 --- a/src/Crypto/Aes_x64.asm +++ b/src/Crypto/Aes_x64.asm @@ -55,8 +55,8 @@ ; The default convention is that for windows, the gnu/linux convention being ; used if __GNUC__ is defined. ; -; Define _SEH_ to include support for Win64 structured exception handling -; (this requires YASM version 0.6 or later). +; Win64 unwind metadata is emitted explicitly in .pdata/.xdata when this file +; is assembled as a PE32+ object. ; ; This code provides the standard AES block size (128 bits, 16 bytes) and the ; three standard AES key sizes (128, 192 and 256 bits). It has the same call @@ -673,6 +673,32 @@ %endif +%macro win64_aes_unwind_info 2 +%ifidn __OUTPUT_FORMAT__, win64 + section .pdata rdata align=4 + align 4 + dd %1 wrt ..imagebase + dd %2 wrt ..imagebase + dd %1 %+ _unwind_info wrt ..imagebase + + section .xdata rdata align=8 + align 4 +%1 %+ _unwind_info: + db 1 ; version 1, no flags + db %1 %+ .prolog_end - %1 + db 6 ; unwind code slots + db 0 ; no frame register + db %1 %+ .alloc_end - %1, 2 ; UWOP_ALLOC_SMALL, 8 bytes + db %1 %+ .save_r12_end - %1, (12 << 4) | 0 ; UWOP_PUSH_NONVOL r12 + db %1 %+ .save_rbp_end - %1, (5 << 4) | 0 ; UWOP_PUSH_NONVOL rbp + db %1 %+ .save_rbx_end - %1, (3 << 4) | 0 ; UWOP_PUSH_NONVOL rbx + db %1 %+ .save_rdi_end - %1, (7 << 4) | 0 ; UWOP_PUSH_NONVOL rdi + db %1 %+ .save_rsi_end - %1, (6 << 4) | 0 ; UWOP_PUSH_NONVOL rsi + + section .text align=16 +%endif +%endmacro + %ifdef ENCRYPTION global aes_encrypt @@ -691,19 +717,24 @@ enc_tab: section .text align=16 align 16 -%ifdef _SEH_ -proc_frame aes_encrypt - alloc_stack 7*8 ; 7 to align stack to 16 bytes - save_reg rsi,4*8 - save_reg rdi,5*8 - save_reg rbx,1*8 - save_reg rbp,2*8 - save_reg r12,3*8 -end_prologue - mov rdi, rcx ; input pointer - mov [rsp+0*8], rdx ; output pointer -%else aes_encrypt: + %ifidn __OUTPUT_FORMAT__, win64 + push rsi +.save_rsi_end: + push rdi +.save_rdi_end: + push rbx +.save_rbx_end: + push rbp +.save_rbp_end: + push r12 +.save_r12_end: + sub rsp, 8 +.alloc_end: + mov rdi, rcx ; input pointer + mov [rsp], rdx ; output pointer +.prolog_end: + %else %ifdef __GNUC__ sub rsp, 4*8 ; gnu/linux binary interface mov [rsp+0*8], rsi ; output pointer @@ -718,7 +749,7 @@ end_prologue mov [rsp+1*8], rbx ; input pointer in rdi mov [rsp+2*8], rbp ; output pointer in [rsp] mov [rsp+3*8], r12 ; context in r8 -%endif + %endif movzx esi, byte [kptr+4*KS_LENGTH] lea tptr, [rel enc_tab] @@ -766,23 +797,35 @@ end_prologue mov [rbx+12], r12d xor rax, rax .4: +%ifidn __OUTPUT_FORMAT__, win64 + add rsp, 8 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret +%else +%ifdef __GNUC__ mov rbx, [rsp+1*8] mov rbp, [rsp+2*8] mov r12, [rsp+3*8] -%ifdef __GNUC__ add rsp, 4*8 ret %else - mov rsi, [rsp+4*8] - mov rdi, [rsp+5*8] - %ifdef _SEH_ - add rsp, 7*8 - ret - endproc_frame - %else - add rsp, 6*8 - ret - %endif + mov rbx, [rsp+1*8] + mov rbp, [rsp+2*8] + mov r12, [rsp+3*8] + mov rsi, [rsp+4*8] + mov rdi, [rsp+5*8] + add rsp, 6*8 + ret +%endif +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +aes_encrypt_end: + win64_aes_unwind_info aes_encrypt, aes_encrypt_end %endif %endif @@ -805,19 +848,24 @@ dec_tab: section .text align 16 -%ifdef _SEH_ -proc_frame aes_decrypt - alloc_stack 7*8 ; 7 to align stack to 16 bytes - save_reg rsi,4*8 - save_reg rdi,5*8 - save_reg rbx,1*8 - save_reg rbp,2*8 - save_reg r12,3*8 -end_prologue - mov rdi, rcx ; input pointer - mov [rsp+0*8], rdx ; output pointer -%else aes_decrypt: + %ifidn __OUTPUT_FORMAT__, win64 + push rsi +.save_rsi_end: + push rdi +.save_rdi_end: + push rbx +.save_rbx_end: + push rbp +.save_rbp_end: + push r12 +.save_r12_end: + sub rsp, 8 +.alloc_end: + mov rdi, rcx ; input pointer + mov [rsp], rdx ; output pointer +.prolog_end: + %else %ifdef __GNUC__ sub rsp, 4*8 ; gnu/linux binary interface mov [rsp+0*8], rsi ; output pointer @@ -832,7 +880,7 @@ end_prologue mov [rsp+1*8], rbx ; input pointer in rdi mov [rsp+2*8], rbp ; output pointer in [rsp] mov [rsp+3*8], r12 ; context in r8 -%endif + %endif movzx esi,byte[kptr+4*KS_LENGTH] lea tptr, [rel dec_tab] @@ -885,23 +933,36 @@ end_prologue mov [rbx+8], r11d mov [rbx+12], r12d xor rax, rax -.4: mov rbx, [rsp+1*8] +.4: +%ifidn __OUTPUT_FORMAT__, win64 + add rsp, 8 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret +%else +%ifdef __GNUC__ + mov rbx, [rsp+1*8] mov rbp, [rsp+2*8] mov r12, [rsp+3*8] -%ifdef __GNUC__ add rsp, 4*8 ret %else - mov rsi, [rsp+4*8] - mov rdi, [rsp+5*8] - %ifdef _SEH_ - add rsp, 7*8 - ret - endproc_frame - %else - add rsp, 6*8 - ret - %endif + mov rbx, [rsp+1*8] + mov rbp, [rsp+2*8] + mov r12, [rsp+3*8] + mov rsi, [rsp+4*8] + mov rdi, [rsp+5*8] + add rsp, 6*8 + ret +%endif +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +aes_decrypt_end: + win64_aes_unwind_info aes_decrypt, aes_decrypt_end %endif %endif