Skip to content

Commit 1e31633

Browse files
lundmanandrewc12
authored andcommitted
Add Windows assembler: gcm_pclmulqdq
1 parent bac5963 commit 1e31633

File tree

5 files changed

+277
-3
lines changed

5 files changed

+277
-3
lines changed

include/os/windows/spl/sys/simd.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,10 @@ xgetbv(uint32_t c)
9292

9393
#endif
9494

95-
#define CPUID_FEATURE_AES (1<<25)
96-
#define CPUID_FEATURE_XSAVE (1<<26)
95+
#define CPUID_FEATURE_PCLMULQDQ (1<<1)
96+
#define CPUID_FEATURE_AES (1<<25)
97+
#define CPUID_FEATURE_XSAVE (1<<26)
98+
//#define CPUID_FEATURE_AVX (1<<28)
9799

98100
extern uint64_t spl_cpuid_features(void);
99101
extern uint64_t spl_cpuid_leaf7_features(void);

include/os/windows/zfs/zfs_config.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@
6363

6464
#define HAVE_USLEEP 1
6565

66+
/* These control which assembler files to use */
67+
//#define HAVE_AVX 1
68+
#define HAVE_PCLMULQDQ 1
6669
#define HAVE_AES 1
6770

6871
/* Path where the kernel module is installed. */

lib/libicp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ add_library(libicp
4545
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/aes/aes_amd64.S"
4646
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha256_impl.S"
4747
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha512_impl.S"
48+
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S"
4849
)
4950

5051
# Add windows/assembler sources here too.

module/icp/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ wdk_add_library(icpkern
1818
algs/modes/ecb.c
1919
algs/modes/gcm.c
2020
algs/modes/gcm_generic.c
21+
algs/modes/gcm_pclmulqdq.c
2122
algs/modes/modes.c
2223
algs/sha2/sha2.c
2324
algs/skein/skein.c
@@ -33,7 +34,7 @@ wdk_add_library(icpkern
3334
asm-x86_64/os/windows/aes/aes_aesni.S
3435
asm-x86_64/os/windows/aes/aes_amd64.S
3536
asm-x86_64/modes/aesni-gcm-x86_64.S
36-
asm-x86_64/modes/gcm_pclmulqdq.S
37+
asm-x86_64/os/windows/modes/gcm_pclmulqdq.S
3738
asm-x86_64/modes/ghash-x86_64.S
3839
# asm-x86_64/sha1/sha1-x86_64.S
3940
asm-x86_64/os/windows/sha2/sha256_impl.S
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9+
* or https://opensource.org/licenses/CDDL-1.0.
10+
* See the License for the specific language governing permissions
11+
* and limitations under the License.
12+
*
13+
* When distributing Covered Code, include this CDDL HEADER in each
14+
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15+
* If applicable, add the following below this CDDL HEADER, with the
16+
* fields enclosed by brackets "[]" replaced with your own identifying
17+
* information: Portions Copyright [yyyy] [name of copyright owner]
18+
*
19+
* CDDL HEADER END
20+
*/
21+
22+
/*
23+
* Copyright (c) 2009 Intel Corporation
24+
* All Rights Reserved.
25+
*/
26+
/*
27+
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28+
* Use is subject to license terms.
29+
*/
30+
31+
/*
32+
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33+
* instructions. This file contains an accelerated
34+
* Galois Field Multiplication implementation.
35+
*
36+
* PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37+
* carry-less multiplication. More information about PCLMULQDQ can be
38+
* found at:
39+
* http://software.intel.com/en-us/articles/
40+
* carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41+
*
42+
*/
43+
44+
/*
45+
* ====================================================================
46+
* OpenSolaris OS modifications
47+
*
48+
* This source originates as file galois_hash_asm.c from
49+
* Intel Corporation dated September 21, 2009.
50+
*
51+
* This OpenSolaris version has these major changes from the original source:
52+
*
53+
* 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54+
* /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55+
* definition for lint.
56+
*
57+
* 2. Formatted code, added comments, and added #includes and #defines.
58+
*
59+
* 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60+
* calling kpreempt_disable() and kpreempt_enable().
61+
* If the TS bit is not set, Save and restore %xmm registers at the beginning
62+
* and end of function calls (%xmm* registers are not saved and restored by
63+
* during kernel thread preemption).
64+
*
65+
* 4. Removed code to perform hashing. This is already done with C macro
66+
* GHASH in gcm.c. For better performance, this removed code should be
67+
* reintegrated in the future to replace the C GHASH macro.
68+
*
69+
* 5. Added code to byte swap 16-byte input and output.
70+
*
71+
* 6. Folded in comments from the original C source with embedded assembly
72+
* (SB_w_shift_xor.c)
73+
*
74+
* 7. Renamed function and reordered parameters to match OpenSolaris:
75+
* Intel interface:
76+
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
77+
* unsigned char *d, int length)
78+
* OpenSolaris OS interface:
79+
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80+
* ====================================================================
81+
*/
82+
83+
84+
#if defined(lint) || defined(__lint) /* lint */
85+
86+
#include <sys/types.h>
87+
88+
void
89+
gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
90+
(void) x_in, (void) y, (void) res;
91+
}
92+
93+
#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */
94+
95+
#define _ASM
96+
#include <sys/asm_linkage.h>
97+
98+
/*
99+
* Use this mask to byte-swap a 16-byte integer with the pshufb instruction
100+
*/
101+
102+
// static uint8_t byte_swap16_mask[] = {
103+
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
104+
.section .rodata
105+
.align XMM_ALIGN
106+
.Lbyte_swap16_mask:
107+
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
108+
109+
110+
/*
111+
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
112+
*
113+
* Perform a carry-less multiplication (that is, use XOR instead of the
114+
* multiply operator) on P1 and P2 and place the result in P3.
115+
*
116+
* Byte swap the input and the output.
117+
*
118+
* Note: x_in, y, and res all point to a block of 20-byte numbers
119+
* (an array of two 64-bit integers).
120+
*
121+
* Note2: For kernel code, caller is responsible for ensuring
122+
* kpreempt_disable() has been called. This is because %xmm registers are
123+
* not saved/restored. Clear and set the CR0.TS bit on entry and exit,
124+
* respectively, if TS is set on entry. Otherwise, if TS is not set,
125+
* save and restore %xmm registers on the stack.
126+
*
127+
* Note3: Original Intel definition:
128+
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
129+
* unsigned char *d, int length)
130+
*
131+
* Note4: Register/parameter mapping:
132+
* Intel:
133+
* Parameter 1: %rcx (copied to %xmm0) hk or x_in
134+
* Parameter 2: %rdx (copied to %xmm1) s or y
135+
* Parameter 3: %rdi (result) d or res
136+
* OpenSolaris:
137+
* Parameter 1: %rdi (copied to %xmm0) x_in
138+
* Parameter 2: %rsi (copied to %xmm1) y
139+
* Parameter 3: %rdx (result) res
140+
*/
141+
// Windows x64:
142+
// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3)
143+
// Return: rax (float: xmm0)
144+
// Volatile: rax, rcx, rdx, r8-r11
145+
// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15)
146+
147+
// Unix x64:
148+
// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7)
149+
// Return: rax (float: xmm0)
150+
// Volatile:
151+
// Nonvolatile: rbx, rbp, rsp, r12-r15
152+
153+
// outcome:
154+
155+
ENTRY_NP(gcm_mul_pclmulqdq)
156+
//
157+
// Copy Parameters
158+
//
159+
movdqu (%rcx), %xmm0 // P1
160+
movdqu (%rdx), %xmm1 // P2
161+
162+
//
163+
// Byte swap 16-byte input
164+
//
165+
lea .Lbyte_swap16_mask(%rip), %rax
166+
movups (%rax), %xmm10
167+
pshufb %xmm10, %xmm0
168+
pshufb %xmm10, %xmm1
169+
170+
171+
//
172+
// Multiply with the hash key
173+
//
174+
movdqu %xmm0, %xmm3
175+
pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
176+
177+
movdqu %xmm0, %xmm4
178+
pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
179+
180+
movdqu %xmm0, %xmm5
181+
pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
182+
movdqu %xmm0, %xmm6
183+
pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
184+
185+
pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
186+
187+
movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
188+
psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
189+
pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
190+
pxor %xmm5, %xmm3
191+
pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
192+
// of the carry-less multiplication of
193+
// xmm0 by xmm1.
194+
195+
// We shift the result of the multiplication by one bit position
196+
// to the left to cope for the fact that the bits are reversed.
197+
movdqu %xmm3, %xmm7
198+
movdqu %xmm6, %xmm8
199+
pslld $1, %xmm3
200+
pslld $1, %xmm6
201+
psrld $31, %xmm7
202+
psrld $31, %xmm8
203+
movdqu %xmm7, %xmm9
204+
pslldq $4, %xmm8
205+
pslldq $4, %xmm7
206+
psrldq $12, %xmm9
207+
por %xmm7, %xmm3
208+
por %xmm8, %xmm6
209+
por %xmm9, %xmm6
210+
211+
//
212+
// First phase of the reduction
213+
//
214+
// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
215+
// independently.
216+
movdqu %xmm3, %xmm7
217+
movdqu %xmm3, %xmm8
218+
movdqu %xmm3, %xmm9
219+
pslld $31, %xmm7 // packed right shift shifting << 31
220+
pslld $30, %xmm8 // packed right shift shifting << 30
221+
pslld $25, %xmm9 // packed right shift shifting << 25
222+
pxor %xmm8, %xmm7 // xor the shifted versions
223+
pxor %xmm9, %xmm7
224+
movdqu %xmm7, %xmm8
225+
pslldq $12, %xmm7
226+
psrldq $4, %xmm8
227+
pxor %xmm7, %xmm3 // first phase of the reduction complete
228+
229+
//
230+
// Second phase of the reduction
231+
//
232+
// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
233+
// shift operations.
234+
movdqu %xmm3, %xmm2
235+
movdqu %xmm3, %xmm4 // packed left shifting >> 1
236+
movdqu %xmm3, %xmm5
237+
psrld $1, %xmm2
238+
psrld $2, %xmm4 // packed left shifting >> 2
239+
psrld $7, %xmm5 // packed left shifting >> 7
240+
pxor %xmm4, %xmm2 // xor the shifted versions
241+
pxor %xmm5, %xmm2
242+
pxor %xmm8, %xmm2
243+
pxor %xmm2, %xmm3
244+
pxor %xmm3, %xmm6 // the result is in xmm6
245+
246+
//
247+
// Byte swap 16-byte result
248+
//
249+
pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
250+
251+
//
252+
// Store the result
253+
//
254+
movdqu %xmm6, (%r8) // P3
255+
256+
257+
//
258+
// Return
259+
//
260+
RET
261+
SET_SIZE(gcm_mul_pclmulqdq)
262+
263+
#endif /* lint || __lint */
264+
265+
#ifdef __ELF__
266+
.section .note.GNU-stack,"",%progbits
267+
#endif

0 commit comments

Comments
 (0)