|
| 1 | +/* |
| 2 | + * CDDL HEADER START |
| 3 | + * |
| 4 | + * The contents of this file are subject to the terms of the |
| 5 | + * Common Development and Distribution License (the "License"). |
| 6 | + * You may not use this file except in compliance with the License. |
| 7 | + * |
| 8 | + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| 9 | + * or https://opensource.org/licenses/CDDL-1.0. |
| 10 | + * See the License for the specific language governing permissions |
| 11 | + * and limitations under the License. |
| 12 | + * |
| 13 | + * When distributing Covered Code, include this CDDL HEADER in each |
| 14 | + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| 15 | + * If applicable, add the following below this CDDL HEADER, with the |
| 16 | + * fields enclosed by brackets "[]" replaced with your own identifying |
| 17 | + * information: Portions Copyright [yyyy] [name of copyright owner] |
| 18 | + * |
| 19 | + * CDDL HEADER END |
| 20 | + */ |
| 21 | + |
| 22 | +/* |
| 23 | + * Copyright (c) 2009 Intel Corporation |
| 24 | + * All Rights Reserved. |
| 25 | + */ |
| 26 | +/* |
| 27 | + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| 28 | + * Use is subject to license terms. |
| 29 | + */ |
| 30 | + |
| 31 | +/* |
| 32 | + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI |
| 33 | + * instructions. This file contains an accelerated |
| 34 | + * Galois Field Multiplication implementation. |
| 35 | + * |
| 36 | + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, |
| 37 | + * carry-less multiplication. More information about PCLMULQDQ can be |
| 38 | + * found at: |
| 39 | + * http://software.intel.com/en-us/articles/ |
| 40 | + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ |
| 41 | + * |
| 42 | + */ |
| 43 | + |
| 44 | +/* |
| 45 | + * ==================================================================== |
| 46 | + * OpenSolaris OS modifications |
| 47 | + * |
| 48 | + * This source originates as file galois_hash_asm.c from |
| 49 | + * Intel Corporation dated September 21, 2009. |
| 50 | + * |
| 51 | + * This OpenSolaris version has these major changes from the original source: |
| 52 | + * |
| 53 | + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from |
| 54 | + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function |
| 55 | + * definition for lint. |
| 56 | + * |
| 57 | + * 2. Formatted code, added comments, and added #includes and #defines. |
| 58 | + * |
| 59 | + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before |
| 60 | + * calling kpreempt_disable() and kpreempt_enable(). |
| 61 | + * If the TS bit is not set, Save and restore %xmm registers at the beginning |
| 62 | + * and end of function calls (%xmm* registers are not saved and restored by |
| 63 | + * during kernel thread preemption). |
| 64 | + * |
| 65 | + * 4. Removed code to perform hashing. This is already done with C macro |
| 66 | + * GHASH in gcm.c. For better performance, this removed code should be |
| 67 | + * reintegrated in the future to replace the C GHASH macro. |
| 68 | + * |
| 69 | + * 5. Added code to byte swap 16-byte input and output. |
| 70 | + * |
| 71 | + * 6. Folded in comments from the original C source with embedded assembly |
| 72 | + * (SB_w_shift_xor.c) |
| 73 | + * |
| 74 | + * 7. Renamed function and reordered parameters to match OpenSolaris: |
| 75 | + * Intel interface: |
| 76 | + * void galois_hash_asm(unsigned char *hk, unsigned char *s, |
| 77 | + * unsigned char *d, int length) |
| 78 | + * OpenSolaris OS interface: |
| 79 | + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); |
| 80 | + * ==================================================================== |
| 81 | + */ |
| 82 | + |
| 83 | + |
| 84 | +#if defined(lint) || defined(__lint) /* lint */ |
| 85 | + |
| 86 | +#include <sys/types.h> |
| 87 | + |
| 88 | +void |
| 89 | +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { |
| 90 | + (void) x_in, (void) y, (void) res; |
| 91 | +} |
| 92 | + |
| 93 | +#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ |
| 94 | + |
| 95 | +#define _ASM |
| 96 | +#include <sys/asm_linkage.h> |
| 97 | + |
| 98 | +/* |
| 99 | + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction |
| 100 | + */ |
| 101 | + |
| 102 | +// static uint8_t byte_swap16_mask[] = { |
| 103 | +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; |
| 104 | +.section .rodata |
| 105 | +.align XMM_ALIGN |
| 106 | +.Lbyte_swap16_mask: |
| 107 | + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
| 108 | + |
| 109 | + |
| 110 | +/* |
| 111 | + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); |
| 112 | + * |
| 113 | + * Perform a carry-less multiplication (that is, use XOR instead of the |
| 114 | + * multiply operator) on P1 and P2 and place the result in P3. |
| 115 | + * |
| 116 | + * Byte swap the input and the output. |
| 117 | + * |
| 118 | + * Note: x_in, y, and res all point to a block of 20-byte numbers |
| 119 | + * (an array of two 64-bit integers). |
| 120 | + * |
| 121 | + * Note2: For kernel code, caller is responsible for ensuring |
| 122 | + * kpreempt_disable() has been called. This is because %xmm registers are |
| 123 | + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, |
| 124 | + * respectively, if TS is set on entry. Otherwise, if TS is not set, |
| 125 | + * save and restore %xmm registers on the stack. |
| 126 | + * |
| 127 | + * Note3: Original Intel definition: |
| 128 | + * void galois_hash_asm(unsigned char *hk, unsigned char *s, |
| 129 | + * unsigned char *d, int length) |
| 130 | + * |
| 131 | + * Note4: Register/parameter mapping: |
| 132 | + * Intel: |
| 133 | + * Parameter 1: %rcx (copied to %xmm0) hk or x_in |
| 134 | + * Parameter 2: %rdx (copied to %xmm1) s or y |
| 135 | + * Parameter 3: %rdi (result) d or res |
| 136 | + * OpenSolaris: |
| 137 | + * Parameter 1: %rdi (copied to %xmm0) x_in |
| 138 | + * Parameter 2: %rsi (copied to %xmm1) y |
| 139 | + * Parameter 3: %rdx (result) res |
| 140 | + */ |
| 141 | +// Windows x64: |
| 142 | +// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) |
| 143 | +// Return: rax (float: xmm0) |
| 144 | +// Volatile: rax, rcx, rdx, r8-r11 |
| 145 | +// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15) |
| 146 | + |
| 147 | +// Unix x64: |
| 148 | +// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) |
| 149 | +// Return: rax (float: xmm0) |
| 150 | +// Volatile: |
| 151 | +// Nonvolatile: rbx, rbp, rsp, r12-r15 |
| 152 | + |
| 153 | +// outcome: |
| 154 | + |
| 155 | +ENTRY_NP(gcm_mul_pclmulqdq) |
| 156 | + // |
| 157 | + // Copy Parameters |
| 158 | + // |
| 159 | + movdqu (%rcx), %xmm0 // P1 |
| 160 | + movdqu (%rdx), %xmm1 // P2 |
| 161 | + |
| 162 | + // |
| 163 | + // Byte swap 16-byte input |
| 164 | + // |
| 165 | + lea .Lbyte_swap16_mask(%rip), %rax |
| 166 | + movups (%rax), %xmm10 |
| 167 | + pshufb %xmm10, %xmm0 |
| 168 | + pshufb %xmm10, %xmm1 |
| 169 | + |
| 170 | + |
| 171 | + // |
| 172 | + // Multiply with the hash key |
| 173 | + // |
| 174 | + movdqu %xmm0, %xmm3 |
| 175 | + pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 |
| 176 | + |
| 177 | + movdqu %xmm0, %xmm4 |
| 178 | + pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 |
| 179 | + |
| 180 | + movdqu %xmm0, %xmm5 |
| 181 | + pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 |
| 182 | + movdqu %xmm0, %xmm6 |
| 183 | + pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 |
| 184 | + |
| 185 | + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 |
| 186 | + |
| 187 | + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 |
| 188 | + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right |
| 189 | + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left |
| 190 | + pxor %xmm5, %xmm3 |
| 191 | + pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result |
| 192 | + // of the carry-less multiplication of |
| 193 | + // xmm0 by xmm1. |
| 194 | + |
| 195 | + // We shift the result of the multiplication by one bit position |
| 196 | + // to the left to cope for the fact that the bits are reversed. |
| 197 | + movdqu %xmm3, %xmm7 |
| 198 | + movdqu %xmm6, %xmm8 |
| 199 | + pslld $1, %xmm3 |
| 200 | + pslld $1, %xmm6 |
| 201 | + psrld $31, %xmm7 |
| 202 | + psrld $31, %xmm8 |
| 203 | + movdqu %xmm7, %xmm9 |
| 204 | + pslldq $4, %xmm8 |
| 205 | + pslldq $4, %xmm7 |
| 206 | + psrldq $12, %xmm9 |
| 207 | + por %xmm7, %xmm3 |
| 208 | + por %xmm8, %xmm6 |
| 209 | + por %xmm9, %xmm6 |
| 210 | + |
| 211 | + // |
| 212 | + // First phase of the reduction |
| 213 | + // |
| 214 | + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts |
| 215 | + // independently. |
| 216 | + movdqu %xmm3, %xmm7 |
| 217 | + movdqu %xmm3, %xmm8 |
| 218 | + movdqu %xmm3, %xmm9 |
| 219 | + pslld $31, %xmm7 // packed right shift shifting << 31 |
| 220 | + pslld $30, %xmm8 // packed right shift shifting << 30 |
| 221 | + pslld $25, %xmm9 // packed right shift shifting << 25 |
| 222 | + pxor %xmm8, %xmm7 // xor the shifted versions |
| 223 | + pxor %xmm9, %xmm7 |
| 224 | + movdqu %xmm7, %xmm8 |
| 225 | + pslldq $12, %xmm7 |
| 226 | + psrldq $4, %xmm8 |
| 227 | + pxor %xmm7, %xmm3 // first phase of the reduction complete |
| 228 | + |
| 229 | + // |
| 230 | + // Second phase of the reduction |
| 231 | + // |
| 232 | + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these |
| 233 | + // shift operations. |
| 234 | + movdqu %xmm3, %xmm2 |
| 235 | + movdqu %xmm3, %xmm4 // packed left shifting >> 1 |
| 236 | + movdqu %xmm3, %xmm5 |
| 237 | + psrld $1, %xmm2 |
| 238 | + psrld $2, %xmm4 // packed left shifting >> 2 |
| 239 | + psrld $7, %xmm5 // packed left shifting >> 7 |
| 240 | + pxor %xmm4, %xmm2 // xor the shifted versions |
| 241 | + pxor %xmm5, %xmm2 |
| 242 | + pxor %xmm8, %xmm2 |
| 243 | + pxor %xmm2, %xmm3 |
| 244 | + pxor %xmm3, %xmm6 // the result is in xmm6 |
| 245 | + |
| 246 | + // |
| 247 | + // Byte swap 16-byte result |
| 248 | + // |
| 249 | + pshufb %xmm10, %xmm6 // %xmm10 has the swap mask |
| 250 | + |
| 251 | + // |
| 252 | + // Store the result |
| 253 | + // |
| 254 | + movdqu %xmm6, (%r8) // P3 |
| 255 | + |
| 256 | + |
| 257 | + // |
| 258 | + // Return |
| 259 | + // |
| 260 | + RET |
| 261 | + SET_SIZE(gcm_mul_pclmulqdq) |
| 262 | + |
| 263 | +#endif /* lint || __lint */ |
| 264 | + |
| 265 | +#ifdef __ELF__ |
| 266 | +.section .note.GNU-stack,"",%progbits |
| 267 | +#endif |
0 commit comments