Skip to content

Commit 47a48eb

Browse files
sophie-zhaoabner-chenc
authored andcommitted
hash/crc32: optimize the loong64 crc32 implementation
Make use of the newly added LA64 CRC32 instructions to accelerate computation of CRC32 with IEEE and Castagnoli polynomials. Benchmarks: goos: linux goarch: loong64 pkg: hash/crc32 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | CRC32/poly=IEEE/size=15/align=0 63.35n ± 0% 15.80n ± 0% -75.06% (p=0.000 n=20) CRC32/poly=IEEE/size=15/align=1 63.35n ± 0% 16.42n ± 0% -74.08% (p=0.000 n=20) CRC32/poly=IEEE/size=40/align=0 65.40n ± 0% 19.22n ± 0% -70.61% (p=0.000 n=20) CRC32/poly=IEEE/size=40/align=1 65.40n ± 0% 19.23n ± 0% -70.60% (p=0.000 n=20) CRC32/poly=IEEE/size=512/align=0 407.30n ± 0% 66.86n ± 0% -83.58% (p=0.000 n=20) CRC32/poly=IEEE/size=512/align=1 407.30n ± 0% 66.86n ± 0% -83.58% (p=0.000 n=20) CRC32/poly=IEEE/size=1kB/align=0 778.2n ± 0% 118.1n ± 0% -84.82% (p=0.000 n=20) CRC32/poly=IEEE/size=1kB/align=1 778.2n ± 0% 118.1n ± 0% -84.82% (p=0.000 n=20) CRC32/poly=IEEE/size=4kB/align=0 3004.0n ± 0% 425.6n ± 0% -85.83% (p=0.000 n=20) CRC32/poly=IEEE/size=4kB/align=1 3004.0n ± 0% 425.6n ± 0% -85.83% (p=0.000 n=20) CRC32/poly=IEEE/size=32kB/align=0 23.775µ ± 0% 3.305µ ± 0% -86.10% (p=0.000 n=20) CRC32/poly=IEEE/size=32kB/align=1 23.774µ ± 0% 3.305µ ± 0% -86.10% (p=0.000 n=20) CRC32/poly=Castagnoli/size=15/align=0 63.58n ± 0% 15.28n ± 0% -75.97% (p=0.000 n=20) CRC32/poly=Castagnoli/size=15/align=1 63.58n ± 0% 16.95n ± 0% -73.34% (p=0.000 n=20) CRC32/poly=Castagnoli/size=40/align=0 65.29n ± 0% 17.04n ± 0% -73.90% (p=0.000 n=20) CRC32/poly=Castagnoli/size=40/align=1 65.29n ± 0% 19.05n ± 0% -70.83% (p=0.000 n=20) CRC32/poly=Castagnoli/size=512/align=0 407.20n ± 0% 55.06n ± 0% -86.48% (p=0.000 n=20) CRC32/poly=Castagnoli/size=512/align=1 407.20n ± 0% 56.44n ± 0% -86.14% (p=0.000 n=20) CRC32/poly=Castagnoli/size=1kB/align=0 778.10n ± 0% 95.08n ± 0% -87.78% (p=0.000 n=20) CRC32/poly=Castagnoli/size=1kB/align=1 778.10n ± 0% 97.72n ± 0% -87.44% (p=0.000 n=20) CRC32/poly=Castagnoli/size=4kB/align=0 3004.0n ± 0% 338.5n ± 0% -88.73% (p=0.000 n=20) CRC32/poly=Castagnoli/size=4kB/align=1 3004.0n ± 0% 341.1n ± 0% -88.64% (p=0.000 n=20) CRC32/poly=Castagnoli/size=32kB/align=0 23.775µ ± 0% 2.623µ ± 0% -88.97% (p=0.000 n=20) CRC32/poly=Castagnoli/size=32kB/align=1 23.775µ ± 0% 2.896µ ± 0% -87.82% (p=0.000 n=20) CRC32/poly=Koopman/size=15/align=0 63.11n ± 0% 63.11n ± 0% ~ (p=0.737 n=20) CRC32/poly=Koopman/size=15/align=1 63.11n ± 0% 63.11n ± 0% ~ (p=1.000 n=20) CRC32/poly=Koopman/size=40/align=0 153.2n ± 0% 153.2n ± 0% ~ (p=1.000 n=20) CRC32/poly=Koopman/size=40/align=1 153.2n ± 0% 153.2n ± 0% ~ (p=0.737 n=20) CRC32/poly=Koopman/size=512/align=0 1.854µ ± 0% 1.854µ ± 0% ~ (p=1.000 n=20) CRC32/poly=Koopman/size=512/align=1 1.854µ ± 0% 1.854µ ± 0% ~ (p=0.737 n=20) CRC32/poly=Koopman/size=1kB/align=0 3.699µ ± 0% 3.699µ ± 0% ~ (p=1.000 n=20) CRC32/poly=Koopman/size=1kB/align=1 3.699µ ± 0% 3.699µ ± 0% ~ (p=1.000 n=20) CRC32/poly=Koopman/size=4kB/align=0 14.77µ ± 0% 14.77µ ± 0% ~ (p=0.495 n=20) CRC32/poly=Koopman/size=4kB/align=1 14.77µ ± 0% 14.77µ ± 0% ~ (p=0.704 n=20) CRC32/poly=Koopman/size=32kB/align=0 118.1µ ± 0% 118.1µ ± 0% ~ (p=0.057 n=20) CRC32/poly=Koopman/size=32kB/align=1 118.1µ ± 0% 118.1µ ± 0% ~ (p=0.493 n=20) geomean 1.001µ 306.8n -69.35% goos: linux goarch: loong64 pkg: hash/crc32 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | CRC32/poly=IEEE/size=15/align=0 75.70n ± 1% 47.04n ± 1% -37.86% (p=0.000 n=20) CRC32/poly=IEEE/size=15/align=1 75.70n ± 1% 46.64n ± 1% -38.39% (p=0.000 n=20) CRC32/poly=IEEE/size=40/align=0 89.26n ± 0% 65.49n ± 0% -26.63% (p=0.000 n=20) CRC32/poly=IEEE/size=40/align=1 89.09n ± 0% 72.55n ± 1% -18.56% (p=0.000 n=20) CRC32/poly=IEEE/size=512/align=0 621.0n ± 0% 513.5n ± 0% -17.31% (p=0.000 n=20) CRC32/poly=IEEE/size=512/align=1 621.0n ± 0% 521.9n ± 0% -15.96% (p=0.000 n=20) CRC32/poly=IEEE/size=1kB/align=0 1.204µ ± 0% 1.001µ ± 0% -16.86% (p=0.000 n=20) CRC32/poly=IEEE/size=1kB/align=1 1.205µ ± 0% 1.009µ ± 0% -16.27% (p=0.000 n=20) CRC32/poly=IEEE/size=4kB/align=0 4.665µ ± 0% 3.923µ ± 0% -15.91% (p=0.000 n=20) CRC32/poly=IEEE/size=4kB/align=1 4.665µ ± 0% 3.931µ ± 0% -15.73% (p=0.000 n=20) CRC32/poly=IEEE/size=32kB/align=0 36.97µ ± 0% 31.20µ ± 0% -15.60% (p=0.000 n=20) CRC32/poly=IEEE/size=32kB/align=1 36.96µ ± 0% 31.21µ ± 0% -15.57% (p=0.000 n=20) CRC32/poly=Castagnoli/size=15/align=0 75.72n ± 1% 48.07n ± 1% -36.52% (p=0.000 n=20) CRC32/poly=Castagnoli/size=15/align=1 75.70n ± 1% 46.99n ± 2% -37.93% (p=0.000 n=20) CRC32/poly=Castagnoli/size=40/align=0 87.91n ± 0% 64.89n ± 0% -26.19% (p=0.000 n=20) CRC32/poly=Castagnoli/size=40/align=1 87.91n ± 0% 72.12n ± 1% -17.97% (p=0.000 n=20) CRC32/poly=Castagnoli/size=512/align=0 619.8n ± 0% 514.3n ± 0% -17.02% (p=0.000 n=20) CRC32/poly=Castagnoli/size=512/align=1 619.8n ± 0% 521.7n ± 0% -15.83% (p=0.000 n=20) CRC32/poly=Castagnoli/size=1kB/align=0 1.202µ ± 0% 1.001µ ± 0% -16.72% (p=0.000 n=20) CRC32/poly=Castagnoli/size=1kB/align=1 1.202µ ± 0% 1.009µ ± 0% -16.06% (p=0.000 n=20) CRC32/poly=Castagnoli/size=4kB/align=0 4.663µ ± 0% 3.924µ ± 0% -15.85% (p=0.000 n=20) CRC32/poly=Castagnoli/size=4kB/align=1 4.663µ ± 0% 3.931µ ± 0% -15.70% (p=0.000 n=20) CRC32/poly=Castagnoli/size=32kB/align=0 36.96µ ± 0% 31.20µ ± 0% -15.60% (p=0.000 n=20) CRC32/poly=Castagnoli/size=32kB/align=1 36.96µ ± 0% 31.21µ ± 0% -15.57% (p=0.000 n=20) CRC32/poly=Koopman/size=15/align=0 74.91n ± 1% 74.95n ± 1% ~ (p=0.963 n=20) CRC32/poly=Koopman/size=15/align=1 74.91n ± 1% 75.02n ± 1% ~ (p=0.909 n=20) CRC32/poly=Koopman/size=40/align=0 165.0n ± 0% 165.0n ± 0% ~ (p=0.865 n=20) CRC32/poly=Koopman/size=40/align=1 165.1n ± 0% 165.0n ± 0% ~ (p=0.342 n=20) CRC32/poly=Koopman/size=512/align=0 1.867µ ± 0% 1.867µ ± 0% ~ (p=0.320 n=20) CRC32/poly=Koopman/size=512/align=1 1.867µ ± 0% 1.867µ ± 0% ~ (p=0.782 n=20) CRC32/poly=Koopman/size=1kB/align=0 3.712µ ± 0% 3.712µ ± 0% ~ (p=0.859 n=20) CRC32/poly=Koopman/size=1kB/align=1 3.712µ ± 0% 3.713µ ± 0% ~ (p=0.175 n=20) CRC32/poly=Koopman/size=4kB/align=0 14.79µ ± 0% 14.79µ ± 0% ~ (p=0.826 n=20) CRC32/poly=Koopman/size=4kB/align=1 14.79µ ± 0% 14.79µ ± 0% ~ (p=0.169 n=20) CRC32/poly=Koopman/size=32kB/align=0 118.1µ ± 0% 118.1µ ± 0% ~ (p=0.941 n=20) CRC32/poly=Koopman/size=32kB/align=1 118.1µ ± 0% 118.1µ ± 0% ~ (p=0.473 n=20) geomean 1.299µ 1.109µ -14.68% Performance of poly=Koopman is not affected. This patch is a copy of CL 478596. Co-authored-by: WANG Xuerui <[email protected]> Change-Id: I345192cdf693f21fe1015a8b8361ca68ac780c9e Reviewed-on: https://go-review.googlesource.com/c/go/+/624355 Reviewed-by: abner chenc <[email protected]> Reviewed-by: Carlos Amedee <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: David Chase <[email protected]>
1 parent 6f59c11 commit 47a48eb

File tree

3 files changed

+211
-1
lines changed

3 files changed

+211
-1
lines changed

src/hash/crc32/crc32_loong64.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// LoongArch64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
6+
// description of the interface that each architecture-specific file
7+
// implements.
8+
9+
package crc32
10+
11+
import "internal/cpu"
12+
13+
func castagnoliUpdate(crc uint32, p []byte) uint32
14+
func ieeeUpdate(crc uint32, p []byte) uint32
15+
16+
func archAvailableCastagnoli() bool {
17+
return cpu.Loong64.HasCRC32
18+
}
19+
20+
func archInitCastagnoli() {
21+
if !cpu.Loong64.HasCRC32 {
22+
panic("arch-specific crc32 instruction for Castagnoli not available")
23+
}
24+
}
25+
26+
func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
27+
if !cpu.Loong64.HasCRC32 {
28+
panic("arch-specific crc32 instruction for Castagnoli not available")
29+
}
30+
31+
return ^castagnoliUpdate(^crc, p)
32+
}
33+
34+
func archAvailableIEEE() bool {
35+
return cpu.Loong64.HasCRC32
36+
}
37+
38+
func archInitIEEE() {
39+
if !cpu.Loong64.HasCRC32 {
40+
panic("arch-specific crc32 instruction for IEEE not available")
41+
}
42+
}
43+
44+
func archUpdateIEEE(crc uint32, p []byte) uint32 {
45+
if !cpu.Loong64.HasCRC32 {
46+
panic("arch-specific crc32 instruction for IEEE not available")
47+
}
48+
49+
return ^ieeeUpdate(^crc, p)
50+
}

src/hash/crc32/crc32_loong64.s

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "textflag.h"
6+
7+
// castagnoliUpdate updates the non-inverted crc with the given data.
8+
9+
// func castagnoliUpdate(crc uint32, p []byte) uint32
10+
TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36
11+
MOVWU crc+0(FP), R4 // a0 = CRC value
12+
MOVV p+8(FP), R5 // a1 = data pointer
13+
MOVV p_len+16(FP), R6 // a2 = len(p)
14+
15+
SGT $8, R6, R12
16+
BNE R12, less_than_8
17+
AND $7, R5, R12
18+
BEQ R12, aligned
19+
20+
// Process the first few bytes to 8-byte align the input.
21+
// t0 = 8 - t0. We need to process this many bytes to align.
22+
SUB $1, R12
23+
XOR $7, R12
24+
25+
AND $1, R12, R13
26+
BEQ R13, align_2
27+
MOVB (R5), R13
28+
CRCCWBW R4, R13, R4
29+
ADDV $1, R5
30+
ADDV $-1, R6
31+
32+
align_2:
33+
AND $2, R12, R13
34+
BEQ R13, align_4
35+
MOVH (R5), R13
36+
CRCCWHW R4, R13, R4
37+
ADDV $2, R5
38+
ADDV $-2, R6
39+
40+
align_4:
41+
AND $4, R12, R13
42+
BEQ R13, aligned
43+
MOVW (R5), R13
44+
CRCCWWW R4, R13, R4
45+
ADDV $4, R5
46+
ADDV $-4, R6
47+
48+
aligned:
49+
// The input is now 8-byte aligned and we can process 8-byte chunks.
50+
SGT $8, R6, R12
51+
BNE R12, less_than_8
52+
MOVV (R5), R13
53+
CRCCWVW R4, R13, R4
54+
ADDV $8, R5
55+
ADDV $-8, R6
56+
JMP aligned
57+
58+
less_than_8:
59+
// We may have some bytes left over; process 4 bytes, then 2, then 1.
60+
AND $4, R6, R12
61+
BEQ R12, less_than_4
62+
MOVW (R5), R13
63+
CRCCWWW R4, R13, R4
64+
ADDV $4, R5
65+
ADDV $-4, R6
66+
67+
less_than_4:
68+
AND $2, R6, R12
69+
BEQ R12, less_than_2
70+
MOVH (R5), R13
71+
CRCCWHW R4, R13, R4
72+
ADDV $2, R5
73+
ADDV $-2, R6
74+
75+
less_than_2:
76+
BEQ R6, done
77+
MOVB (R5), R13
78+
CRCCWBW R4, R13, R4
79+
80+
done:
81+
MOVW R4, ret+32(FP)
82+
RET
83+
84+
// ieeeUpdate updates the non-inverted crc with the given data.
85+
86+
// func ieeeUpdate(crc uint32, p []byte) uint32
87+
TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36
88+
MOVWU crc+0(FP), R4 // a0 = CRC value
89+
MOVV p+8(FP), R5 // a1 = data pointer
90+
MOVV p_len+16(FP), R6 // a2 = len(p)
91+
92+
SGT $8, R6, R12
93+
BNE R12, less_than_8
94+
AND $7, R5, R12
95+
BEQ R12, aligned
96+
97+
// Process the first few bytes to 8-byte align the input.
98+
// t0 = 8 - t0. We need to process this many bytes to align.
99+
SUB $1, R12
100+
XOR $7, R12
101+
102+
AND $1, R12, R13
103+
BEQ R13, align_2
104+
MOVB (R5), R13
105+
CRCWBW R4, R13, R4
106+
ADDV $1, R5
107+
ADDV $-1, R6
108+
109+
align_2:
110+
AND $2, R12, R13
111+
BEQ R13, align_4
112+
MOVH (R5), R13
113+
CRCWHW R4, R13, R4
114+
ADDV $2, R5
115+
ADDV $-2, R6
116+
117+
align_4:
118+
AND $4, R12, R13
119+
BEQ R13, aligned
120+
MOVW (R5), R13
121+
CRCWWW R4, R13, R4
122+
ADDV $4, R5
123+
ADDV $-4, R6
124+
125+
aligned:
126+
// The input is now 8-byte aligned and we can process 8-byte chunks.
127+
SGT $8, R6, R12
128+
BNE R12, less_than_8
129+
MOVV (R5), R13
130+
CRCWVW R4, R13, R4
131+
ADDV $8, R5
132+
ADDV $-8, R6
133+
JMP aligned
134+
135+
less_than_8:
136+
// We may have some bytes left over; process 4 bytes, then 2, then 1.
137+
AND $4, R6, R12
138+
BEQ R12, less_than_4
139+
MOVW (R5), R13
140+
CRCWWW R4, R13, R4
141+
ADDV $4, R5
142+
ADDV $-4, R6
143+
144+
less_than_4:
145+
AND $2, R6, R12
146+
BEQ R12, less_than_2
147+
MOVH (R5), R13
148+
CRCWHW R4, R13, R4
149+
ADDV $2, R5
150+
ADDV $-2, R6
151+
152+
less_than_2:
153+
BEQ R6, done
154+
MOVB (R5), R13
155+
CRCWBW R4, R13, R4
156+
157+
done:
158+
MOVW R4, ret+32(FP)
159+
RET
160+

src/hash/crc32/crc32_otherarch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 && !s390x && !ppc64le && !arm64
5+
//go:build !amd64 && !s390x && !ppc64le && !arm64 && !loong64
66

77
package crc32
88

0 commit comments

Comments
 (0)