Skip to content

Commit a7483d1

Browse files
authored
Merge pull request #5074 from tingboliao/develop
Optimize the gemm_tcopy_8_rvv to be compatible with the vlens 128 and 256.
2 parents eba7338 + ef7f54b commit a7483d1

File tree

1 file changed

+25
-189
lines changed

1 file changed

+25
-189
lines changed

kernel/riscv64/gemm_tcopy_8_rvv.c

Lines changed: 25 additions & 189 deletions
Original file line numberDiff line numberDiff line change
@@ -28,35 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828
#include "common.h"
2929

3030
#if !defined(DOUBLE)
31-
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
32-
#define FLOAT_V_T vfloat32m1_t
33-
#define FLOAT_VX2_T vfloat32m1x2_t
34-
#define FLOAT_VX4_T vfloat32m1x4_t
35-
#define FLOAT_VX8_T vfloat32m1x8_t
36-
#define VLEV_FLOAT __riscv_vle32_v_f32m1
37-
#define VLSEV_FLOAT __riscv_vlse32_v_f32m1
38-
#define VSEV_FLOAT __riscv_vse32_v_f32m1
39-
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
40-
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
41-
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
42-
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43-
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
44-
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
31+
#define FLOAT_V_T vfloat32m2_t
32+
#define FLOAT_V_T_HALF vfloat32m1_t
33+
#define VLEV_FLOAT __riscv_vle32_v_f32m2
34+
#define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1
35+
#define VSEV_FLOAT __riscv_vse32_v_f32m2
36+
#define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1
4537
#else
46-
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
47-
#define FLOAT_V_T vfloat64m1_t
48-
#define FLOAT_VX2_T vfloat64m1x2_t
49-
#define FLOAT_VX4_T vfloat64m1x4_t
50-
#define FLOAT_VX8_T vfloat64m1x8_t
51-
#define VLEV_FLOAT __riscv_vle64_v_f64m1
52-
#define VLSEV_FLOAT __riscv_vlse64_v_f64m1
53-
#define VSEV_FLOAT __riscv_vse64_v_f64m1
54-
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
55-
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56-
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
57-
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
58-
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
59-
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
38+
#define FLOAT_V_T vfloat64m4_t
39+
#define FLOAT_V_T_HALF vfloat64m2_t
40+
#define VLEV_FLOAT __riscv_vle64_v_f64m4
41+
#define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2
42+
#define VSEV_FLOAT __riscv_vse64_v_f64m4
43+
#define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2
6044
#endif
6145

6246
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
@@ -69,9 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
6953
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
7054

7155
FLOAT_V_T v0;
72-
FLOAT_VX2_T vx2;
73-
FLOAT_VX4_T vx4;
74-
FLOAT_VX8_T vx8;
56+
FLOAT_V_T_HALF v1;
7557

7658
// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
7759

@@ -81,156 +63,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
8163
boffset3 = b + m * (n & ~3);
8264
boffset4 = b + m * (n & ~1);
8365

84-
for(j = (m >> 3); j > 0; j--) {
85-
86-
aoffset1 = aoffset;
87-
aoffset += 8 * lda;
88-
89-
boffset1 = boffset;
90-
boffset += 64;
91-
92-
for(i = (n >> 3); i > 0; i--) {
93-
size_t vl = 8;
94-
95-
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
96-
VSSEG8_FLOAT(boffset1, vx8, vl);
97-
98-
aoffset1 += 8;
99-
boffset1 += m * 8;
100-
}
101-
102-
if (n & 4) {
103-
size_t vl = 8;
104-
105-
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
106-
VSSEG4_FLOAT(boffset2, vx4, vl);
107-
108-
aoffset1 += 4;
109-
boffset2 += 32;
110-
}
111-
112-
if (n & 2) {
113-
size_t vl = 8;
114-
115-
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
116-
VSSEG2_FLOAT(boffset3, vx2, vl);
117-
118-
aoffset1 += 2;
119-
boffset3 += 16;
120-
}
121-
122-
if (n & 1) {
123-
size_t vl = 8;
124-
125-
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
126-
VSEV_FLOAT(boffset4, v0, vl);
127-
128-
aoffset1 += 1;
129-
boffset4 += 8;
130-
}
131-
132-
}
133-
134-
if (m & 4) {
135-
136-
aoffset1 = aoffset;
137-
aoffset += 4 * lda;
138-
139-
boffset1 = boffset;
140-
boffset += 32;
141-
142-
for(i = (n >> 3); i > 0; i--) {
143-
size_t vl = 4;
144-
145-
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
146-
VSSEG8_FLOAT(boffset1, vx8, vl);
147-
148-
aoffset1 += 8;
149-
boffset1 += m * 8;
150-
}
151-
152-
if (n & 4) {
153-
size_t vl = 4;
154-
155-
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
156-
VSSEG4_FLOAT(boffset2, vx4, vl);
157-
158-
aoffset1 += 4;
159-
boffset2 += 16;
160-
}
161-
162-
if (n & 2) {
163-
size_t vl = 4;
164-
165-
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
166-
VSSEG2_FLOAT(boffset3, vx2, vl);
167-
168-
aoffset1 += 2;
169-
boffset3 += 8;
170-
}
171-
172-
if (n & 1) {
173-
size_t vl = 4;
174-
175-
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
176-
VSEV_FLOAT(boffset4, v0, vl);
177-
178-
aoffset1 += 1;
179-
boffset4 += 4;
180-
}
181-
}
182-
183-
if (m & 2) {
66+
for(j = m; j > 0; j--) {
18467
aoffset1 = aoffset;
185-
aoffset += 2 * lda;
186-
18768
boffset1 = boffset;
188-
boffset += 16;
189-
190-
for(i = (n >> 3); i > 0; i--) {
191-
size_t vl = 2;
19269

193-
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
194-
VSSEG8_FLOAT(boffset1, vx8, vl);
195-
196-
aoffset1 += 8;
197-
boffset1 += m * 8;
198-
}
199-
200-
if (n & 4) {
201-
size_t vl = 2;
202-
203-
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
204-
VSSEG4_FLOAT(boffset2, vx4, vl);
205-
206-
aoffset1 += 4;
207-
boffset2 += 8;
208-
}
209-
210-
if (n & 2) {
211-
size_t vl = 2;
212-
213-
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
214-
VSSEG2_FLOAT(boffset3, vx2, vl);
215-
216-
aoffset1 += 2;
217-
boffset3 += 4;
218-
}
219-
220-
if (n & 1) {
221-
size_t vl = 2;
222-
223-
v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
224-
VSEV_FLOAT(boffset4, v0, vl);
225-
226-
aoffset1 += 1;
227-
boffset4 += 2;
228-
}
229-
}
230-
231-
if (m & 1) {
232-
aoffset1 = aoffset;
233-
boffset1 = boffset;
70+
aoffset += lda;
71+
boffset += 8;
23472

23573
for(i = (n >> 3); i > 0; i--) {
23674
size_t vl = 8;
@@ -245,27 +83,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
24583
if (n & 4) {
24684
size_t vl = 4;
24785

248-
v0 = VLEV_FLOAT(aoffset1, vl);
249-
VSEV_FLOAT(boffset2, v0, vl);
86+
v1 = VLEV_FLOAT_HALF(aoffset1, vl);
87+
VSEV_FLOAT_HALF(boffset2, v1, vl);
25088

25189
aoffset1 += 4;
252-
//boffset2 += 4;
90+
boffset2 += 4;
25391
}
25492

25593
if (n & 2) {
256-
size_t vl = 2;
257-
258-
v0 = VLEV_FLOAT(aoffset1, vl);
259-
VSEV_FLOAT(boffset3, v0, vl);
94+
*(boffset3) = *(aoffset1);
95+
*(boffset3 + 1) = *(aoffset1 + 1);
26096

26197
aoffset1 += 2;
262-
// boffset3 += 2;
98+
boffset3 += 2;
26399
}
264100

265101
if (n & 1) {
266-
*(boffset4) = *(aoffset1);
267-
// aoffset1 ++;
268-
// boffset4 ++;
102+
*(boffset4) = *(aoffset1);
103+
aoffset1 ++;
104+
boffset4 ++;
269105
}
270106
}
271107

0 commit comments

Comments
 (0)