@@ -28,35 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
#include "common.h"
29
29
30
30
#if !defined(DOUBLE )
31
- #define VSETVL (n ) __riscv_vsetvl_e32m1(n)
32
- #define FLOAT_V_T vfloat32m1_t
33
- #define FLOAT_VX2_T vfloat32m1x2_t
34
- #define FLOAT_VX4_T vfloat32m1x4_t
35
- #define FLOAT_VX8_T vfloat32m1x8_t
36
- #define VLEV_FLOAT __riscv_vle32_v_f32m1
37
- #define VLSEV_FLOAT __riscv_vlse32_v_f32m1
38
- #define VSEV_FLOAT __riscv_vse32_v_f32m1
39
- #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
40
- #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
41
- #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
42
- #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43
- #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
44
- #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
31
+ #define FLOAT_V_T vfloat32m2_t
32
+ #define FLOAT_V_T_HALF vfloat32m1_t
33
+ #define VLEV_FLOAT __riscv_vle32_v_f32m2
34
+ #define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1
35
+ #define VSEV_FLOAT __riscv_vse32_v_f32m2
36
+ #define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1
45
37
#else
46
- #define VSETVL (n ) __riscv_vsetvl_e64m1(n)
47
- #define FLOAT_V_T vfloat64m1_t
48
- #define FLOAT_VX2_T vfloat64m1x2_t
49
- #define FLOAT_VX4_T vfloat64m1x4_t
50
- #define FLOAT_VX8_T vfloat64m1x8_t
51
- #define VLEV_FLOAT __riscv_vle64_v_f64m1
52
- #define VLSEV_FLOAT __riscv_vlse64_v_f64m1
53
- #define VSEV_FLOAT __riscv_vse64_v_f64m1
54
- #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
55
- #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56
- #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
57
- #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
58
- #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
59
- #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
38
+ #define FLOAT_V_T vfloat64m4_t
39
+ #define FLOAT_V_T_HALF vfloat64m2_t
40
+ #define VLEV_FLOAT __riscv_vle64_v_f64m4
41
+ #define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2
42
+ #define VSEV_FLOAT __riscv_vse64_v_f64m4
43
+ #define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2
60
44
#endif
61
45
62
46
int CNAME (BLASLONG m , BLASLONG n , IFLOAT * a , BLASLONG lda , IFLOAT * b )
@@ -69,9 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
69
53
IFLOAT * boffset , * boffset1 , * boffset2 , * boffset3 , * boffset4 ;
70
54
71
55
FLOAT_V_T v0 ;
72
- FLOAT_VX2_T vx2 ;
73
- FLOAT_VX4_T vx4 ;
74
- FLOAT_VX8_T vx8 ;
56
+ FLOAT_V_T_HALF v1 ;
75
57
76
58
// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
77
59
@@ -81,156 +63,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
81
63
boffset3 = b + m * (n & ~3 );
82
64
boffset4 = b + m * (n & ~1 );
83
65
84
- for (j = (m >> 3 ); j > 0 ; j -- ) {
85
-
86
- aoffset1 = aoffset ;
87
- aoffset += 8 * lda ;
88
-
89
- boffset1 = boffset ;
90
- boffset += 64 ;
91
-
92
- for (i = (n >> 3 ); i > 0 ; i -- ) {
93
- size_t vl = 8 ;
94
-
95
- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
96
- VSSEG8_FLOAT (boffset1 , vx8 , vl );
97
-
98
- aoffset1 += 8 ;
99
- boffset1 += m * 8 ;
100
- }
101
-
102
- if (n & 4 ) {
103
- size_t vl = 8 ;
104
-
105
- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
106
- VSSEG4_FLOAT (boffset2 , vx4 , vl );
107
-
108
- aoffset1 += 4 ;
109
- boffset2 += 32 ;
110
- }
111
-
112
- if (n & 2 ) {
113
- size_t vl = 8 ;
114
-
115
- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
116
- VSSEG2_FLOAT (boffset3 , vx2 , vl );
117
-
118
- aoffset1 += 2 ;
119
- boffset3 += 16 ;
120
- }
121
-
122
- if (n & 1 ) {
123
- size_t vl = 8 ;
124
-
125
- v0 = VLSEV_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
126
- VSEV_FLOAT (boffset4 , v0 , vl );
127
-
128
- aoffset1 += 1 ;
129
- boffset4 += 8 ;
130
- }
131
-
132
- }
133
-
134
- if (m & 4 ) {
135
-
136
- aoffset1 = aoffset ;
137
- aoffset += 4 * lda ;
138
-
139
- boffset1 = boffset ;
140
- boffset += 32 ;
141
-
142
- for (i = (n >> 3 ); i > 0 ; i -- ) {
143
- size_t vl = 4 ;
144
-
145
- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
146
- VSSEG8_FLOAT (boffset1 , vx8 , vl );
147
-
148
- aoffset1 += 8 ;
149
- boffset1 += m * 8 ;
150
- }
151
-
152
- if (n & 4 ) {
153
- size_t vl = 4 ;
154
-
155
- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
156
- VSSEG4_FLOAT (boffset2 , vx4 , vl );
157
-
158
- aoffset1 += 4 ;
159
- boffset2 += 16 ;
160
- }
161
-
162
- if (n & 2 ) {
163
- size_t vl = 4 ;
164
-
165
- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
166
- VSSEG2_FLOAT (boffset3 , vx2 , vl );
167
-
168
- aoffset1 += 2 ;
169
- boffset3 += 8 ;
170
- }
171
-
172
- if (n & 1 ) {
173
- size_t vl = 4 ;
174
-
175
- v0 = VLSEV_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
176
- VSEV_FLOAT (boffset4 , v0 , vl );
177
-
178
- aoffset1 += 1 ;
179
- boffset4 += 4 ;
180
- }
181
- }
182
-
183
- if (m & 2 ) {
66
+ for (j = m ; j > 0 ; j -- ) {
184
67
aoffset1 = aoffset ;
185
- aoffset += 2 * lda ;
186
-
187
68
boffset1 = boffset ;
188
- boffset += 16 ;
189
-
190
- for (i = (n >> 3 ); i > 0 ; i -- ) {
191
- size_t vl = 2 ;
192
69
193
- vx8 = VLSSEG8_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
194
- VSSEG8_FLOAT (boffset1 , vx8 , vl );
195
-
196
- aoffset1 += 8 ;
197
- boffset1 += m * 8 ;
198
- }
199
-
200
- if (n & 4 ) {
201
- size_t vl = 2 ;
202
-
203
- vx4 = VLSSEG4_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
204
- VSSEG4_FLOAT (boffset2 , vx4 , vl );
205
-
206
- aoffset1 += 4 ;
207
- boffset2 += 8 ;
208
- }
209
-
210
- if (n & 2 ) {
211
- size_t vl = 2 ;
212
-
213
- vx2 = VLSSEG2_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
214
- VSSEG2_FLOAT (boffset3 , vx2 , vl );
215
-
216
- aoffset1 += 2 ;
217
- boffset3 += 4 ;
218
- }
219
-
220
- if (n & 1 ) {
221
- size_t vl = 2 ;
222
-
223
- v0 = VLSEV_FLOAT (aoffset1 , lda * sizeof (FLOAT ), vl );
224
- VSEV_FLOAT (boffset4 , v0 , vl );
225
-
226
- aoffset1 += 1 ;
227
- boffset4 += 2 ;
228
- }
229
- }
230
-
231
- if (m & 1 ) {
232
- aoffset1 = aoffset ;
233
- boffset1 = boffset ;
70
+ aoffset += lda ;
71
+ boffset += 8 ;
234
72
235
73
for (i = (n >> 3 ); i > 0 ; i -- ) {
236
74
size_t vl = 8 ;
@@ -245,27 +83,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
245
83
if (n & 4 ) {
246
84
size_t vl = 4 ;
247
85
248
- v0 = VLEV_FLOAT (aoffset1 , vl );
249
- VSEV_FLOAT (boffset2 , v0 , vl );
86
+ v1 = VLEV_FLOAT_HALF (aoffset1 , vl );
87
+ VSEV_FLOAT_HALF (boffset2 , v1 , vl );
250
88
251
89
aoffset1 += 4 ;
252
- // boffset2 += 4;
90
+ boffset2 += 4 ;
253
91
}
254
92
255
93
if (n & 2 ) {
256
- size_t vl = 2 ;
257
-
258
- v0 = VLEV_FLOAT (aoffset1 , vl );
259
- VSEV_FLOAT (boffset3 , v0 , vl );
94
+ * (boffset3 ) = * (aoffset1 );
95
+ * (boffset3 + 1 ) = * (aoffset1 + 1 );
260
96
261
97
aoffset1 += 2 ;
262
- // boffset3 += 2;
98
+ boffset3 += 2 ;
263
99
}
264
100
265
101
if (n & 1 ) {
266
- * (boffset4 ) = * (aoffset1 );
267
- // aoffset1 ++;
268
- // boffset4 ++;
102
+ * (boffset4 ) = * (aoffset1 );
103
+ aoffset1 ++ ;
104
+ boffset4 ++ ;
269
105
}
270
106
}
271
107
0 commit comments