@@ -26,20 +26,6 @@ ifndef UNAME_M
26
26
UNAME_M := $(shell uname -m)
27
27
endif
28
28
29
- ifeq '' '$(findstring clang,$(shell $(CC ) --version) ) '
30
- CC_IS_GCC=1
31
- CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
32
- else
33
- CC_IS_CLANG=1
34
- ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
35
- CC_IS_LLVM_CLANG=1
36
- else
37
- CC_IS_APPLE_CLANG=1
38
- endif
39
- CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
40
- | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
41
- endif
42
-
43
29
# Mac OS + Arm can report x86_64
44
30
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
45
31
ifeq ($(UNAME_S ) ,Darwin)
@@ -121,12 +107,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC
121
107
122
108
# -Ofast tends to produce faster code, but may not be available for some compilers.
123
109
ifdef LLAMA_FAST
124
- MK_CFLAGS += -Ofast
125
- MK_HOST_CXXFLAGS += -Ofast
126
- MK_CUDA_CXXFLAGS += -O3
110
+ MK_CFLAGS += -Ofast
111
+ HOST_CXXFLAGS += -Ofast
112
+ MK_NVCCFLAGS += -O3
127
113
else
128
- MK_CFLAGS += -O3
129
- MK_CXXFLAGS += -O3
114
+ MK_CFLAGS += -O3
115
+ MK_CXXFLAGS += -O3
130
116
endif
131
117
132
118
# clock_gettime came in POSIX.1b (1993)
@@ -220,30 +206,6 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
220
206
-Werror=implicit-function-declaration
221
207
MK_CXXFLAGS += $(WARN_FLAGS ) -Wmissing-declarations -Wmissing-noreturn
222
208
223
- ifeq ($(CC_IS_CLANG ) , 1)
224
- # clang options
225
- MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
226
- MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
227
-
228
- ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
229
- MK_CFLAGS += -Wdouble-promotion
230
- endif
231
- ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
232
- MK_CFLAGS += -Wdouble-promotion
233
- endif
234
- else
235
- # gcc options
236
- MK_CFLAGS += -Wdouble-promotion
237
- MK_HOST_CXXFLAGS += -Wno-array-bounds
238
-
239
- ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
240
- MK_HOST_CXXFLAGS += -Wno-format-truncation
241
- endif
242
- ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
243
- MK_HOST_CXXFLAGS += -Wextra-semi
244
- endif
245
- endif
246
-
247
209
# this version of Apple ld64 is buggy
248
210
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC ) $(LDFLAGS ) -Wl,-v 2>&1) ) '
249
211
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -294,8 +256,8 @@ ifndef RISCV
294
256
295
257
ifeq ($(UNAME_M ) ,$(filter $(UNAME_M ) ,x86_64 i686 amd64) )
296
258
# Use all CPU extensions that are available:
297
- MK_CFLAGS += -march=native -mtune=native
298
- MK_HOST_CXXFLAGS += -march=native -mtune=native
259
+ MK_CFLAGS += -march=native -mtune=native
260
+ HOST_CXXFLAGS += -march=native -mtune=native
299
261
300
262
# Usage AVX-only
301
263
# MK_CFLAGS += -mfma -mf16c -mavx
@@ -398,10 +360,10 @@ ifdef LLAMA_CUBLAS
398
360
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
399
361
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
400
362
OBJS += ggml-cuda.o
401
- NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
363
+ MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
402
364
403
365
ifdef LLAMA_DEBUG
404
- NVCCFLAGS += -lineinfo
366
+ MK_NVCCFLAGS += -lineinfo
405
367
endif
406
368
407
369
ifdef LLAMA_CUDA_NVCC
@@ -410,54 +372,52 @@ else
410
372
NVCC = nvcc
411
373
endif # LLAMA_CUDA_NVCC
412
374
ifdef CUDA_DOCKER_ARCH
413
- NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
414
- else ifdef CUDA_POWER_ARCH
415
- NVCCFLAGS +=
416
- else
417
- NVCCFLAGS += -arch=native
375
+ MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
376
+ else ifndef CUDA_POWER_ARCH
377
+ MK_NVCCFLAGS += -arch=native
418
378
endif # CUDA_DOCKER_ARCH
419
379
ifdef LLAMA_CUDA_FORCE_DMMV
420
- NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
380
+ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
421
381
endif # LLAMA_CUDA_FORCE_DMMV
422
382
ifdef LLAMA_CUDA_FORCE_MMQ
423
- NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
383
+ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
424
384
endif # LLAMA_CUDA_FORCE_MMQ
425
385
ifdef LLAMA_CUDA_DMMV_X
426
- NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
386
+ MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
427
387
else
428
- NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
388
+ MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
429
389
endif # LLAMA_CUDA_DMMV_X
430
390
ifdef LLAMA_CUDA_MMV_Y
431
- NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
391
+ MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
432
392
else ifdef LLAMA_CUDA_DMMV_Y
433
- NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
393
+ MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
434
394
else
435
- NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
395
+ MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
436
396
endif # LLAMA_CUDA_MMV_Y
437
397
ifdef LLAMA_CUDA_F16
438
- NVCCFLAGS += -DGGML_CUDA_F16
398
+ MK_NVCCFLAGS += -DGGML_CUDA_F16
439
399
endif # LLAMA_CUDA_F16
440
400
ifdef LLAMA_CUDA_DMMV_F16
441
- NVCCFLAGS += -DGGML_CUDA_F16
401
+ MK_NVCCFLAGS += -DGGML_CUDA_F16
442
402
endif # LLAMA_CUDA_DMMV_F16
443
403
ifdef LLAMA_CUDA_KQUANTS_ITER
444
- NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
404
+ MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
445
405
else
446
- NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
406
+ MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
447
407
endif
448
408
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
449
- NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
409
+ MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
450
410
else
451
- NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
411
+ MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
452
412
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
453
413
# ifdef LLAMA_CUDA_CUBLAS
454
- # NVCCFLAGS += -DGGML_CUDA_CUBLAS
414
+ # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
455
415
# endif # LLAMA_CUDA_CUBLAS
456
416
ifdef LLAMA_CUDA_CCBIN
457
- NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
417
+ MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
458
418
endif
459
419
ggml-cuda.o : ggml-cuda.cu ggml-cuda.h
460
- $(NVCC ) $(NVCCFLAGS ) -c $< -o $@
420
+ $(NVCC ) $(BASE_CXXFLAGS ) $( NVCCFLAGS ) -Wno-pedantic -Xcompiler " $( CUDA_CXXFLAGS ) " -c $< -o $@
461
421
endif # LLAMA_CUBLAS
462
422
463
423
ifdef LLAMA_CLBLAST
@@ -519,16 +479,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
519
479
$(CC ) $(CFLAGS ) -c $< -o $@
520
480
endif # LLAMA_MPI
521
481
482
+ GF_CC := $(CC )
483
+ include scripts/get-flags.mk
484
+
522
485
# combine build flags with cmdline overrides
523
- override CFLAGS := $(MK_CPPFLAGS ) $(CPPFLAGS ) $(MK_CFLAGS ) $(CFLAGS )
524
- override CXXFLAGS := $(MK_CPPFLAGS ) $(CPPFLAGS ) $(MK_CXXFLAGS ) $(CXXFLAGS )
525
- override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS ) $(CUDA_CXXFLAGS )
526
- override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS ) $(HOST_CXXFLAGS )
527
- override LDFLAGS := $(MK_LDFLAGS ) $(LDFLAGS )
528
-
529
- # save CXXFLAGS before we add host-only options
530
- NVCCFLAGS := $(NVCCFLAGS ) $(CXXFLAGS ) $(CUDA_CXXFLAGS ) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS ) "
531
- override CXXFLAGS += $(HOST_CXXFLAGS )
486
+ override CFLAGS := $(MK_CPPFLAGS ) $(CPPFLAGS ) $(MK_CFLAGS ) $(GF_CFLAGS ) $(CFLAGS )
487
+ BASE_CXXFLAGS := $(MK_CPPFLAGS ) $(CPPFLAGS ) $(MK_CXXFLAGS ) $(CXXFLAGS )
488
+ override CXXFLAGS := $(BASE_CXXFLAGS ) $(HOST_CXXFLAGS ) $(GF_CXXFLAGS )
489
+ override NVCCFLAGS := $(MK_NVCCFLAGS ) $(NVCCFLAGS )
490
+ override LDFLAGS := $(MK_LDFLAGS ) $(LDFLAGS )
491
+
492
+ # identify CUDA host compiler
493
+ ifdef LLAMA_CUBLAS
494
+ GF_CC := $(NVCC ) $(NVCCFLAGS ) 2>/dev/null .c -Xcompiler
495
+ include scripts/get-flags.mk
496
+ CUDA_CXXFLAGS := $(GF_CXXFLAGS )
497
+ endif
532
498
533
499
#
534
500
# Print build information
0 commit comments