Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
2d3481c
Fix msys2 build error and warnings (#1009)
na-na-hi Apr 16, 2023
489537e
examples: add missing <ctime> include for time() (#1011)
prusnak Apr 16, 2023
3173a62
stdout : vertical align outputs for better readibility
ggerganov Apr 16, 2023
47f61aa
Fix: do not close file on mmap (#1017)
slaren Apr 16, 2023
f266259
Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933)
dfyz Apr 17, 2023
69b7402
ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c
ggerganov Apr 17, 2023
eb17a02
quantize-stats : fix bug in --type argument
ggerganov Apr 17, 2023
efd0564
llama : well-defined static initialization of complex objects (#927)
arikpoz Apr 17, 2023
315a95a
Add LoRA support (#820)
slaren Apr 17, 2023
4ad7313
add 4_0 to default outfile namestr dict (#1031)
cammytown Apr 17, 2023
e9298af
readme : add Ruby bindings (#1029)
yoshoku Apr 17, 2023
4274722
Do not close file after mmap (Windows version) (#1034)
dfyz Apr 18, 2023
5af8e32
ci : do not run on drafts
ggerganov Apr 17, 2023
7faa746
readme : update hot topics about new LoRA functionality
ggerganov Apr 18, 2023
5ecff35
Adding a simple program to measure speed of dot products (#1041)
ikawrakow Apr 18, 2023
dcdd65e
ggml : optimize ggml_vec_dot_q4_0_q8_0() using vectorized accumulators
ggerganov Apr 18, 2023
4caebf6
gitignore : vdot
ggerganov Apr 18, 2023
50a8a2a
ggml : scratch that - vmlaq_n_f32 is always better
ggerganov Apr 18, 2023
77a7340
ggml : add new Q4_2 quantization (ARM only) (#1046)
ggerganov Apr 18, 2023
6667401
Multi-threaded ggml_cpy (#1035)
slaren Apr 18, 2023
8944a13
Add NVIDIA cuBLAS support (#1044)
slaren Apr 19, 2023
f3d4edf
ggml : Q4 cleanup - remove 4-bit dot product code (#1061)
sw Apr 19, 2023
7cd5c4a
readme : add warning about Q4_2 and Q4_3
ggerganov Apr 19, 2023
884e7d7
ggml : use 8-bit precision for Q4_1 intermediate results (#1047)
ggerganov Apr 19, 2023
f7d0509
Q4_2 quantization with rmse-optimized scale and quants (#1062)
ikawrakow Apr 19, 2023
834695f
Minor: Readme fixed grammar, spelling, and misc updates (#1071)
CRD716 Apr 19, 2023
02d6988
Improve cuBLAS performance by dequantizing on the GPU (#1065)
slaren Apr 20, 2023
c8c2c52
AVX2 optimization for vec_dot_q4_2_q8_0 (#1068)
sw Apr 20, 2023
5addcb1
fix: LLAMA_CUBLAS=1 undefined reference 'shm_open' (#1080)
fumiama Apr 20, 2023
6a9661e
ci : remove the LLAMA_ACCELERATE matrix dimension from Ubuntu builds …
dfyz Apr 20, 2023
e0305ea
ggml : add Q4_3 quantization (#1082)
ggerganov Apr 20, 2023
38de86a
llama : multi-threaded quantization (#1075)
ikawrakow Apr 20, 2023
66aab46
ggml : fix Q4_3 quantization
ggerganov Apr 20, 2023
8a1756a
ggml : do not break cuBLAS build (Q4_3 is not yet implemented)
ggerganov Apr 20, 2023
2005469
Add Q4_3 support to cuBLAS (#1086)
slaren Apr 20, 2023
9ff334f
ggml : fix bug in ggml_compute_forward_dup_f32()
ggerganov Apr 20, 2023
12b5900
ggml : sync ggml (add GPT-NeoX RoPE implementation)
ggerganov Apr 20, 2023
2510c18
Add ggml-model-*.bin checksums for 7B, 13B, 30B, 65B (#1088)
sw Apr 20, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ on:
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
pull_request:
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
Expand All @@ -18,6 +20,8 @@ env:

jobs:
ubuntu-latest-make:
if: github.event.pull_request.draft == false

runs-on: ubuntu-latest

steps:
Expand All @@ -37,6 +41,8 @@ jobs:
make

ubuntu-latest-cmake:
if: github.event.pull_request.draft == false

runs-on: ubuntu-latest

steps:
Expand Down Expand Up @@ -65,6 +71,8 @@ jobs:
ctest --verbose

ubuntu-latest-cmake-sanitizer:
if: github.event.pull_request.draft == false

runs-on: ubuntu-latest

continue-on-error: true
Expand All @@ -73,7 +81,6 @@ jobs:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
accelerate: [ON, OFF]

steps:
- name: Clone
Expand All @@ -91,7 +98,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }}

- name: Test
Expand All @@ -101,6 +108,8 @@ jobs:
ctest --verbose

macOS-latest-make:
if: github.event.pull_request.draft == false

runs-on: macos-latest

steps:
Expand All @@ -119,6 +128,8 @@ jobs:
make

macOS-latest-cmake:
if: github.event.pull_request.draft == false

runs-on: macOS-latest

steps:
Expand Down Expand Up @@ -146,6 +157,8 @@ jobs:
ctest --verbose

windows-latest-cmake:
if: github.event.pull_request.draft == false

runs-on: windows-latest

strategy:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ on:
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
if: github.event.pull_request.draft == false

runs-on: ubuntu-latest
env:
COMMIT_SHA: ${{ github.sha }}
Expand Down
16 changes: 9 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
*.o
*.a
.DS_Store
.build/
.cache/
.direnv/
.envrc
.swiftpm
.venv
.vs/
.vscode/
.DS_Store

.build/
build/
build-em/
build-debug/
Expand All @@ -24,17 +28,15 @@ models/*
/perplexity
/embedding
/benchmark-q4_0-matmult
/vdot
/Pipfile

arm_neon.h
compile_commands.json

.envrc
.direnv/

.venv
__pycache__
.swiftpm

zig-out/
zig-cache/

ppl-*.txt
72 changes: 64 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer"
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
Expand All @@ -64,6 +66,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -107,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE)
message(WARNING "Accelerate framework not found")
endif()
endif()

if (LLAMA_OPENBLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
Expand Down Expand Up @@ -140,6 +144,30 @@ if (LLAMA_OPENBLAS)
endif()
endif()

if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")

enable_language(CUDA)

set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)

add_compile_definitions(GGML_USE_CUBLAS)

if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

else()
message(WARNING "cuBLAS not found")
endif()
endif()

if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
Expand All @@ -151,7 +179,6 @@ if (LLAMA_ALL_WARNINGS)
-Wshadow
-Wstrict-prototypes
-Wpointer-arith
-Wno-unused-function
)
set(cxx_flags
-Wall
Expand Down Expand Up @@ -219,11 +246,26 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
add_compile_options(/arch:AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (LLAMA_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_options(/arch:AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
elseif (LLAMA_AVX)
add_compile_options(/arch:AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif()
else()
if (LLAMA_F16C)
Expand All @@ -240,9 +282,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
endif()
if (LLAMA_AVX512)
add_compile_options(-mavx512f)
# add_compile_options(-mavx512cd)
# add_compile_options(-mavx512dq)
# add_compile_options(-mavx512bw)
add_compile_options(-mavx512bw)
endif()
if (LLAMA_AVX512_VBMI)
add_compile_options(-mavx512vbmi)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_options(-mavx512vnni)
endif()
endif()
else()
Expand All @@ -256,7 +302,8 @@ endif()

add_library(ggml OBJECT
ggml.c
ggml.h)
ggml.h
${GGML_CUDA_SOURCES})

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
Expand All @@ -278,6 +325,14 @@ if (BUILD_SHARED_LIBS)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()

if (GGML_CUDA_SOURCES)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
endif()


#
# programs, examples and tests
#
Expand All @@ -289,4 +344,5 @@ endif ()

if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
31 changes: 21 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Define the default target now so that it is always the first target
default: main quantize quantize-stats perplexity embedding vdot

ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
Expand Down Expand Up @@ -36,7 +39,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =

# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

# OS specific
Expand Down Expand Up @@ -97,6 +100,13 @@ ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
nvcc -arch=native -c -o $@ $<
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
Expand Down Expand Up @@ -133,8 +143,6 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )

default: main quantize quantize-stats perplexity embedding

#
# Build library
#
Expand All @@ -151,32 +159,35 @@ common.o: examples/common.cpp examples/common.h
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult

main: examples/main/main.cpp ggml.o llama.o common.o
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo

quantize: examples/quantize/quantize.cpp ggml.o llama.o
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

libllama.so: llama.o ggml.o
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

#
# Tests
#

benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult

Expand Down
Loading