You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Create Importance Matrix (for some quantizations below)
## https://medium.com/@ingridwickstevens/quantization-of-llms-with-llama-cpp-9bbf59deda35 (search for 'Importance Matrix')
## (ONLY use imatrix when absolutely needed even so more might be able to use it see ggml.c#L21908)
wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip -O "$(dirname ${GG})/wiki.train.raw.zip"
unzip -p "$(dirname ${GG})/wiki.train.raw.zip" wikitext-2-raw/wiki.train.raw > "$(dirname ${GG})/wiki.train.raw"
## skip -ngl 300 if no gpu support or reduce layers if not enough mem (on A100 <2h)
./build/bin/llama-imatrix -m "${GG}-${I}.gguf" -f "$(dirname ${GG})/wiki.train.raw" \
-o "${GG}-imatrix.dat" -t $(nproc) -ngl 300 \
2>&1 | tee -a "${GG}.log"
### https://github.com/ggerganov/llama.cpp/blob/b3901/examples/perplexity/README.md#llama-3-8b-scoreboard (getting imatrix from there?!; llama-quantize logs show problems with downloaded imatrix -> try again to generate!?)
#wget https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/resolve/main/imatrix-llama_3-8b-f16-10m_tokens.dat -O "${GG}-imatrix.dat"
## starting point: https://github.com/ggerganov/llama.cpp/blob/b3901/src/llama.cpp#L19800
## -> https://github.com/ggerganov/llama.cpp/blob/b3901/src/llama.cpp#L18681
## -> https://github.com/ggerganov/llama.cpp/blob/b3901/ggml/src/ggml.c#L21884
## ==> add code to: https://github.com/ggerganov/llama.cpp/blob/b3901/ggml/src/ggml.c#L21908
## (suggest overwriting quantize_q8_0 or ggml_fp32_to_fp16_row (line L21936) if that makes inference code easy (no use of Kompute!?))
##
## see status:
git diff b3901..HEAD
Patch inference ops (make sure its thread-safe for production version)
## overwrite traits https://github.com/ggerganov/llama.cpp/blob/b3901/ggml/src/ggml.c#L865
## -> eg https://github.com/ggerganov/llama.cpp/blob/b3901/ggml/src/ggml-quants.c#L5518 but make sure it's called
## ==> skip #if and go to naive impl https://github.com/ggerganov/llama.cpp/blob/b3901/ggml/src/ggml-quants.c#L5846
Testing functionality
./build/bin/llama-cli \
--model "${GG}-${O}.gguf" \
--threads 1 --repeat_penalty 1.0 --prompt "Tell me a German joke. I've never heard one before." \
--predict 50 --seed 1 --verbose
Build and quantize zfp versions (as long as dim/rate for zfp are static we need multiple builds)
GG="Meta-Llama-3-8B/Meta-Llama-3-8B"
I="F32"; O="ZFP"
for ZFPDIM in 4 3 2 1; do
for ZFPRATE in -1.0 16.0 12.0 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.5; do
rm -f "${GG}-${O}_${ZFPDIM}_${ZFPRATE}.gguf"
rm -rf "build_${ZFPDIM}_${ZFPRATE}"
OPT=""; CBT="Release";
ZFP="-DZFPDIM=${ZFPDIM} -DZFPRATE=${ZFPRATE} -I$(pwd)/zfp/include/ -L$(pwd)/zfp/build/lib64/ -Wl,-rpath=$(pwd)/zfp/build/lib64/ -lzfp"
cmake -B "build_${ZFPDIM}_${ZFPRATE}" -DCMAKE_BUILD_TYPE="${CBT}" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DGGML_GPROF=OFF -DGGML_NATIVE=ON -DGGML_LTO=ON \
-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLA_PREFER_PKGCONFIG=ON -DCMAKE_C_FLAGS="${OPT} ${ZFP}" -DCMAKE_CXX_FLAGS="${OPT} ${ZFP}"
cmake --build "build_${ZFPDIM}_${ZFPRATE}" --config "${CBT}" --parallel $(nproc)
./build_${ZFPDIM}_${ZFPRATE}/bin/llama-quantize "${GG}-${I}.gguf" "${GG}-${O}_${ZFPDIM}_${ZFPRATE}.gguf" ${O} $(nproc) 2>&1 | tee -a "${GG}.zfp.log"
done
done
Eval signal-to-noise or mean quare error (zfp vs fp32/input)
GG="Meta-Llama-3-8B/Meta-Llama-3-8B"
for O in Q4_0 Q4_1 Q5_0 Q5_1 IQ2_M TQ1_0 TQ2_0 Q2_K IQ3_XXS IQ3_S IQ3_M Q3_K IQ3_XS Q3_K_S Q3_K_M Q3_K_L IQ4_NL IQ4_XS Q4_K Q4_K_S Q4_K_M Q5_K Q5_K_S Q5_K_M Q6_K Q8_0 Q4_0_4_4 Q4_0_4_8 Q4_0_8_8 F16 BF16 IQ1_S IQ1_M IQ2_S IQ2_XXS IQ2_XS Q2_K_S F32 ; do
echo ${O} $(stat -c %s "${GG}-${O}.gguf")
done
### this would be incorrect, the current code writes max block size not the actually required block size
#for ZFPDIM in 4 3 2 1; do
# for ZFPRATE in -1.0 16.0 12.0 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.5; do
# O="ZFP"; echo ${O}_${ZFPDIM}_${ZFPRATE} $(stat -c %s "${GG}-${O}_${ZFPDIM}_${ZFPRATE}.gguf")
# done
#done
Eval bits/weight
## ???
Eval ms/token
## 1) big prefill (1k+ words) + 1 output word
## 2) token split in 80% prefill + 20% prediction
## 3) small prefill/question + predicting hundred of words
related work
## is SZx faster??? https://arxiv.org/pdf/2201.13020
## competing compressors https://sdrbench.github.io/