diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile deleted file mode 100644 index 6789e17afcc..00000000000 --- a/.devops/llama-cli-intel.Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 - -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build - -ARG LLAMA_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git - -WORKDIR /app - -COPY . . - -RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ - echo "LLAMA_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ - fi && \ - cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target llama-cli - -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime - -COPY --from=build /app/build/bin/llama-cli /llama-cli - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile deleted file mode 100644 index cec43645233..00000000000 --- a/.devops/llama-server-intel.Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 - -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build - -ARG LLAMA_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ - echo "LLAMA_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ - fi && \ - cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target llama-server - -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev - -COPY --from=build /app/build/bin/llama-server /llama-server - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/llama-server" ] diff --git a/.editorconfig b/.editorconfig index bd525e13f3e..f88f8da67cd 100644 --- a/.editorconfig +++ b/.editorconfig @@ -28,4 +28,5 @@ indent_size = 2 indent_style = tab [examples/cvector-generator/*.txt] +trim_trailing_whitespace = unset insert_final_newline = unset diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index 997c6d9d053..00000000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,7 +0,0 @@ - - -- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) -- Self-reported review complexity: - - [ ] Low - - [ ] Medium - - [ ] High diff --git a/.github/workflows/kcpp-build-release-win-cuda.yaml b/.github/workflows/kcpp-build-release-win-cuda.yaml index 9a336bc11b0..618c21e6328 100644 --- a/.github/workflows/kcpp-build-release-win-cuda.yaml +++ b/.github/workflows/kcpp-build-release-win-cuda.yaml @@ -14,11 +14,11 @@ jobs: with: ref: ${{ github.head_ref || github.ref_name }} - - uses: Jimver/cuda-toolkit@v0.2.11 + - uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit with: cuda: '11.4.4' - + - name: Build id: cmake_build run: | diff --git a/.github/workflows/kcpp-build-release-win-cuda12.yaml b/.github/workflows/kcpp-build-release-win-cuda12.yaml index d20973b69b5..ebf7ae402b2 100644 --- a/.github/workflows/kcpp-build-release-win-cuda12.yaml +++ b/.github/workflows/kcpp-build-release-win-cuda12.yaml @@ -14,11 +14,11 @@ jobs: with: ref: ${{ github.head_ref || github.ref_name }} - - uses: Jimver/cuda-toolkit@v0.2.11 + - uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit with: cuda: '12.1.0' - + - name: Build id: cmake_build run: | diff --git a/.github/workflows/kcpp-build-release-win-full-cu12.yaml b/.github/workflows/kcpp-build-release-win-full-cu12.yaml index 2244cb62cc9..e58684b6c6c 100644 --- a/.github/workflows/kcpp-build-release-win-full-cu12.yaml +++ b/.github/workflows/kcpp-build-release-win-full-cu12.yaml @@ -38,7 +38,7 @@ jobs: run: | make -j ${env:NUMBER_OF_PROCESSORS} - - uses: Jimver/cuda-toolkit@v0.2.11 + - uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit with: cuda: '12.1.0' diff --git a/.github/workflows/kcpp-build-release-win-full.yaml b/.github/workflows/kcpp-build-release-win-full.yaml index 284f0309b00..47f80e368a1 100644 --- a/.github/workflows/kcpp-build-release-win-full.yaml +++ b/.github/workflows/kcpp-build-release-win-full.yaml @@ -38,7 +38,7 @@ jobs: run: | make -j ${env:NUMBER_OF_PROCESSORS} - - uses: Jimver/cuda-toolkit@v0.2.11 + - uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit with: cuda: '11.4.4' diff --git a/.github/workflows/kcpp-build-release-win-noavx2-full.yaml b/.github/workflows/kcpp-build-release-win-oldcpu-full.yaml similarity index 94% rename from .github/workflows/kcpp-build-release-win-noavx2-full.yaml rename to .github/workflows/kcpp-build-release-win-oldcpu-full.yaml index 9c8dba98819..c58d7214cce 100644 --- a/.github/workflows/kcpp-build-release-win-noavx2-full.yaml +++ b/.github/workflows/kcpp-build-release-win-oldcpu-full.yaml @@ -1,4 +1,4 @@ -name: Koboldcpp Windows Full Binaries +name: Koboldcpp Windows Full OldCPU Binaries on: workflow_dispatch env: @@ -38,7 +38,7 @@ jobs: run: | make -j ${env:NUMBER_OF_PROCESSORS} - - uses: Jimver/cuda-toolkit@v0.2.11 + - uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit with: cuda: '11.4.4' @@ -68,8 +68,7 @@ jobs: - name: Package PyInstallers id: make_pyinstaller run: | - ./make_pyinstaller.bat - ./make_pyinstaller_cuda.bat + ./make_pyinstaller_cuda_oldcpu.bat - name: Save artifact uses: actions/upload-artifact@v3 diff --git a/.gitignore b/.gitignore index 9cdac3b26a4..c580a8f6c83 100644 --- a/.gitignore +++ b/.gitignore @@ -125,6 +125,8 @@ tests/test-tokenizer-1-bpe /koboldcpp_vulkan.dll /cublas64_11.dll /cublasLt64_11.dll +/cublas64_12.dll +/cublasLt64_12.dll /rocblas/ rocblas.dll hipblas.dll diff --git a/AUTHORS b/AUTHORS deleted file mode 100644 index b029f13da3b..00000000000 --- a/AUTHORS +++ /dev/null @@ -1,655 +0,0 @@ -# date: Tue Apr 9 09:17:14 EEST 2024 -# this file is auto-generated by scripts/gen-authors.sh - -0cc4m -0xspringtime <110655352+0xspringtime@users.noreply.github.com> -2f38b454 -3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> -44670 <44670@users.noreply.github.com> -AN Long -AT -Aarni Koskela -Aaron Miller -Aaryaman Vasishta -Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> -Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com> -Adithya Balaji -AdithyanI -Adrian -Adrian Hesketh -AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> -Aisuko -Alberto <57916483+albbus-stack@users.noreply.github.com> -Alex -Alex Azarov -Alex Azarov -Alex Klinkhamer -Alex Klinkhamer -Alex Nguyen -Alex Petenchea -Alex Renda -Alex von Gluck IV -Alexey Parfenov -Ali Chraghi <63465728+alichraghi@users.noreply.github.com> -Ali Nehzat -Ali Tariq -Alon -AlpinDale <52078762+AlpinDale@users.noreply.github.com> -AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> -Ananta Bastola -Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> -András Salamon -Andrei -Andrew Canis -Andrew Duffy -Andrew Godfrey -Arik Poznanski -Artem -Artyom Lebedev -Asbjørn Olling -Ásgeir Bjarni Ingvarsson -Ashok Gelal <401055+ashokgelal@users.noreply.github.com> -Ashraful Islam -Atsushi Tatsuma -Austin <77757836+teleprint-me@users.noreply.github.com> -AustinMroz -BADR -Bach Le -Bailey Chittle <39804642+bachittle@users.noreply.github.com> -BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> -Behnam M <58621210+ibehnam@users.noreply.github.com> -Ben Garney -Ben Siraphob -Ben Williams -Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> -Bernat Vadell -Bodo Graumann -Bono Lv -Borislav Stanimirov -Branden Butler -Brian -Bruce MacDonald -CJ Pais -CRD716 -Cameron -Cameron Kaiser -Casey Primozic -Casey Primozic -CausalLM <148736309+CausalLM@users.noreply.github.com> -Cebtenzzre -Chad Brewbaker -Cheng Shao -Chris Kuehl -Christian Demsar -Christian Demsar -Christian Falch <875252+chrfalch@users.noreply.github.com> -Christian Kögler -Clark Saben <76020733+csaben@users.noreply.github.com> -Clint Herron -Cuong Trinh Manh -DAN™ -Damian Stewart -Dane Madsen -DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> -Daniel Bevenius -Daniel Drake -Daniel Hiltgen -Daniel Illescas Romero -DannyDaemonic -Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> -Dave Della Costa -David Friehs -David Kennedy -David Pflug -David Renshaw -David Sommers <12738+databyte@users.noreply.github.com> -David Yang -Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> -Dean -Deins -Didzis Gosko -Don Mahurin -DooWoong Lee (David) -Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> -Douglas Hanley -Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> -Ebey Abraham -Ed Lee -Ed Lepedus -Edward Taylor -Elbios <141279586+Elbios@users.noreply.github.com> -Engininja2 <139037756+Engininja2@users.noreply.github.com> -Equim -Eric Sommerlade -Eric Zhang <34133756+EZForever@users.noreply.github.com> -Erik Garrison -Erik Scholz -Ettore Di Giacinto -Evan Jones -Evan Miller -Eve <139727413+netrunnereve@users.noreply.github.com> -Evgeny Kurnevsky -Ewout ter Hoeven -ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com> -FK -Fabian -Fabio R. Sluzala -Faez Shakil -FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> -Fattire <528174+fat-tire@users.noreply.github.com> -Felix -Finn Voorhees -Firat -Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> -Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> -Francisco Melo <43780565+francis2tm@users.noreply.github.com> -FrankHB -Frederik Vogel -Gabe Goodhart -GainLee -Galunid -Gary Linscott -Gary Mulder -Genkagaku.GPT -Georgi Gerganov -Gilad S -GiviMAD -Govlzkoy -Guillaume "Vermeille" Sanchez -Guillaume Wenzek -Guoteng <32697156+SolenoidWGT@users.noreply.github.com> -Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> -Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> -Haohui Mai -Haoxiang Fei -Harald Fernengel -Hatsune Miku <129688334+at8u@users.noreply.github.com> -Henk Poley -Henri Vasserman -Henrik Forstén -Herman Semenov -Hesen Peng -Hoang Nguyen -Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> -Howard Su -Hua Jiang -Huawei Lin -Ian Bull -Ian Bull -Ian Scrivener -Ido S -IgnacioFDM -Igor Okulist -Ikko Eltociear Ashimine -Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> -Ionoclast Laboratories -Isaac McFadyen -IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> -Ivan Komarov -Ivan Stepanov -JH23X <165871467+JH23X@users.noreply.github.com> -Jack Mousseau -JackJollimore <130917767+JackJollimore@users.noreply.github.com> -Jag Chadha -Jakub N -James Reynolds -Jan Boon -Jan Boon -Jan Ploski -Jannis Schönleber -Jared Van Bortel -Jared Van Bortel -Jason McCartney -Jean-Christophe Hoelt -Jean-Michaël Celerier -Jed Fox -Jeffrey Quesnelle -Jesse Jojo Johnson -Jhen-Jie Hong -Jiahao Li -Jian Liao -JidongZhang-THU <1119708529@qq.com> -Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com> -Jiří Podivín <66251151+jpodivin@users.noreply.github.com> -Johannes Gäßler -Johannes Rudolph -John <78893154+cmp-nct@users.noreply.github.com> -John Balis -John Smith <67539080+kingsidelee@users.noreply.github.com> -JohnnyB -Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com> -Jorge A <161275481+jorgealias@users.noreply.github.com> -Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com> -Joseph Stahl <1269177+josephst@users.noreply.github.com> -Joyce -Juan Calderon-Perez <835733+gaby@users.noreply.github.com> -Judd -Julius Arkenberg -Jun Jie <71215065+junnjiee16@users.noreply.github.com> -Juraj Bednar -Justin Parker -Justin Suess -Justine Tunney -Juuso Alasuutari -KASR -Kamil Tomšík -Karsten Weiss -Karthick -Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> -Karthik Sethuraman -Kasumi <90275229+kasumi-1@users.noreply.github.com> -Kawrakow <48489457+ikawrakow@users.noreply.github.com> -Keiichi Tabata -Kenvix ⭐ -Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> -Kevin Ji <1146876+kevinji@users.noreply.github.com> -Kevin Kwok -Kevin Lo -Kolen Cheung -Konstantin Herud -Konstantin Zhuravlyov -Kunshang Ji -Kyle Liang -Kyle Mistele -Kylin <56434533+KyL0N@users.noreply.github.com> -Lars Grammel -Laura -Lee <44310445+lx200916@users.noreply.github.com> -Lee Drake -Leng Yue -LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> -Leonardo Neumann -Li Tan -Linwei Wang -LoganDark -LostRuins <39025047+LostRuins@users.noreply.github.com> -Luciano -Luo Tian -M. Yusuf Sarıgöz -Maarten ter Huurne -Mack Straight -Maël Kerbiriou -MaggotHATE -Marc Köhlbrugge -Marco Matthies <71844+marcom@users.noreply.github.com> -Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> -Marian Cepok -Mark Fairbairn -Marko Tasic -Martin Krasser -Martin Schwaighofer -Marvin Gießing -Mateusz Charytoniuk -Matheus C. França -Matheus Gabriel Alves Silva -Mathieu Nayrolles -Mathijs de Bruin -Matt Clayton <156335168+mattjcly@users.noreply.github.com> -Matt Pulver -Matteo Boschini <12133566+mbosc@users.noreply.github.com> -Matthew Tejo -Matvey Soloviev -Maxime <672982+maximegmd@users.noreply.github.com> -Maximilian Winter -Meng Zhang -Meng, Hengyu -Merrick Christensen -Michael Coppola -Michael Hueschen -Michael Kesper -Michael Klimenko -Michael Podvitskiy -Michael Potter -Michaël de Vries -Mihai -Mike -Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> -Mirko185 -Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> -Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> -Mohammadreza Hendiani -Murilo Santana -Musab Gultekin -Nam D. Tran <42194884+namtranase@users.noreply.github.com> -NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> -Nebula -Neo Zhang Jianyu -Neuman Vong -Nexesenex <124105151+Nexesenex@users.noreply.github.com> -Niall Coates <1349685+Niall-@users.noreply.github.com> -Nicolai Weitkemper -Nigel Bosch -Niklas Korz -Nindaleth -Oleksandr Nikitin -Oleksii Maryshchenko -Olivier Chafik -Ondřej Čertík -Ouadie EL FAROUKI -Paul Tsochantaris -Pavol Rusnak -Pedro Cuenca -Peter Sugihara -Phil H <5756783+phiharri@users.noreply.github.com> -Philip Taron -Phillip Kravtsov -Pierre Alexandre SCHEMBRI -Pierrick Hymbert -Przemysław Pawełczyk -Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> -Qingyou Meng -Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> -RJ Adriaansen -Radoslav Gerganov -Radosław Gryta -Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> -Rand Xie -Randall Fitzgerald -Reinforce-II -Riceball LEE -Richard Kiss -Richard Roberson -Rick G <26732651+TheFlipbook@users.noreply.github.com> -Rickard Edén -Rickard Hallerbäck -Rickey Bowers Jr -Riley Stewart -Rinne -Rinne -Robert Brisita <986796+rbrisita@users.noreply.github.com> -Robert Sung-wook Shin -Robey Holderith -Robyn -Roger Meier -Roland <14355895+rbur0425@users.noreply.github.com> -Romain D <90720+Artefact2@users.noreply.github.com> -Romain Neutron -Roman Parykin -Ron Evans -Ron Jailall -Ronny Brendel -Ronsor -Rowan Hart -Rune <43761327+Rune-AI@users.noreply.github.com> -Ryan Landay -Ryder Wishart -Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> -SakuraUmi -Salvador E. Tropea -Sam Spilsbury -Sami Farin <3876865+Safari77@users.noreply.github.com> -Samuel Maynard -Sang-Kil Park -Seb C <47074056+Sebby37@users.noreply.github.com> -Sebastián A -SebastianApel <13675545+SebastianApel@users.noreply.github.com> -Senemu <10880819+Senemu@users.noreply.github.com> -Sergey Alirzaev -Sergio López -SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> -ShadovvBeast -Shakhar Dasgupta -Shangning Xu <32517059+xushangning@users.noreply.github.com> -Shijie <821898965@qq.com> -Shintarou Okada -Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> -Shouzheng Liu -Sigbjørn Skjæret -Simon Willison -Siwen Yu -Sky Yan -Slaren <2141330+slaren@users.noreply.github.com> -Slava Primenko -SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> -Someone -Someone Serge -Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> -Spencer Sutton -Srinivas Billa -Stefan Sydow -Stephan Walter -Stephen Nichols -Steve Grubb -Steven Roussey -Steward Garcia <57494570+FSSRepo@users.noreply.github.com> -Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> -SuperUserNameMan -Tai Duc Nguyen -Taikono-Himazin -Tameem <113388789+AhmadTameem@users.noreply.github.com> -Tamotsu Takahashi -Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com> -Thatcher Chamberlin -Theia Vogel -Thérence <13496987+Royalphax@users.noreply.github.com> -Thibault Terrasson -Thomas Klausner -Tim Miller -Timmy Knight -Timothy Cronin <40186632+4imothy@users.noreply.github.com> -Ting Lou -Ting Sun -Tobias Lütke -Tom C -Tom Jobbins <784313+TheBloke@users.noreply.github.com> -Tomas -Tomáš Pazdiora -Tristan Ross -Tungsten842 <886724vf@anonaddy.me> -Tungsten842 -Tushar -UEXTM.com <84163508+uextm@users.noreply.github.com> -Uzo Nweke -Vaibhav Srivastav -Val Kharitonov -Valentin Konovalov -Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> -Victor Z. Peng -Vlad -Vladimir -Vladimir Malyutin -Vladimir Zorin -Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> -WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> -Weird Constructor -Welby Seely -Wentai Zhang -WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> -Willy Tarreau -Wu Jian Ping -Wu Jian Ping -Xiake Sun -Xiang (Kevin) Li -Xiao-Yong Jin -XiaotaoChen -Xiaoyi Chen -Xingchen Song(宋星辰) -Xuan Son Nguyen -Yann Follet <131855179+YannFollet@users.noreply.github.com> -Yiming Cui -Yishuo Wang -Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> -Yui -Yusuf Kağan Hanoğlu -Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> -ZHAOKAI WANG -Zane Shannon -Zay <95888118+isaiahbjork@users.noreply.github.com> -Zenix -Zhang Peiyuan -ZhouYuChen -Ziad Ben Hadj-Alouane -Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> -Zsapi -a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> -adel boussaken -afrideva <95653597+afrideva@users.noreply.github.com> -akawrykow <142945436+akawrykow@users.noreply.github.com> -alexpinel <93524949+alexpinel@users.noreply.github.com> -alonfaraj -andrijdavid -anon998 <131767832+anon998@users.noreply.github.com> -anzz1 -apaz -apcameron <37645737+apcameron@users.noreply.github.com> -arcrank -arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> -at8u <129688334+at8u@users.noreply.github.com> -automaticcat -bandoti <141645996+bandoti@users.noreply.github.com> -beiller -bhubbb <79117352+bhubbb@users.noreply.github.com> -bmwl -bobqianic <129547291+bobqianic@users.noreply.github.com> -bryanSwk <93190252+bryanSwk@users.noreply.github.com> -bsilvereagle -bssrdf -byte-6174 <88070277+byte-6174@users.noreply.github.com> -cebtenzzre -chaihahaha -chiranko <96988916+chiranko@users.noreply.github.com> -clibdev <52199778+clibdev@users.noreply.github.com> -clyang -cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> -coezbek -comex -compilade <113953597+compilade@users.noreply.github.com> -crasm -crasm -daboe01 -david raistrick -ddpasa <112642920+ddpasa@users.noreply.github.com> -deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> -divinity76 -dotpy314 <33351922+dotpy314@users.noreply.github.com> -drbh -ds5t5 <145942675+ds5t5@users.noreply.github.com> -dylan -eastriver -ebraminio -eiery <19350831+eiery@users.noreply.github.com> -eric8607242 -fraxy-v <65565042+fraxy-v@users.noreply.github.com> -github-actions[bot] -gliptic -goerch -grahameth <96447521+grahameth@users.noreply.github.com> -gwjr <502526+gwjr@users.noreply.github.com> -h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> -hankcs -hoangmit -hongbo.mo <352280764@qq.com> -howlger -howlger -hutli <6594598+hutli@users.noreply.github.com> -hutli -hutli -hxer7963 -hydai -iSma -iacore <74560659+iacore@users.noreply.github.com> -igarnier -iohub -jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> -jameswu2014 <545426914@qq.com> -jneem -johnson442 <56517414+johnson442@users.noreply.github.com> -jon-chuang <9093549+jon-chuang@users.noreply.github.com> -jp-x-g -jwj7140 <32943891+jwj7140@users.noreply.github.com> -kaizau -kalomaze <66376113+kalomaze@users.noreply.github.com> -kang -katsu560 <118887472+katsu560@users.noreply.github.com> -kchro3 <62481661+kchro3@users.noreply.github.com> -khimaros -kiltyj -klosax <131523366+klosax@users.noreply.github.com> -kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> -kunnis -kuronekosaiko -kuvaus <22169537+kuvaus@users.noreply.github.com> -kwin1412 <42286931+kwin1412@users.noreply.github.com> -l3utterfly -ldwang -le.chang -leejet -limitedAtonement -lon <114724657+longregen@users.noreply.github.com> -m3ndax -maddes8cht <55592906+maddes8cht@users.noreply.github.com> -makomk -manikbhandari -mdrokz -mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> -minarchist -mj-shifu <77107165+mj-shifu@users.noreply.github.com> -mmyjona -momonga <115213907+mmnga@users.noreply.github.com> -moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> -mzcu -nanahi <130121847+na-na-hi@users.noreply.github.com> -ngc92 <7938269+ngc92@users.noreply.github.com> -nhamanasu <45545786+nhamanasu@users.noreply.github.com> -niansa/tuxifan -niansa/tuxifan -ningshanwutuobang -nold -nopperl <54780682+nopperl@users.noreply.github.com> -nusu-github <29514220+nusu-github@users.noreply.github.com> -olexiyb -oobabooga <112222186+oobabooga@users.noreply.github.com> -opparco -ostix360 <55257054+ostix360@users.noreply.github.com> -perserk -postmasters -pudepiedj -qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> -qouoq -qunash -rabidcopy -rankaiyx -rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> -rhuddleston -rimoliga <53384203+rimoliga@users.noreply.github.com> -runfuture -sandyiscool -semidark -sharpHL <132747147+sharpHL@users.noreply.github.com> -shibe2 -singularity <12184989+singularity-s0@users.noreply.github.com> -sjinzh -slaren <2141330+slaren@users.noreply.github.com> -slaren -snadampal <87143774+snadampal@users.noreply.github.com> -staviq -stduhpf -swittk -takov751 <40316768+takov751@users.noreply.github.com> -tarcey -texmex76 <40733439+texmex76@users.noreply.github.com> -thement <40525767+thement@users.noreply.github.com> -tjohnman -tslmy -ubik2 -uint256_t -uint256_t -unbounded -valiray <133289098+valiray@users.noreply.github.com> -vodkaslime <646329483@qq.com> -vvhg1 <94630311+vvhg1@users.noreply.github.com> -vxiiduu <73044267+vxiiduu@users.noreply.github.com> -wbpxre150 <100937007+wbpxre150@users.noreply.github.com> -whoreson <139810751+whoreson@users.noreply.github.com> -wonjun Jang -wzy <32936898+Freed-Wu@users.noreply.github.com> -xaedes -xaedes -xloem <0xloem@gmail.com> -yangli2 -yuiseki -zakkor -zhouwg <6889919+zhouwg@users.noreply.github.com> -zrm -源文雨 <41315874+fumiama@users.noreply.github.com> -Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> diff --git a/CMakeLists.txt b/CMakeLists.txt index 08fca8df0f7..98123d4969e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ set(LLAMA_GPROF OFF) set(LLAMA_SANITIZE_THREAD OFF) set(LLAMA_SANITIZE_ADDRESS OFF) set(LLAMA_SANITIZE_UNDEFINED OFF) +set(LLAMA_SCHED_MAX_COPIES "2" CACHE STRING "llama: max input copies for pipeline parallelism") # instruction set specific option(LLAMA_AVX "llama: enable AVX" ON) @@ -41,7 +42,6 @@ endif() # 3rd party libs option(LLAMA_CUBLAS "llama: use CUDA" ON) -set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") @@ -49,8 +49,11 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") +set(GGML_CUDA_USE_GRAPHS OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) +# Other +option(LLAMA_OPENMP "llama: use OpenMP" OFF) # # Compile flags @@ -64,17 +67,22 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) add_compile_definitions(LOG_DISABLE_LOGS) +add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) -file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") -list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") -file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") +file(GLOB GGML_SOURCES_CUDA "ggml/src/ggml-cuda/*.cu") +list(APPEND GGML_SOURCES_CUDA "ggml/src/ggml-cuda.cu") +file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) -file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") +file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h) set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h) set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h) +if (GGML_CUDA_USE_GRAPHS) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) +endif() + if (LLAMA_CUBLAS) cmake_minimum_required(VERSION 3.17) @@ -87,7 +95,6 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_USE_LLAMAFILE) add_compile_definitions(GGML_USE_CUDA) add_compile_definitions(SD_USE_CUBLAS) - add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) @@ -99,11 +106,11 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) # only build minimal quants required for fattn quant kv - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) if (LLAMA_STATIC) @@ -124,14 +131,15 @@ if (LLAMA_CUBLAS) # 60 == f16 CUDA intrinsics # 61 == integer CUDA intrinsics # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster + # 75 == int8 tensor cores if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics + set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics else() message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}") if(CUDAToolkit_VERSION VERSION_GREATER 12) - set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics else() - set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics endif() endif() endif() @@ -163,23 +171,23 @@ if (LLAMA_HIPBLAS) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") - file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") - file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + file(GLOB GGML_SOURCES_ROCM "ggml/src/ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_ROCM "ggml/src/ggml-cuda.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/mmq*.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA SD_USE_CUBLAS) - add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA}) + add_library(ggml-rocm ${GGML_SOURCES_CUDA}) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV) endif() - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu") list(APPEND GGML_SOURCES_ROCM ${SRCS}) # only build minimal quants required for fattn quant kv @@ -189,7 +197,7 @@ if (LLAMA_HIPBLAS) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) - add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES}) + add_library(ggml-v2-rocm ${GGML_V2_CUDA_SOURCES}) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV) endif() @@ -199,7 +207,7 @@ if (LLAMA_HIPBLAS) set_source_files_properties(otherarch/ggml_v2-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-v2-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) - add_library(ggml-v3-rocm OBJECT ${GGML_V3_CUDA_SOURCES}) + add_library(ggml-v3-rocm ${GGML_V3_CUDA_SOURCES}) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-v3-rocm PUBLIC GGML_CUDA_FORCE_DMMV) endif() @@ -209,7 +217,7 @@ if (LLAMA_HIPBLAS) set_source_files_properties(otherarch/ggml_v3-cuda.cu PROPERTIES LANGUAGE CXX) target_link_libraries(ggml-v3-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) - add_library(ggml-v2-legacy-rocm OBJECT ${GGML_V2_LEGACY_CUDA_SOURCES}) + add_library(ggml-v2-legacy-rocm ${GGML_V2_LEGACY_CUDA_SOURCES}) if (LLAMA_CUDA_FORCE_DMMV) target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_FORCE_DMMV) endif() @@ -286,6 +294,17 @@ if (LLAMA_LTO) endif() endif() +if (LLAMA_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + add_compile_definitions(GGML_USE_OPENMP) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() +endif() + # this version of Apple ld64 is buggy execute_process( COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v @@ -402,46 +421,46 @@ endif() # Build libraries # -add_library(ggml OBJECT - ggml.c - ggml.h - ggml-alloc.c - ggml-alloc.h - ggml-backend.c - ggml-backend.h - ggml-quants.c - ggml-quants.h - sgemm.cpp - sgemm.h +add_library(ggml + ggml/src/ggml.c + ggml/include/ggml.h + ggml/src/ggml-alloc.c + ggml/include/ggml-alloc.h + ggml/src/ggml-backend.c + ggml/include/ggml-backend.h + ggml/src/ggml-quants.c + ggml/src/ggml-quants.h + ggml/src/sgemm.cpp + ggml/src/sgemm.h ${GGML_SOURCES_CUDA}) -target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools) +target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools) target_compile_features(ggml PUBLIC c_std_11) # don't bump target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) -add_library(ggml_v1 OBJECT +add_library(ggml_v1 otherarch/ggml_v1.c otherarch/ggml_v1.h) -target_include_directories(ggml_v1 PUBLIC . ./otherarch ./otherarch/tools) +target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools) target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON) -add_library(ggml_v2 OBJECT +add_library(ggml_v2 otherarch/ggml_v2.c otherarch/ggml_v2.h ${GGML_V2_CUDA_SOURCES} ${GGML_V2_LEGACY_CUDA_SOURCES}) -target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools) +target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools) target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON) -add_library(ggml_v3 OBJECT +add_library(ggml_v3 otherarch/ggml_v3.c otherarch/ggml_v3.h ${GGML_V3_CUDA_SOURCES}) -target_include_directories(ggml_v3 PUBLIC . ./otherarch ./otherarch/tools) +target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools) target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -457,31 +476,31 @@ add_library(common2 examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h - unicode.h - unicode.cpp - unicode-data.cpp) -target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) + src/unicode.h + src/unicode.cpp + src/unicode-data.cpp) +target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(common2 PUBLIC cxx_std_11) # don't bump target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(sdtype_adapter otherarch/sdcpp/sdtype_adapter.cpp) -target_include_directories(sdtype_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) +target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(sdtype_adapter PUBLIC cxx_std_11) # don't bump target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(whisper_adapter otherarch/whispercpp/whisper_adapter.cpp) -target_include_directories(whisper_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common) +target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./examples ./common) target_compile_features(whisper_adapter PUBLIC cxx_std_11) # don't bump target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(gpttype_adapter gpttype_adapter.cpp) -target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) +target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -489,7 +508,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) if (LLAMA_CUBLAS) set(TARGET koboldcpp_cublas) add_library(${TARGET} SHARED expose.cpp expose.h) - target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) + target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") @@ -501,7 +520,7 @@ endif() if (LLAMA_HIPBLAS) set(TARGET koboldcpp_hipblas) add_library(${TARGET} SHARED expose.cpp expose.h) - target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) + target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common) target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") diff --git a/CMakePresets.json b/CMakePresets.json deleted file mode 100644 index e2b7a79e371..00000000000 --- a/CMakePresets.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "version": 4, - "configurePresets": [ - { - "name": "base", - "hidden": true, - "generator": "Ninja", - "binaryDir": "${sourceDir}/build-${presetName}", - "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", - "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." - } - }, - - { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, - { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, - { "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } }, - - { - "name": "arm64-windows-msvc", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x86_64", "strategy": "external" }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" - } - }, - - { - "name": "arm64-windows-llvm", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x86_64", "strategy": "external" }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" - } - }, - - { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, - { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] }, - { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] }, - - { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, - { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }, - - { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, - { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] }, - { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] } - ] -} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 991d85e493b..00000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,14 +0,0 @@ -# Contributing Guidelines - -## Checklist - -* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines) -* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library -* Execute [the full CI locally on your machine](ci/README.md) before publishing - -## PR formatting - -* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information. -* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times. -* When squashing multiple commits on merge, use the following format for your commit title: ` : (#)`. For example: `utils : Fix typo in utils.py (#1234)` diff --git a/Makefile b/Makefile index da83137e595..b3bb499f777 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split dev: koboldcpp_openblas dev2: koboldcpp_clblast - +dev3: koboldcpp_vulkan ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -42,8 +42,8 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE -CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE +CFLAGS = -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE +CXXFLAGS = -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE LDFLAGS = FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS)) FASTCXXFLAGS = $(subst -O3,-Ofast,$(CXXFLAGS)) @@ -150,17 +150,17 @@ ifndef LLAMA_NO_ACCELERATE endif # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better -OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu)) -OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu)) -OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) -OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) -OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) +OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) +OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) ifdef LLAMA_CUBLAS CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include - CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib + CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o - CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) CUBLAS_OBJS += $(OBJS_CUDA_TEMP_INST) NVCC = nvcc NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math @@ -209,18 +209,14 @@ ifdef LLAMA_CUDA_KQUANTS_ITER else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif -ifdef LLAMA_CUDA_MMQ_Y - NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) -else - NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64 -endif + ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif -ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh +ggml/src/ggml-cuda/%.o: ggml/src/ggml-cuda/%.cu ggml/include/ggml.h ggml/src/ggml-common.h ggml/src/ggml-cuda/common.cuh $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) +ggml-cuda.o: ggml/src/ggml-cuda.cu ggml/include/ggml-cuda.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/src/ggml-backend-impl.h ggml/src/ggml-common.h $(wildcard ggml/src/ggml-cuda/*.cuh) $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ @@ -248,7 +244,7 @@ ifdef LLAMA_HIPBLAS HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas HIP_OBJS += ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o - HIP_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + HIP_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) HIP_OBJS += $(OBJS_CUDA_TEMP_INST) HIPFLAGS2 += $(addprefix --offload-arch=,$(GPU_TARGETS)) @@ -256,9 +252,9 @@ ifdef LLAMA_HIPBLAS HIPFLAGS2 += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS2 += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh +ggml/src/ggml-cuda/%.o: ggml/src/ggml-cuda/%.cu ggml/include/ggml.h ggml/src/ggml-common.h ggml/src/ggml-cuda/common.cuh $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $< -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) +ggml-cuda.o: ggml/src/ggml-cuda.cu ggml/include/ggml-cuda.h ggml/include/ggml.h ggml/include/ggml-backend.h ggml/src/ggml-backend-impl.h ggml/src/ggml-common.h $(wildcard ggml/src/ggml-cuda/*.cuh) $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $< ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $(HIPFLAGS2) -x hip -c -o $@ $< @@ -275,9 +271,10 @@ ifdef LLAMA_METAL LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders OBJS += ggml-metal.o -ggml-metal.o: ggml-metal.m ggml-metal.h +ggml-metal.o: ggml/src/ggml-metal.m ggml/include/ggml-metal.h @echo "== Preparing merged Metal file ==" - @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-merged.metal + @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-merged.metal + @cp ggml/src/ggml-metal-merged.metal ./ggml-metal-merged.metal $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_METAL @@ -396,57 +393,57 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ -ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_openblas.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ -ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_failsafe.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ -ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ -ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_clblast.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ -ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_cublas.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ -ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_clblast_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ -ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_vulkan.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@ -ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h ggml-common.h +ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@ #quants -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h +ggml-quants.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ -ggml-quants_noavx2.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h +ggml-quants_noavx2.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ -ggml-quants_failsafe.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h +ggml-quants_failsafe.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ #sgemm -sgemm.o: sgemm.cpp sgemm.h ggml.h +sgemm.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h $(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@ -sgemm_noavx2.o: sgemm.cpp sgemm.h ggml.h +sgemm_noavx2.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h $(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@ -sgemm_failsafe.o: sgemm.cpp sgemm.h ggml.h +sgemm_failsafe.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h $(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@ #there's no intrinsics or special gpu ops used here, so we can have a universal object -ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h +ggml-alloc.o: ggml/src/ggml-alloc.c ggml/include/ggml.h ggml/include/ggml-alloc.h $(CC) $(CFLAGS) -c $< -o $@ llava.o: examples/llava/llava.cpp examples/llava/llava.h $(CXX) $(CXXFLAGS) -c $< -o $@ -unicode.o: unicode.cpp unicode.h +unicode.o: src/unicode.cpp src/unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ -unicode-data.o: unicode-data.cpp unicode-data.h +unicode-data.o: src/unicode-data.cpp src/unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ #these have special gpu defines -ggml-backend_default.o: ggml-backend.c ggml.h ggml-backend.h +ggml-backend_default.o: ggml/src/ggml-backend.c ggml/include/ggml.h ggml/include/ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-backend_vulkan.o: ggml-backend.c ggml.h ggml-backend.h +ggml-backend_vulkan.o: ggml/src/ggml-backend.c ggml/include/ggml.h ggml/include/ggml-backend.h $(CC) $(CFLAGS) $(VULKAN_FLAGS) -c $< -o $@ -ggml-backend_cublas.o: ggml-backend.c ggml.h ggml-backend.h +ggml-backend_cublas.o: ggml/src/ggml-backend.c ggml/include/ggml.h ggml/include/ggml-backend.h $(CC) $(CFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -454,7 +451,7 @@ llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ #this is only used for openblas and accelerate -ggml-blas.o: ggml-blas.cpp ggml-blas.h +ggml-blas.o: ggml/src/ggml-blas.cpp ggml/include/ggml-blas.h $(CXX) $(CXXFLAGS) -c $< -o $@ #version 3 libs @@ -506,11 +503,11 @@ ggml_v3-opencl.o: otherarch/ggml_v3-opencl.cpp otherarch/ggml_v3-opencl.h $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ #vulkan -ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h +ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@ # intermediate objects -llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h +llama.o: src/llama.cpp ggml/include/ggml.h ggml/include/ggml-alloc.h ggml/include/ggml-backend.h ggml/include/ggml-cuda.h ggml/include/ggml-metal.h include/llama.h otherarch/llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: common/common.cpp common/common.h common/log.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -536,7 +533,7 @@ whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ # idiotic "for easier compilation" -GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h otherarch/llama-util.h +GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) @@ -556,8 +553,8 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER) clean: rm -vf *.o main sdmain whispermain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so - rm -vrf ggml-cuda/*.o - rm -vrf ggml-cuda/template-instances/*.o + rm -vrf ggml/src/ggml-cuda/*.o + rm -vrf ggml/src/ggml-cuda/template-instances/*.o # useful tools main: examples/main/main.cpp build-info.h ggml.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS) diff --git a/README.md b/README.md index 849854a7b17..be2dfe1ecbe 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,61 @@ # koboldcpp -KoboldCpp is an easy-to-use AI text-generation software for GGML and GGUF models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, Stable Diffusion image generation, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything KoboldAI and KoboldAI Lite have to offer. +KoboldCpp is an easy-to-use AI text-generation software for GGML and GGUF models, inspired by the original KoboldAI. It's a single self-contained distributable from Concedo, that builds off llama.cpp, and adds a versatile **KoboldAI API endpoint**, additional format support, Stable Diffusion image generation, speech-to-text, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything KoboldAI and KoboldAI Lite have to offer. ![Preview](media/preview.png) ![Preview](media/preview2.png) ![Preview](media/preview3.png) ![Preview](media/preview4.png) -## Windows Usage -- **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo. -- Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. You can also rebuild it yourself with the provided makefiles and scripts. -- Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke). +## Windows Usage (Precompiled Binary, Recommended) +- Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper containing all necessary files. **[Download the latest koboldcpp.exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** - To run, simply execute **koboldcpp.exe**. - Launching with no command line arguments displays a GUI containing a subset of configurable settings. Generally you dont have to change much besides the `Presets` and `GPU Layers`. Read the `--help` for more info about each settings. - By default, you can connect to http://localhost:5001 - You can also run it using the command line. For info, please check `koboldcpp.exe --help` -### Improving Performance -- **(Nvidia Only) GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support. -- **Any GPU Acceleration**: As a slightly slower alternative, try CLBlast with `--useclblast` flags for a slightly slower but more GPU compatible speedup. -- **GPU Layer Offloading**: Want even more speedup? Combine one of the above GPU flags with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory. -- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the KoboldAI Lite UI as well (click and edit the number text field). -- If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`. +## Linux Usage (Precompiled Binary, Recommended) +On modern Linux systems, you should download the `koboldcpp-linux-x64-cuda1150` prebuilt PyInstaller binary on the **[releases page](https://github.com/LostRuins/koboldcpp/releases/latest)**. Simply download and run the binary. -For more information, be sure to run the program with the `--help` flag, or [check the wiki](https://github.com/LostRuins/koboldcpp/wiki). +Alternatively, you can also install koboldcpp to the current directory by running the following terminal command: +``` +curl -fLo koboldcpp https://github.com/LostRuins/koboldcpp/releases/latest/download/koboldcpp-linux-x64-cuda1150 && chmod +x koboldcpp +``` +After running this command you can launch Koboldcpp from the current directory using `./koboldcpp` in the terminal (for CLI usage, run with `--help`). ## Run on Colab - KoboldCpp now has an **official Colab GPU Notebook**! This is an easy way to get started without installing anything in a minute or two. [Try it here!](https://colab.research.google.com/github/LostRuins/koboldcpp/blob/concedo/colab.ipynb). - Note that KoboldCpp is not responsible for your usage of this Colab Notebook, you should ensure that your own usage complies with Google Colab's terms of use. -## OSX and Linux +## Run on RunPod +- KoboldCpp can now be used on RunPod cloud GPUs! This is an easy way to get started without installing anything in a minute or two, and is very scalable, capable of running 70B+ models at afforable cost. [Try our RunPod image here!](https://koboldai.org/runpodcpp). -### Linux Usage (Precompiled Binary, Recommended) -On Linux, we provide a `koboldcpp-linux-x64-cuda1150` PyInstaller prebuilt binary on the **[releases](https://github.com/LostRuins/koboldcpp/releases/latest)** page for modern systems. Simply download and run the binary. +## Docker +- The official docker can be found at https://hub.docker.com/r/koboldai/koboldcpp +- If you're building your own docker, remember to set CUDA_DOCKER_ARCH or enable LLAMA_PORTABLE -Alternatively, you can also install koboldcpp to the current directory by running the following terminal command: -``` -curl -fLo koboldcpp https://github.com/LostRuins/koboldcpp/releases/latest/download/koboldcpp-linux-x64-cuda1150 && chmod +x koboldcpp -``` -After running this command you can launch Koboldcpp from the current directory using `./koboldcpp` in the terminal (for CLI usage, run with `--help`). +## MacOS +- You will need to clone the repo and compile from source code, see Compiling for MacOS below. + +## Obtaining a GGUF model +- KoboldCpp uses GGUF models. They are not included here, but you can download GGUF files from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format. +- For beginners, we recommend the models [Airoboros Mistral](https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model). +- [Alternatively, you can download the tools to convert models to the GGUF format yourself here](https://github.com/LostRuins/koboldcpp/releases/download/v1.69.1/koboldcpp_tools_6jul.zip). Run `convert-hf-to-gguf.py` to convert them, then `quantize_gguf.exe` to quantize the result. + +## Improving Performance +- **GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag (Nvidia Only), or `--usevulkan` (Any GPU), make sure you select the correct .exe with CUDA support. +- **GPU Layer Offloading**: Add `--gpulayers` to offload model layers to the GPU. The more layers you offload to VRAM, the faster generation speed will become. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory. +- **Increasing Context Size**: Use `--contextsize (number)` to increase context size, allowing the model to read more text. Note that you may also need to increase the max context in the KoboldAI Lite UI as well (click and edit the number text field). +- **Old CPU Compatibility**: If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`. +For more information, be sure to run the program with the `--help` flag, or **[check the wiki](https://github.com/LostRuins/koboldcpp/wiki).** -### Linux Usage (koboldcpp.sh automated compiler script) -when you can't use the precompiled binary directly, we provide an automated build script which uses conda to obtain all dependencies, and generates (from source) a ready-to-use a pyinstaller binary for linux users. Simply execute the build script with `./koboldcpp.sh dist` and run the generated binary. (Not recomended for systems that already have an existing installation of conda. Dependencies: curl, bzip2) +## Compiling KoboldCpp From Source Code +### Compiling on Linux (Using koboldcpp.sh automated compiler script) +when you can't use the precompiled binary directly, we provide an automated build script which uses conda to obtain all dependencies, and generates (from source) a ready-to-use a pyinstaller binary for linux users. +- Clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git` +- Simply execute the build script with `./koboldcpp.sh dist` and run the generated binary. (Not recomended for systems that already have an existing installation of conda. Dependencies: curl, bzip2) ``` ./koboldcpp.sh # This launches the GUI for easy configuration and launching (X11 required). ./koboldcpp.sh --help # List all available terminal commands for using Koboldcpp, you can use koboldcpp.sh the same way as our python script and binaries. @@ -51,38 +63,41 @@ when you can't use the precompiled binary directly, we provide an automated buil ./koboldcpp.sh dist # Generate your own precompiled binary (Due to the nature of Linux compiling these will only work on distributions equal or newer than your own.) ``` -## OSX and Linux Manual Compiling -- Otherwise, you will have to compile your binaries from source. A makefile is provided, simply run `make`. -- If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1` -- If you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries. +### Compiling on Linux (Manual Method) +- To compile your binaries from source, clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git` +- A makefile is provided, simply run `make`. +- Optional OpenBLAS: Link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1` +- Optional CLBlast: Link your own install of CLBlast manually with `make LLAMA_CLBLAST=1` +- Note: for these you will need to obtain and link OpenCL and CLBlast libraries. - For Arch Linux: Install `cblas` `openblas` and `clblast`. - For Debian: Install `libclblast-dev` and `libopenblas-dev`. - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1`. You will need CUDA Toolkit installed. Some have also reported success with the CMake file, though that is more for windows. - For a full featured build (all backends), do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_VULKAN=1`. (Note that `LLAMA_CUBLAS=1` will not work on windows, you need visual studio) -- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]` +- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.gguf] [port]` -- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds. - -### Arch Linux Packages -There are some community made AUR packages available: [CUBLAS](https://aur.archlinux.org/packages/koboldcpp-cuda), and [HIPBLAS](https://aur.archlinux.org/packages/koboldcpp-hipblas). They are intended for users with NVIDIA GPUs, and users with a supported AMD GPU. Note that these packages may be outdated, and it's probably better to use official KoboldCpp binaries. - -## Compiling on Windows +### Compiling on Windows - You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is: - - Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs! - - Make sure you are using the w64devkit integrated terminal, then run 'make' at the KoboldCpp source folder. This will create the .dll files. - - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip ('pip install PyInstaller'). - - Run the script make_pyinstaller.bat at a regular terminal (or Windows Explorer). + - Get the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs! + - Clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git` + - Make sure you are using the w64devkit integrated terminal, then run `make` at the KoboldCpp source folder. This will create the .dll files. + - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip (`pip install PyInstaller`). Then run the script `make_pyinstaller.bat` - The koboldcpp.exe file will be at your dist folder. -- If you wish to use your own version of the additional Windows libraries (OpenCL, CLBlast and OpenBLAS), you can do it with: +- **Building with CUDA**: Visual Studio, CMake and CUDA Toolkit is required. Clone the repo, then open the CMake file and compile it in Visual Studio. Copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. +- **Replacing Libraries (Not Recommended)**: If you wish to use your own version of the additional Windows libraries (OpenCL, CLBlast and OpenBLAS), you can do it with: - OpenCL - tested with https://github.com/KhronosGroup/OpenCL-SDK . If you wish to compile it, follow the repository instructions. You will need vcpkg. - CLBlast - tested with https://github.com/CNugteren/CLBlast . If you wish to compile it you will need to reference the OpenCL files. It will only generate the ".lib" file if you compile using MSVC. - OpenBLAS - tested with https://github.com/xianyi/OpenBLAS . - Move the respectives .lib files to the /lib folder of your project, overwriting the older files. - Also, replace the existing versions of the corresponding .dll files located in the project directory root (e.g. libopenblas.dll). - - You can attempt a CuBLAS build with using the provided CMake file with visual studio. If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. - - Make the KoboldCPP project using the instructions above. + - Make the KoboldCpp project using the instructions above. + +### Compiling on MacOS +- You can compile your binaries from source. You can clone the repo with `git clone https://github.com/LostRuins/koboldcpp.git` +- A makefile is provided, simply run `make`. +- If you want Metal GPU support, instead run `make LLAMA_METAL=1`, note that MacOS metal libraries need to be installed. +- After all binaries are built, you can run the python script with the command `koboldcpp.py --model [ggml_model.gguf]` (and add `--gpulayers (number of layer)` if you wish to offload layers to GPU). -## Compiling on Android (Termux Installation) +### Compiling on Android (Termux Installation) - [Install and run Termux from F-Droid](https://f-droid.org/en/packages/com.termux/) - Enter the command `termux-change-repo` and choose `Mirror by BFSU` - Install dependencies with `pkg install wget git python` (plus any other missing packages) @@ -96,16 +111,9 @@ There are some community made AUR packages available: [CUBLAS](https://aur.archl - If you encounter any errors, make sure your packages are up-to-date with `pkg up` - GPU acceleration for Termux may be possible but I have not explored it. If you find a good cross-device solution, do share or PR it. -## AMD +## AMD Users - Please check out https://github.com/YellowRoseCx/koboldcpp-rocm -## Docker -- The official docker can be found at https://hub.docker.com/r/koboldai/koboldcpp -- KoboldCpp also has a few unofficial third-party community created docker images. Feel free to try them out, but do not expect up-to-date support: - - https://github.com/korewaChino/koboldCppDocker - - https://github.com/noneabove1182/koboldcpp-docker -- If you're building your own docker, remember to set CUDA_DOCKER_ARCH or enable LLAMA_PORTABLE - ## Questions and Help - **First, please check out [The KoboldCpp FAQ and Knowledgebase](https://github.com/LostRuins/koboldcpp/wiki) which may already have answers to your questions! Also please search through past issues and discussions.** - If you cannot find an answer, open an issue on this github, or find us on the [KoboldAI Discord](https://koboldai.org/discord). @@ -114,11 +122,11 @@ There are some community made AUR packages available: [CUBLAS](https://aur.archl - For Windows: No installation, single file executable, (It Just Works) - Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS. - Since v1.15, requires CLBlast if enabled, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without CLBlast. -- Since v1.33, you can set the context size to be above what the model supports officially. It does increases perplexity but should still work well below 4096 even on untuned models. (For GPT-NeoX, GPT-J, and LLAMA models) Customize this with `--ropeconfig`. +- Since v1.33, you can set the context size to be above what the model supports officially. It does increases perplexity but should still work well below 4096 even on untuned models. (For GPT-NeoX, GPT-J, and Llama models) Customize this with `--ropeconfig`. - Since v1.42, supports GGUF models for LLAMA and Falcon - Since v1.55, lcuda paths on Linux are hardcoded and may require manual changes to the makefile if you do not use koboldcpp.sh for the compilation. - Since v1.60, provides native image generation with StableDiffusion.cpp, you can load any SD1.5 or SDXL .safetensors model and it will provide an A1111 compatible API to use. -- **I plan to keep backwards compatibility with ALL past llama.cpp AND alpaca.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results. +- **I try to keep backwards compatibility with ALL past llama.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results. ## License - The original GGML library and llama.cpp by ggerganov are licensed under the MIT License @@ -126,17 +134,18 @@ There are some community made AUR packages available: [CUBLAS](https://aur.archl - The other files are also under the AGPL v3.0 License unless otherwise stated ## Notes -- Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` or `--usecublas` more so. -- I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat` -- API documentation available at `/api` and https://lite.koboldai.net/koboldcpp_api -- Supported GGML models (Includes backward compatibility for older versions/legacy GGML models, though some newer features might be unavailable): - - All up-to-date GGUF models are supported (Mistral/Mixtral/QWEN/Gemma and more) - - LLAMA and LLAMA2 (LLaMA / Alpaca / GPT4All / Vicuna / Koala / Pygmalion 7B / Metharme 7B / WizardLM and many more) +- If you wish, after building the koboldcpp libraries with `make`, you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat` +- API documentation available at `/api` (e.g. `http://localhost:5001/api`) and https://lite.koboldai.net/koboldcpp_api. An OpenAI compatible API is also provided at `/v1` route (e.g. `http://localhost:5001/v1`). +- **All up-to-date GGUF models are supported**, and KoboldCpp also includes backward compatibility for older versions/legacy GGML `.bin` models, though some newer features might be unavailable. +- An incomplete list of models and architectures is listed, but there are *many hundreds of other GGUF models*. In general, if it's GGUF, it should work. + - Llama / Llama2 / Llama3 / Alpaca / GPT4All / Vicuna / Koala / Pygmalion / Metharme / WizardLM + - Mistral / Mixtral / Miqu + - Qwen / Qwen2 / Yi + - Gemma / Gemma2 - GPT-2 / Cerebras - - GPT-J - - RWKV + - Phi-2 / Phi-3 - GPT-NeoX / Pythia / StableLM / Dolly / RedPajama - - MPT models - - Falcon (GGUF only) - - [Stable Diffusion and SDXL models](https://github.com/LostRuins/koboldcpp/wiki#can-i-generate-images-with-koboldcpp) - + - GPT-J / RWKV4 / MPT / Falcon / Starcoder / Deepseek and many more + - [Stable Diffusion 1.5 and SDXL safetensor models](https://github.com/LostRuins/koboldcpp/wiki#can-i-generate-images-with-koboldcpp) + - [LLaVA based Vision models and multimodal projectors (mmproj)](https://github.com/LostRuins/koboldcpp/wiki#what-is-llava-and-mmproj) + - [Whisper models for Speech-To-Text](https://huggingface.co/koboldcpp/whisper/tree/main) diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake deleted file mode 100644 index 80237968006..00000000000 --- a/cmake/arm64-windows-llvm.cmake +++ /dev/null @@ -1,16 +0,0 @@ -set( CMAKE_SYSTEM_NAME Windows ) -set( CMAKE_SYSTEM_PROCESSOR arm64 ) - -set( target arm64-pc-windows-msvc ) - -set( CMAKE_C_COMPILER clang ) -set( CMAKE_CXX_COMPILER clang++ ) - -set( CMAKE_C_COMPILER_TARGET ${target} ) -set( CMAKE_CXX_COMPILER_TARGET ${target} ) - -set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" ) -set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" ) - -set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) -set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) diff --git a/cmake/arm64-windows-msvc.cmake b/cmake/arm64-windows-msvc.cmake deleted file mode 100644 index c77631420ce..00000000000 --- a/cmake/arm64-windows-msvc.cmake +++ /dev/null @@ -1,6 +0,0 @@ -set( CMAKE_SYSTEM_NAME Windows ) -set( CMAKE_SYSTEM_PROCESSOR arm64 ) - -set( target arm64-pc-windows-msvc ) -set( CMAKE_C_COMPILER_TARGET ${target} ) -set( CMAKE_CXX_COMPILER_TARGET ${target} ) diff --git a/colab.ipynb b/colab.ipynb index 78ecc91c536..a806fa28442 100644 --- a/colab.ipynb +++ b/colab.ipynb @@ -48,13 +48,12 @@ "source": [ "#@title v-- Enter your model below and then click this to start Koboldcpp\r\n", "\r\n", - "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n", + "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\"]{allow-input: true}\r\n", "Layers = 99 #@param [99]{allow-input: true}\r\n", "ContextSize = 4096 #@param [4096] {allow-input: true}\r\n", - "ForceRebuild = False #@param {type:\"boolean\"}\r\n", "#@markdown
\r\n", "LoadLLaVAmmproj = False #@param {type:\"boolean\"}\r\n", - "LLaVAmmproj = \"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-13b-mmproj-v1.5.Q4_1.gguf\" #@param [\"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-13b-mmproj-v1.5.Q4_1.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/mistral-7b-mmproj-v1.5-Q4_1.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-7b-mmproj-v1.5-Q4_0.gguf\"]{allow-input: true}\r\n", + "LLaVAmmproj = \"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-13b-mmproj-v1.5.Q4_1.gguf\" #@param [\"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-13b-mmproj-v1.5.Q4_1.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/mistral-7b-mmproj-v1.5-Q4_1.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-7b-mmproj-v1.5-Q4_0.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/LLaMA3-8B_mmproj-Q4_1.gguf\"]{allow-input: true}\r\n", "VCommand = \"\"\r\n", "#@markdown
\r\n", "LoadImgModel = False #@param {type:\"boolean\"}\r\n", @@ -70,12 +69,6 @@ " raise RuntimeError(\"⚠️Colab did not give you a GPU due to usage limits, this can take a few hours before they let you back in. Check out https://lite.koboldai.net for a free alternative (that does not provide an API link but can load KoboldAI saves and chat cards) or subscribe to Colab Pro for immediate access.⚠️\")\r\n", "\r\n", "%cd /content\r\n", - "!git clone https://github.com/LostRuins/koboldcpp\r\n", - "%cd /content/koboldcpp\r\n", - "kvers = !(cat koboldcpp.py | grep 'KcppVersion = ' | cut -d '\"' -f2)\r\n", - "kvers = kvers[0]\r\n", - "if ForceRebuild:\r\n", - " kvers = \"force_rebuild\"\r\n", "if LLaVAmmproj and LoadLLaVAmmproj:\r\n", " VCommand = \"--mmproj vmodel.gguf\"\r\n", "else:\r\n", @@ -88,13 +81,15 @@ " WCommand = \"--whispermodel wmodel.bin\"\r\n", "else:\r\n", " WCommand = \"\"\r\n", - "!echo Finding prebuilt binary for {kvers}\r\n", - "!wget -O dlfile.tmp https://kcppcolab.concedo.workers.dev/?{kvers} && mv dlfile.tmp koboldcpp_cublas.so\r\n", - "!test -f koboldcpp_cublas.so && echo Prebuilt Binary Exists || echo Prebuilt Binary Does Not Exist\r\n", - "!test -f koboldcpp_cublas.so && echo Build Skipped || make -j2 koboldcpp_cublas LLAMA_CUBLAS=1 LLAMA_COLAB=1 LLAMA_PORTABLE=1\r\n", - "!cp koboldcpp_cublas.so koboldcpp_cublas.dat\r\n", + "!echo Downloading KoboldCpp, please wait...\r\n", + "!wget -O dlfile.tmp https://kcpplinux.concedo.workers.dev && mv dlfile.tmp koboldcpp_linux\r\n", + "!test -f koboldcpp_linux && echo Download Successful || echo Download Failed\r\n", + "!chmod +x ./koboldcpp_linux\r\n", "!apt update\r\n", "!apt install aria2 -y\r\n", + "# simple fix for a common URL mistake\r\n", + "if \"https://huggingface.co/\" in Model and \"/blob/main/\" in Model: \r\n", + " Model = Model.replace(\"/blob/main/\", \"/resolve/main/\")\r\n", "!aria2c -x 10 -o model.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $Model\r\n", "if VCommand:\r\n", " !aria2c -x 10 -o vmodel.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $LLaVAmmproj\r\n", @@ -102,26 +97,13 @@ " !aria2c -x 10 -o imodel.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $ImgModel\r\n", "if WCommand:\r\n", " !aria2c -x 10 -o wmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $SpeechModel\r\n", - "!python koboldcpp.py model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --quiet --remotetunnel $VCommand $SCommand $WCommand\r\n" + "!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --quiet --remotetunnel $VCommand $SCommand $WCommand\r\n" ] } ], "metadata": { - "accelerator": "GPU", - "colab": { - "cell_execution_strategy": "setup", - "gpuType": "T4", - "include_colab_link": true, - "private_outputs": true, - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } + "language_info": {}, + "orig_nbformat": 3 }, "nbformat": 4, "nbformat_minor": 0 diff --git a/common/common.cpp b/common/common.cpp index 24eb8b64b57..f35f3d7fa38 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -7,7 +7,6 @@ #include "llama.h" #include -#include #include #include #include @@ -275,26 +274,22 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } +#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } + bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { const char split_delim = ','; llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; } if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads = std::stoi(argv[i]); if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); @@ -302,10 +297,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-tb" || arg == "--threads-batch") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_batch = std::stoi(argv[i]); if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); @@ -313,10 +305,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-td" || arg == "--threads-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_draft = std::stoi(argv[i]); if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); @@ -324,10 +313,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-tbd" || arg == "--threads-batch-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_batch_draft = std::stoi(argv[i]); if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); @@ -335,10 +321,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.prompt = argv[i]; return true; } @@ -351,10 +334,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--prompt-cache") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.path_prompt_cache = argv[i]; return true; } @@ -367,10 +347,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-bf" || arg == "--binary-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i], std::ios::binary); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -386,10 +363,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-f" || arg == "--file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -405,10 +379,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--in-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -419,66 +390,42 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_predict = std::stoi(argv[i]); return true; } if (arg == "--top-k") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.top_k = std::stoi(argv[i]); return true; } if (arg == "-c" || arg == "--ctx-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_ctx = std::stoi(argv[i]); return true; } if (arg == "--grp-attn-n" || arg == "-gan") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.grp_attn_n = std::stoi(argv[i]); return true; } if (arg == "--grp-attn-w" || arg == "-gaw") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.grp_attn_w = std::stoi(argv[i]); return true; } if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_base = std::stof(argv[i]); return true; } if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_scale = std::stof(argv[i]); return true; } if (arg == "--rope-scaling") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } @@ -487,217 +434,148 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--rope-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rope_freq_scale = 1.0f / std::stof(argv[i]); return true; } if (arg == "--yarn-orig-ctx") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_orig_ctx = std::stoi(argv[i]); return true; } if (arg == "--yarn-ext-factor") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_ext_factor = std::stof(argv[i]); return true; } if (arg == "--yarn-attn-factor") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_attn_factor = std::stof(argv[i]); return true; } if (arg == "--yarn-beta-fast") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_beta_fast = std::stof(argv[i]); return true; } if (arg == "--yarn-beta-slow") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.yarn_beta_slow = std::stof(argv[i]); return true; } if (arg == "--pooling") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { invalid_param = true; } + return true; + } + if (arg == "--attention") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { invalid_param = true; } return true; } if (arg == "--defrag-thold" || arg == "-dt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.defrag_thold = std::stof(argv[i]); return true; } if (arg == "--samplers") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); return true; } if (arg == "--sampling-seq") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); return true; } if (arg == "--top-p") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.top_p = std::stof(argv[i]); return true; } if (arg == "--min-p") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.min_p = std::stof(argv[i]); return true; } if (arg == "--temp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); return true; } if (arg == "--tfs") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.tfs_z = std::stof(argv[i]); return true; } if (arg == "--typical") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.typical_p = std::stof(argv[i]); return true; } if (arg == "--repeat-last-n") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); return true; } if (arg == "--repeat-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_repeat = std::stof(argv[i]); return true; } if (arg == "--frequency-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_freq = std::stof(argv[i]); return true; } if (arg == "--presence-penalty") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.penalty_present = std::stof(argv[i]); return true; } if (arg == "--dynatemp-range") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.dynatemp_range = std::stof(argv[i]); return true; } if (arg == "--dynatemp-exp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.dynatemp_exponent = std::stof(argv[i]); return true; } if (arg == "--mirostat") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat = std::stoi(argv[i]); return true; } if (arg == "--mirostat-lr") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat_eta = std::stof(argv[i]); return true; } if (arg == "--mirostat-ent") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.mirostat_tau = std::stof(argv[i]); return true; } if (arg == "--cfg-negative-prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.cfg_negative_prompt = argv[i]; return true; } if (arg == "--cfg-negative-prompt-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -711,203 +589,125 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--cfg-scale") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.cfg_scale = std::stof(argv[i]); return true; } if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_batch = std::stoi(argv[i]); return true; } if (arg == "-ub" || arg == "--ubatch-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_ubatch = std::stoi(argv[i]); return true; } if (arg == "--keep") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_keep = std::stoi(argv[i]); return true; } if (arg == "--draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_draft = std::stoi(argv[i]); return true; } if (arg == "--chunks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_chunks = std::stoi(argv[i]); return true; } if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_parallel = std::stoi(argv[i]); return true; } if (arg == "-ns" || arg == "--sequences") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_sequences = std::stoi(argv[i]); return true; } if (arg == "--p-split" || arg == "-ps") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.p_split = std::stof(argv[i]); return true; } if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model = argv[i]; return true; } if (arg == "-md" || arg == "--model-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_draft = argv[i]; return true; } if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_alias = argv[i]; return true; } if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.model_url = argv[i]; return true; } if (arg == "-hfr" || arg == "--hf-repo") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hf_repo = argv[i]; return true; } if (arg == "-hff" || arg == "--hf-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hf_file = argv[i]; return true; } if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; return true; } if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const char* lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; return true; } if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lora_base = argv[i]; return true; } if (arg == "--control-vector") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vectors.push_back({ 1.0f, argv[i], }); return true; } if (arg == "--control-vector-scaled") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG const char* fname = argv[i]; - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vectors.push_back({ std::stof(argv[i]), fname, }); return true; } if (arg == "--control-vector-layer-range") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vector_layer_start = std::stoi(argv[i]); - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.control_vector_layer_end = std::stoi(argv[i]); return true; } if (arg == "--mmproj") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.mmproj = argv[i]; return true; } if (arg == "--image") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.image.emplace_back(argv[i]); return true; } @@ -923,6 +723,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.embedding = true; return true; } + if (arg == "--embd-normalize") { + CHECK_ARG + params.embd_normalize = std::stoi(argv[i]); + return true; + } + if (arg == "--embd-output-format") { + CHECK_ARG + params.embd_out = argv[i]; + return true; + } + if (arg == "--embd-separator") { + CHECK_ARG + params.embd_sep = argv[i]; + return true; + } if (arg == "-if" || arg == "--interactive-first") { params.interactive_first = true; return true; @@ -951,7 +766,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cache_type_v = argv[++i]; return true; } - if (arg == "--multiline-input") { + if (arg == "-mli" || arg == "--multiline-input") { params.multiline_input = true; return true; } @@ -976,10 +791,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_gpu_layers = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -988,10 +800,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -1000,10 +809,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.main_gpu = std::stoi(argv[i]); #ifndef GGML_USE_CUDA_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); @@ -1011,10 +817,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--split-mode" || arg == "-sm") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string arg_next = argv[i]; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1039,10 +842,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string arg_next = argv[i]; // split string by , and / @@ -1067,10 +867,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--rpc") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.rpc_servers = argv[i]; return true; } @@ -1079,10 +876,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--numa") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::string value(argv[i]); /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } @@ -1095,10 +889,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--verbosity") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.verbosity = std::stoi(argv[i]); return true; } @@ -1111,18 +902,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.antiprompt.emplace_back(argv[i]); return true; } if (arg == "-ld" || arg == "--logdir") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.logdir = argv[i]; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -1131,26 +916,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-lcs" || arg == "--lookup-cache-static") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lookup_cache_static = argv[i]; return true; } if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.lookup_cache_dynamic = argv[i]; return true; } if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.logits_file = argv[i]; return true; } @@ -1159,26 +935,17 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--ppl-stride") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ppl_stride = std::stoi(argv[i]); return true; } if (arg == "--ppl-output-type") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ppl_output_type = std::stoi(argv[i]); return true; } if (arg == "-ptc" || arg == "--print-token-count") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_print = std::stoi(argv[i]); return true; } @@ -1191,10 +958,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--hellaswag-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hellaswag_tasks = std::stoi(argv[i]); return true; } @@ -1203,10 +967,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--winogrande-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.winogrande_tasks = std::stoi(argv[i]); return true; } @@ -1215,10 +976,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--multiple-choice-tasks") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.multiple_choice_tasks = std::stoi(argv[i]); return true; } @@ -1235,10 +993,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-l" || arg == "--logit-bias") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::stringstream ss(argv[i]); llama_token key; char sign; @@ -1268,37 +1023,32 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; + params.enable_chat_template = false; return true; } if (arg == "--in-prefix") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.input_prefix = argv[i]; + params.enable_chat_template = false; return true; } if (arg == "--in-suffix") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.input_suffix = argv[i]; + params.enable_chat_template = false; + return true; + } + if (arg == "--spm-infill") { + params.spm_infill = true; return true; } if (arg == "--grammar") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG sparams.grammar = argv[i]; return true; } if (arg == "--grammar-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1313,18 +1063,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } // if (arg == "-j" || arg == "--json-schema") { - // if (++i >= argc) { - // invalid_param = true; - // return true; - // } + // CHECK_ARG // sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); // return true; // } if (arg == "--override-kv") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!string_parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; @@ -1333,42 +1077,27 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.hostname = argv[i]; return true; } if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.port = std::stoi(argv[i]); return true; } if (arg == "--path") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.public_path = argv[i]; return true; } if (arg == "--api-key") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.api_keys.push_back(argv[i]); return true; } if (arg == "--api-key-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream key_file(argv[i]); if (!key_file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1385,43 +1114,28 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--ssl-key-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ssl_file_key = argv[i]; return true; } if (arg == "--ssl-cert-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.ssl_file_cert = argv[i]; return true; } if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.timeout_read = std::stoi(argv[i]); params.timeout_write = std::stoi(argv[i]); return true; } if (arg == "--threads-http") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_threads_http = std::stoi(argv[i]); return true; } if (arg == "-spf" || arg == "--system-prompt-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i]); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1438,10 +1152,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--log-format") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (std::strcmp(argv[i], "json") == 0) { params.log_json = true; } else if (std::strcmp(argv[i], "text") == 0) { @@ -1461,10 +1172,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--slot-save-path") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.slot_save_path = argv[i]; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -1473,10 +1181,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chat-template") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!llama_chat_verify_template(argv[i])) { fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); @@ -1487,10 +1192,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--slot-prompt-similarity" || arg == "-sps") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.slot_prompt_similarity = std::stof(argv[i]); return true; } @@ -1499,37 +1201,25 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-npp") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split(argv[i], split_delim); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); return true; } if (arg == "-ntg") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split(argv[i], split_delim); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); return true; } if (arg == "-npl") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG auto p = string_split(argv[i], split_delim); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); return true; } if (arg == "--context-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG std::ifstream file(argv[i], std::ios::binary); if (!file) { fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); @@ -1540,59 +1230,38 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chunk-size") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.chunk_size = std::stoi(argv[i]); return true; } if (arg == "--chunk-separator") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.chunk_separator = argv[i]; return true; } if (arg == "--junk") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_junk = std::stoi(argv[i]); return true; } if (arg == "--pos") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.i_pos = std::stoi(argv[i]); return true; } if (arg == "-o" || arg == "--output" || arg == "--output-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.out_file = argv[i]; params.cvector_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_out_freq = std::stoi(argv[i]); return true; } if (arg == "--save-frequency") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_save_freq = std::stoi(argv[i]); return true; } @@ -1605,62 +1274,39 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "--chunk" || arg == "--from-chunk") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.i_chunk = std::stoi(argv[i]); return true; } // cvector params - if (arg == "--completions-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.cvector_completions_file = argv[i]; - return true; - } if (arg == "--positive-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_positive_file = argv[i]; return true; } if (arg == "--negative-file") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.cvector_negative_file = argv[i]; return true; } - if (arg == "--completions") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.n_completions = std::stoi(argv[i]); - return true; - } if (arg == "--pca-batch") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_pca_batch = std::stoi(argv[i]); return true; } if (arg == "--pca-iter") { - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG params.n_pca_iterations = std::stoi(argv[i]); return true; } + if (arg == "--method") { + CHECK_ARG + std::string value(argv[i]); + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { invalid_param = true; } + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1672,10 +1318,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - return true; - } + CHECK_ARG if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { invalid_param = true; return true; @@ -1760,7 +1403,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); - options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); + options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" + "in conversation mode, this will be used as system prompt\n" + "(default: '%s')", params.prompt.c_str() }); options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); @@ -1775,13 +1420,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "halt generation at PROMPT, return control in interactive mode\n" "can be specified more than once for multiple prompts" }); options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n" + "if suffix/prefix are not specified, default chat template will be used\n" + "(default: %s)", params.conversation ? "true" : "false" }); options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "server infill", + " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); options.push_back({ "sampling" }); options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" @@ -1815,7 +1464,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main", " --cfg-negative-prompt-file FNAME", "negative prompt file to use for guidance" }); options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); - + options.push_back({ "main", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "grammar" }); options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); @@ -1824,8 +1477,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); options.push_back({ "embedding" }); - options.push_back({ "embedding", " --pooling {none,mean,cls}", + options.push_back({ "embedding", " --pooling {none,mean,cls,last}", "pooling type for embeddings, use model default if unspecified" }); + options.push_back({ "embedding", " --attention {causal,non-causal}", + "attention type for embeddings, use model default if unspecified" }); options.push_back({ "context hacking" }); options.push_back({ "*", " --rope-scaling {none,linear,yarn}", @@ -1871,6 +1526,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "backend" }); options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); + if (llama_supports_mlock()) { options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); } @@ -1908,9 +1564,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); - options.push_back({ "*", " --control-vector FNAME", "add a control vector" }); + options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" + "note: this argument can be repeated to add multiple control vectors" }); options.push_back({ "*", " --control-vector-scaled FNAME SCALE", - "add a control vector with user defined scaling SCALE" }); + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors" }); options.push_back({ "*", " --control-vector-layer-range START END", "layer range to apply the control vector(s) to, start and end inclusive" }); options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" @@ -1944,6 +1602,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); + options.push_back({ "embedding" }); + options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); + options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); + options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); + options.push_back({ "server" }); options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); @@ -1986,11 +1649,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", " --completions-file FNAME", - "completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); printf("usage: %s [options]\n", argv[0]); @@ -2425,7 +2086,24 @@ std::tuple llama_init_from_gpt_par if (params.warmup) { LOG("warming up the model with an empty run\n"); - std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; + std::vector tmp; + llama_token bos = llama_token_bos(model); + llama_token eos = llama_token_eos(model); + // some models (e.g. T5) don't have a BOS token + if (bos != -1) { + tmp.push_back(bos); + } + tmp.push_back(eos); + + if (llama_model_has_encoder(model)) { + llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) { + decoder_start_token_id = bos; + } + tmp.clear(); + tmp.push_back(decoder_start_token_id); + } llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_clear(lctx); llama_synchronize(lctx); @@ -2508,6 +2186,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; + cparams.attention_type = params.attention_type; cparams.defrag_thold = params.defrag_thold; cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; @@ -2658,7 +2337,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat } // Set the output file - std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb"), fclose); + + struct FILE_deleter { + void operator()(FILE * f) const { + fclose(f); + } + }; + + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; @@ -2907,51 +2593,35 @@ std::vector llama_tokenize( } std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { - std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); - if (n_tokens < 0) { - result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); - GGML_ASSERT(check == -n_tokens); - } else { - result.resize(n_tokens); - } - - return std::string(result.data(), result.size()); -} - -std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { - const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - // remove the leading space of the first non-BOS token - if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { - piece = piece.substr(1); - } - - result += piece; + piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' + const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); + if (n_chars < 0) { + piece.resize(-n_chars); + int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); + GGML_ASSERT(check == -n_chars); + } + else { + piece.resize(n_chars); } - return result; + return piece; } -std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - result += piece; +std::string llama_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { + std::string text; + text.resize(std::max(text.capacity(), tokens.size())); + int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + if (n_chars < 0) { + text.resize(-n_chars); + n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } + text.resize(n_chars); + // NOTE: the original tokenizer decodes bytes after collecting the pieces. - return result; + return text; } bool llama_should_add_bos_token(const llama_model * model) { @@ -2960,12 +2630,91 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } +// +// Chat template utils +// + bool llama_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector & msgs, + bool add_ass) { + int alloc_size = 0; + bool fallback = false; // indicate if we must fallback to default chatml + std::vector chat; + for (auto & msg : msgs) { + chat.push_back({msg.role.c_str(), msg.content.c_str()}); + alloc_size += (msg.role.size() + msg.content.size()) * 1.25; + } + + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + std::vector buf(alloc_size); + + // run the first time to get the total output length + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + + // error: chat template is not supported + if (res < 0) { + if (ptr_tmpl != nullptr) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported"); + } else { + // If the built-in template is not supported, we default to chatml + res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + fallback = true; + } + } + + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template( + fallback ? nullptr : model, + fallback ? "chatml" : ptr_tmpl, + chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + } + + std::string formatted_chat(buf.data(), res); + return formatted_chat; +} + +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass) { + std::ostringstream ss; + auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + std::vector chat_new(past_msg); + // if the past_msg ends with a newline, we must preserve it in the formatted version + if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { + ss << "\n"; + }; + // format chat with new_msg + chat_new.push_back(new_msg); + auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); + // get the diff part + ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return ss.str(); +} + +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl) { + std::vector msgs = { + {"system", "You are a helpful assistant"}, + {"user", "Hello"}, + {"assistant", "Hi there"}, + {"user", "How are you?"}, + }; + return llama_chat_apply_template(model, tmpl, msgs, true); +} + // // KV cache utils // @@ -3045,14 +2794,34 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n) { +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; - for (int i = 0; i < n; i++) { - sum += inp[i] * inp[i]; + + switch (embd_norm) { + case -1: // no normalisation + sum = 1.0; + break; + case 0: // max absolute + for (int i = 0; i < n; i++) { + if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); + } + sum /= 32760.0; // make an int16 range + break; + case 2: // euclidean + for (int i = 0; i < n; i++) { + sum += inp[i] * inp[i]; + } + sum = std::sqrt(sum); + break; + default: // p-norm (euclidean is p-norm p=2) + for (int i = 0; i < n; i++) { + sum += std::pow(std::abs(inp[i]), embd_norm); + } + sum = std::pow(sum, 1.0 / embd_norm); + break; } - sum = sqrt(sum); - const float norm = sum > 0.0 ? 1.0f / sum : 0.0f; + const float norm = sum > 0.0 ? 1.0 / sum : 0.0f; for (int i = 0; i < n; i++) { out[i] = inp[i] * norm; @@ -3070,6 +2839,14 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) sum2 += embd2[i] * embd2[i]; } + // Handle the case where one or both vectors are zero vectors + if (sum1 == 0.0 || sum2 == 0.0) { + if (sum1 == 0.0 && sum2 == 0.0) { + return 1.0f; // two zero vectors are similar + } + return 0.0f; + } + return sum / (sqrt(sum1) * sqrt(sum2)); } @@ -3078,125 +2855,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { - int32_t n_tensors; - - size_t n_bytes = 0; - - uint32_t max_direction_layer = 0; - llama_control_vector_data result = { -1, {} }; - // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer - { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &meta_ctx, - }; - struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); - if (!meta_ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - return result; - } - - n_tensors = gguf_get_n_tensors(meta_ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); - - // split on '.' - size_t dotpos = name.find('.'); - if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { - try { - uint32_t layer = std::stoi(name.substr(dotpos + 1)); - if (layer == 0) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (layer > max_direction_layer) { - max_direction_layer = layer; - } - } catch (...) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - } - - struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); - if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (result.n_embd == -1) { - result.n_embd = ggml_nelements(tensor_meta); - } else if (ggml_nelements(tensor_meta) != result.n_embd) { - fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - n_bytes += ggml_nbytes(tensor_meta); - } - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + return result; } + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); - return result; } - // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(ctx_gguf, i); - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); - if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(ctx); - return result; - } + int layer_idx = -1; - // do not store data for layer 0 (it's not used) - result.data.resize(result.n_embd * max_direction_layer); + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + layer_idx = std::stoi(name.substr(dotpos + 1)); + } catch (...) { + layer_idx = -1; + } + } + if (layer_idx < 0) { + fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } else if (layer_idx == 0) { + fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } - for (uint32_t il = 1; il <= max_direction_layer; il++) { - const std::string name = "direction." + std::to_string(il); - const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + if (ggml_n_dims(tensor) != 1) { + fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } - float * dst = result.data.data() + result.n_embd * (il - 1); + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor); + } else if (ggml_nelements(tensor) != result.n_embd) { + fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } - if (tensor) { - const float * src = (const float *) tensor->data; - for (int j = 0; j < result.n_embd; j++) { - dst[j] = src[j] * load_info.strength; - } - } else { - for (int j = 0; j < result.n_embd; j++) { - dst[j] = 0.0f; - } + // extend if necessary - do not store data for layer 0 (it's not used) + result.data.resize(std::max(result.data.size(), static_cast(result.n_embd * layer_idx)), 0.0f); + + const float * src = (const float *) tensor->data; + float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0] + for (int j = 0; j < result.n_embd; j++) { + dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file } + } + if (result.n_embd == -1) { + fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + result.data.clear(); + } + + gguf_free(ctx_gguf); + ggml_free(ctx); + return result; } @@ -3207,16 +2946,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector image; // path to image file(s) + // embedding + bool embedding = false; // get only sentence embedding + int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix + std::string embd_sep = "\n"; // separator of embendings + // server params int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds @@ -207,6 +218,7 @@ struct gpt_params { std::string public_path = ""; std::string chat_template = ""; std::string system_prompt = ""; + bool enable_chat_template = true; std::vector api_keys; @@ -251,13 +263,14 @@ struct gpt_params { bool compute_ppl = true; // whether to compute perplexity // cvector-generator params - int n_completions = 64; - int n_pca_batch = 20; + int n_pca_batch = 100; int n_pca_iterations = 1000; - std::string cvector_outfile = "control_vector.gguf"; - std::string cvector_completions_file = "examples/cvector-generator/completions.txt"; - std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; - std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; + std::string cvector_outfile = "control_vector.gguf"; + std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; + std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + + bool spm_infill = false; // suffix/prefix/middle pattern for infill }; void gpt_params_handle_model_default(gpt_params & params); @@ -354,21 +367,13 @@ std::string llama_token_to_piece( llama_token token, bool special = true); -// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function -// that takes into account the tokenizer type and decides how to handle the leading space -// -// detokenizes a vector of tokens into a string -// should work similar to Python's `tokenizer.decode` -// removes the leading space from the first non-BOS token -std::string llama_detokenize_spm( - llama_context * ctx, - const std::vector & tokens); - // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` -std::string llama_detokenize_bpe( +// optionally renders special/control tokens +std::string llama_detokenize( llama_context * ctx, - const std::vector & tokens); + const std::vector & tokens, + bool special = true); // Uses the value from the model metadata if possible, otherwise // defaults to true when model type is SPM, otherwise false. @@ -378,9 +383,34 @@ bool llama_should_add_bos_token(const llama_model * model); // Chat template utils // +// same with llama_chat_message, but uses std::string +struct llama_chat_msg { + std::string role; + std::string content; +}; + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool llama_chat_verify_template(const std::string & tmpl); +// CPP wrapper for llama_chat_apply_template +// If the built-in template is not supported, we default to chatml +// If the custom "tmpl" is not supported, we throw an error +std::string llama_chat_apply_template(const struct llama_model * model, + const std::string & tmpl, + const std::vector & chat, + bool add_ass); + +// Format single message, while taking into account the position of that message in chat history +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass); + +// Returns an example of formatted chat +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl); + // // KV cache utils // @@ -395,7 +425,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n); +void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); @@ -438,5 +468,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha void yaml_dump_non_result_info( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); - - diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 10b9b3d1d4d..881eb49e338 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items return result; } +/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */ +class string_view { + const std::string & _str; + const size_t _start; + const size_t _end; +public: + string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {} + + size_t size() const { + return _end - _start; + } + + size_t length() const { + return size(); + } + + operator std::string() const { + return str(); + } + + std::string str() const { + return _str.substr(_start, _end - _start); + } + + string_view substr(size_t pos, size_t len = std::string::npos) const { + return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len); + } + + char operator[](size_t pos) const { + auto index = _start + pos; + if (index >= _end) { + throw std::out_of_range("string_view index out of range"); + } + return _str[_start + pos]; + } + + bool operator==(const string_view & other) const { + std::string this_str = *this; + std::string other_str = other; + return this_str == other_str; + } +}; + +static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { + auto has_min = min_value != std::numeric_limits::min(); + auto has_max = max_value != std::numeric_limits::max(); + + auto digit_range = [&](char from, char to) { + out << "["; + if (from == to) { + out << from; + } else { + out << from << "-" << to; + } + out << "]"; + }; + auto more_digits = [&](int min_digits, int max_digits) { + out << "[0-9]"; + if (min_digits == max_digits && min_digits == 1) { + return; + } + out << "{"; + out << min_digits; + if (max_digits != min_digits) { + out << ","; + if (max_digits != std::numeric_limits::max()) { + out << max_digits; + } + } + out << "}"; + }; + std::function uniform_range = + [&](const string_view & from, const string_view & to) { + size_t i = 0; + while (i < from.length() && i < to.length() && from[i] == to[i]) { + i++; + } + if (i > 0) { + out << "\"" << from.substr(0, i).str() << "\""; + } + if (i < from.length() && i < to.length()) { + if (i > 0) { + out << " "; + } + auto sub_len = from.length() - i - 1; + if (sub_len > 0) { + auto from_sub = from.substr(i + 1); + auto to_sub = to.substr(i + 1); + auto sub_zeros = repeat("0", sub_len); + auto sub_nines = repeat("9", sub_len); + + auto to_reached = false; + out << "("; + if (from_sub == sub_zeros) { + digit_range(from[i], to[i] - 1); + out << " "; + more_digits(sub_len, sub_len); + } else { + out << "[" << from[i] << "] "; + out << "("; + uniform_range(from_sub, sub_nines); + out << ")"; + if (from[i] < to[i] - 1) { + out << " | "; + if (to_sub == sub_nines) { + digit_range(from[i] + 1, to[i]); + to_reached = true; + } else { + digit_range(from[i] + 1, to[i] - 1); + } + out << " "; + more_digits(sub_len, sub_len); + } + } + if (!to_reached) { + out << " | "; + digit_range(to[i], to[i]); + out << " "; + uniform_range(sub_zeros, to_sub); + } + out << ")"; + } else { + out << "[" << from[i] << "-" << to[i] << "]"; + } + } + }; + + if (has_min && has_max) { + if (min_value < 0 && max_value < 0) { + out << "\"-\" ("; + _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true); + out << ")"; + return; + } + + if (min_value < 0) { + out << "\"-\" ("; + _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true); + out << ") | "; + min_value = 0; + } + + auto min_s = std::to_string(min_value); + auto max_s = std::to_string(max_value); + auto min_digits = min_s.length(); + auto max_digits = max_s.length(); + + for (auto digits = min_digits; digits < max_digits; digits++) { + uniform_range(min_s, repeat("9", digits)); + min_s = "1" + repeat("0", digits); + out << " | "; + } + uniform_range(min_s, max_s); + return; + } + + auto less_decimals = std::max(decimals_left - 1, 1); + + if (has_min) { + if (min_value < 0) { + out << "\"-\" ("; + _build_min_max_int(std::numeric_limits::min(), -min_value, out, decimals_left, /* top_level= */ false); + out << ") | [0] | [1-9] "; + more_digits(0, decimals_left - 1); + } else if (min_value == 0) { + if (top_level) { + out << "[0] | [1-9] "; + more_digits(0, less_decimals); + } else { + more_digits(1, decimals_left); + } + } else if (min_value <= 9) { + char c = '0' + min_value; + auto range_start = top_level ? '1' : '0'; + if (c > range_start) { + digit_range(range_start, c - 1); + out << " "; + more_digits(1, less_decimals); + out << " | "; + } + digit_range(c, '9'); + out << " "; + more_digits(0, less_decimals); + } else { + auto min_s = std::to_string(min_value); + auto len = min_s.length(); + auto c = min_s[0]; + + if (c > '1') { + digit_range(top_level ? '1' : '0', c - 1); + out << " "; + more_digits(len, less_decimals); + out << " | "; + } + digit_range(c, c); + out << " ("; + _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits::max(), out, less_decimals, /* top_level= */ false); + out << ")"; + if (c < '9') { + out << " | "; + digit_range(c + 1, '9'); + out << " "; + more_digits(len - 1, less_decimals); + } + } + return; + } + + if (has_max) { + if (max_value >= 0) { + if (top_level) { + out << "\"-\" [1-9] "; + more_digits(0, less_decimals); + out << " | "; + } + _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true); + } else { + out << "\"-\" ("; + _build_min_max_int(-max_value, std::numeric_limits::max(), out, decimals_left, /* top_level= */ false); + out << ")"; + } + return; + } + + throw std::runtime_error("At least one of min_value or max_value must be set"); +} + const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}"; struct BuiltinRule { @@ -89,7 +316,7 @@ std::unordered_map GRAMMAR_LITERAL_ESCAPES = { }; std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; -std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; template std::string join(Iterator begin, Iterator end, const std::string & separator) { @@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) { return "\"" + escaped + "\""; } - class SchemaConverter { private: std::function _fetch_json; @@ -388,6 +614,75 @@ class SchemaConverter { return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); } + /* + Returns a rule that matches a JSON string that is none of the provided strings + + not_strings({"a"}) + -> ["] ( [a] char+ | [^"a] char* )? ["] space + not_strings({"and", "also"}) + -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space + */ + std::string _not_strings(const std::vector & strings) { + + struct TrieNode { + std::map children; + bool is_end_of_string; + + TrieNode() : is_end_of_string(false) {} + + void insert(const std::string & string) { + auto node = this; + for (char c : string) { + node = &node->children[c]; + } + node->is_end_of_string = true; + } + }; + + TrieNode trie; + for (const auto & s : strings) { + trie.insert(s); + } + + std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); + std::ostringstream out; + out << "[\"] ( "; + std::function visit = [&](const TrieNode & node) { + std::ostringstream rejects; + auto first = true; + for (const auto & kv : node.children) { + rejects << kv.first; + if (first) { + first = false; + } else { + out << " | "; + } + out << "[" << kv.first << "]"; + if (!kv.second.children.empty()) { + out << " ("; + visit(kv.second); + out << ")"; + } else if (kv.second.is_end_of_string) { + out << " " << char_rule << "+"; + } + } + if (!node.children.empty()) { + if (!first) { + out << " | "; + } + out << "[^\"" << rejects.str() << "] " << char_rule << "*"; + } + }; + visit(trie); + + out << " )"; + if (!trie.is_end_of_string) { + out << "?"; + } + out << " [\"] space"; + return out.str(); + } + std::string _resolve_ref(const std::string & ref) { std::string ref_name = ref.substr(ref.find_last_of('/') + 1); if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) { @@ -408,6 +703,7 @@ class SchemaConverter { std::vector required_props; std::vector optional_props; std::unordered_map prop_kv_rule_names; + std::vector prop_names; for (const auto & kv : properties) { const auto &prop_name = kv.first; const auto &prop_schema = kv.second; @@ -422,11 +718,18 @@ class SchemaConverter { } else { optional_props.push_back(prop_name); } + prop_names.push_back(prop_name); } - if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) { + if ((additional_properties.is_boolean() && additional_properties.get()) || additional_properties.is_object()) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; - std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value"); - std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule); + std::string value_rule = + additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") + : _add_primitive("value", PRIMITIVE_RULES.at("value")); + + auto key_rule = + prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string")) + : _add_rule(sub_name + "-k", _not_strings(prop_names)); + std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule); prop_kv_rule_names["*"] = kv_rule; optional_props.push_back("*"); } @@ -452,15 +755,11 @@ class SchemaConverter { } std::string k = ks[0]; std::string kv_rule_name = prop_kv_rule_names[k]; - if (k == "*") { - res = _add_rule( - name + (name.empty() ? "" : "-") + "additional-kvs", - kv_rule_name + " ( \",\" space " + kv_rule_name + " )*" - ); - } else if (first_is_optional) { - res = "( \",\" space " + kv_rule_name + " )?"; + std::string comma_ref = "( \",\" space " + kv_rule_name + " )"; + if (first_is_optional) { + res = comma_ref + (k == "*" ? "*" : "?"); } else { - res = kv_rule_name; + res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : ""); } if (ks.size() > 1) { res += " " + _add_rule( @@ -594,17 +893,19 @@ class SchemaConverter { } else if (schema_type.is_array()) { std::vector schema_types; for (const auto & t : schema_type) { - schema_types.push_back({{"type", t}}); + json schema_copy(schema); + schema_copy["type"] = t; + schema_types.push_back(schema_copy); } return _add_rule(rule_name, _generate_union_rule(name, schema_types)); } else if (schema.contains("const")) { - return _add_rule(rule_name, _generate_constant_rule(schema["const"])); + return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space"); } else if (schema.contains("enum")) { std::vector enum_values; for (const auto & v : schema["enum"]) { enum_values.push_back(_generate_constant_rule(v)); } - return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | ")); + return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space"); } else if ((schema_type.is_null() || schema_type == "object") && (schema.contains("properties") || (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) { @@ -686,6 +987,24 @@ class SchemaConverter { int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); + } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) { + int min_value = std::numeric_limits::min(); + int max_value = std::numeric_limits::max(); + if (schema.contains("minimum")) { + min_value = schema["minimum"].get(); + } else if (schema.contains("exclusiveMinimum")) { + min_value = schema["exclusiveMinimum"].get() + 1; + } + if (schema.contains("maximum")) { + max_value = schema["maximum"].get(); + } else if (schema.contains("exclusiveMaximum")) { + max_value = schema["exclusiveMaximum"].get() - 1; + } + std::stringstream out; + out << "("; + _build_min_max_int(min_value, max_value, out); + out << ") space"; + return _add_rule(rule_name, out.str()); } else if (schema.empty() || schema_type == "object") { return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object"))); } else { diff --git a/common/sampling.cpp b/common/sampling.cpp index 11c2033bfd4..2a07b711a18 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ std::vector grammar_rules(result->parsed_grammar.c_rules()); - result->grammar = llama_grammar_init( + struct llama_grammar * grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } + result->grammar = grammar; } result->prev.resize(params.n_prev); @@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) { if (!ctx->parsed_grammar.rules.empty()) { std::vector grammar_rules(ctx->parsed_grammar.c_rules()); - ctx->grammar = llama_grammar_init( + struct llama_grammar * grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } + ctx->grammar = grammar; } std::fill(ctx->prev.begin(), ctx->prev.end(), 0); diff --git a/convert-hf-to-gguf.py b/convert_hf_to_gguf.py similarity index 83% rename from convert-hf-to-gguf.py rename to convert_hf_to_gguf.py index a6751cc80e6..455eea883b9 100755 --- a/convert-hf-to-gguf.py +++ b/convert_hf_to_gguf.py @@ -13,7 +13,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast import math import numpy as np @@ -65,7 +65,8 @@ class Model: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, + model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -80,7 +81,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, if not self.is_safetensors: self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") self.hparams = Model.load_hparams(self.dir_model) - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None if self.ftype == gguf.LlamaFileType.GUESSED: @@ -96,7 +97,8 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) - self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod def __init_subclass__(cls): @@ -332,6 +334,8 @@ def write(self): self.gguf_writer.close() def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -400,7 +404,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: return tokens, toktypes, tokpre - # NOTE: this function is generated by convert-hf-to-gguf-update.py + # NOTE: this function is generated by convert_hf_to_gguf_update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre @@ -420,7 +424,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": @@ -483,15 +487,21 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" + if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": + # ref: https://huggingface.co/LumiOpen/Viking-7B + res = "viking" + if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": + # ref: https://huggingface.co/core42/jais-13b + res = "jais" if res is None: logger.warning("\n") logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") @@ -569,7 +579,19 @@ def _set_vocab_qwen(self): special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_sentencepiece(self): + def _set_vocab_sentencepiece(self, add_to_gguf=True): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _create_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -631,14 +653,7 @@ def _set_vocab_sentencepiece(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) + return tokens, scores, toktypes def _set_vocab_llama_hf(self): vocab = gguf.LlamaHfVocab(self.dir_model) @@ -662,6 +677,51 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): + tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + vocab_reader = gguf.GGUFReader(tokenizer_path, "r") + + default_pre = "mpt" if model_name == "gpt-neox" else "default" + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) + assert field # tokenizer model + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field # token list + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + if model_name == "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) + assert field # token scores + self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field # token types + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + if model_name != "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field # token merges + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: + self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: + self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) + @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): @@ -967,7 +1027,11 @@ def set_vocab(self): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) + if max_vocab_index >= vocab_size: + raise ValueError("Vocabulary size exceeds expected maximum size.") reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() @@ -1400,6 +1464,48 @@ def write_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("BitnetForCausalLM") +class BitnetModel(Model): + model_arch = gguf.MODEL_ARCH.BITNET + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def weight_quant(self, weight): + dtype = weight.dtype + weight = weight.float() + s = 1 / weight.abs().mean().clamp(min=1e-5) + weight = (weight * s).round().clamp(-1, 1) / s + scale = weight.abs().max().unsqueeze(0) + weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) + weight = torch.sign(weight).type(dtype) + return weight.type(dtype), scale.type(torch.float32) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): + # transform weight into 1/0/-1 (in fp32) + weight_torch, scale_torch = self.weight_quant(data_torch) + yield (new_name, weight_torch) + yield (new_name.removesuffix(".weight") + ".scale", scale_torch) + else: + yield (new_name, data_torch) + + @Model.register("GrokForCausalLM") class GrokModel(Model): model_arch = gguf.MODEL_ARCH.GROK @@ -1881,7 +1987,7 @@ def set_gguf_parameters(self): if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') - if rope_scaling_type == 'su': + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 elif rope_scaling_type == 'yarn': attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 @@ -2255,6 +2361,8 @@ def set_vocab(self): special_vocab._set_special_token("eot", 107) special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -2287,6 +2395,71 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma2ForCausalLM") +class Gemma2Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + # hack: This is required so that we can properly use start/end-of-turn for chat template + for i in range(108): + # including , , + toktypes[i] = SentencePieceTokenTypes.CONTROL + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_attn_logit_softcapping( + self.hparams["attn_logit_softcapping"] + ) + self.gguf_writer.add_final_logit_softcapping( + self.hparams["final_logit_softcapping"] + ) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + # sanity check + attn_scalar = self.hparams["query_pre_attn_scalar"] + if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]: + raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 @@ -2311,39 +2484,7 @@ def set_vocab(self): self._set_vocab_sentencepiece() else: # Use the GPT-NeoX tokenizer when no tokenizer files are present - tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - neox_reader = gguf.GGUFReader(tokenizer_path, "r") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") - - field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) - - field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) + self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model"]) @@ -2495,6 +2636,82 @@ def set_vocab(self, *args, **kwargs): self.gguf_writer.add_add_eos_token(True) +@Model.register("OpenELMForCausalLM") +class OpenELMModel(Model): + model_arch = gguf.MODEL_ARCH.OPENELM + + @staticmethod + def _make_divisible(v: float | int, divisor: int) -> int: + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 + new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] + ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] + self._n_embd: int = self.hparams["model_dim"] + self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] + self._num_query_heads: list[int] = self.hparams["num_query_heads"] + self._ffn_dims: list[int] = [ + OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) + for multiplier in ffn_multipliers + ] + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) + + # Uses the tokenizer from meta-llama/Llama-2-7b-hf + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) + + def set_gguf_parameters(self): + n_embd = self._n_embd + head_dim = self.hparams["head_dim"] + rot_pct = 1.0 + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_query_heads) + assert self.block_count == len(self._ffn_dims) + + self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_context_length"]) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_head_count(self._num_query_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) + # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 + self.gguf_writer.add_layer_norm_rms_eps(1e-6) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + if "n_layers" in keys: + return self.hparams["num_transformer_layers"] + + return super().find_hparam(keys, optional) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # split ff + if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": + ff_dim = self._ffn_dims[bid] + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) + return + + yield (self.map_tensor_name(name), data_torch) + + @Model.register("ArcticForCausalLM") class ArcticModel(Model): model_arch = gguf.MODEL_ARCH.ARCTIC @@ -2725,6 +2942,240 @@ def write_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("T5WithLMHeadModel") +@Model.register("T5ForConditionalGeneration") +@Model.register("MT5ForConditionalGeneration") +@Model.register("UMT5ForConditionalGeneration") +class T5Model(Model): + model_arch = gguf.MODEL_ARCH.T5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(True) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("T5") + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False + if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + self.max_alibi_bias = 8.0 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias")): + return tensors + + if name.endswith(("relative_pe.slopes")): + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch._data[0]) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + + def write_tensors(self): + super().write_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) + + ###### CONVERSION LOGIC ###### @@ -2774,10 +3225,6 @@ def parse_args() -> argparse.Namespace: "--vocab-only", action="store_true", help="extract only the vocab", ) - parser.add_argument( - "--awq-path", type=Path, default=None, - help="Path to scale awq cache file", - ) parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", @@ -2810,10 +3257,44 @@ def parse_args() -> argparse.Namespace: "--verbose", action="store_true", help="increase output verbosity", ) + parser.add_argument( + "--split-max-tensors", type=int, default=0, + help="max tensors in each split", + ) + parser.add_argument( + "--split-max-size", type=str, default="0", + help="max size per split N(M|G)", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out a split plan and exit, without writing any new files", + ) + parser.add_argument( + "--no-tensor-first-split", action="store_true", + help="do not add tensors to the first split (disabled by default)" + ) return parser.parse_args() +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n < 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + def main() -> None: args = parse_args() @@ -2821,19 +3302,6 @@ def main() -> None: dir_model = args.model - if args.awq_path: - sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] - tmp_model_path = args.model / "weighted_model" - dir_model = tmp_model_path - if tmp_model_path.is_dir(): - logger.info(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - logger.info("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - logger.info(f"Saved weighted model at {tmp_model_path}.") - if not dir_model.is_dir(): logger.error(f'Error: {args.model} is not a directory') sys.exit(1) @@ -2846,6 +3314,11 @@ def main() -> None: "auto": gguf.LlamaFileType.GUESSED, } + is_split = args.split_max_tensors > 0 or args.split_max_size != "0" + if args.use_temp_file and is_split: + logger.error("Error: Cannot use temp file when splitting") + sys.exit(1) + if args.outfile is not None: fname_out = args.outfile else: @@ -2863,7 +3336,10 @@ def main() -> None: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, + args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split) logger.info("Set model parameters") model_instance.set_gguf_parameters() @@ -2874,13 +3350,14 @@ def main() -> None: model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) if args.vocab_only: - logger.info(f"Exporting model vocab to '{model_instance.fname_out}'") + logger.info("Exporting model vocab...") model_instance.write_vocab() + logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") else: - logger.info(f"Exporting model to '{model_instance.fname_out}'") + logger.info("Exporting model...") model_instance.write() - - logger.info(f"Model successfully exported to '{model_instance.fname_out}'") + out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out + logger.info(f"Model successfully exported to {out_path}") if __name__ == '__main__': diff --git a/convert-hf-to-gguf-update.py b/convert_hf_to_gguf_update.py similarity index 85% rename from convert-hf-to-gguf-update.py rename to convert_hf_to_gguf_update.py index fbf1e1ea3de..e4165ae2d97 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert_hf_to_gguf_update.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # This script downloads the tokenizer models of the specified models from Huggingface and -# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py +# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py # # This is necessary in order to analyze the type of pre-tokenizer used by the model and # provide the necessary information to llama.cpp via the GGUF header in order to implement @@ -15,9 +15,9 @@ # - Add a new model to the "models" list # - Run the script with your huggingface token: # -# python3 convert-hf-to-gguf-update.py +# python3 convert_hf_to_gguf_update.py # -# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py +# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp @@ -37,7 +37,7 @@ from transformers import AutoTokenizer logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger("convert-hf-to-gguf-update") +logger = logging.getLogger("convert_hf_to_gguf_update") sess = requests.Session() @@ -45,6 +45,7 @@ class TOKENIZER_TYPE(IntEnum): SPM = auto() BPE = auto() WPM = auto() + UGM = auto() # TODO: this string has to exercise as much pre-tokenizer functionality as possible @@ -55,10 +56,10 @@ class TOKENIZER_TYPE(IntEnum): token = sys.argv[1] if not token.startswith("hf_"): logger.info("Huggingface token seems invalid") - logger.info("Usage: python convert-hf-to-gguf-update.py ") + logger.info("Usage: python convert_hf_to_gguf_update.py ") sys.exit(1) else: - logger.info("Usage: python convert-hf-to-gguf-update.py ") + logger.info("Usage: python convert_hf_to_gguf_update.py ") sys.exit(1) # TODO: add models here, base models preferred @@ -85,6 +86,11 @@ class TOKENIZER_TYPE(IntEnum): {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, + {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B + {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, + {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, + {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, + {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, ] @@ -106,9 +112,13 @@ def download_model(model): os.makedirs(f"models/tokenizers/{name}", exist_ok=True) files = ["config.json", "tokenizer.json", "tokenizer_config.json"] + if tokt == TOKENIZER_TYPE.SPM: files.append("tokenizer.model") + if tokt == TOKENIZER_TYPE.UGM: + files.append("spiece.model") + for file in files: save_path = f"models/tokenizers/{name}/{file}" if os.path.isfile(save_path): @@ -124,14 +134,14 @@ def download_model(model): logger.error(f"Failed to download model {model['name']}. Error: {e}") -# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: +# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function: src_ifs = "" for model in models: name = model["name"] tokt = model["tokt"] - if tokt == TOKENIZER_TYPE.SPM: + if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM: continue # Skip if the tokenizer folder does not exist or there are other download issues previously @@ -141,7 +151,10 @@ def download_model(model): # create the tokenizer try: - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + if name == "t5": + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) + else: + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") except OSError as e: logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded @@ -188,7 +201,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! {src_ifs} @@ -197,9 +210,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {{chkhsh}}") @@ -213,8 +226,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: return res """ -convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") -convert_py = convert_py_pth.read_text() +convert_py_pth = pathlib.Path("convert_hf_to_gguf.py") +convert_py = convert_py_pth.read_text(encoding="utf-8") convert_py = re.sub( r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", lambda m: m.group(1) + src_func + m.group(3), @@ -222,9 +235,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: flags=re.DOTALL | re.MULTILINE, ) -convert_py_pth.write_text(convert_py) +convert_py_pth.write_text(convert_py, encoding="utf-8") -logger.info("+++ convert-hf-to-gguf.py was updated") +logger.info("+++ convert_hf_to_gguf.py was updated") # generate tests for each tokenizer model @@ -262,6 +275,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: "\n =", "' era", "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + "!!!!!!", "3", "33", "333", @@ -271,7 +285,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: "3333333", "33333333", "333333333", - # "Cửa Việt", # llama-bpe fails on this + "Cửa Việt", # llama-bpe fails on this + " discards", chktxt, ] @@ -299,7 +314,10 @@ def get_vocab_base_pre(self, tokenizer) -> str: # create the tokenizer try: - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + if name == "t5": + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) + else: + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") except OSError as e: logger.error(f"Failed to load tokenizer for model {name}. Error: {e}") continue # Skip this model and continue with the next one in the loop @@ -325,6 +343,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: for model in models: name = model["name"] - print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 + print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 logger.info("\n") diff --git a/convert-llama-ggml-to-gguf.py b/convert_llama_ggml_to_gguf.py similarity index 100% rename from convert-llama-ggml-to-gguf.py rename to convert_llama_ggml_to_gguf.py diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md deleted file mode 100644 index 3eec077ea7b..00000000000 --- a/docs/HOWTO-add-model.md +++ /dev/null @@ -1,119 +0,0 @@ -## Add a new model architecture to `llama.cpp` - -Adding a model requires few steps: - -1. Convert the model to GGUF -2. Define the model architecture in `llama.cpp` -3. Build the GGML graph implementation - -After following these steps, you can open PR. - -Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially: -- [main](../examples/main) -- [imatrix](../examples/imatrix) -- [quantize](../examples/quantize) -- [server](../examples/server) - -### 1. Convert the model to GGUF - -This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format). - -The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. - -The required steps to implement for an HF model are: - -1. Define the model `Model.register` annotation in a new `Model` subclass, example: - -```python -@Model.register("MyModelForCausalLM") -class MyModel(Model): - model_arch = gguf.MODEL_ARCH.GROK -``` - -2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py) - -Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`. - -Example for `falcon` model: -```python - MODEL_ARCH.FALCON: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_NORM_2, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - ] -``` - -3. Map the original tensor names to the standardize equivalent in GGUF - -As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist. - -Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file. - -If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it. - -Example for the normalization tensor in attention layers: - -```python -block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { - # Attention norm - MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - ... - ) -} -``` - -`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF. - -Depending on the model configuration, tokenizer, code and tensors layout, you will have to override: -- `Model#set_gguf_parameters` -- `Model#set_vocab` -- `Model#write_tensors` - -NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. - -### 2. Define the model architecture in `llama.cpp` - -The model params and tensors layout must be defined in `llama.cpp`: -1. Define a new `llm_arch` -2. Define the tensors layout in `LLM_TENSOR_NAMES` -3. Add any non standard metadata in `llm_load_hparams` -4. Create the tensors for inference in `llm_load_tensors` -5. If the model has a RoPE operation, add the rope type in `llama_rope_type` - -NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions. - -### 3. Build the GGML graph implementation - -This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. - -Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. - -When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. - -Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback). - -## GGUF specification - -https://github.com/ggerganov/ggml/blob/master/docs/gguf.md - -## Resources - -- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268 -- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009 -- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283 -- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406 -- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423 -- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204 -- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491 -- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515 -- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948 diff --git a/docs/debugging-tests.md b/docs/debugging-tests.md deleted file mode 100644 index 18407f688f9..00000000000 --- a/docs/debugging-tests.md +++ /dev/null @@ -1,104 +0,0 @@ -# Debugging Tests Tips - -## How to run & execute or debug a specific test without anything else to keep the feedback loop short? - -There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number. - -For example, running the following command will output an interactive list from which you can select a test. It takes this form: - -`debug-test.sh [OPTION]... ` - -It will then build & run in the debugger for you. - -To just execute a test and get back a PASS or FAIL message run: - -```bash -./scripts/debug-test.sh test-tokenizer -``` - -To test in GDB use the `-g` flag to enable gdb test mode. - -```bash -./scripts/debug-test.sh -g test-tokenizer - -# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows: ->>> b main -``` - -To speed up the testing loop, if you know your test number you can just run it similar to below: - -```bash -./scripts/debug-test.sh test 23 -``` - -For further reference use `debug-test.sh -h` to print help. - -  - -### How does the script work? -If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below. - -#### Step 1: Reset and Setup folder context - -From base of this repository, let's create `build-ci-debug` as our build context. - -```bash -rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug -``` - -#### Step 2: Setup Build Environment and Compile Test Binaries - -Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults. - -```bash -cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON .. -make -j -``` - -#### Step 3: Find all tests available that matches REGEX - -The output of this command will give you the command & arguments needed to run GDB. - -* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex) -* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB. -* `-V` : Verbose Mode - -```bash -ctest -R "test-tokenizer" -V -N -``` - -This may return output similar to below (focusing on key lines to pay attention to): - -```bash -... -1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf" -1: Working Directory: . -Labels: main - Test #1: test-tokenizer-0-llama-spm -... -4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf" -4: Working Directory: . -Labels: main - Test #4: test-tokenizer-0-falcon -... -``` - -#### Step 4: Identify Test Command for Debugging - -So for test #1 above we can tell these two pieces of relevant information: -* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0` -* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf` - -#### Step 5: Run GDB on test command - -Based on the ctest 'test command' report above we can then run a gdb session via this command below: - -```bash -gdb --args ${Test Binary} ${Test GGUF Model} -``` - -Example: - -```bash -gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf" -``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt deleted file mode 100644 index 0b51c44c05e..00000000000 --- a/examples/CMakeLists.txt +++ /dev/null @@ -1,56 +0,0 @@ -# dependencies - -find_package(Threads REQUIRED) - -# third-party - -# ... - -# examples - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -if (EMSCRIPTEN) -else() - add_subdirectory(cvector-generator) - add_subdirectory(baby-llama) - add_subdirectory(batched-bench) - add_subdirectory(batched) - add_subdirectory(benchmark) - add_subdirectory(convert-llama2c-to-ggml) - add_subdirectory(embedding) - add_subdirectory(eval-callback) - add_subdirectory(export-lora) - add_subdirectory(finetune) - add_subdirectory(gbnf-validator) - add_subdirectory(gguf-split) - add_subdirectory(gguf) - add_subdirectory(gritlm) - add_subdirectory(imatrix) - add_subdirectory(infill) - add_subdirectory(llama-bench) - add_subdirectory(llava) - add_subdirectory(lookahead) - add_subdirectory(lookup) - add_subdirectory(main) - add_subdirectory(parallel) - add_subdirectory(passkey) - add_subdirectory(perplexity) - add_subdirectory(quantize-stats) - add_subdirectory(quantize) - add_subdirectory(retrieval) - if (LLAMA_RPC) - add_subdirectory(rpc) - endif() - if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (LLAMA_SYCL) - add_subdirectory(sycl) - endif() - add_subdirectory(save-load-state) - add_subdirectory(simple) - add_subdirectory(speculative) - add_subdirectory(tokenize) - add_subdirectory(train-text-from-scratch) -endif() diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt deleted file mode 100644 index 71b82105c88..00000000000 --- a/examples/baby-llama/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-baby-llama) -add_executable(${TARGET} baby-llama.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt deleted file mode 100644 index 959acaeeebc..00000000000 --- a/examples/batched-bench/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-batched-bench) -add_executable(${TARGET} batched-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index dbbd06da581..616494d2d84 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false) + let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false) if nTokens < 0 { let actualTokensCount = -Int(nTokens) result = .init(repeating: 0, count: actualTokensCount) @@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String token, &result, Int32(result.count), + 0, false ) assert(check == actualTokensCount) diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt deleted file mode 100644 index 77e33343b66..00000000000 --- a/examples/batched/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-batched) -add_executable(${TARGET} batched.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 62d9b144d33..2442e954dcc 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -93,14 +93,34 @@ int main(int argc, char ** argv) { // create a llama_batch // we use this object to submit token data for decoding - llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1); + llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); + + std::vector seq_ids(n_parallel, 0); + for (int32_t i = 0; i < n_parallel; ++i) { + seq_ids[i] = i; + } // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); + llama_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); + if (llama_model_has_encoder(model)) { + if (llama_encode(ctx, batch)) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) { + decoder_start_token_id = llama_token_bos(model); + } + + llama_batch_clear(batch); + llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + } + // llama_decode will output logits only for the last token of the prompt batch.logits[batch.n_tokens - 1] = true; @@ -109,11 +129,11 @@ int main(int argc, char ** argv) { return 1; } - // assign the system KV cache to all parallel sequences - // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them - for (int32_t i = 1; i < n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } + //// assign the system KV cache to all parallel sequences + //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them + //for (int32_t i = 1; i < n_parallel; ++i) { + // llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + //} if (n_parallel > 1) { LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt deleted file mode 100644 index 34a58cc02ab..00000000000 --- a/examples/benchmark/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-bench-matmult) -add_executable(${TARGET} benchmark-matmult.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt deleted file mode 100644 index a6790e61721..00000000000 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-convert-llama2c-to-ggml) -add_executable(${TARGET} convert-llama2c-to-ggml.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-legacy-llama.py b/examples/convert_legacy_llama.py similarity index 100% rename from examples/convert-legacy-llama.py rename to examples/convert_legacy_llama.py diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt deleted file mode 100644 index 0a559d60c2a..00000000000 --- a/examples/cvector-generator/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-cvector-generator) -add_executable(${TARGET} cvector-generator.cpp pca.hpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index 7b0e79c1ffb..be4dd5250f1 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -11,13 +11,16 @@ Related PRs: ```sh # CPU only -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf +./cvector-generator -m ./llama-3.Q4_K_M.gguf # With GPU -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 +./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 # With advanced options -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100 +./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100 + +# Using mean value instead of PCA +./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean # To see help message ./cvector-generator -h @@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha <|im_start|>system\nAct like a person who is extremely happy.<|im_end|> <|im_start|>system\nYou are in a very good mood today<|im_end|> ``` + +Example to use output file with `llama-cli`: + +(Tips: The control vector works better when apply to layers higher than 10) + +```sh +./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31 +``` diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 9941683db67..d4e126ac22e 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -2,6 +2,7 @@ #include "llama.h" #include "ggml.h" #include "pca.hpp" +#include "mean.hpp" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { gpt_params_print_usage(argc, argv, params); printf("\nexample usage:\n"); - printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); - printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); - printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); + printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); + printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); + printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); + printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]); printf("\n"); } @@ -223,23 +225,30 @@ struct train_context { // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method - void build_v_diff() { + void build_v_diff(bool transpose) { printf("build_v_diff\n"); for (int il = 0; il < n_layers - 1; il++) { auto & diff_tmp = v_diff_tmp[il]; int n_elem = diff_tmp.size() / sizeof(float); GGML_ASSERT(n_elem % n_embd == 0); int n_rows = n_elem / n_embd; - struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); + struct ggml_tensor * diff = transpose + ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd) + : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); - // copy data & transpose diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible - float * arr = (float *) diff_tmp.data(); - for (int ir = 0; ir < n_rows; ++ir) { - for (int ic = 0; ic < n_embd; ++ic) { - float f = arr[ir*n_embd + ic]; - ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + if (transpose) { + // copy data & transpose + float * arr = (float *) diff_tmp.data(); + for (int ir = 0; ir < n_rows; ++ir) { + for (int ic = 0; ic < n_embd; ++ic) { + float f = arr[ir*n_embd + ic]; + ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + } } + } else { + // only copy + memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff)); } v_diff.push_back(diff); print_debug_tensor(diff); @@ -263,8 +272,8 @@ struct tokenized_prompt { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - tokens_pos = ::llama_tokenize(ctx, pos, add_bos); - tokens_neg = ::llama_tokenize(ctx, neg, add_bos); + tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); + tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); @@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { fprintf(stderr, "must provide at least one prompt pair\n"); return 1; } - - // create templated prompts - std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); - auto format_template = [](std::string persona, std::string suffix) { - // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" - return persona + " " + suffix; - }; - for (size_t i = 0; i < positive_prompts.size(); ++i) { - for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { - // TODO replicate the truncations done by the python implementation - ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); - ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); - } - } + ctx_train.positive_entries = positive_prompts; + ctx_train.negative_entries = negative_prompts; return 0; } @@ -480,15 +477,22 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); + bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; + // prepare ctx_train for PCA - ctx_train.build_v_diff(); - - // run PCA - PCA::pca_params pca_params; - pca_params.n_threads = params.n_threads; - pca_params.n_batch = params.n_pca_batch; - pca_params.n_iterations = params.n_pca_iterations; - PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + ctx_train.build_v_diff(use_pca); + + if (use_pca) { + // run PCA + PCA::pca_params pca_params; + pca_params.n_threads = params.n_threads; + pca_params.n_batch = params.n_pca_batch; + pca_params.n_iterations = params.n_pca_iterations; + PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + } else { + // run mean + mean::run(ctx_train.v_diff, ctx_train.v_final); + } // write output vectors to gguf export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp new file mode 100644 index 00000000000..16be5ce3eec --- /dev/null +++ b/examples/cvector-generator/mean.hpp @@ -0,0 +1,48 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include + +namespace mean { + +static void run( + const std::vector & v_input, // shape of v_input[0]: [n_embd, n_samples] + const std::vector & v_output) { + printf("%s: Running mean...\n", __func__); + for (size_t il = 0; il < v_input.size(); ++il) { + // prepare output vector + struct ggml_tensor * ctrl_out = v_output[il]; + ggml_format_name(ctrl_out, "direction.%ld", il+1); + + // calculate mean vector + struct ggml_tensor * t_layer = v_input[il]; + GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd + for (int ic = 0; ic < t_layer->ne[0]; ic++) { + float f = 0.0; + for (int ir = 0; ir < t_layer->ne[1]; ir++) { + f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0); + } + f /= t_layer->ne[1]; + ggml_set_f32_1d(ctrl_out, ic, f); + } + + // normalize output vector + float norm = 0.0; + for (int i = 0; i < ggml_nelements(ctrl_out); i++) { + float f = ggml_get_f32_1d(ctrl_out, i); + norm += f*f; + } + norm = sqrt(norm); + for (int i = 0; i < ggml_nelements(ctrl_out); i++) { + float f = ggml_get_f32_1d(ctrl_out, i); + ggml_set_f32_1d(ctrl_out, i, f / norm); + } + + printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); + } +} + +} diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt index 2ac3387f184..45b9384b390 100644 --- a/examples/cvector-generator/negative.txt +++ b/examples/cvector-generator/negative.txt @@ -1 +1,4 @@ -[INST] Act like a person who is extremely sad. [/INST] \ No newline at end of file +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow +<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me +<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow \ No newline at end of file diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 36eadaac26a..6ec3141afbc 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -290,7 +290,7 @@ static void power_iteration( } printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", - __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch); + __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch); } // get output tensor @@ -298,6 +298,9 @@ static void power_iteration( ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); //print_debug_tensor(output); ggml_gallocr_free(allocr); + + // TODO @ngxson : The output vector is randomly inverted + // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171 } static void run_pca( diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt index f28e9aa1aeb..fea73622571 100644 --- a/examples/cvector-generator/positive.txt +++ b/examples/cvector-generator/positive.txt @@ -1 +1,4 @@ -[INST] Act like a person who is extremely happy. [/INST] \ No newline at end of file +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever! +<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you +<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now! \ No newline at end of file diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt deleted file mode 100644 index 8256e789ad3..00000000000 --- a/examples/embedding/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-embedding) -add_executable(${TARGET} embedding.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 2298ec3e7fc..e3705b45476 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -19,3 +19,42 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. + +## extra parameters +### --embd-normalize $integer$ +| $integer$ | description | formula | +|-----------|---------------------|---------| +| $-1$ | none | +| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$ +| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$ +| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$ +| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$ + +### --embd-output-format $'string'$ +| $'string'$ | description | | +|------------|------------------------------|--| +| '' | same as before | (default) +| 'array' | single embeddings | $[[x_1,...,x_n]]$ +| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$ +| 'json' | openai style | +| 'json+' | add cosine similarity matrix | + +### --embd-separator $"string"$ +| $"string"$ | | +|--------------|-| +| "\n" | (default) +| "<#embSep#>" | for exemple +| "<#sep#>" | other exemple + +## examples +### Unix-based systems (Linux, macOS, etc.): + +```bash +./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +``` + +### Windows: + +```powershell +embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 9f3263bd0c9..4d3bff5995c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -8,23 +8,30 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static std::vector split_lines(const std::string & s) { - std::string line; +static std::vector split_lines(const std::string & s, const std::string & separator = "\n") { std::vector lines; - std::stringstream ss(s); - while (std::getline(ss, line)) { - lines.push_back(line); + size_t start = 0; + size_t end = s.find(separator); + + while (end != std::string::npos) { + lines.push_back(s.substr(start, end - start)); + start = end + separator.length(); + end = s.find(separator, start); } + + lines.push_back(s.substr(start)); // Add the last part + return lines; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { - for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } -static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); @@ -41,22 +48,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu // try to get sequence embeddings - supported only when pooling_type is not NONE const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - if (embd == NULL) { - fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); - continue; - } - } + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); float * out = output + batch.seq_id[i][0] * n_embd; - //TODO: I would also add a parameter here to enable normalization or not. - /*fprintf(stdout, "unnormalized_embedding:"); - for (int hh = 0; hh < n_embd; hh++) { - fprintf(stdout, "%9.6f ", embd[hh]); - } - fprintf(stdout, "\n");*/ - llama_embd_normalize(embd, out, n_embd); + llama_embd_normalize(embd, out, n_embd, embd_norm); } } @@ -98,6 +93,12 @@ int main(int argc, char ** argv) { const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + return 1; + } + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); @@ -110,7 +111,7 @@ int main(int argc, char ** argv) { } // split the prompt into lines - std::vector prompts = split_lines(params.prompt); + std::vector prompts = split_lines(params.prompt, params.embd_sep); // max batch size const uint64_t n_batch = params.n_batch; @@ -170,7 +171,7 @@ int main(int argc, char ** argv) { // encode if at capacity if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd); + batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); llama_batch_clear(batch); p += s; s = 0; @@ -183,29 +184,78 @@ int main(int argc, char ** argv) { // final batch float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd); - - // print the first part of the embeddings or for a single prompt, the full embedding - fprintf(stdout, "\n"); - for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } - fprintf(stdout, "\n"); - } + batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - // print cosine similarity matrix - if (n_prompts > 1) { + if (params.embd_out.empty()) { + // print the first part of the embeddings or for a single prompt, the full embedding fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } } fprintf(stdout, "\n"); } + + // print cosine similarity matrix + if (n_prompts > 1) { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + fprintf(stdout, "%6.6s ", prompts[i].c_str()); + } + fprintf(stdout, "\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "%1.10s", prompts[i].c_str()); + fprintf(stdout, "\n"); + } + } + } + + if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { + const bool notArray = params.embd_out != "array"; + + fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); + for (int j = 0;;) { // at least one iteration (one prompt) + if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); + fprintf(stdout, "["); + for (int i = 0;;) { // at least one iteration (n_embd > 0) + fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); + i++; + if (i < n_embd) fprintf(stdout, ","); else break; + } + fprintf(stdout, notArray ? "]\n }" : "]"); + j++; + if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break; + } + fprintf(stdout, notArray ? "\n ]" : "]\n"); + + if (params.embd_out == "json+" && n_prompts > 1) { + fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); + for (int i = 0;;) { // at least two iteration (n_prompts > 1) + fprintf(stdout, " ["); + for (int j = 0;;) { // at least two iteration (n_prompts > 1) + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f", sim); + j++; + if (j < n_prompts) fprintf(stdout, ", "); else break; + } + fprintf(stdout, " ]"); + i++; + if (i < n_prompts) fprintf(stdout, ",\n"); else break; + } + fprintf(stdout, "\n ]"); + } + + if (notArray) fprintf(stdout, "\n}\n"); } // clean up diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt deleted file mode 100644 index a48753d38e1..00000000000 --- a/examples/eval-callback/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -set(TARGET llama-eval-callback) -add_executable(${TARGET} eval-callback.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) -set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt deleted file mode 100644 index 1cef6e71694..00000000000 --- a/examples/export-lora/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-export-lora) -add_executable(${TARGET} export-lora.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt deleted file mode 100644 index 64afe6ddc64..00000000000 --- a/examples/finetune/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-finetune) -add_executable(${TARGET} finetune.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py similarity index 100% rename from examples/finetune/convert-finetune-checkpoint-to-gguf.py rename to examples/finetune/convert_finetune_checkpoint_to_gguf.py diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt deleted file mode 100644 index 4edd6ec7394..00000000000 --- a/examples/gbnf-validator/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gbnf-validator) -add_executable(${TARGET} gbnf-validator.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 0406dc3398b..dd53ba9b1d5 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -101,7 +101,9 @@ int main(int argc, char** argv) { auto grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - + if (grammar == nullptr) { + throw std::runtime_error("Failed to initialize llama_grammar"); + } // Read the input file std::string input_str; { diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt deleted file mode 100644 index f63887da7df..00000000000 --- a/examples/gguf-split/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gguf-split) -add_executable(${TARGET} gguf-split.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt deleted file mode 100644 index a9569b41195..00000000000 --- a/examples/gguf/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gguf) -add_executable(${TARGET} gguf.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt deleted file mode 100644 index 86dfddca346..00000000000 --- a/examples/gritlm/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gritlm) -add_executable(${TARGET} gritlm.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 2135157916c..2c61c2e1eb3 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -44,6 +44,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); + llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); // run model @@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo llama_token eos_token = llama_token_eos(mdl); llama_kv_cache_clear(ctx); + llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); + llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); std::vector inputs = llama_tokenize(mdl, prompt, false, true); @@ -166,8 +169,7 @@ int main(int argc, char * argv[]) { llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); - // create new context - set to embedding mode - cparams.embeddings = true; + // create generation context llama_context * ctx = llama_new_context_with_model(mdl, cparams); // ### Embedding/Representation ### diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt deleted file mode 100644 index d4c8265bdb9..00000000000 --- a/examples/imatrix/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-imatrix) -add_executable(${TARGET} imatrix.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 38b36ee5a26..29602881a0d 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -LLAMA_CUDA=1 make -j +GGML_CUDA=1 make -j # generate importance matrix (imatrix.dat) ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt deleted file mode 100644 index 9b1aa3b63c9..00000000000 --- a/examples/infill/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-infill) -add_executable(${TARGET} infill.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/infill/README.md b/examples/infill/README.md index 74f42d2fc26..810a0c5e766 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. ## Input Prompts diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 467034d9315..f1d82d3639f 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -211,6 +211,7 @@ int main(int argc, char ** argv) { suff_rm_leading_spc = false; } std::vector embd_inp; + std::vector embd_end; std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); const int space_token = 29871; @@ -218,13 +219,18 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + embd_inp.push_back(middle_token); } - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(model)); LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); @@ -523,13 +529,18 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); } - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(model)); + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + + if (middle_token >= 0) { + embd_inp.push_back(middle_token); + } + embd.clear(); n_remain = params.n_predict; n_past = 0; @@ -649,4 +660,3 @@ int main(int argc, char ** argv) { return 0; } - diff --git a/examples/json-schema-pydantic-example.py b/examples/json_schema_pydantic_example.py similarity index 88% rename from examples/json-schema-pydantic-example.py rename to examples/json_schema_pydantic_example.py index cc64e572bac..c7ca7b8d904 100644 --- a/examples/json-schema-pydantic-example.py +++ b/examples/json_schema_pydantic_example.py @@ -1,9 +1,9 @@ # Usage: #! ./llama-server -m some-model.gguf & #! pip install pydantic -#! python json-schema-pydantic-example.py +#! python json_schema_pydantic_example.py -from pydantic import BaseModel, TypeAdapter +from pydantic import BaseModel, Extra, TypeAdapter from annotated_types import MinLen from typing import Annotated, List, Optional import json, requests @@ -50,11 +50,16 @@ def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1 if __name__ == '__main__': class QAPair(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema question: str concise_answer: str justification: str + stars: Annotated[int, Field(ge=1, le=5)] class PyramidalSummary(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema title: str summary: str question_answers: Annotated[List[QAPair], MinLen(2)] diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index b588497b99f..072a230f724 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -4,8 +4,7 @@ import json import re import sys -from typing import Any, Dict, List, Set, Tuple, Union - +from typing import Any, List, Optional, Set, Tuple, Union def _build_repetition(item_rule, min_items, max_items, separator_rule=None): @@ -23,6 +22,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None) return f'({result})?' if min_items == 0 else result +def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): + has_min = min_value != None + has_max = max_value != None + + def digit_range(from_char: str, to_char: str): + out.append("[") + if from_char == to_char: + out.append(from_char) + else: + out.append(from_char) + out.append("-") + out.append(to_char) + out.append("]") + + def more_digits(min_digits: int, max_digits: int): + out.append("[0-9]") + if min_digits == max_digits and min_digits == 1: + return + out.append("{") + out.append(str(min_digits)) + if max_digits != min_digits: + out.append(",") + if max_digits != sys.maxsize: + out.append(str(max_digits)) + out.append("}") + + def uniform_range(from_str: str, to_str: str): + i = 0 + while i < len(from_str) and from_str[i] == to_str[i]: + i += 1 + if i > 0: + out.append("\"") + out.append(from_str[:i]) + out.append("\"") + if i < len(from_str): + if i > 0: + out.append(" ") + sub_len = len(from_str) - i - 1 + if sub_len > 0: + from_sub = from_str[i+1:] + to_sub = to_str[i+1:] + sub_zeros = "0" * sub_len + sub_nines = "9" * sub_len + + to_reached = False + out.append("(") + if from_sub == sub_zeros: + digit_range(from_str[i], chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + else: + out.append("[") + out.append(from_str[i]) + out.append("] ") + out.append("(") + uniform_range(from_sub, sub_nines) + out.append(")") + if ord(from_str[i]) < ord(to_str[i]) - 1: + out.append(" | ") + if to_sub == sub_nines: + digit_range(chr(ord(from_str[i]) + 1), to_str[i]) + to_reached = True + else: + digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + if not to_reached: + out.append(" | ") + digit_range(to_str[i], to_str[i]) + out.append(" ") + uniform_range(sub_zeros, to_sub) + out.append(")") + else: + out.append("[") + out.append(from_str[i]) + out.append("-") + out.append(to_str[i]) + out.append("]") + + if has_min and has_max: + if min_value < 0 and max_value < 0: + out.append("\"-\" (") + _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) + out.append(")") + return + + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True) + out.append(") | ") + min_value = 0 + + min_s = str(min_value) + max_s = str(max_value) + min_digits = len(min_s) + max_digits = len(max_s) + + for digits in range(min_digits, max_digits): + uniform_range(min_s, "9" * digits) + min_s = "1" + "0" * digits + out.append(" | ") + uniform_range(min_s, max_s) + return + + less_decimals = max(decimals_left - 1, 1) + + if has_min: + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) + out.append(") | [0] | [1-9] ") + more_digits(0, decimals_left - 1) + elif min_value == 0: + if top_level: + out.append("[0] | [1-9] ") + more_digits(0, less_decimals) + else: + more_digits(1, decimals_left) + elif min_value <= 9: + c = str(min_value) + range_start = '1' if top_level else '0' + if c > range_start: + digit_range(range_start, chr(ord(c) - 1)) + out.append(" ") + more_digits(1, less_decimals) + out.append(" | ") + digit_range(c, "9") + out.append(" ") + more_digits(0, less_decimals) + else: + min_s = str(min_value) + length = len(min_s) + c = min_s[0] + + if c > "1": + digit_range("1" if top_level else "0", chr(ord(c) - 1)) + out.append(" ") + more_digits(length, less_decimals) + out.append(" | ") + digit_range(c, c) + out.append(" (") + _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False) + out.append(")") + if c < "9": + out.append(" | ") + digit_range(chr(ord(c) + 1), "9") + out.append(" ") + more_digits(length - 1, less_decimals) + return + + if has_max: + if max_value >= 0: + if top_level: + out.append("\"-\" [1-9] ") + more_digits(0, less_decimals) + out.append(" | ") + _generate_min_max_int(0, max_value, out, decimals_left, top_level=True) + else: + out.append("\"-\" (") + _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False) + out.append(")") + return + + raise RuntimeError("At least one of min_value or max_value must be set") class BuiltinRule: def __init__(self, content: str, deps: list = None): @@ -68,7 +231,7 @@ def __init__(self, content: str, deps: list = None): GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} NON_LITERAL_SET = set('|.()[]{}*+?') -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?') class SchemaConverter: @@ -112,6 +275,51 @@ def recurse(i: int): return ''.join(('(', *recurse(0), ')')) + def _not_strings(self, strings): + class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_string = False + + def insert(self, string): + node = self + for c in string: + node = node.children.setdefault(c, TrieNode()) + node.is_end_of_string = True + + trie = TrieNode() + for s in strings: + trie.insert(s) + + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + out = ['["] ( '] + + def visit(node): + rejects = [] + first = True + for c in sorted(node.children.keys()): + child = node.children[c] + rejects.append(c) + if first: + first = False + else: + out.append(' | ') + out.append(f'[{c}]') + if child.children: + out.append(f' (') + visit(child) + out.append(')') + elif child.is_end_of_string: + out.append(f' {char_rule}+') + if node.children: + if not first: + out.append(' | ') + out.append(f'[^"{"".join(rejects)}] {char_rule}*') + visit(trie) + + out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + return ''.join(out) + def _add_rule(self, name, rule): esc_name = INVALID_RULE_CHARS_RE.sub('-', name) if esc_name not in self._rules or self._rules[esc_name] == rule: @@ -357,13 +565,13 @@ def visit(self, schema, name): return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf'])) elif isinstance(schema_type, list): - return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type])) + return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) + return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') elif 'enum' in schema: - rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' return self._add_rule(rule_name, rule) elif schema_type in (None, 'object') and \ @@ -394,7 +602,7 @@ def add_component(comp_schema, is_required): else: add_component(t, is_required=True) - return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[])) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): items = schema.get('items') or schema['prefixItems'] @@ -432,6 +640,24 @@ def add_component(comp_schema, is_required): return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + elif schema_type in (None, 'integer') and \ + ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): + min_value = None + max_value = None + if 'minimum' in schema: + min_value = schema['minimum'] + elif 'exclusiveMinimum' in schema: + min_value = schema['exclusiveMinimum'] + 1 + if 'maximum' in schema: + max_value = schema['maximum'] + elif 'exclusiveMaximum' in schema: + max_value = schema['exclusiveMaximum'] - 1 + + out = ["("] + _generate_min_max_int(min_value, max_value, out) + out.append(") space") + return self._add_rule(rule_name, ''.join(out)) + elif (schema_type == 'object') or (len(schema) == 0): return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) @@ -450,7 +676,7 @@ def _add_primitive(self, name: str, rule: BuiltinRule): self._add_primitive(dep, dep_rule) return n - def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]): + def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]): prop_order = self._prop_order # sort by position in prop_order (if specified) then by original order sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))] @@ -465,12 +691,16 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties == True or isinstance(additional_properties, dict): + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' - value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value') + value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ + self._add_primitive('value', PRIMITIVE_RULES['value']) + key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \ + else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props)) + prop_kv_rule_names["*"] = self._add_rule( f'{sub_name}-kv', - self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}' + f'{key_rule} ":" space {value_rule}' ) optional_props.append("*") @@ -485,15 +715,11 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st def get_recursive_refs(ks, first_is_optional): [k, *rest] = ks kv_rule_name = prop_kv_rule_names[k] - if k == '*': - res = self._add_rule( - f'{name}{"-" if name else ""}additional-kvs', - f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*' - ) - elif first_is_optional: - res = f'( "," space {kv_rule_name} )?' + comma_ref = f'( "," space {kv_rule_name} )' + if first_is_optional: + res = comma_ref + ('*' if k == '*' else '?') else: - res = kv_rule_name + res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '') if len(rest) > 0: res += ' ' + self._add_rule( f'{name}{"-" if name else ""}{k}-rest', diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt deleted file mode 100644 index 5bdbea4e281..00000000000 --- a/examples/llama-bench/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-bench) -add_executable(${TARGET} llama-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llama.android/llama/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt deleted file mode 100644 index a5618cac058..00000000000 --- a/examples/llama.android/llama/CMakeLists.txt +++ /dev/null @@ -1,55 +0,0 @@ - -# For more information about using CMake with Android Studio, read the -# documentation: https://d.android.com/studio/projects/add-native-code.html. -# For more examples on how to use CMake, see https://github.com/android/ndk-samples. - -# Sets the minimum CMake version required for this project. -cmake_minimum_required(VERSION 3.22.1) - -# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, -# Since this is the top level CMakeLists.txt, the project name is also accessible -# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level -# build script scope). -project("llama-android") - -## Fetch latest llama.cpp from GitHub -#include(FetchContent) -#FetchContent_Declare( -# llama -# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp -# GIT_TAG master -#) -# -## Also provides "common" -#FetchContent_MakeAvailable(llama) - -# llama.cpp CI uses the code from the current branch -# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700 -add_subdirectory(../../../../../../ build-llama) - -# Creates and names a library, sets it as either STATIC -# or SHARED, and provides the relative paths to its source code. -# You can define multiple libraries, and CMake builds them for you. -# Gradle automatically packages shared libraries with your APK. -# -# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define -# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} -# is preferred for the same purpose. -# -# In order to load a library into your app from Java/Kotlin, you must call -# System.loadLibrary() and pass the name of the library defined here; -# for GameActivity/NativeActivity derived applications, the same library name must be -# used in the AndroidManifest.xml file. -add_library(${CMAKE_PROJECT_NAME} SHARED - # List C/C++ source files with relative paths to this CMakeLists.txt. - llama-android.cpp) - -# Specifies libraries CMake should link to your target library. You -# can link libraries from various origins, such as libraries defined in this -# build script, prebuilt third-party libraries, or Android system libraries. -target_link_libraries(${CMAKE_PROJECT_NAME} - # List libraries link to the target library - llama - common - android - log) diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt deleted file mode 100644 index 42ebaad49a5..00000000000 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ /dev/null @@ -1,49 +0,0 @@ -# For more information about using CMake with Android Studio, read the -# documentation: https://d.android.com/studio/projects/add-native-code.html. -# For more examples on how to use CMake, see https://github.com/android/ndk-samples. - -# Sets the minimum CMake version required for this project. -cmake_minimum_required(VERSION 3.22.1) - -# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, -# Since this is the top level CMakeLists.txt, the project name is also accessible -# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level -# build script scope). -project("llama-android") - -include(FetchContent) -FetchContent_Declare( - llama - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp - GIT_TAG master -) - -# Also provides "common" -FetchContent_MakeAvailable(llama) - -# Creates and names a library, sets it as either STATIC -# or SHARED, and provides the relative paths to its source code. -# You can define multiple libraries, and CMake builds them for you. -# Gradle automatically packages shared libraries with your APK. -# -# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define -# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} -# is preferred for the same purpose. -# -# In order to load a library into your app from Java/Kotlin, you must call -# System.loadLibrary() and pass the name of the library defined here; -# for GameActivity/NativeActivity derived applications, the same library name must be -# used in the AndroidManifest.xml file. -add_library(${CMAKE_PROJECT_NAME} SHARED - # List C/C++ source files with relative paths to this CMakeLists.txt. - llama-android.cpp) - -# Specifies libraries CMake should link to your target library. You -# can link libraries from various origins, such as libraries defined in this -# build script, prebuilt third-party libraries, or Android system libraries. -target_link_libraries(${CMAKE_PROJECT_NAME} - # List libraries link to the target library - llama - common - android - log) diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 874158ef0f9..92a6b16b1a0 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -5,7 +5,7 @@ #include #include #include "llama.h" -#include "common/common.h" +#include "common.h" // Write C++ code here. // diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 737f882fb2d..2a3f9f75890 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -322,7 +322,7 @@ actor LlamaContext { defer { result.deallocate() } - let nTokens = llama_token_to_piece(model, token, result, 8, false) + let nTokens = llama_token_to_piece(model, token, result, 8, 0, false) if nTokens < 0 { let newResult = UnsafeMutablePointer.allocate(capacity: Int(-nTokens)) @@ -330,7 +330,7 @@ actor LlamaContext { defer { newResult.deallocate() } - let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) return Array(bufferPointer) } else { diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 5bde1891727..2c1e3f61bad 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -131,22 +131,29 @@ class LlamaState: ObservableObject { messageLog += "\(text)" - while await llamaContext.n_cur < llamaContext.n_len { - let result = await llamaContext.completion_loop() - messageLog += "\(result)" - } + Task.detached { + while await llamaContext.n_cur < llamaContext.n_len { + let result = await llamaContext.completion_loop() + await MainActor.run { + self.messageLog += "\(result)" + } + } - let t_end = DispatchTime.now().uptimeNanoseconds - let t_generation = Double(t_end - t_heat_end) / NS_PER_S - let tokens_per_second = Double(await llamaContext.n_len) / t_generation + let t_end = DispatchTime.now().uptimeNanoseconds + let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S + let tokens_per_second = Double(await llamaContext.n_len) / t_generation - await llamaContext.clear() - messageLog += """ - \n - Done - Heat up took \(t_heat)s - Generated \(tokens_per_second) t/s\n - """ + await llamaContext.clear() + + await MainActor.run { + self.messageLog += """ + \n + Done + Heat up took \(t_heat)s + Generated \(tokens_per_second) t/s\n + """ + } + } } func bench() async { diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt deleted file mode 100644 index e9fa73acb09..00000000000 --- a/examples/llava/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -add_library(llava OBJECT - llava.cpp - llava.h - clip.cpp - clip.h - ) - -target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) - -target_include_directories(llava PUBLIC .) -target_include_directories(llava PUBLIC ../..) -target_include_directories(llava PUBLIC ../../common) - -target_compile_features(llava PRIVATE cxx_std_11) - -add_library(llava_static STATIC $) -if (BUILD_SHARED_LIBS) - set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD) - add_library(llava_shared SHARED $) - target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) - install(TARGETS llava_shared LIBRARY) -endif() - -if (NOT MSVC) - target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h -endif() - -if(TARGET BUILD_INFO) - add_dependencies(llava BUILD_INFO) -endif() - -set(TARGET llama-llava-cli) -add_executable(${TARGET} llava-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 05a8207e67b..06a65fba478 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B git clone https://huggingface.co/openai/clip-vit-large-patch14-336 ``` -2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh -python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B +python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B ``` -3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: +3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert-image-encoder-to-gguf \ +python ./examples/llava/convert_image_encoder_to_gguf \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \ --output-dir path/to/MobileVLM-1.7B \ @@ -47,17 +47,17 @@ python ./examples/llava/convert-image-encoder-to-gguf \ ``` ```sh -python ./examples/llava/convert-image-encoder-to-gguf \ +python ./examples/llava/convert_image_encoder_to_gguf \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --output-dir path/to/MobileVLM-1.7B_V2 \ --projector-type ldpv2 ``` -4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF: +4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B +python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B ``` 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` @@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens ## Orin compile and run ### compile ```sh -make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32 ``` ### run on Orin ### case 1 diff --git a/examples/llava/README.md b/examples/llava/README.md index f4554de676e..01245136176 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 pip install -r examples/llava/requirements.txt ``` -3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh -python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b +python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b ``` -4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: +4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b +python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` -5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF: +5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown +python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown ``` Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. @@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b pip install -r examples/llava/requirements.txt ``` -3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console -python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ +python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory @@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso 5) Create the visual gguf model: ```console -python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision +python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP 6) Then convert the model to gguf format: ```console -python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown +python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` 7) And finally we can run the llava cli using the 1.6 model version: diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 95fbe3d0216..d6882eec31e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (n < 32) hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { hparams.image_grid_pinpoints[0]=0; } try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { strcpy(hparams.mm_patch_merge_type, "flat"); } try { hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { hparams.image_crop_resolution = hparams.image_size; } @@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { try { vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); new_clip->has_class_embedding = true; - } catch (const std::exception& e) { + } catch (const std::exception& /*e*/) { new_clip->has_class_embedding = false; } @@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); new_clip->has_pre_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_pre_norm = false; } @@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); new_clip->has_post_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_post_norm = false; } try { vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); new_clip->has_patch_bias = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_patch_bias = false; } try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { LOG_TEE("%s: failed to load vision model tensors\n", __func__); } @@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // Yi-type llava vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // missing in Yi-type llava vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert_image_encoder_to_gguf.py similarity index 100% rename from examples/llava/convert-image-encoder-to-gguf.py rename to examples/llava/convert_image_encoder_to_gguf.py diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava_surgery.py similarity index 100% rename from examples/llava/llava-surgery.py rename to examples/llava/llava_surgery.py diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava_surgery_v2.py similarity index 100% rename from examples/llava/llava-surgery-v2.py rename to examples/llava/llava_surgery_v2.py diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index 17cb4d5e5ee..4713f0a3460 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -1,3 +1,3 @@ --r ../../requirements/requirements-convert-legacy-llama.txt +-r ../../requirements/requirements-convert_legacy_llama.txt pillow~=10.2.0 -torch~=2.1.1 +torch~=2.2.1 diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt deleted file mode 100644 index f0ae5cd8924..00000000000 --- a/examples/lookahead/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-lookahead) -add_executable(${TARGET} lookahead.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt deleted file mode 100644 index ef19fe25e31..00000000000 --- a/examples/lookup/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -set(TARGET llama-lookup) -add_executable(${TARGET} lookup.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TARGET llama-lookup-create) -add_executable(${TARGET} lookup-create.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TARGET llama-lookup-merge) -add_executable(${TARGET} lookup-merge.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -set(TARGET llama-lookup-stats) -add_executable(${TARGET} lookup-stats.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookup/README.md b/examples/lookup/README.md index 5bfb0de9360..71c345c037a 100644 --- a/examples/lookup/README.md +++ b/examples/lookup/README.md @@ -10,4 +10,3 @@ More info: https://github.com/ggerganov/llama.cpp/pull/4484 https://github.com/ggerganov/llama.cpp/issues/4226 - diff --git a/examples/main-cmake-pkg/.gitignore b/examples/main-cmake-pkg/.gitignore index e32c11c7f46..67c01d64cb7 100644 --- a/examples/main-cmake-pkg/.gitignore +++ b/examples/main-cmake-pkg/.gitignore @@ -48,4 +48,3 @@ build*/ out/ tmp/ - diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt deleted file mode 100644 index a97ded3653f..00000000000 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -cmake_minimum_required(VERSION 3.12) -project("llama-cli-cmake-pkg" C CXX) -set(TARGET llama-cli-cmake-pkg) - -find_package(Llama 0.0.1 REQUIRED) - -# Bake common functionality in with target. Because applications -# using the relocatable Llama package should be outside of the -# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in. -set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") -add_library(common OBJECT) -file(GLOB _common_files - "${_common_path}/*.h" - "${_common_path}/*.cpp" -) -target_sources(common PRIVATE ${_common_files}) - -# If the common project was part of "llama-cli-cmake-pkg" the transient -# defines would automatically be attached. Because the common func- -# tionality is separate, but dependent upon the defines, it must be -# explicitly extracted from the "llama" target. -# -get_target_property(_llama_transient_defines llama - INTERFACE_COMPILE_DEFINITIONS) - -target_compile_definitions(common PRIVATE "${_llama_transient_defines}") - -add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) -target_include_directories(${TARGET} PRIVATE ${_common_path}) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt deleted file mode 100644 index 5f6efaa9aa9..00000000000 --- a/examples/main/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-cli) -add_executable(${TARGET} main.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8a436d41f86..6dcdea9bd0f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -38,14 +38,15 @@ static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; -static bool is_interacting = false; +static bool is_interacting = false; +static bool need_insert_eot = false; -static bool file_exists(const std::string &path) { +static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); } -static bool file_is_empty(const std::string &path) { +static bool file_is_empty(const std::string & path) { std::ifstream f; f.exceptions(std::ifstream::failbit | std::ifstream::badbit); f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); @@ -100,7 +101,8 @@ static void write_logfile( static void sigint_handler(int signo) { if (signo == SIGINT) { if (!is_interacting && g_params->interactive) { - is_interacting = true; + is_interacting = true; + need_insert_eot = true; } else { console::cleanup(); printf("\n"); @@ -118,6 +120,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { + llama_chat_msg new_msg{role, content}; + auto formatted = llama_chat_format_single( + model, g_params->chat_template, chat_msgs, new_msg, role == "user"); + chat_msgs.push_back({role, content}); + return formatted; +} + int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; @@ -191,6 +201,7 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; llama_context * ctx_guidance = NULL; + std::vector chat_msgs; g_model = &model; g_ctx = &ctx; @@ -216,6 +227,15 @@ int main(int argc, char ** argv) { __func__, n_ctx_train, n_ctx); } + // print chat template example in conversation mode + if (params.conversation) { + if (params.enable_chat_template) { + LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + } else { + LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + } + } + // print system information { LOG_TEE("\n"); @@ -245,21 +265,28 @@ int main(int argc, char ** argv) { } const bool add_bos = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); + if (!llama_model_has_encoder(model)) { + GGML_ASSERT(llama_add_eos_token(model) != 1); + } LOG("add_bos: %d\n", add_bos); std::vector embd_inp; - if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); - } else { - LOG("use session tokens\n"); - embd_inp = session_tokens; - } + { + auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) + ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode + : params.prompt; + if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { + LOG("tokenize the prompt\n"); + embd_inp = ::llama_tokenize(ctx, prompt, true, true); + } else { + LOG("use session tokens\n"); + embd_inp = session_tokens; + } - LOG("prompt: \"%s\"\n", log_tostr(params.prompt)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG("prompt: \"%s\"\n", log_tostr(prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } // Should not run without any tokens if (embd_inp.empty()) { @@ -479,6 +506,7 @@ int main(int argc, char ** argv) { std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; std::ostringstream output_ss; g_output_ss = &output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); @@ -501,6 +529,24 @@ int main(int argc, char ** argv) { exit(1); } + if (llama_model_has_encoder(model)) { + int enc_input_size = embd_inp.size(); + llama_token * enc_input_buf = embd_inp.data(); + + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == -1) { + decoder_start_token_id = llama_token_bos(model); + } + + embd_inp.clear(); + embd_inp.push_back(decoder_start_token_id); + } + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { @@ -794,11 +840,20 @@ int main(int argc, char ** argv) { is_antiprompt = true; } + if (params.enable_chat_template) { + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + } is_interacting = true; printf("\n"); } } + // if current token is not EOG, we add it to current assistant message + if (params.conversation) { + auto id = llama_sampling_last(ctx_sampling); + assistant_ss << llama_token_to_piece(ctx, id, false); + } + if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); @@ -849,12 +904,24 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } + bool format_chat = params.conversation && params.enable_chat_template; + std::string user_inp = format_chat + ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) + : std::move(buffer); + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + // if user stop generation mid-way, we must add EOT to finish model's last response + if (need_insert_eot && format_chat) { + llama_token eot = llama_token_eot(model); + embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); + need_insert_eot = false; + } + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); @@ -865,6 +932,9 @@ int main(int argc, char ** argv) { output_ss << llama_token_to_piece(ctx, token); } + // reset assistant message + assistant_ss.str(""); + n_remain -= line_inp.size(); LOG("n_remain: %d\n", n_remain); } else { diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt deleted file mode 100644 index c13557bac2b..00000000000 --- a/examples/parallel/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-parallel) -add_executable(${TARGET} parallel.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt deleted file mode 100644 index dc467a5d3e4..00000000000 --- a/examples/passkey/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-passkey) -add_executable(${TARGET} passkey.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/passkey/README.md b/examples/passkey/README.md index a48a6283a0b..2b8e910f965 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -1,5 +1,8 @@ # llama.cpp/example/passkey +A passkey retrieval task is an evaluation method used to measure a language +models ability to recall information from long contexts. + See the following PRs for more info: - https://github.com/ggerganov/llama.cpp/pull/3856 diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt deleted file mode 100644 index be0f2fd029e..00000000000 --- a/examples/perplexity/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-perplexity) -add_executable(${TARGET} perplexity.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7fbc06f720f..3269dfe1974 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1992,6 +1992,12 @@ int main(int argc, char ** argv) { params.n_batch = std::min(params.n_batch, n_kv); } else { params.n_batch = std::min(params.n_batch, params.n_ctx); + if (params.kl_divergence) { + params.n_parallel = 1; + } else { + // ensure there's at least enough seq_ids for HellaSwag + params.n_parallel = std::max(4, params.n_parallel); + } } if (params.ppl_stride > 0) { @@ -2016,9 +2022,6 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; - // ensure there's at least enough seq_ids for HellaSwag - params.n_parallel = std::max(4, params.n_parallel); - // load the model and apply lora adapter, if any std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic_models_to_grammar_examples.py similarity index 100% rename from examples/pydantic-models-to-grammar-examples.py rename to examples/pydantic_models_to_grammar_examples.py diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt deleted file mode 100644 index bb986a71688..00000000000 --- a/examples/quantize-stats/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-quantize-stats) -add_executable(${TARGET} quantize-stats.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt deleted file mode 100644 index 3ee4eb9719f..00000000000 --- a/examples/quantize/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-quantize) -add_executable(${TARGET} quantize.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index b78ece4e7f5..553c2701bce 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -4,7 +4,89 @@ You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf- Note: It is synced from llama.cpp `main` every 6 hours. -## Llama 2 7B +Example usage: + +```bash +# obtain the official LLaMA model weights and place them in ./models +ls ./models +llama-2-7b tokenizer_checklist.chk tokenizer.model +# [Optional] for models using BPE tokenizers +ls ./models + vocab.json +# [Optional] for PyTorch .bin models like Mistral-7B +ls ./models + + +# install Python dependencies +python3 -m pip install -r requirements.txt + +# convert the model to ggml FP16 format +python3 convert_hf_to_gguf.py models/mymodel/ + +# quantize the model to 4-bits (using Q4_K_M method) +./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M + +# update the gguf filetype to current version if older version is now unsupported +./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY +``` + +Run the quantized model: + +```bash +# start inference on a gguf model +./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 +``` + +When running the larger models, make sure you have enough disk space to store all the intermediate files. + +## Memory/Disk Requirements + +As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. + +| Model | Original size | Quantized size (Q4_0) | +|------:|--------------:|----------------------:| +| 7B | 13 GB | 3.9 GB | +| 13B | 24 GB | 7.8 GB | +| 30B | 60 GB | 19.5 GB | +| 65B | 120 GB | 38.5 GB | + +## Quantization + +Several quantization methods are supported. They differ in the resulting model disk size and inference speed. + +*(outdated)* + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | +|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:| +| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 | +| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G | +| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 | +| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 | +| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | +| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 | +| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G | +| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 | +| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 | +| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | + +- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) +- recent k-quants improvements and new i-quants + - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) + - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) + - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773) + - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856) + - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861) + - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872) + - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897) + - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) + - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) + - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) + - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) + - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) + - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) + +**Llama 2 7B** | Quantization | Bits per Weight (BPW) | |--------------|-----------------------| @@ -18,7 +100,8 @@ Note: It is synced from llama.cpp `main` every 6 hours. | Q5_K_M | 5.68 | | Q6_K | 6.56 | -## Llama 2 13B +**Llama 2 13B** + Quantization | Bits per Weight (BPW) -- | -- Q2_K | 3.34 @@ -31,7 +114,7 @@ Q5_K_S | 5.51 Q5_K_M | 5.67 Q6_K | 6.56 -# Llama 2 70B +**Llama 2 70B** Quantization | Bits per Weight (BPW) -- | -- diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 72c33a12a15..9896c2db7bc 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -17,41 +17,41 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, - { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, - { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, - { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. - { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; diff --git a/examples/regex-to-grammar.py b/examples/regex_to_grammar.py similarity index 100% rename from examples/regex-to-grammar.py rename to examples/regex_to_grammar.py diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt deleted file mode 100644 index 66610f31114..00000000000 --- a/examples/retrieval/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-retrieval) -add_executable(${TARGET} retrieval.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 55b7b2f70ae..eb89d16daf1 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -73,9 +73,10 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz return chunks; } -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { - for (size_t i = 0; i < tokens.size(); i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { + size_t n_tokens = tokens.size(); + for (size_t i = 0; i < n_tokens; i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -160,6 +161,12 @@ int main(int argc, char ** argv) { const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + return 1; + } + if (n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); diff --git a/examples/rpc/CMakeLists.txt b/examples/rpc/CMakeLists.txt deleted file mode 100644 index ae48fb98d09..00000000000 --- a/examples/rpc/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable(rpc-server rpc-server.cpp) -target_link_libraries(rpc-server PRIVATE ggml llama) diff --git a/examples/rpc/README.md b/examples/rpc/README.md index 86544e3fea2..e1da801f285 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d ## Usage -On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. +On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options. For example, to build the CUDA backend with RPC support: ```bash mkdir build-rpc-cuda cd build-rpc-cuda -cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON +cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON cmake --build . --config Release ``` @@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. -On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: +On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: ```bash mkdir build-rpc cd build-rpc -cmake .. -DLLAMA_RPC=ON +cmake .. -DGGML_RPC=ON cmake --build . --config Release ``` diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt deleted file mode 100644 index 0fb5e359bc9..00000000000 --- a/examples/save-load-state/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-save-load-state) -add_executable(${TARGET} save-load-state.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt deleted file mode 100644 index 8365f9510ce..00000000000 --- a/examples/server/CMakeLists.txt +++ /dev/null @@ -1,51 +0,0 @@ -set(TARGET llama-server) -option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) -include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) -set(TARGET_SRCS - server.cpp - utils.hpp - httplib.h -) -set(PUBLIC_ASSETS - colorthemes.css - style.css - theme-beeninorder.css - theme-ketivah.css - theme-mangotango.css - theme-playground.css - theme-polarnight.css - theme-snowstorm.css - index.html - index-new.html - index.js - completion.js - system-prompts.js - prompt-formats.js - json-schema-to-grammar.mjs -) -foreach(asset ${PUBLIC_ASSETS}) - set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}") - set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") - list(APPEND TARGET_SRCS ${output}) - add_custom_command( - DEPENDS "${input}" - OUTPUT "${output}" - COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" - ) -endforeach() -add_executable(${TARGET} ${TARGET_SRCS}) -install(TARGETS ${TARGET} RUNTIME) -target_compile_definitions(${TARGET} PRIVATE - SERVER_VERBOSE=$ -) -target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) -if (LLAMA_SERVER_SSL) - find_package(OpenSSL REQUIRED) - target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) - target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT) -endif() -if (WIN32) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) -endif() -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/server/README.md b/examples/server/README.md index e7fb0bf64c0..aa4cbbe6390 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co - `-fa`, `--flash-attn` : enable flash attention (default: disabled). - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) +- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. **If compiled with `LLAMA_SERVER_SSL=ON`** - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key @@ -374,7 +375,7 @@ Notice that each `probs` is an array of length `n_probs`. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 19c9f643d30..5513e9121a4 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -634,12 +634,12 @@
-