Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 119 additions & 156 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,71 +557,66 @@ def string_contains_sequence_substring(inputstr,sequences):
import struct

def read_gguf_metadata(file_path):
chunk_size = 8192 # read only first 8kb of file
try:
def read_gguf_key(keyname,data,maxval):
keylen = len(keyname)
index = data.find(keyname) # Search for the magic number, Read 2 chunks of 4 byte numbers
if index != -1 and index + keylen + 8 <= chunk_size:
start_index = index + keylen
first_value_bytes = data[start_index:start_index + 4]
second_value_bytes = data[start_index + 4:start_index + 8]
# Unpack each 4 bytes as an unsigned int32 in little-endian format
value1 = struct.unpack('<I', first_value_bytes)[0] #4 means its a uint32
value2 = struct.unpack('<I', second_value_bytes)[0]
if value1 == 4 and value2 > 0 and value2 <= maxval:
return value2 #contains the desired value
return 0
else:
return 0 #not found
CHUNK_SIZE = 8192 # read only first 8kb of file
MIN_FILE_SIZE = 10000 # ignore files under 10kb

def read_gguf_key(keyname, data, maxval):
index = data.find(keyname) # Search for the magic number
if index != -1 and index + len(keyname) + 8 <= CHUNK_SIZE:
start_index = index + len(keyname)
# Read 2 chunks of 4 byte numbers
# Unpack each 4 bytes as an unsigned int32 in little-endian format
value1, value2 = struct.unpack('<II', data[start_index:start_index + 8])
if value1 == 4 and 0 < value2 <= maxval:
return value2 # contains the desired value
return 0 # not found

fsize = os.path.getsize(file_path)
if fsize < 10000: #ignore files under 10kb
try:
if os.path.getsize(file_path) < MIN_FILE_SIZE: # ignore files under 10kb
return None

with open(file_path, 'rb') as f:
file_header = f.read(4)
if file_header != b'GGUF': #file is not GGUF
if file_header != b'GGUF': # file is not GGUF
return None
data = f.read(chunk_size)
layercount = read_gguf_key(b'.block_count',data,512)
head_count_kv = read_gguf_key(b'.attention.head_count_kv',data,8192)
key_length = read_gguf_key(b'.attention.key_length',data,8192)
val_length = read_gguf_key(b'.attention.value_length',data,8192)
return [layercount,head_count_kv, max(key_length,val_length)]

data = f.read(CHUNK_SIZE)
layercount = read_gguf_key(b'.block_count', data, 512)
head_count_kv = read_gguf_key(b'.attention.head_count_kv', data, 8192)
key_length = read_gguf_key(b'.attention.key_length', data, 8192)
val_length = read_gguf_key(b'.attention.value_length', data, 8192)

return [layercount, head_count_kv, max(key_length, val_length)]
except Exception as ex:
return None

def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use
def autoset_gpu_layers(filepath, ctxsize, gpumem): # shitty algo to determine how many layers to use
try:
layerlimit = 0
fsize = os.path.getsize(filepath)
if fsize>10000000: #dont bother with models < 10mb
cs = ctxsize
mem = gpumem
csmul = 1.0
if cs and cs > 8192:
csmul = 1.4
elif cs and cs > 4096:
csmul = 1.2
elif cs and cs > 2048:
csmul = 1.1
if mem < fsize*1.6*csmul:
ggufmeta = read_gguf_metadata(filepath)
if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
sizeperlayer = fsize*csmul*0.052
layerlimit = int(min(200,mem/sizeperlayer))
else:
layers = ggufmeta[0]
headcount = ggufmeta[1]
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
ratio = mem/(fsize*csmul*1.5)
if headcount > 0:
ratio = max(ratio,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25)))
layerlimit = int(ratio*layers)
else:
layerlimit = 200 # assume full offload
return layerlimit
except Exception as ex:
if fsize <= 10000000: # dont bother with models < 10mb
return 0

cs = ctxsize
mem = gpumem
csmul = 1.0 + (0.4 if cs > 8192 else 0.2 if cs > 4096 else 0.1 if cs > 2048 else 0)

if mem >= fsize * 1.6 * csmul:
return 200 # assume full offload

ggufmeta = read_gguf_metadata(filepath)
if not ggufmeta or ggufmeta[0] == 0: # fail to read or no layers
sizeperlayer = fsize * csmul * 0.052
return int(min(200, mem / sizeperlayer))

layers, headcount, headkvlen = ggufmeta
headkvlen = max(headkvlen, 128)
ratio = mem / (fsize * csmul * 1.5)
if headcount > 0:
ratio = max(ratio, mem / (fsize * 1.34 + (layers * headcount * headkvlen * cs * 4.25)))

return int(ratio * layers)

except Exception:
return 0

def fetch_gpu_properties(testCL,testCU,testVK):
Expand Down Expand Up @@ -3071,39 +3066,41 @@ def display_help():
wb.open("https://github.com/LostRuins/koboldcpp/wiki")
except:
print("Cannot launch help in browser.")

def display_updates():
try:
import webbrowser as wb
wb.open("https://github.com/LostRuins/koboldcpp/releases/latest")
import webbrowser
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this even for? why?

webbrowser.open("https://github.com/LostRuins/koboldcpp/releases/latest")
except:
print("Cannot launch updates in browser.")

ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", hover_color="#2faa3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5)

ctk.CTkButton(tabs , text = "Update", fg_color="#9900cc", hover_color="#aa11dd", command = display_updates, width=90, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5)
ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", hover_color="#085a88", command = save_config_gui, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5)
ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", hover_color="#085a88", command = load_config_gui, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5)
ctk.CTkButton(tabs , text = "Help", fg_color="#992222", hover_color="#bb3333", command = display_help, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 135, pady=5)

# start a thread that tries to get actual gpu names and layer counts

buttons = [
("Launch", "#2f8d3c", "#2faa3c", guilaunch, 80, 1, 1, "se", 25),
("Update", "#9900cc", "#aa11dd", display_updates, 90, 1, 0, "sw", 5),
("Save", "#084a66", "#085a88", save_config_gui, 60, 1, 1, "sw", 5),
("Load", "#084a66", "#085a88", load_config_gui, 60, 1, 1, "sw", 70),
("Help", "#992222", "#bb3333", display_help, 60, 1, 1, "sw", 135)
]

for text, fg, hover, command, width, row, col, stick, padx in buttons:
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing it this way is significantly less readable. In the original code, it's very clear what width means, e.g. 80 is referring to width. Now, by splitting it into an array of unnamed values, you even have a width but not the height. You have the x padding but not the y padding. It becomes super confusing.

ctk.CTkButton(tabs, text=text, fg_color=fg, hover_color=hover, command=command,
width=width, height=35).grid(row=row, column=col, sticky=stick, padx=padx, pady=5)

gpuinfo_thread = threading.Thread(target=auto_set_backend_gui)
gpuinfo_thread.start() #submit job in new thread so nothing is waiting

# runs main loop until closed or launch clicked
gpuinfo_thread.start()

root.mainloop()

if nextstate==0:
if nextstate == 0:
exitcounter = 999
print("Exiting by user request.")
sys.exit(0)
else:
# processing vars
kcpp_exporting_template = False
export_vars()

if not args.model_param and not args.sdmodel and not args.whispermodel:
if not any([args.model_param, args.sdmodel, args.whispermodel]):
exitcounter = 999
exit_with_error(2,"No text or image model file was selected. Exiting.")
exit_with_error(2, "No text or image model file was selected. Exiting.")

def show_gui_msgbox(title,message):
print(title + ": " + message, flush=True)
Expand Down Expand Up @@ -3436,69 +3433,51 @@ def tunnel_reader():

def unload_libs():
global handle
if not handle:
return

OS = platform.system()
dll_close = None

def setup_dll_close(lib, func_name="dlclose"):
nonlocal dll_close
dll_close = getattr(lib, func_name)
dll_close.argtypes = [ctypes.c_void_p]
dll_close.restype = ctypes.c_int

if OS == "Windows": # pragma: Windows
from ctypes import wintypes
dll_close = ctypes.windll.kernel32.FreeLibrary
dll_close.argtypes = [wintypes.HMODULE]
dll_close.restype = ctypes.c_int
elif OS == "Darwin":
try:
try: # macOS 11 (Big Sur). Possibly also later macOS 10s.
stdlib = ctypes.CDLL("libc.dylib")
for lib_name in ["libc.dylib", "libSystem", "/usr/lib/system/libsystem_c.dylib"]:
try:
setup_dll_close(ctypes.CDLL(lib_name))
break
except OSError:
stdlib = ctypes.CDLL("libSystem")
except OSError:
# Older macOSs. Not only is the name inconsistent but it's
# not even in PATH.
stdlib = ctypes.CDLL("/usr/lib/system/libsystem_c.dylib")
dll_close = stdlib.dlclose
dll_close.argtypes = [ctypes.c_void_p]
dll_close.restype = ctypes.c_int
continue
elif OS == "Linux":
try:
stdlib = ctypes.CDLL("")
setup_dll_close(ctypes.CDLL(""))
except OSError:
stdlib = ctypes.CDLL("libc.so") # Alpine Linux.
dll_close = stdlib.dlclose
dll_close.argtypes = [ctypes.c_void_p]
dll_close.restype = ctypes.c_int
setup_dll_close(ctypes.CDLL("libc.so")) # Alpine Linux
elif sys.platform == "msys":
# msys can also use `ctypes.CDLL("kernel32.dll").FreeLibrary()`.
stdlib = ctypes.CDLL("msys-2.0.dll")
dll_close = stdlib.dlclose
dll_close.argtypes = [ctypes.c_void_p]
dll_close.restype = ctypes.c_int
setup_dll_close(ctypes.CDLL("msys-2.0.dll"))
elif sys.platform == "cygwin":
stdlib = ctypes.CDLL("cygwin1.dll")
dll_close = stdlib.dlclose
dll_close.argtypes = [ctypes.c_void_p]
dll_close.restype = ctypes.c_int
setup_dll_close(ctypes.CDLL("cygwin1.dll"))
elif OS == "FreeBSD":
# FreeBSD uses `/usr/lib/libc.so.7` where `7` is another version number.
# It is not in PATH but using its name instead of its path is somehow the
# only way to open it. The name must include the .so.7 suffix.
stdlib = ctypes.CDLL("libc.so.7")
dll_close = stdlib.close
setup_dll_close(ctypes.CDLL("libc.so.7"), "close")

if handle and dll_close:
if dll_close:
print("Unloading Libraries...")
dll_close(handle._handle)
del handle.load_model
del handle.generate
del handle.new_token
del handle.get_stream_count
del handle.has_finished
del handle.get_last_eval_time
del handle.get_last_process_time
del handle.get_last_token_count
del handle.get_last_seed
del handle.get_total_gens
del handle.get_last_stop_reason
del handle.abort_generate
del handle.token_count
del handle.get_pending_output
for attr in ['load_model', 'generate', 'new_token', 'get_stream_count', 'has_finished',
'get_last_eval_time', 'get_last_process_time', 'get_last_token_count',
'get_last_seed', 'get_total_gens', 'get_last_stop_reason', 'abort_generate',
'token_count', 'get_pending_output']:
delattr(handle, attr)
global handle
del handle
handle = None

Expand Down Expand Up @@ -3683,34 +3662,19 @@ def main(launch_args,start_server=True):
print(f"Warning: Chat Completions Adapter invalid or not found.")

# handle model downloads if needed
if args.model_param and args.model_param!="":
if args.model_param.endswith("?download=true"):
args.model_param = args.model_param.replace("?download=true","")
if (args.model_param.startswith("http://") or args.model_param.startswith("https://")) and (args.model_param.endswith(".gguf") or args.model_param.endswith(".bin")):
dlfile = download_model_from_url(args.model_param)
if dlfile:
args.model_param = dlfile
if args.sdmodel and args.sdmodel!="":
if args.sdmodel.endswith("?download=true"):
args.sdmodel = args.sdmodel.replace("?download=true","")
if (args.sdmodel.startswith("http://") or args.sdmodel.startswith("https://")) and (args.sdmodel.endswith(".gguf") or args.sdmodel.endswith(".safetensors")):
dlfile = download_model_from_url(args.sdmodel)
if dlfile:
args.sdmodel = dlfile
if args.mmproj and args.mmproj!="":
if args.mmproj.endswith("?download=true"):
args.mmproj = args.mmproj.replace("?download=true","")
if (args.mmproj.startswith("http://") or args.mmproj.startswith("https://")) and (args.mmproj.endswith(".gguf")):
dlfile = download_model_from_url(args.mmproj)
if dlfile:
args.mmproj = dlfile
if args.whispermodel and args.whispermodel!="":
if args.whispermodel.endswith("?download=true"):
args.whispermodel = args.whispermodel.replace("?download=true","")
if (args.whispermodel.startswith("http://") or args.whispermodel.startswith("https://")) and (args.whispermodel.endswith(".gguf") or args.whispermodel.endswith(".bin")):
dlfile = download_model_from_url(args.whispermodel)
if dlfile:
args.whispermodel = dlfile
for arg_name in ['model_param', 'sdmodel', 'mmproj', 'whispermodel']:
arg_value = getattr(args, arg_name)
if arg_value and arg_value != "":
if arg_value.endswith("?download=true"):
arg_value = arg_value[:-14] # Remove "?download=true"

is_url = arg_value.startswith(("http://", "https://"))
valid_extensions = (".gguf", ".bin", ".safetensors")
Comment on lines +3668 to +3672
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is WRONG!!! For example, mmproj cannot be a safetensors!

Are you using an LLM to refactor the code? Please do not do that without going through what the code does.
Language models CANNOT refactor code an accurate manner without human review..

Also why would you truncate a string by and offset when just replacing it is so much clearer and more intuitive?


if is_url and arg_value.endswith(valid_extensions):
dlfile = download_model_from_url(arg_value)
if dlfile:
setattr(args, arg_name, dlfile)

# sanitize and replace the default vanity name. remember me....
if args.model_param and args.model_param!="":
Expand Down Expand Up @@ -3889,26 +3853,25 @@ def main(launch_args,start_server=True):
if not loadok:
exitcounter = 999
exit_with_error(3,"Could not load image model: " + imgmodel)

#handle whisper model
if args.whispermodel and args.whispermodel!="":
# handle whisper model
if args.whispermodel:
whispermodel = args.whispermodel
if not whispermodel or not os.path.exists(whispermodel):
if not os.path.exists(whispermodel):
if args.ignoremissing:
print(f"Ignoring missing whisper model file: {whispermodel}")
args.whispermodel = None
else:
exitcounter = 999
exit_with_error(2,f"Cannot find whisper model file: {whispermodel}")
exit_with_error(2, f"Cannot find whisper model file: {whispermodel}")
else:
whispermodel = os.path.abspath(whispermodel)
fullwhispermodelpath = whispermodel
loadok = whisper_load_model(whispermodel)
print("Load Whisper Model OK: " + str(loadok))
print(f"Load Whisper Model OK: {loadok}")
if not loadok:
exitcounter = 999
exit_with_error(3,"Could not load whisper model: " + whispermodel)

exit_with_error(3, f"Could not load whisper model: {whispermodel}")

#load embedded lite
try:
Expand Down