You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
My initial testing comparing ct2 (using int8) and the bitsandbytes library at 4 and 8 bit...nicely done ctranslate2 people. Looking forward to testing GGUF in there as well.
Here's the relevant portion of my script omitting the prompt, imports, and private path on my computer to the models:
PARTIAL TEST SCRIPT
context_length = 4095
max_generation_length = 512
max_prompt_length = context_length - max_generation_length
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
warnings.filterwarnings("ignore", module="pynvml")
results = {}
for model_config in models:
model_dir = model_config["model_dir"]
build_prompt_func = model_config["build_prompt"]
model_name = os.path.basename(model_dir)
print(f"\033[32mLoading the model: {model_name}...\033[0m")
intra_threads = max(os.cpu_count() - 4, os.cpu_count())
generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads)
sp = spm.SentencePieceProcessor(os.path.join(model_dir, "tokenizer.model"))
model_results = []
for _ in range(3):
start_time = time.time()
dialog = [{"role": "user", "content": user_prompt}]
prompt_tokens = build_prompt_func(sp, dialog)
step_results = generator.generate_tokens(
prompt_tokens,
max_length=max_generation_length,
sampling_temperature=0.1,
sampling_topk=20,
sampling_topp=1,
)
memory_info_peak = pynvml.nvmlDeviceGetMemoryInfo(handle)
vram_usage_peak = memory_info_peak.used / 1024**2
print("", flush=True)
text_output = ""
num_generated_tokens = 0
for word in generate_words(sp, step_results):
if text_output:
word = " " + word
print(word, end="", flush=True)
text_output += word
num_generated_tokens += 1
print("")
end_time = time.time()
response_time = end_time - start_time
model_results.append({
"response_time": response_time,
"peak_vram_usage": vram_usage_peak
})
results[model_name] = model_results
del generator
del sp
gc.collect()
time.sleep(2)
pynvml.nvmlShutdown()
print("\nAverage Results:")
for model_name, model_results in results.items():
avg_response_time = sum(result['response_time'] for result in model_results) / len(model_results)
avg_peak_vram_usage = sum(result['peak_vram_usage'] for result in model_results) / len(model_results)
print(f"Model: {model_name}")
print(f"Average Response Time: {avg_response_time:.2f} seconds")
print(f"Average Peak VRAM Usage: {avg_peak_vram_usage:.2f} MB")
print()
def generate_words(sp, step_results):
tokens_buffer = []
for step_result in step_results:
is_new_word = step_result.token.startswith("▁")
if is_new_word and tokens_buffer:
word = sp.decode(tokens_buffer)
if word:
yield word
tokens_buffer = []
tokens_buffer.append(step_result.token_id)
if tokens_buffer:
word = sp.decode(tokens_buffer)
if word:
yield word
def build_prompt_solar_10_7b_instruct_v1_0(sp, dialog):
user_prompt = dialog[0]["content"]
system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
prompt = f"""### System:\n{system_message}\n\n### User:\n{user_prompt}\n\n### Assistant:\n"""
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_neural_chat_7b_v3_3(sp, dialog):
system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
user_prompt = dialog[0]["content"]
prompt = f"### System:\n{system_prompt}\n### User:\n{user_prompt}\n### Assistant: "
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_llama_2_7b_chat(sp, dialog):
user_prompt = dialog[0]["content"]
system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_llama_2_13b_chat(sp, dialog):
user_prompt = dialog[0]["content"]
system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
def build_prompt_mistral_7b_instruct_v0_2(sp, dialog):
user_prompt = dialog[0]["content"]
prompt = f"<s>[INST] {user_prompt} [/INST]</s>\n"
dialog_tokens = sp.encode_as_pieces(prompt)
return dialog_tokens
Let me know if anyone wants the full test script...EDIT, I ran it 5 times and changed the graph to only show models all backends were run with...think it's a better chart.
The text was updated successfully, but these errors were encountered:
BBC-Esq
changed the title
Nicely done messrs
Tried and Tested - Nicely done messrs
Mar 27, 2024
Updated graph here including llama.cpp, which, apparently, is faster but uses slightly more VRAM...except for the 13B model where it's 3GBhigher. Plus, the numbers changed somewhat because I ran each model 15 times instead of 3...
Ask if you're interested in the test scripts.
BBC-Esq
changed the title
Tried and Tested - Nicely done messrs
Benchmarking Whisper on ctranslate2, llama.cpp, and bitsandbytes
Apr 2, 2024
BBC-Esq
changed the title
Benchmarking Whisper on ctranslate2, llama.cpp, and bitsandbytes
Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes
May 23, 2024
My initial testing comparing ct2 (using int8) and the
bitsandbytes
library at 4 and 8 bit...nicely done ctranslate2 people. Looking forward to testing GGUF in there as well.Here's the relevant portion of my script omitting the prompt, imports, and private path on my computer to the models:
PARTIAL TEST SCRIPT
Let me know if anyone wants the full test script...EDIT, I ran it 5 times and changed the graph to only show models all backends were run with...think it's a better chart.
The text was updated successfully, but these errors were encountered: