Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes #1650

BBC-Esq · 2024-03-27T09:54:38Z

My initial testing comparing ct2 (using int8) and the bitsandbytes library at 4 and 8 bit...nicely done ctranslate2 people. Looking forward to testing GGUF in there as well.

Here's the relevant portion of my script omitting the prompt, imports, and private path on my computer to the models:

PARTIAL TEST SCRIPT

    context_length = 4095
    max_generation_length = 512
    max_prompt_length = context_length - max_generation_length

    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    warnings.filterwarnings("ignore", module="pynvml")

    results = {}

    for model_config in models:
        model_dir = model_config["model_dir"]
        build_prompt_func = model_config["build_prompt"]

        model_name = os.path.basename(model_dir)
        print(f"\033[32mLoading the model: {model_name}...\033[0m")
        intra_threads = max(os.cpu_count() - 4, os.cpu_count())
        generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads)
        sp = spm.SentencePieceProcessor(os.path.join(model_dir, "tokenizer.model"))

        model_results = []

        for _ in range(3):
            start_time = time.time()
            dialog = [{"role": "user", "content": user_prompt}]
            prompt_tokens = build_prompt_func(sp, dialog)
            step_results = generator.generate_tokens(
                prompt_tokens,
                max_length=max_generation_length,
                sampling_temperature=0.1,
                sampling_topk=20,
                sampling_topp=1,
            )

            memory_info_peak = pynvml.nvmlDeviceGetMemoryInfo(handle)
            vram_usage_peak = memory_info_peak.used / 1024**2

            print("", flush=True)
            text_output = ""
            num_generated_tokens = 0
            for word in generate_words(sp, step_results):
                if text_output:
                    word = " " + word
                print(word, end="", flush=True)
                text_output += word
                num_generated_tokens += 1
            print("")

            end_time = time.time()
            response_time = end_time - start_time

            model_results.append({
                "response_time": response_time,
                "peak_vram_usage": vram_usage_peak
            })

        results[model_name] = model_results

        del generator
        del sp
        gc.collect()

        time.sleep(2)

    pynvml.nvmlShutdown()

    print("\nAverage Results:")
    for model_name, model_results in results.items():
        avg_response_time = sum(result['response_time'] for result in model_results) / len(model_results)
        avg_peak_vram_usage = sum(result['peak_vram_usage'] for result in model_results) / len(model_results)
        print(f"Model: {model_name}")
        print(f"Average Response Time: {avg_response_time:.2f} seconds")
        print(f"Average Peak VRAM Usage: {avg_peak_vram_usage:.2f} MB")
        print()

def generate_words(sp, step_results):
    tokens_buffer = []
    for step_result in step_results:
        is_new_word = step_result.token.startswith("▁")
        if is_new_word and tokens_buffer:
            word = sp.decode(tokens_buffer)
            if word:
                yield word
            tokens_buffer = []
        tokens_buffer.append(step_result.token_id)
    if tokens_buffer:
        word = sp.decode(tokens_buffer)
        if word:
            yield word

def build_prompt_solar_10_7b_instruct_v1_0(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"""### System:\n{system_message}\n\n### User:\n{user_prompt}\n\n### Assistant:\n"""
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_neural_chat_7b_v3_3(sp, dialog):
    system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    user_prompt = dialog[0]["content"]
    prompt = f"### System:\n{system_prompt}\n### User:\n{user_prompt}\n### Assistant: "
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_llama_2_7b_chat(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_llama_2_13b_chat(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens
    
def build_prompt_mistral_7b_instruct_v0_2(sp, dialog):
    user_prompt = dialog[0]["content"]
    prompt = f"<s>[INST] {user_prompt} [/INST]</s>\n"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

Let me know if anyone wants the full test script...EDIT, I ran it 5 times and changed the graph to only show models all backends were run with...think it's a better chart.

The text was updated successfully, but these errors were encountered:

BBC-Esq · 2024-03-28T18:04:20Z

Updated graph here including llama.cpp, which, apparently, is faster but uses slightly more VRAM...except for the 13B model where it's 3GBhigher. Plus, the numbers changed somewhat because I ran each model 15 times instead of 3...

Ask if you're interested in the test scripts.

BBC-Esq changed the title ~~Nicely done messrs~~ Tried and Tested - Nicely done messrs Mar 27, 2024

BBC-Esq changed the title ~~Tried and Tested - Nicely done messrs~~ Benchmarking Whisper on ctranslate2, llama.cpp, and bitsandbytes Apr 2, 2024

BBC-Esq changed the title ~~Benchmarking Whisper on ctranslate2, llama.cpp, and bitsandbytes~~ Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes May 23, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes #1650

Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes #1650

BBC-Esq commented Mar 27, 2024 •

edited

BBC-Esq commented Mar 28, 2024 •

edited

Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes #1650

Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes #1650

Comments

BBC-Esq commented Mar 27, 2024 • edited

BBC-Esq commented Mar 28, 2024 • edited

BBC-Esq commented Mar 27, 2024 •

edited

BBC-Esq commented Mar 28, 2024 •

edited