Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes #1650

Open
BBC-Esq opened this issue Mar 27, 2024 · 1 comment
Open

Comments

@BBC-Esq
Copy link

BBC-Esq commented Mar 27, 2024

My initial testing comparing ct2 (using int8) and the bitsandbytes library at 4 and 8 bit...nicely done ctranslate2 people. Looking forward to testing GGUF in there as well.

image

Here's the relevant portion of my script omitting the prompt, imports, and private path on my computer to the models:

PARTIAL TEST SCRIPT
    context_length = 4095
    max_generation_length = 512
    max_prompt_length = context_length - max_generation_length

    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    warnings.filterwarnings("ignore", module="pynvml")

    results = {}

    for model_config in models:
        model_dir = model_config["model_dir"]
        build_prompt_func = model_config["build_prompt"]

        model_name = os.path.basename(model_dir)
        print(f"\033[32mLoading the model: {model_name}...\033[0m")
        intra_threads = max(os.cpu_count() - 4, os.cpu_count())
        generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads)
        sp = spm.SentencePieceProcessor(os.path.join(model_dir, "tokenizer.model"))

        model_results = []

        for _ in range(3):
            start_time = time.time()
            dialog = [{"role": "user", "content": user_prompt}]
            prompt_tokens = build_prompt_func(sp, dialog)
            step_results = generator.generate_tokens(
                prompt_tokens,
                max_length=max_generation_length,
                sampling_temperature=0.1,
                sampling_topk=20,
                sampling_topp=1,
            )

            memory_info_peak = pynvml.nvmlDeviceGetMemoryInfo(handle)
            vram_usage_peak = memory_info_peak.used / 1024**2

            print("", flush=True)
            text_output = ""
            num_generated_tokens = 0
            for word in generate_words(sp, step_results):
                if text_output:
                    word = " " + word
                print(word, end="", flush=True)
                text_output += word
                num_generated_tokens += 1
            print("")

            end_time = time.time()
            response_time = end_time - start_time

            model_results.append({
                "response_time": response_time,
                "peak_vram_usage": vram_usage_peak
            })

        results[model_name] = model_results

        del generator
        del sp
        gc.collect()

        time.sleep(2)

    pynvml.nvmlShutdown()

    print("\nAverage Results:")
    for model_name, model_results in results.items():
        avg_response_time = sum(result['response_time'] for result in model_results) / len(model_results)
        avg_peak_vram_usage = sum(result['peak_vram_usage'] for result in model_results) / len(model_results)
        print(f"Model: {model_name}")
        print(f"Average Response Time: {avg_response_time:.2f} seconds")
        print(f"Average Peak VRAM Usage: {avg_peak_vram_usage:.2f} MB")
        print()

def generate_words(sp, step_results):
    tokens_buffer = []
    for step_result in step_results:
        is_new_word = step_result.token.startswith("▁")
        if is_new_word and tokens_buffer:
            word = sp.decode(tokens_buffer)
            if word:
                yield word
            tokens_buffer = []
        tokens_buffer.append(step_result.token_id)
    if tokens_buffer:
        word = sp.decode(tokens_buffer)
        if word:
            yield word

def build_prompt_solar_10_7b_instruct_v1_0(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"""### System:\n{system_message}\n\n### User:\n{user_prompt}\n\n### Assistant:\n"""
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_neural_chat_7b_v3_3(sp, dialog):
    system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    user_prompt = dialog[0]["content"]
    prompt = f"### System:\n{system_prompt}\n### User:\n{user_prompt}\n### Assistant: "
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_llama_2_7b_chat(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

def build_prompt_llama_2_13b_chat(sp, dialog):
    user_prompt = dialog[0]["content"]
    system_message = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you."
    prompt = f"<<SYS>>\n{system_message}\n<</SYS>>\n\n[INST] {user_prompt} [/INST]"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens
    
def build_prompt_mistral_7b_instruct_v0_2(sp, dialog):
    user_prompt = dialog[0]["content"]
    prompt = f"<s>[INST] {user_prompt} [/INST]</s>\n"
    dialog_tokens = sp.encode_as_pieces(prompt)
    return dialog_tokens

Let me know if anyone wants the full test script...EDIT, I ran it 5 times and changed the graph to only show models all backends were run with...think it's a better chart.

@BBC-Esq BBC-Esq changed the title Nicely done messrs Tried and Tested - Nicely done messrs Mar 27, 2024
@BBC-Esq
Copy link
Author

BBC-Esq commented Mar 28, 2024

Updated graph here including llama.cpp, which, apparently, is faster but uses slightly more VRAM...except for the 13B model where it's 3GBhigher. Plus, the numbers changed somewhat because I ran each model 15 times instead of 3...

image

Ask if you're interested in the test scripts.

@BBC-Esq BBC-Esq changed the title Tried and Tested - Nicely done messrs Benchmarking Whisper on ctranslate2, llama.cpp, and bitsandbytes Apr 2, 2024
@BBC-Esq BBC-Esq changed the title Benchmarking Whisper on ctranslate2, llama.cpp, and bitsandbytes Benchmarking common LLMs on ctranslate2, llama.cpp, and bitsandbytes May 23, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant