python/llm/src/ipex_llm/transformers/qlora.py

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/huggingface/peft/blob/v0.5.0/src/peft/tuners/lora.py
#
# coding=utf-8
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/huggingface/peft/blob/v0.5.0/src/peft/tuners/lora.py
#
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import logging
from torch.nn import Linear, Embedding, Module
from ipex_llm.transformers.low_bit_linear import LowBitLinear, BF16Linear, get_qk_size
from peft.tuners.lora import LoraLayer
from typing import Any, Optional, Union
from ipex_llm.utils.common import invalidInputError
from ipex_llm.transformers.utils import get_autocast_dtype
from ipex_llm.ggml.quantize import ggml_tensor_qtype
import functools
from ipex_llm.transformers import training_patch

LOG = logging.getLogger("ipex_llm.qlora")


class LoraLowBitLinear(Module, LoraLayer):
    # Lora implemented in a dense layer
    def __init__(
        self,
        base_layer,
        adapter_name,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        qa_lora: bool = True,
        # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        fan_in_fan_out: bool = False,
        is_target_conv_1d_layer: bool = False,
        init_lora_weights: Union[bool, str]=True,
        use_rslora: bool = False,
        use_dora: bool = False,
        **kwargs,
    ):
        super().__init__()
        qk_size = get_qk_size(kwargs.get("qtype"))
        if qa_lora:
            # qa_lora need to change the in_features of the base_layer
            in_features = base_layer.in_features
            base_layer.in_features = in_features // qk_size

        LoraLayer.__init__(self, base_layer, **kwargs)

        self.fan_in_fan_out = fan_in_fan_out
        self._active_adapter = adapter_name
        self.update_layer(
            adapter_name,
            r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            init_lora_weights=init_lora_weights,
            use_rslora=use_rslora,
            use_dora=use_dora,
        )
        self.is_target_conv_1d_layer = is_target_conv_1d_layer

        if qa_lora:
            self.qa_pool = torch.nn.AvgPool1d(qk_size)
        else:
            self.qa_pool = torch.nn.Identity()

    def forward(self, x: torch.Tensor):
        autocast_dtype = get_autocast_dtype(x)
        if x.device.type == "xpu":
            # force to use bf16 on gpu
            x = x.to(torch.bfloat16)
        elif autocast_dtype is not None:
            x = x.to(autocast_dtype)
        result = self.base_layer.forward(x)

        if self.disable_adapters or self.merged:
            return result
        else:
            if autocast_dtype is None and x.device.type == "cpu":
                expected_dtype = result.dtype
                for active_adapter in self.active_adapters:
                    if active_adapter not in self.lora_A.keys():
                        continue
                    x = x.to(self.lora_A[active_adapter].weight.dtype)
                    lora_A = self.lora_A[active_adapter]
                    lora_B = self.lora_B[active_adapter]
                    dropout = self.lora_dropout[active_adapter]
                    scaling = self.scaling[active_adapter]
                    result += lora_B(lora_A(dropout(self.qa_pool(x)))).to(expected_dtype) * scaling
            else:
                for active_adapter in self.active_adapters:
                    if active_adapter not in self.lora_A.keys():
                        continue
                    lora_A = self.lora_A[active_adapter]
                    lora_B = self.lora_B[active_adapter]
                    dropout = self.lora_dropout[active_adapter]
                    scaling = self.scaling[active_adapter]
                    result += lora_B(lora_A(dropout(self.qa_pool(x)))) * scaling
        return result


class LoraBF16Linear(Module, LoraLayer):
    # Lora implemented in a dense layer
    def __init__(
        self,
        base_layer,
        adapter_name,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        fan_in_fan_out: bool = False,
        is_target_conv_1d_layer: bool = False,
        init_lora_weights: Union[bool, str]=True,
        use_rslora: bool = False,
        use_dora: bool = False,
        **kwargs,
    ):
        super().__init__()
        LoraLayer.__init__(self, base_layer, **kwargs)

        self.fan_in_fan_out = fan_in_fan_out
        self._active_adapter = adapter_name
        self.update_layer(
            adapter_name,
            r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            init_lora_weights=init_lora_weights,
            use_rslora=use_rslora,
            use_dora=use_dora,
        )
        self.is_target_conv_1d_layer = is_target_conv_1d_layer

    def forward(self, x: torch.Tensor):
        autocast_dtype = get_autocast_dtype(x)
        if x.device.type == "xpu":
            # force to use bf16 on gpu
            x = x.to(torch.bfloat16)
        elif autocast_dtype is not None:
            x = x.to(autocast_dtype)
        result = self.base_layer.forward(x)

        if self.disable_adapters or self.merged:
            return result
        else:
            if autocast_dtype is None and x.device.type == "cpu":
                expected_dtype = result.dtype
                for active_adapter in self.active_adapters:
                    if active_adapter not in self.lora_A.keys():
                        continue
                    x = x.to(self.lora_A[active_adapter].weight.dtype)
                    lora_A = self.lora_A[active_adapter]
                    lora_B = self.lora_B[active_adapter]
                    dropout = self.lora_dropout[active_adapter]
                    scaling = self.scaling[active_adapter]
                    result += lora_B(lora_A(dropout(x))).to(expected_dtype) * scaling
            else:
                for active_adapter in self.active_adapters:
                    if active_adapter not in self.lora_A.keys():
                        continue
                    lora_A = self.lora_A[active_adapter]
                    lora_B = self.lora_B[active_adapter]
                    dropout = self.lora_dropout[active_adapter]
                    scaling = self.scaling[active_adapter]
                    result += lora_B(lora_A(dropout(x))) * scaling
        return result


def _create_new_module(create_new_module_func, lora_config, adapter_name, target, **kwargs):

    if isinstance(target, LowBitLinear) or isinstance(target, BF16Linear):
        low_bit_kwargs = kwargs.copy()
        bias = low_bit_kwargs.pop("bias", False)

        if hasattr(lora_config, "training_mode") and lora_config.training_mode == "lora":
            new_module = LoraBF16Linear(target,
                                        adapter_name,
                                        bias=bias,
                                        **low_bit_kwargs)
        else:
            if hasattr(lora_config, "training_mode"):
                qa_lora = lora_config.training_mode == "qalora"
            else:
                qa_lora = False
            low_bit_kwargs.update(
                {
                    "qtype": target.qtype,
                    "qa_lora": qa_lora
                }
            )
            new_module = LoraLowBitLinear(target,
                                          adapter_name,
                                          bias=bias,
                                          **low_bit_kwargs)
    else:
        new_module = create_new_module_func(lora_config, adapter_name, target, **kwargs)

    return new_module


from peft.tuners.lora import LoraModel
from peft.tuners.lora import LoraConfig as LoraConfigBase
from transformers import TrainingArguments as TrainingArgumentsBase
from transformers.training_args import OptimizerNames
from dataclasses import dataclass, field


@dataclass
class LoraConfig(LoraConfigBase):
    training_mode: str = field(default="qlora", metadata={"help": "determine training mode"})

    def __init__(self, *args, **kwargs):
        self.training_mode = kwargs.pop("training_mode", "qlora")
        super().__init__(*args, **kwargs)
        from ipex_llm.llm_patching import bigdl_patched
        if bigdl_patched == 'Train':
            from .model import patched_training_mode
            self.training_mode = patched_training_mode


supported_optim = ["adamw_hf", "adamw_torch", "adafactor", "sgd", "adagrad", "rmsprop"]


@dataclass
class TrainingArguments(TrainingArgumentsBase):
    def __init__(self, *args, **kwargs):
        kwargs["fp16"] = False
        kwargs["bf16"] = True
        for optim in supported_optim.copy():
            supported_optim.append(OptimizerNames(optim))
        if kwargs["optim"] not in supported_optim:
            LOG.info(f"{self.optim} is not supported yet and adamw_torch optimizer is used.")
            kwargs["optim"] = "adamw_torch"
        super().__init__(*args, **kwargs)


def get_peft_model(*args, **kwargs):
    old_create_new_module = LoraModel._create_new_module
    LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
                                                                  old_create_new_module))

    try:
        from ipex_llm.llm_patching import bigdl_patched
        if bigdl_patched == 'Train':
            from peft import get_peft_model_original
        else:
            from peft import get_peft_model as get_peft_model_original
        model = get_peft_model_original(*args, **kwargs)
    finally:
        LoraModel._create_new_module = old_create_new_module

    if model.device.type == "xpu":
        cast_lora_weight(model, torch.bfloat16)
        _optimize_post(model)
        torch.xpu.synchronize()

    return model


def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):
    r"""
    This method wraps the entire protocol for preparing a model before running a training.
    This includes:
        1- Cast the layernorm in fp32
        2- making output embedding layer require grads
        3- Add the upcasting of the lm head to fp32

    Args:
        model, (`transformers.PreTrainedModel`):
            The loaded model from `transformers`
    """

    is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq"
    for name, param in model.named_parameters():
        # freeze base model's layers
        param.requires_grad = False

    if not is_gptq_quantized:
        # cast all non INT8 parameters to fp32
        # for param in model.parameters():
        #     if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
        #         param.data = param.data.to(torch.float32)

        # change to below way to reduce memory for Linear
        # otherwise lora finetuning on arc may OOM at this convert
        for module in model.modules():
            if list(module.children()) == []:
                # leaf module
                if not isinstance(module, (Linear, Embedding)):
                    for param in module.parameters():
                        if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
                            param.data = param.data.to(torch.float32)

    if use_gradient_checkpointing:
        # For backward compatibility
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
        else:

            def make_inputs_require_grad(module, input, output):
                output.requires_grad_(True)

            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

        # enable gradient checkpointing for memory efficiency
        model.gradient_checkpointing_enable()

    return model


class PeftModel:

    @staticmethod
    def from_pretrained(*args,
                        **kwargs):
        old_create_new_module = LoraModel._create_new_module
        LoraModel._create_new_module = staticmethod(functools.partial(_create_new_module,
                                                                      old_create_new_module))
        from peft import PeftModel
        try:
            model = PeftModel.from_pretrained(*args, **kwargs)
        finally:
            LoraModel._create_new_module = old_create_new_module

        return model


from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING

PEFT_TYPE_TO_CONFIG_MAPPING["lora"] = LoraConfig


def cast_lora_weight(model, dtype=torch.bfloat16):
    for name, module in model.named_modules():
        if isinstance(module, LowBitLinear):
            module.compute_dtype = dtype
        if isinstance(module, LoraLayer):
            module = module.to(dtype)
        if isinstance(module, BF16Linear):
            module = module.to(dtype)
            module.compute_dtype = dtype
        if 'norm' in name:
            module = module.to(torch.float32)
        if 'lm_head' in name or 'embed_tokens' in name:
            if hasattr(module, 'weight'):
                if module.weight.dtype == torch.float32:
                    module = module.to(dtype)


def _optimize_post(model):
    import transformers
    from packaging import version
    from ipex_llm.transformers.convert import convert_forward
    from ipex_llm.transformers.models.llama import llama_attention_fast_forward

    trans_version = transformers.__version__
    if version.parse(trans_version) >= version.parse("4.31.0"):
        LOG.info("Optimizing Llama finetuning....")
        convert_forward(
            model,
            transformers.models.llama.modeling_llama.LlamaAttention,
            llama_attention_fast_forward,)