Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Quantization don't reduce the model file size #561

Open
SefaZeng opened this issue Aug 5, 2021 · 0 comments
Open

Quantization don't reduce the model file size #561

SefaZeng opened this issue Aug 5, 2021 · 0 comments

Comments

@SefaZeng
Copy link

SefaZeng commented Aug 5, 2021

I try to use Post Training Quantization to convert my float32 model to int8 follow the tutorial of quantizing GNMT. I change the model code to a distiller style and get a quantized model. This is some information about the quantized model:

(context_attn): MultiHeadedAttention(
          (linear_keys): RangeLinearQuantParamLayerWrapper(
            weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
            output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
            accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
              inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None
            scale_approx_mult_bits=None
            preset_activation_stats=True
              output_scale=16.876938, output_zero_point=0.000000
            weights_scale=194.250000, weights_zero_point=0.000000
            (wrapped_module): Linear(in_features=512, out_features=512, bias=True)
          )
          (linear_values): RangeLinearQuantParamLayerWrapper(
            weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
            output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
            accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
              inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None
            scale_approx_mult_bits=None
            preset_activation_stats=True
              output_scale=13.410025, output_zero_point=0.000000
            weights_scale=204.500000, weights_zero_point=0.000000
            (wrapped_module): Linear(in_features=512, out_features=512, bias=True)
          )
          (linear_query): RangeLinearQuantParamLayerWrapper(
            weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
            output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
            accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
              inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None
            scale_approx_mult_bits=None
            preset_activation_stats=True
              output_scale=15.167286, output_zero_point=0.000000
            weights_scale=200.000000, weights_zero_point=0.000000
            (wrapped_module): Linear(in_features=512, out_features=512, bias=True)
          )

My model is a transformer model from OpenNMT. It seems I do get a correct compressed model but the model file is larger than the original model, like 200MB -> 280 MB. Is there any method can reduce the model size as I think it's the most important feature from quantization.
The code script is like this:

model_path= sys.argv[1]

parser = ArgumentParser(description='translate.py')
opts.config_opts(parser)
opts.translate_opts(parser)
src = "test.en"
output = src+".distiller_out"
opt = parser.parse_args(f"--model {model_path} --src {src} --output {output} --gpu 0")

#translate(opt)
ArgumentParser.validate_translate_opts(opt)
logger = init_logger(opt.log_file)

translator = build_translator(opt, report_score=True)
stats_file = "./acts_quantization_stats.yaml"

def evaluate(model, output, num_batches=None):
    src_shards = split_corpus(opt.src, opt.shard_size)
    tgt_shards = split_corpus(opt.tgt, opt.shard_size)
    shard_pairs = zip(src_shards, tgt_shards)

    for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
        logger.info("Translating shard %d." % i)
        translator.translate(
            src=src_shard,
            tgt=tgt_shard,
            src_dir=opt.src_dir,
            batch_size=opt.batch_size,
            batch_type=opt.batch_type,
            attn_debug=opt.attn_debug,
            align_debug=opt.align_debug
            )

    print("translate end")

output = "output_file_distiller"

if not os.path.isfile(stats_file): # Collect stats.
    #model_copy = deepcopy(model)
    model_copy = translator.model
    distiller.utils.assign_layer_fq_names(model_copy)
    
    def eval_for_stats(model):
        evaluate(model, output + '.temp', num_batches=None)
    collect_quant_stats(model_copy, eval_for_stats, save_dir='.')
    #del model_copy
    torch.cuda.empty_cache()

quantizer = PostTrainLinearQuantizer(deepcopy(translator.model),
                                    mode="SYMMETRIC",  # As was suggested in GNMT's paper
                                    model_activation_stats=stats_file)
for t, rf in quantizer.replacement_factory.items():
    if rf is not None:
        print("Replacing '{}' modules using '{}' function".format(t.__name__, rf.__name__))

fake_input = torch.tensor([4,115,1480,73,12,4,18125,1424,234,26,12,3658,16278,36], dtype=torch.long)
fake_input = fake_input.unsqueeze(-1).unsqueeze(-2)
length  = torch.tensor([14])
fake_inputs = (fake_input, fake_input, length)
dummy_input = (torch.ones(1, 1, 2).to(dtype=torch.long),
                torch.ones(1, 1, 2).to(dtype=torch.long),
                torch.tensor([1]).to(dtype=torch.long),)
quantizer.prepare_model(fake_inputs)
print(quantizer.model)
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant