Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp
import torch
# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
def q4_0_quantize_and_dequantize_tensor(tensor):
tensor = tensor.to(dtype=torch.float32, device=device)
# Reshape tensor to process each 32-value block independently
orig_shape = tensor.shape
tensor = tensor.view(-1, 32)
# Find the maximum absolute value per block
max_vals = torch.max(torch.abs(tensor), dim=1)[0]
# Prevent division by zero
max_vals[max_vals == 0] = 1.0
# Calculate d and id for each block
d = max_vals / -8.0
ids = 1.0 / d
# Scale and quantize tensor elements
scaled_tensors = tensor * ids[:, None]
quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)
# Dequantize the tensor
dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]
# Reshape back to the original shape
dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)
return dequantized_tensors
# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
"embed_tokens.weight",
"self_attn.q_proj.weight",
"self_attn.k_proj.weight",
"self_attn.v_proj.weight",
"self_attn.o_proj.weight",
"mlp.up_proj.weight",
"mlp.gate_proj.weight",
"mlp.down_proj.weight",
"lm_head.weight"
]
for name, data in model_part.items():
for word in keywords:
if word in name:
# Quantize and dequantize the entire tensor
model_part[name] = q4_0_quantize_and_dequantize_tensor(data)
# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")
Reference: