Pytorch Q4_1 Quantize and Dequantize aligning with llama.cpp
import torch
# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
def q4_1_quantize_and_dequantize_tensor(tensor):
tensor = tensor.to(dtype=torch.float32, device=device)
# Reshape tensor to process each 4-value block independently
orig_shape = tensor.shape
tensor = tensor.view(-1, 32)
# Find the min and max values per block
min_vals = torch.min(tensor, dim=1)[0]
max_vals = torch.max(tensor, dim=1)[0]
# Calculate scale d for each block
d = (max_vals - min_vals) / (2**4 - 1)
d[d == 0] = 1.0 # Prevent division by zero
# Calculate inverse of d
ids = 1.0 / d
# Quantize tensor elements
quantized_tensors = (tensor - min_vals[:, None]) * ids[:, None]
# Clamp values to be between 0 and 15 (for 4 bits)
quantized_tensors = torch.clamp(quantized_tensors + 0.5, 0, 15).to(torch.uint8)
# Dequantize the tensor
dequantized_tensors = (quantized_tensors.float() * d[:, None]) + min_vals[:, None]
# Reshape back to the original shape
dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)
return dequantized_tensors
# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
"embed_tokens.weight",
"self_attn.q_proj.weight",
"self_attn.k_proj.weight",
"self_attn.v_proj.weight",
"self_attn.o_proj.weight",
"mlp.up_proj.weight",
"mlp.gate_proj.weight",
"mlp.down_proj.weight",
"lm_head.weight"
]
for name, data in model_part.items():
for word in keywords:
if word in name:
# Quantize and dequantize the entire tensor
model_part[name] = q4_1_quantize_and_dequantize_tensor(data)
# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")
Reference: