EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
UNIX CSV XML Cloudreve Pytorch hf Datetime Animate 版权 Input PIP WAN Base64 Bitcoin AI Shortcut FP32 Paddle CV Quantize Gemma Website diffusers Tracking WebCrawler Qwen2.5 Safetensors FastAPI PyCharm Video GoogLeNet 公式 FP64 Zip NLP ModelScope Tiktoken CEIR Proxy Web Hotel SQL v0.dev Statistics Github Llama CLAP Plotly Bin Qwen Vmess Excel FlashAttention HaggingFace GGML Qwen2 API Ubuntu VGG-16 SVR Color Ptyhon logger Magnet Pandas Plate TensorRT FP16 OpenCV 签证 C++ LLAMA Bert 域名 Claude uwsgi ResNet-50 GPTQ Tensor Math VSCode NLTK YOLO tar PDB Paper Vim Disk NameSilo 音频 tqdm 多线程 JSON git Crawler Git Michelin Sklearn 视频信息 EXCEL RAR 证件照 TensorFlow Algorithm Breakpoint Attention Hungarian Anaconda v2ray CTC Hilton Use CC VPN Google printf Django PDF llama.cpp UI IndexTTS2 Miniforge Jupyter DeepSeek TTS Numpy 报税 Heatmap ChatGPT Baidu 搞笑 RGB Freesound transformers BF16 净利润 Logo Domain LoRA DeepStream LLM Knowledge Card 关于博主 Windows OCR ONNX MD5 Data Bipartite Firewall 腾讯云 Conda Pillow Password OpenAI 飞书 Python Jetson scipy Interview HuggingFace BeautifulSoup PyTorch 算法题 Food Review Template FP8 QWEN SPIE LeetCode Docker BTC 阿里云 财报 Clash Permission torchinfo Land SAM Random GIT CAM Transformers Quantization Translation git-lfs Distillation COCO CUDA LaTeX Nginx Dataset Markdown Mixtral uWSGI Image2Text Linux Diagram mmap Streamlit 多进程 Augmentation InvalidArgumentError TSV SQLite 继承 XGBoost GPT4 Pickle
站点统计

本站现有博文311篇,共被浏览739983

本站已经建立2376天!

热门文章
文章归档
回到顶部