EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
scipy Zip Math HaggingFace Card TensorFlow UNIX 净利润 Nginx Use Sklearn Docker Land Numpy Github CTC 关于博主 FP16 Bipartite 搞笑 Diagram Mixtral Knowledge Interview GGML SQL FP32 VGG-16 Augmentation YOLO Transformers Firewall Shortcut Qwen2 diffusers Input Rebuttal 签证 FlashAttention Baidu Hotel transformers Quantization Google Logo HuggingFace uwsgi torchinfo Image2Text TTS NLTK GPTQ Tracking XML Git Dataset Food CSV Qwen 音频 Tiktoken Algorithm CAM Disk Password printf mmap Python CEIR TensorRT Heatmap Pickle WebCrawler ResNet-50 Django Paper Safetensors GoogLeNet Qwen2.5 GPT4 UI OpenCV MD5 Llama COCO Plate Excel SVR Claude Linux 图标 顶会 Base64 ChatGPT GIT Datetime CC BTC Gemma OpenAI RAR SQLite 强化学习 Paddle LLM JSON 版权 Data Search BeautifulSoup Ubuntu Bitcoin 第一性原理 git Animate git-lfs Tensor Ptyhon RGB 证件照 PDB Cloudreve Domain 多线程 LoRA LeetCode uWSGI VSCode v2ray Distillation PyCharm Attention 多进程 SPIE logger CUDA v0.dev LaTeX Conda Anaconda Streamlit WAN IndexTTS2 Pandas llama.cpp Statistics Translation 云服务器 NameSilo tqdm EXCEL Review Hungarian 腾讯云 Pillow 算法题 News 飞书 Clash ONNX QWEN NLP Crawler SAM 报税 VPN OCR Template FastAPI Permission tar 继承 hf TSV ModelScope Freesound 阿里云 Website icon Random Plotly Quantize BF16 DeepSeek DeepStream Bert PDF Windows Bin PIP XGBoost PyTorch InvalidArgumentError 图形思考法 Markdown Hilton API Jetson 财报 Video FP64 域名 Agent 递归学习法 FP8 C++ Miniforge Proxy LLAMA 公式 Web Color Vmess Jupyter Pytorch AI CV Magnet Michelin Vim CLAP Breakpoint
站点统计

本站现有博文323篇,共被浏览801138

本站已经建立2500天!

热门文章
文章归档
回到顶部