EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
diffusers CV Shortcut Pillow Ptyhon Baidu Dataset FP64 JSON COCO Claude Plate Data LLAMA AI uwsgi Distillation Bitcoin Github Jupyter 多线程 VPN Crawler XML Website torchinfo Clash Pandas Paddle CLAP Math GPTQ HuggingFace git Transformers ONNX Bipartite Color Datetime Docker Quantization Land 递归学习法 Hilton CUDA Breakpoint PDB MD5 SPIE Web v0.dev Michelin Search Nginx hf C++ Translation 论文 InvalidArgumentError LeetCode Windows 版权 TTS SAM Conda RGB scipy NameSilo BeautifulSoup VGG-16 Miniforge Python Augmentation OCR Random Sklearn Password SVR Django Bin transformers Rebuttal API logger Domain 论文速读 Pickle Streamlit Freesound 报税 Firewall llama.cpp Magnet icon 关于博主 SQLite WAN v2ray Algorithm 域名 EXCEL Qwen WebCrawler Diagram PDF Quantize Llama 飞书 Knowledge VSCode TSV ResNet-50 UI FlashAttention TensorRT UNIX Logo Tensor CSV ChatGPT QWEN Zip LLM 顶会 DeepStream Interview PyTorch printf Tracking BF16 RAR 公式 多进程 Gemma 净利润 Attention HaggingFace Statistics Pytorch Vmess Use Qwen2.5 NLTK Cloudreve Numpy mmap 证件照 Heatmap tqdm Template Linux Anaconda 签证 Agent LaTeX DeepSeek News Plotly 强化学习 Review PIP CAM Base64 图标 算法题 Food NLP CTC Image2Text 搞笑 Paper Excel 财报 Hungarian Proxy Vim FastAPI OpenAI 云服务器 GPT4 PyCharm Markdown Bert IndexTTS2 阿里云 GIT Qwen2 腾讯云 YOLO LoRA Ubuntu SQL uWSGI Safetensors 图形思考法 继承 git-lfs 第一性原理 CC 音频 Tiktoken Input Animate FP8 Disk GGML OpenCV Hotel Git FP32 BTC Google Permission Mixtral TensorFlow Video tar CEIR ModelScope XGBoost GoogLeNet Jetson FP16 Card
站点统计

本站现有博文328篇,共被浏览839119

本站已经建立2543天!

热门文章
文章归档
回到顶部