EADST

Pytorch Q4_1 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_1 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_1_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 4-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the min and max values per block
    min_vals = torch.min(tensor, dim=1)[0]
    max_vals = torch.max(tensor, dim=1)[0]

    # Calculate scale d for each block
    d = (max_vals - min_vals) / (2**4 - 1)
    d[d == 0] = 1.0  # Prevent division by zero

    # Calculate inverse of d
    ids = 1.0 / d

    # Quantize tensor elements
    quantized_tensors = (tensor - min_vals[:, None]) * ids[:, None]

    # Clamp values to be between 0 and 15 (for 4 bits)
    quantized_tensors = torch.clamp(quantized_tensors + 0.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() * d[:, None]) + min_vals[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_1_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
VPN Transformers uWSGI XML Knowledge DeepStream git Qwen Quantize ModelScope tar FlashAttention 递归学习法 论文 Bitcoin Qwen2.5 Food ONNX C++ GIT 域名 CC Pandas Tracking Paddle 继承 News Safetensors Gemma SAM FP16 hf GoogLeNet LLM Diagram Michelin Anaconda TensorRT Hungarian Baidu OCR 图标 Tiktoken Password Jetson logger Plotly Domain CSV Linux AI EXCEL 财报 Attention GPT4 公式 Pytorch Search Template LaTeX WebCrawler COCO NLP Django Magnet SQL Pickle CUDA Agent YOLO Dataset OpenCV GGML 算法题 Land TTS tqdm 图形思考法 云服务器 多进程 多线程 Use Shortcut Hilton llama.cpp mmap Qwen2 Github Jupyter Statistics Math PDF Animate Card 第一性原理 ChatGPT Markdown Nginx icon QWEN Vmess PyCharm Heatmap Conda 关于博主 transformers uwsgi FastAPI DeepSeek Data TSV RAR HuggingFace Bert SPIE 签证 GPTQ torchinfo 报税 Datetime RGB 版权 Color WAN Zip FP64 Cloudreve Docker Input Interview Numpy PIP Breakpoint CTC Image2Text SQLite Base64 Website Pillow v0.dev Vim Python Hotel LoRA LLAMA Firewall git-lfs Web BeautifulSoup Permission 证件照 UNIX Augmentation VSCode printf NLTK Rebuttal CLAP MD5 Crawler Miniforge 净利润 BF16 HaggingFace Streamlit 顶会 Paper scipy diffusers Disk Tensor Ptyhon Claude API Algorithm Ubuntu UI InvalidArgumentError Plate PyTorch Distillation Review VGG-16 Logo Video Bipartite OpenAI Translation JSON CV Clash CEIR 阿里云 FP8 PDB Google Excel Quantization Llama TensorFlow Freesound IndexTTS2 SVR CAM BTC Random 音频 Sklearn Git FP32 NameSilo XGBoost ResNet-50 搞笑 强化学习 LeetCode Proxy 论文速读 腾讯云 Windows Mixtral v2ray 飞书 Bin
站点统计

本站现有博文328篇,共被浏览839118

本站已经建立2543天!

热门文章
文章归档
回到顶部