EADST

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

Pytorch Q4_0 Quantize and Dequantize aligning with llama.cpp

import torch

# Check if CUDA is available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def q4_0_quantize_and_dequantize_tensor(tensor):
    tensor = tensor.to(dtype=torch.float32, device=device)

    # Reshape tensor to process each 32-value block independently
    orig_shape = tensor.shape
    tensor = tensor.view(-1, 32)

    # Find the maximum absolute value per block
    max_vals = torch.max(torch.abs(tensor), dim=1)[0]

    # Prevent division by zero
    max_vals[max_vals == 0] = 1.0

    # Calculate d and id for each block
    d = max_vals / -8.0
    ids = 1.0 / d

    # Scale and quantize tensor elements
    scaled_tensors = tensor * ids[:, None]
    quantized_tensors = torch.clamp(scaled_tensors + 8.5, 0, 15).to(torch.uint8)

    # Dequantize the tensor
    dequantized_tensors = (quantized_tensors.float() - 8.0) * d[:, None]

    # Reshape back to the original shape
    dequantized_tensors = dequantized_tensors.view(orig_shape).to(dtype=torch.float16)

    return dequantized_tensors

# Assuming 'model_part' is already loaded and on CPU
model_part = torch.load(f"your_model_path/pytorch_model.bin", map_location="cpu")
keywords = [
    "embed_tokens.weight",
    "self_attn.q_proj.weight",
    "self_attn.k_proj.weight",
    "self_attn.v_proj.weight",
    "self_attn.o_proj.weight",
    "mlp.up_proj.weight",
    "mlp.gate_proj.weight",
    "mlp.down_proj.weight",
    "lm_head.weight"
]
for name, data in model_part.items():
    for word in keywords:
        if word in name:
            # Quantize and dequantize the entire tensor
            model_part[name] = q4_0_quantize_and_dequantize_tensor(data)

# Save the updated model parts
torch.save(model_part, "pytorch_model_quantized.bin")

Reference:

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
RGB Dataset GIT Anaconda COCO CAM Vmess C++ Sklearn GoogLeNet ChatGPT PyCharm Crawler printf Disk Algorithm OpenCV Interview Bipartite GGML Github 报税 Math Gemma VGG-16 Qwen SPIE Pytorch Input tar TSV Zip LLM Python 音频 Bitcoin transformers Transformers ResNet-50 git Tracking UNIX Augmentation Hungarian Safetensors Website EXCEL 搞笑 Ptyhon BF16 Nginx Datetime Domain Git GPTQ Logo NLTK Knowledge Web 签证 XML Conda MD5 PyTorch FlashAttention CLAP CV diffusers BeautifulSoup Image2Text WAN Quantization Michelin CSV QWEN Ubuntu InvalidArgumentError Permission 净利润 域名 scipy API FP16 LLAMA SAM LeetCode DeepStream BTC hf 版权 VPN Animate mmap 公式 HaggingFace CEIR FP64 v2ray Diagram Cloudreve Base64 DeepSeek Plate GPT4 Streamlit Bin UI Random Hilton Firewall PDB CUDA FastAPI XGBoost 财报 Land uWSGI 阿里云 Card Paper Proxy CC ModelScope Mixtral Windows Translation Shortcut Tensor CTC Baidu 关于博主 Food Color Video Freesound Pillow Google SQL Vim VSCode Tiktoken Password Pickle SQLite torchinfo Miniforge LoRA NameSilo Hotel NLP Distillation Markdown TensorRT Numpy Use Linux Jetson 继承 Paddle SVR Data Docker 飞书 ONNX Breakpoint PDF OpenAI Llama TensorFlow RAR YOLO 证件照 AI LaTeX 腾讯云 Claude IndexTTS2 Clash Jupyter WebCrawler uwsgi Heatmap HuggingFace Quantize 多进程 Django Bert Template 算法题 FP8 多线程 v0.dev TTS JSON Qwen2 FP32 Plotly Review llama.cpp logger OCR PIP tqdm Excel Attention Pandas Statistics Magnet Qwen2.5 git-lfs
站点统计

本站现有博文311篇,共被浏览744029

本站已经建立2386天!

热门文章
文章归档
回到顶部