File size: 245,254 Bytes

/* ═══════════════════════════════════════════════════════════════════════════
 * hexstate_quantize.c — HexState GGUF Quantizer
 *
 * ╔═══════════════════════════════════════════════════════════════╗
 * ║  HPC-Optimized GGUF Quantization Engine                      ║
 * ║                                                               ║
 * ║  Architecture: HPCGraph Sensitivity Propagation               ║
 * ║  Optimization: Complex Amplitude BP + MCMC Scale Search       ║
 * ║  Enhancements: MSE Grid Search, Importance Matrix Weighting   ║
 * ║  Output: GGUF v3 (Q2_K)                                       ║
 * ║                                                               ║
 * ║  "The weight and the quantized are opposite faces."           ║
 * ╚═══════════════════════════════════════════════════════════════╝
 *
 * This tool adapts the HExState HPC Ouroboros factoring engine for
 * LLM weight quantization. The core mathematical machinery is reused:
 *
 * Factoring Domain          →  Quantization Domain
 * ─────────────────────────────────────────────────
 * HPCGraph + CZ edges       →  Block sensitivity graph
 * Complex Amplitude BP      →  Importance propagation
 * MCMC period sampler       →  Optimal scale search
 * try_period() validation   →  Error bound checking
 * LLL lattice reduction     →  (future) Adaptive bit allocation
 *
 * Additional techniques ported from llm-compressor:
 * MSE grid search           →  Optimal min/max range shrinking
 * Importance matrix (imatrix) →  Per-channel error weighting
 *
 * Build:
 * make -f Makefile.quantize
 *
 * Usage:
 * ./hexstate_quantize <input> <output.gguf> [options]
 *
 * Input can be:
 * - A single .safetensors file
 * - A model directory containing sharded .safetensors files
 *
 * Options:
 * --optimizer hpc|mse|hybrid   Scale optimization strategy (default: hybrid)
 * --imatrix <file>             Importance matrix for weighted quantization
 * --verbose                    Per-block diagnostics
 * ═══════════════════════════════════════════════════════════════════════════ */

#include <stdio.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <sys/stat.h>
#include <mpfr.h>

/* HExState headers — reused from the factoring engine */
#include "quhit_triality.h"
#include "hpc_graph.h"
#include "hpc_mobius.h"
#include "s6_exotic.h"

/* Quantization-specific headers */
#include "gguf_format.h"
#include "safetensors_reader.h"
#include "tokenizer_reader.h"
#include "imatrix_reader.h"

#define D 6  /* Preserved from HExState — the triality dimension */

/* ═══════════════════════════════════════════════════════════════════════════
 * OPTIMIZER MODE
 * ═══════════════════════════════════════════════════════════════════════════ */

typedef enum {
    OPT_HPC,     /* HExState BP only          */
    OPT_MSE,     /* MSE grid search only      */
    OPT_HYBRID   /* HPC sensitivity + MSE     */
} OptimizerMode;

/* ═══════════════════════════════════════════════════════════════════════════
 * MODEL ARCHITECTURE AUTO-DETECTION
 *
 * Infers model architecture metadata from tensor names and shapes.
 * Supports: LLaMA, Mistral, Qwen2, Phi-3, Gemma, GPT-NeoX, Falcon, DeepSeek
 * ═══════════════════════════════════════════════════════════════════════════ */

typedef struct {
    char     architecture[64];   /* "llama", "phi3", "gemma", etc.  */
    char     name[256];          /* Human-readable model name       */
    uint32_t block_count;        /* Number of transformer layers    */
    uint32_t embedding_length;   /* Hidden dimension                */
    uint32_t head_count;         /* Number of attention heads       */
    uint32_t head_count_kv;      /* Number of KV heads (GQA)        */
    uint32_t vocab_size;         /* Vocabulary size                 */
    uint32_t context_length;     /* Max context length (default)    */
    float    rope_freq_base;     /* RoPE frequency base             */
    uint32_t feed_forward_length; /* FFN intermediate size           */
    float    rms_norm_eps;       /* RMS norm epsilon                */
    int      has_bias;           /* Whether attention has biases    */
    int      tie_word_embeddings; /* Whether output = embed_tokens  */
} ModelArchitecture;

/* Count tensor names matching a pattern prefix */
static int count_tensors_with_prefix(const STMultiFile *mf, const char *prefix)
{
    int count = 0;
    int prefix_len = strlen(prefix);
    for (int i = 0; i < mf->n_tensors; i++) {
        if (strncmp(mf->tensor_map[i].name, prefix, prefix_len) == 0)
            count++;
    }
    return count;
}

/* Find max layer index from tensor names like "model.layers.N.xxx" */
static int find_max_layer_index(const STMultiFile *mf, const char *layer_prefix)
{
    int max_idx = -1;
    int prefix_len = strlen(layer_prefix);
    for (int i = 0; i < mf->n_tensors; i++) {
        if (strncmp(mf->tensor_map[i].name, layer_prefix, prefix_len) == 0) {
            int idx = atoi(mf->tensor_map[i].name + prefix_len);
            if (idx > max_idx) max_idx = idx;
        }
    }
    return max_idx;
}

/* ── Config.json reader for definitive architecture parameters ── */

typedef struct {
    int      valid;
    uint32_t hidden_size;
    uint32_t intermediate_size;
    uint32_t num_attention_heads;
    uint32_t num_key_value_heads;
    uint32_t num_hidden_layers;
    uint32_t vocab_size;
    uint32_t max_position_embeddings;
    float    rope_theta;
    float    rms_norm_eps;
    char     model_type[64];
    int      tie_word_embeddings;
} ConfigJson;

static ConfigJson parse_config_json(const char *path)
{
    ConfigJson cfg;
    memset(&cfg, 0, sizeof(cfg));

    FILE *f = fopen(path, "rb");
    if (!f) return cfg;

    fseek(f, 0, SEEK_END);
    long size = ftell(f);
    fseek(f, 0, SEEK_SET);
    if (size <= 0) { fclose(f); return cfg; }

    char *json = (char *)malloc((size_t)size + 1);
    if (!json) { fclose(f); return cfg; }
    size_t nread = fread(json, 1, (size_t)size, f);
    json[nread] = '\0';
    fclose(f);
    if (nread == 0) { free(json); return cfg; }

    cfg.valid = 1;

    /* Simple key-value extraction */
    const char *p;

    p = tok_find_key(json, "hidden_size");
    if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "intermediate_size");
    if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "num_attention_heads");
    if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "num_key_value_heads");
    if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "num_hidden_layers");
    if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "vocab_size");
    if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "max_position_embeddings");
    if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);

    p = tok_find_key(json, "rope_theta");
    if (p) cfg.rope_theta = (float)strtod(p, NULL);

    p = tok_find_key(json, "rms_norm_eps");
    if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);

    p = tok_find_key(json, "model_type");
    if (p && *p == '"') {
        char buf[64];
        tok_extract_string(p, buf, sizeof(buf));
        strncpy(cfg.model_type, buf, sizeof(cfg.model_type) - 1);
    }

    p = tok_find_key(json, "tie_word_embeddings");
    if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);

    /* ── Qwen 3.5/3.6: parameters are nested inside "text_config" ── */
    if (cfg.hidden_size == 0) {
        const char *tc = strstr(json, "\"text_config\"");
        if (tc) {
            const char *tc_brace = strchr(tc, '{');
            if (tc_brace) {
                p = tok_find_key(tc_brace, "hidden_size");
                if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "intermediate_size");
                if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "num_attention_heads");
                if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "num_key_value_heads");
                if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "num_hidden_layers");
                if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "vocab_size");
                if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "max_position_embeddings");
                if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);
                p = tok_find_key(tc_brace, "rms_norm_eps");
                if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);
                p = tok_find_key(tc_brace, "model_type");
                if (p && *p == '"') {
                    char buf2[64];
                    tok_extract_string(p, buf2, sizeof(buf2));
                    strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1);
                }
                p = tok_find_key(tc_brace, "tie_word_embeddings");
                if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
                /* Qwen3.6 rope_theta is nested in rope_parameters */
                const char *rp = strstr(tc_brace, "\"rope_parameters\"");
                if (rp) {
                    p = tok_find_key(rp, "rope_theta");
                    if (p) cfg.rope_theta = (float)strtod(p, NULL);
                }
            }
        }
    }

    free(json);
    return cfg;
}

static void detect_architecture(const STMultiFile *mf, ModelArchitecture *arch,
                                  const char *config_json_path)
{
    memset(arch, 0, sizeof(*arch));

    /* Default values */
    strcpy(arch->architecture, "llama");
    strcpy(arch->name, "HExState-quantized");
    arch->context_length = 4096;
    arch->rope_freq_base = 10000.0f;
    arch->rms_norm_eps = 1e-5f;

    /* ── Try config.json for definitive parameters ── */
    ConfigJson cfg = {0};
    if (config_json_path) {
        cfg = parse_config_json(config_json_path);
    }

    if (cfg.valid) {
        /* Map model_type to GGUF architecture name */
        if (strcmp(cfg.model_type, "llama") == 0 ||
            strcmp(cfg.model_type, "mistral") == 0) {
            strcpy(arch->architecture, "llama");
        } else if (strcmp(cfg.model_type, "qwen2") == 0) {
            strcpy(arch->architecture, "qwen2");
        } else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
            strcpy(arch->architecture, "qwen2moe");
        } else if (strcmp(cfg.model_type, "qwen3_5") == 0 ||
                   strcmp(cfg.model_type, "qwen3_5_text") == 0 ||
                   strcmp(cfg.model_type, "qwen3_5_moe") == 0) {
            strcpy(arch->architecture, "qwen2");  /* GGUF arch: qwen2 compat */
        } else if (strcmp(cfg.model_type, "phi3") == 0 ||
                   strcmp(cfg.model_type, "phi") == 0) {
            strcpy(arch->architecture, "phi3");
        } else if (strcmp(cfg.model_type, "gemma") == 0 ||
                   strcmp(cfg.model_type, "gemma2") == 0) {
            strcpy(arch->architecture, "gemma");
        } else if (strcmp(cfg.model_type, "deepseek_v2") == 0) {
            strcpy(arch->architecture, "llama");
        } else if (strcmp(cfg.model_type, "gpt_neox") == 0) {
            strcpy(arch->architecture, "gpt_neox");
        } else if (strcmp(cfg.model_type, "falcon") == 0) {
            strcpy(arch->architecture, "falcon");
        } else if (cfg.model_type[0]) {
            /* Unknown — try llama as fallback */
            strcpy(arch->architecture, "llama");
        }

        if (cfg.hidden_size) arch->embedding_length = cfg.hidden_size;
        if (cfg.intermediate_size) arch->feed_forward_length = cfg.intermediate_size;
        if (cfg.num_attention_heads) arch->head_count = cfg.num_attention_heads;
        if (cfg.num_key_value_heads) arch->head_count_kv = cfg.num_key_value_heads;
        if (cfg.num_hidden_layers) arch->block_count = cfg.num_hidden_layers;
        if (cfg.vocab_size) arch->vocab_size = cfg.vocab_size;
        if (cfg.max_position_embeddings) arch->context_length = cfg.max_position_embeddings;
        if (cfg.rope_theta > 0) arch->rope_freq_base = cfg.rope_theta;
        if (cfg.rms_norm_eps > 0) arch->rms_norm_eps = cfg.rms_norm_eps;
        arch->tie_word_embeddings = cfg.tie_word_embeddings;

        printf("  Architecture determined from config.json: %s\n", cfg.model_type);
    }

    /* ── Fall back to tensor name pattern detection ── */
    int has_model_layers = count_tensors_with_prefix(mf, "model.layers.");
    int has_gpt_neox = count_tensors_with_prefix(mf, "gpt_neox.");
    int has_transformer = count_tensors_with_prefix(mf, "transformer.");

    /* Architecture-specific detection */
    int has_qkv_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.qkv_proj");
    int has_kv_a_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.kv_a_proj_with_mqa");
    int has_final_norm = (st_multi_find_tensor(mf, "model.final_norm.weight") >= 0);

    if (has_qkv_proj > 0 && !cfg.valid) {
        strcpy(arch->architecture, "phi3");
    } else if (has_kv_a_proj > 0 && !cfg.valid) {
        strcpy(arch->architecture, "llama");  /* DeepSeek uses llama arch */
    } else if (has_final_norm && !cfg.valid) {
        strcpy(arch->architecture, "gemma");
    }

    if (has_model_layers > 0 && arch->block_count == 0) {
        arch->block_count = find_max_layer_index(mf, "model.layers.") + 1;
    }

    /* Infer dimensions from tensor shapes if not from config.json */
    if (arch->embedding_length == 0 || arch->head_count == 0) {
        int qproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.weight");
        int kproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.k_proj.weight");

        if (qproj_idx >= 0) {
            const STTensorInfo *ti = st_multi_tensor_info(mf, qproj_idx);
            int64_t q_out = ti->shape[0];
            int64_t hidden = ti->shape[1];
            if (arch->embedding_length == 0) arch->embedding_length = hidden;

            /* Try common head dimensions: 128, 64, 96 */
            int head_dim = 128;
            if (q_out % 128 == 0) head_dim = 128;
            else if (q_out % 96 == 0) head_dim = 96;
            else if (q_out % 64 == 0) head_dim = 64;

            if (arch->head_count == 0) arch->head_count = q_out / head_dim;

            if (kproj_idx >= 0 && arch->head_count_kv == 0) {
                const STTensorInfo *kt = st_multi_tensor_info(mf, kproj_idx);
                arch->head_count_kv = kt->shape[0] / head_dim;
            }
        }
    }

    if (arch->vocab_size == 0) {
        int embed_idx = st_multi_find_tensor(mf, "model.embed_tokens.weight");
        if (embed_idx >= 0) {
            const STTensorInfo *ti = st_multi_tensor_info(mf, embed_idx);
            arch->vocab_size = ti->shape[0];
        }
    }

    if (arch->feed_forward_length == 0) {
        int gate_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.gate_proj.weight");
        if (gate_idx >= 0) {
            const STTensorInfo *ti = st_multi_tensor_info(mf, gate_idx);
            arch->feed_forward_length = ti->shape[0];
        } else {
            int up_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.up_proj.weight");
            if (up_idx >= 0) {
                const STTensorInfo *ti = st_multi_tensor_info(mf, up_idx);
                arch->feed_forward_length = ti->shape[0];
            }
        }
    }

    /* Check for attention bias */
    arch->has_bias = (st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.bias") >= 0);

    if (has_gpt_neox > 0 && arch->block_count == 0) {
        strcpy(arch->architecture, "gpt_neox");
        arch->block_count = find_max_layer_index(mf, "gpt_neox.layers.") + 1;
    }
    if (has_transformer > 0 && arch->block_count == 0) {
        strcpy(arch->architecture, "falcon");
        arch->block_count = find_max_layer_index(mf, "transformer.h.") + 1;
    }

    /* Fill in defaults for anything we couldn't detect */
    if (arch->head_count == 0) arch->head_count = 32;
    if (arch->head_count_kv == 0) arch->head_count_kv = arch->head_count;
    if (arch->embedding_length == 0) arch->embedding_length = 4096;
    if (arch->vocab_size == 0) arch->vocab_size = 32000;
    if (arch->feed_forward_length == 0)
        arch->feed_forward_length = (arch->embedding_length * 8) / 3;  /* SwiGLU default */
}

/* ═══════════════════════════════════════════════════════════════════════════
 * TENSOR NAME MAPPING: HuggingFace → GGUF Standard
 *
 * Maps SafeTensors tensor names to the standardized GGUF naming
 * convention used by llama.cpp for model loading.
 *
 * Enhanced with mappings for Phi-3, Gemma, DeepSeek, MoE, and bias tensors.
 * ═══════════════════════════════════════════════════════════════════════════ */

/* Returns 1 if this tensor should be skipped (not written to GGUF) */
static int should_skip_tensor(const char *hf_name)
{
    /* Rotary embeddings are computed at runtime, not stored */
    if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
    if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
    if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
    /* Qwen 3.6 vision encoder — skip all visual.* tensors */
    if (strncmp(hf_name, "model.visual.", 13) == 0) return 1;
    if (strncmp(hf_name, "visual.", 7) == 0) return 1;
    /* MTP (multi-token prediction) layers — not needed for inference */
    if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1;
    return 0;
}

static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
{
    /* Start with identity mapping */
    strncpy(gguf_name, hf_name, buflen - 1);
    gguf_name[buflen - 1] = '\0';

    /* Top-level mappings (common to all architectures) */
    struct { const char *from; const char *to; } mappings[] = {
        {"model.embed_tokens.weight",              "token_embd.weight"},
        {"model.language_model.embed_tokens.weight","token_embd.weight"},  /* Qwen 3.6 */
        {"model.norm.weight",                      "output_norm.weight"},
        {"model.language_model.norm.weight",        "output_norm.weight"},  /* Qwen 3.6 */
        {"model.final_norm.weight",                "output_norm.weight"},  /* Gemma */
        {"lm_head.weight",                         "output.weight"},
        {"model.embed_tokens.bias",                "token_embd.bias"},
        {"model.norm.bias",                        "output_norm.bias"},
        {NULL, NULL}
    };

    for (int m = 0; mappings[m].from; m++) {
        if (strcmp(hf_name, mappings[m].from) == 0) {
            strncpy(gguf_name, mappings[m].to, buflen - 1);
            return;
        }
    }

    /* Layer mappings: "model.layers.N.xxx" or "model.language_model.layers.N.xxx" → "blk.N.xxx" */
    const char *layer_prefix = NULL;
    if (strncmp(hf_name, "model.layers.", 13) == 0)
        layer_prefix = hf_name + 13;
    else if (strncmp(hf_name, "model.language_model.layers.", 27) == 0)
        layer_prefix = hf_name + 27;

    if (layer_prefix) {
        int layer_idx;
        char rest[ST_MAX_NAME_LEN];
        if (sscanf(layer_prefix, "%d.%255s", &layer_idx, rest) == 2) {
            /* Map sublayer names */
            struct { const char *from; const char *to; } layer_maps[] = {
                /* Standard attention projections */
                {"self_attn.q_proj.weight",         "attn_q.weight"},
                {"self_attn.k_proj.weight",         "attn_k.weight"},
                {"self_attn.v_proj.weight",         "attn_v.weight"},
                {"self_attn.o_proj.weight",         "attn_output.weight"},
                /* Attention biases */
                {"self_attn.q_proj.bias",           "attn_q.bias"},
                {"self_attn.k_proj.bias",           "attn_k.bias"},
                {"self_attn.v_proj.bias",           "attn_v.bias"},
                {"self_attn.o_proj.bias",           "attn_output.bias"},
                /* Phi-3 fused QKV */
                {"self_attn.qkv_proj.weight",       "attn_qkv.weight"},
                {"self_attn.qkv_proj.bias",         "attn_qkv.bias"},
                /* DeepSeek MLA */
                {"self_attn.kv_a_proj_with_mqa.weight", "attn_kv_a_mqa.weight"},
                {"self_attn.kv_b_proj.weight",      "attn_kv_b.weight"},
                /* Standard FFN (SwiGLU) */
                {"mlp.gate_proj.weight",            "ffn_gate.weight"},
                {"mlp.up_proj.weight",              "ffn_up.weight"},
                {"mlp.down_proj.weight",            "ffn_down.weight"},
                /* FFN biases */
                {"mlp.gate_proj.bias",              "ffn_gate.bias"},
                {"mlp.up_proj.bias",                "ffn_up.bias"},
                {"mlp.down_proj.bias",              "ffn_down.bias"},
                /* MoE gate */
                {"mlp.gate.weight",                 "ffn_gate_inp.weight"},
                /* MoE expert weights */
                {"mlp.experts.gate_proj.weight",    "ffn_gate_exps.weight"},
                {"mlp.experts.up_proj.weight",      "ffn_up_exps.weight"},
                {"mlp.experts.down_proj.weight",    "ffn_down_exps.weight"},
                /* Norm layers */
                {"input_layernorm.weight",          "attn_norm.weight"},
                {"post_attention_layernorm.weight", "ffn_norm.weight"},
                {"input_layernorm.bias",            "attn_norm.bias"},
                {"post_attention_layernorm.bias",   "ffn_norm.bias"},
                /* Gemma pre/post feedforward norm */
                {"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
                {"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
                /* Qwen 3.6 full attention QK norms */
                {"self_attn.q_norm.weight",         "attn_q_norm.weight"},
                {"self_attn.k_norm.weight",         "attn_k_norm.weight"},
                /* Qwen 3.6 DeltaNet (Gated Linear Attention) */
                {"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"},
                {"linear_attn.in_proj_z.weight",   "ssm_in_z.weight"},
                {"linear_attn.in_proj_a.weight",   "ssm_in_a.weight"},
                {"linear_attn.in_proj_b.weight",   "ssm_in_b.weight"},
                {"linear_attn.out_proj.weight",    "ssm_out.weight"},
                {"linear_attn.conv1d.weight",      "ssm_conv1d.weight"},
                {"linear_attn.norm.weight",        "ssm_norm.weight"},
                {"linear_attn.A_log",              "ssm_a"},
                {"linear_attn.dt_bias",            "ssm_dt.bias"},
                {NULL, NULL}
            };

            for (int m = 0; layer_maps[m].from; m++) {
                if (strcmp(rest, layer_maps[m].from) == 0) {
                    snprintf(gguf_name, buflen, "blk.%d.%s",
                             layer_idx, layer_maps[m].to);
                    return;
                }
            }

            /* MoE expert layer mapping: model.layers.N.mlp.experts.E.xxx */
            int expert_idx;
            char expert_rest[ST_MAX_NAME_LEN];
            if (sscanf(rest, "mlp.experts.%d.%255s", &expert_idx, expert_rest) == 2) {
                struct { const char *from; const char *to; } expert_maps[] = {
                    {"gate_proj.weight", "ffn_gate_exp.weight"},
                    {"up_proj.weight",   "ffn_up_exp.weight"},
                    {"down_proj.weight", "ffn_down_exp.weight"},
                    {NULL, NULL}
                };
                for (int m = 0; expert_maps[m].from; m++) {
                    if (strcmp(expert_rest, expert_maps[m].from) == 0) {
                        snprintf(gguf_name, buflen, "blk.%d.%s.%d",
                                 layer_idx, expert_maps[m].to, expert_idx);
                        return;
                    }
                }
            }

            /* Fallback: keep original sub-path */
            snprintf(gguf_name, buflen, "blk.%d.%s", layer_idx, rest);
        }
    }
}

/* ═══════════════════════════════════════════════════════════════════════════
 * SHOULD THIS TENSOR BE QUANTIZED?
 *
 * Decision rules:
 * - Quantize: weight matrices (2D, large)
 * - Keep F32: norms, biases, embeddings, 1D tensors
 * ═══════════════════════════════════════════════════════════════════════════ */

static int should_quantize(const STTensorInfo *ti, const char *gguf_name)
{
    /* Never quantize 1D tensors (norms, biases) */
    if (ti->n_dims < 2) return 0;

    /* Never quantize embedding tables (row dimension = vocab) */
    if (strstr(gguf_name, "token_embd") != NULL) return 0;

    /* Never quantize LM head output — use exact match, not substring,
     * to avoid matching "attn_output.weight" */
    if (strcmp(gguf_name, "output.weight") == 0) return 0;

    /* Never quantize norm weights */
    if (strstr(gguf_name, "norm") != NULL) return 0;

    /* Never quantize bias tensors */
    if (strstr(gguf_name, ".bias") != NULL) return 0;

    /* Never quantize MoE gate routing weights */
    if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;

    /* Never quantize DeltaNet state-space parameters (1D or small) */
    if (strstr(gguf_name, "ssm_a") != NULL) return 0;      /* A_log */
    if (strstr(gguf_name, "ssm_dt") != NULL) return 0;     /* dt_bias */
    if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */

    /* Quantize everything else (attention projections, FFN weights, SSM projections) */
    return 1;
}

/* Detect attention Q/K/V/O projection tensors.
 * These are the most sensitive to quantization — errors in attention scores
 * cascade through the entire sequence, causing self-correction loops.
 * Promoting these to Q4_0 (~4.5bpw) doubles their precision. */
static int is_attention_tensor(const char *gguf_name)
{
    /* Gemma / LLaMA style GGUF names: blk.N.attn_q/k/v/output.weight */
    if (strstr(gguf_name, "attn_q.weight") != NULL) return 1;
    if (strstr(gguf_name, "attn_k.weight") != NULL) return 1;
    if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
    if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
    if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
    /* Qwen 3.6 DeltaNet SSM projections — treat as attention-class (Q4_0) */
    if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1;
    if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1;
    if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1;
    /* HuggingFace style (fallthrough names) */
    if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
    if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
    if (strstr(gguf_name, "self_attn.v_proj.weight") != NULL) return 1;
    if (strstr(gguf_name, "self_attn.o_proj.weight") != NULL) return 1;
    return 0;
}

/* ═══════════════════════════════════════════════════════════════════════════
 * HPC SENSITIVITY GRAPH BUILDER
 *
 * Creates an HPCGraph where each node represents a weight block.
 * For Q2_K: 256-weight superblocks.
 *
 * The 6 values per site correspond to 6 candidate scale factors:
 * v=0: scale * 0.85  (aggressive, high compression)
 * v=1: scale * 0.90
 * v=2: scale * 0.95
 * v=3: scale * 1.00  (standard)
 * v=4: scale * 1.05
 * v=5: scale * 1.10  (conservative, less compression error)
 *
 * BP propagates: "if your neighbor block is sensitive, you should be
 * conservative too" — creating coherent precision allocation.
 * ═══════════════════════════════════════════════════════════════════════════ */


/* ── Multi-quhit expanded scale table ──
 * Search grid: 24×24 = 576 (d, dmin) candidates
 * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
 * Beam search: operates on all 576 candidates directly */
#define QUHITS_PER_BLOCK  2
#define N_CAND_D   24    /* d multiplier candidates (expanded) */
#define N_CAND_M   24    /* dmin multiplier candidates (expanded) */
#define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)

/* ════════════════════════════════════════════════════════════════════════
 * EXPERIMENTAL / CURRENTLY-UNUSED CODE PATHS
 *
 * Nothing in the live pipeline calls the legacy BP sensitivity graph
 * (build_sensitivity_graph + compute_block_error_q2k + SCALE_TABLE) or the
 * llm-compressor MSE grid search (mse_grid_search_q2k_subblock); the Shor /
 * Viterbi path superseded them. They are preserved behind this flag instead
 * of silently shipping as dead code that still costs an init pass.
 * ════════════════════════════════════════════════════════════════════════ */
#ifdef HEXSTATE_ENABLE_EXPERIMENTAL

#define SCALE_FACTOR_COUNT 6
static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
    0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
};

static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
static int scale_table_initialized = 0;

static void init_scale_table(void) {
    if (scale_table_initialized) return;
    /* candidates: uniform spacing centered on 1.0 */
    for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) {
        SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1));
    }
    scale_table_initialized = 1;
}
#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */

/* ═══════════════════════════════════════════════════════════════════════════
 * THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles
 *
 * The sub-block Shor measurement uses a 16-node linear-chain graph that
 * is identical in topology every time. Instead of hpc_create()/hpc_destroy()
 * inside the OMP hot loop, we reset the same graph to a clean state.
 *
 * This function resets an existing HPCGraph with n_sites nodes to its
 * initial state: clears all edges, resets adjacency lists, reinitializes
 * locals. Zero allocations.
 * ═══════════════════════════════════════════════════════════════════════════ */
static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
{
    /* Reset edge state */
    g->n_edges = 0;
    g->cz_edges = 0;
    g->phase_edges = 0;
    g->syntheme_edges = 0;
    g->n_log = 0;
    g->min_fidelity = 1.0;
    g->avg_fidelity = 1.0;
    g->amp_evals = 0;
    g->prob_evals = 0;
    g->measurements = 0;

    /* Reset adjacency lists (just zero the counts, keep allocated buffers) */
    for (uint64_t i = 0; i < n_sites; i++) {
        g->adj[i].count = 0;
    }

    /* Reinitialize local quhit states */
    for (uint64_t i = 0; i < n_sites; i++)
        triality_init(&g->locals[i]);
}

#ifdef HEXSTATE_ENABLE_EXPERIMENTAL
/* ═══════════════════════════════════════════════════════════════════════════
 * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
 *
 * powf() costs ~50-100 cycles. Use log2f+exp2f (~25 cycles) for the
 * exact x^2.4 = x^2 × 2^(0.4·log2(x)) computation instead.
 * ═══════════════════════════════════════════════════════════════════════════ */
static inline float fast_pow_2_4(float x)
{
    /* x^2.4 = x^2 × 2^(0.4 × log2(x)).  log2f+exp2f ≈ 25 cycles total vs
     * 50-100 for powf, and produces the exact ^2.4 norm the grid search needs. */
    float x2 = x * x;
    return x2 * exp2f(0.4f * log2f(x));  /* x^2 × x^0.4 = x^2.4 */
}

/* Compute the Q2_K sub-block reconstruction error for a block at a given
 * scale multiplier, optionally weighted by importance vector */
static float compute_block_error_q2k(const float *weights, int block_size,
                                       float scale_mult,
                                       const float *importance, int imp_offset)
{
    float min_val = weights[0];
    float max_val = weights[0];
    for (int j = 1; j < block_size; j++) {
        if (weights[j] < min_val) min_val = weights[j];
        if (weights[j] > max_val) max_val = weights[j];
    }
    if (min_val > 0) min_val = 0;

    float range = (max_val - min_val) * scale_mult;
    if (range < 1e-15f) return 0.0f;
    float inv_range = 3.0f / range;

    float err = 0.0f;
    for (int j = 0; j < block_size; j++) {
        float x = weights[j];
        int q = (int)((x - min_val * scale_mult) * inv_range + 0.5f);
        if (q < 0) q = 0; if (q > 3) q = 3;
        float deq = min_val * scale_mult + (float)q * range / 3.0f;
        float diff = x - deq;
        float w = (importance) ? importance[imp_offset + j] : 1.0f;
        err += diff * diff * w;
    }
    return err;
}

/* Build multi-quhit HPC sensitivity graph.
 * 2 quhits per block → 576 scale candidates per block.
 *
 * Graph layout: sites [0..2*n-1] where:
 * site 2*i     = coarse quhit for block i
 * site 2*i + 1 = fine quhit for block i
 *
 * Edges:
 * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
 * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
 * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
static HPCGraph *build_sensitivity_graph(const float *weights,
                                           int64_t n_elements,
                                           int block_size,
                                           float temperature,
                                           const float *importance)
{
    int64_t n_blocks = n_elements / block_size;
    if (n_blocks < 2) return NULL;

    init_scale_table();

    int64_t graph_blocks = (n_blocks > 8192) ? 8192 : n_blocks;
    int64_t stride = n_blocks / graph_blocks;
    int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK;

    HPCGraph *graph = hpc_create(n_sites);
    if (!graph) return NULL;

    for (int64_t i = 0; i < n_sites; i++)
        triality_dft(&graph->locals[i]);

    /* Compute errors for all candidates per block,
     * then project onto coarse (quhit 0) and fine (quhit 1) marginals */
    for (int64_t i = 0; i < graph_blocks; i++) {
        int64_t block_idx = i * stride;
        const float *block_weights = weights + block_idx * block_size;

        /* Evaluate all candidates */
        float errors[TOTAL_SCALE_CANDIDATES];
        float min_err = 1e30f;
        for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
            errors[c] = compute_block_error_q2k(block_weights, block_size,
                                                  SCALE_TABLE[c],
                                                  importance,
                                                  (int)(block_idx * block_size));
            if (errors[c] < min_err) min_err = errors[c];
        }

        /* Project onto quhit 0 (coarse): marginalize over fine dimension
         * amp_coarse[v0] = Σ_{v1} exp(-error(v0*6+v1) / 2T) */
        double coarse_re[6], coarse_im[6];
        double coarse_norm = 0.0;
        for (int v0 = 0; v0 < 6; v0++) {
            coarse_re[v0] = 0.0;
            coarse_im[v0] = 0.0;
            for (int v1 = 0; v1 < 6; v1++) {
                int idx = v0 * 6 + v1;
                coarse_re[v0] += exp(-(double)(errors[idx] - min_err) /
                                      (2.0 * (double)temperature));
            }
            coarse_norm += coarse_re[v0] * coarse_re[v0];
        }
        if (coarse_norm > 1e-30) {
            double inv = 1.0 / sqrt(coarse_norm);
            for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
        }

        /* Project onto quhit 1 (fine): marginalize over coarse dimension
         * amp_fine[v1] = Σ_{v0} exp(-error(v0*6+v1) / 2T) */
        double fine_re[6], fine_im[6];
        double fine_norm = 0.0;
        for (int v1 = 0; v1 < 6; v1++) {
            fine_re[v1] = 0.0;
            fine_im[v1] = 0.0;
            for (int v0 = 0; v0 < 6; v0++) {
                int idx = v0 * 6 + v1;
                fine_re[v1] += exp(-(double)(errors[idx] - min_err) /
                                    (2.0 * (double)temperature));
            }
            fine_norm += fine_re[v1] * fine_re[v1];
        }
        if (fine_norm > 1e-30) {
            double inv = 1.0 / sqrt(fine_norm);
            for (int v = 0; v < 6; v++) fine_re[v] *= inv;
        }

        /* Write coarse quhit (site 2*i) */
        int64_t s_coarse = 2 * i;
        for (int v = 0; v < 6; v++) {
            graph->locals[s_coarse].edge_re[v] = coarse_re[v];
            graph->locals[s_coarse].edge_im[v] = 0.0;
        }
        graph->locals[s_coarse].primary = VIEW_EDGE;
        graph->locals[s_coarse].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
        graph->locals[s_coarse].delta_valid = 0;
        triality_update_mask(&graph->locals[s_coarse]);

        /* Write fine quhit (site 2*i + 1) */
        int64_t s_fine = 2 * i + 1;
        for (int v = 0; v < 6; v++) {
            graph->locals[s_fine].edge_re[v] = fine_re[v];
            graph->locals[s_fine].edge_im[v] = 0.0;
        }
        graph->locals[s_fine].primary = VIEW_EDGE;
        graph->locals[s_fine].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
        graph->locals[s_fine].delta_valid = 0;
        triality_update_mask(&graph->locals[s_fine]);
    }

    /* ── Build edges ── */
    for (int64_t i = 0; i < graph_blocks; i++) {
        /* Intra-block: coarse ↔ fine coupling */
        hpc_cz(graph, 2 * i, 2 * i + 1);

        /* Inter-block: neighbor coupling */
        if (i + 1 < graph_blocks) {
            hpc_cz(graph, 2 * i, 2 * (i + 1));         /* coarse ↔ coarse */
            hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* fine ↔ fine     */
        }
    }

    return graph;
}

/* ═══════════════════════════════════════════════════════════════════════════
 * MSE GRID SEARCH (ported from llm-compressor observers/mse.py)
 *
 * For a Q2_K sub-block, progressively shrink the min/max range to find
 * the candidate that minimizes weighted reconstruction error.
 *
 * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
 * candidate_min = p * min
 * candidate_max = p * max
 * error = ||x - quantize(x, candidate_min, candidate_max)||^norm
 * if error < best: update best
 * else: patience--; if patience == 0: break
 *
 * This is a direct C port of llm-compressor's _grid_search_mse.
 * ═══════════════════════════════════════════════════════════════════════════ */

typedef struct {
    float maxshrink;    /* Maximum shrink factor (0.0 to 1.0)         */
    int   grid;         /* Number of grid divisions                   */
    int   patience;     /* Early stopping patience                    */
    float norm;         /* Error norm exponent (2.0 = MSE, 2.4 = ...)*/
} MSEGridConfig;

static const MSEGridConfig MSE_DEFAULT_CONFIG = {
    .maxshrink = 0.20f,
    .grid      = 200,
    .patience  = 8,
    .norm      = 2.4f
};

/* Grid search for optimal scale/min for a Q2_K sub-block of n weights
 * with nmax = 3 quantization levels.
 * Returns optimized scale; stores absolute min in *out_min.
 * importance: per-element weights (can be NULL for uniform). */
static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
                                            uint8_t *L, float *out_min,
                                            const float *importance,
                                            const MSEGridConfig *cfg)
{
    float min_val = x[0], max_val = x[0];
    for (int i = 1; i < n; i++) {
        if (x[i] < min_val) min_val = x[i];
        if (x[i] > max_val) max_val = x[i];
    }
    if (max_val == min_val) {
        for (int i = 0; i < n; i++) L[i] = 0;
        *out_min = -min_val;
        return 0.0f;
    }
    if (min_val > 0) min_val = 0;

    float best_scale = 0.0f;
    float best_min = -min_val;
    float best_error = 1e30f;
    int no_improve = 0;

    int shrink_steps = (int)(cfg->maxshrink * cfg->grid);
    if (shrink_steps < 1) shrink_steps = 1;

    for (int step = 0; step <= shrink_steps; step++) {
        float p = 1.0f - (float)step / (float)cfg->grid;

        float cand_min = p * min_val;
        float cand_max = p * max_val;

        if (cand_max <= cand_min) continue;

        float iscale = (float)nmax / (cand_max - cand_min);
        float scale = 1.0f / iscale;

        /* Quantize and measure error */
        float err = 0.0f;
        uint8_t tmp_L[256];
        for (int i = 0; i < n; i++) {
            int l = gguf_nearest_int(iscale * (x[i] - cand_min));
            if (l < 0) l = 0;
            if (l > nmax) l = nmax;
            tmp_L[i] = (uint8_t)l;

            float deq = cand_min + scale * (float)l;
            float diff = fabsf(x[i] - deq);
            /* Apply error norm — fast path for default norm=2.4 */
            float e = diff;
            if (cfg->norm == 2.4f) {
                e = fast_pow_2_4(diff);
            } else if (cfg->norm != 1.0f) {
                e = powf(diff, cfg->norm);
            }
            /* Apply importance weighting */
            if (importance) e *= importance[i];
            err += e;
        }

        if (err < best_error) {
            best_error = err;
            best_scale = scale;
            best_min = -cand_min;
            memcpy(L, tmp_L, n);
            no_improve = 0;
        } else {
            no_improve++;
            if (no_improve >= cfg->patience) break;
        }
    }

    /* Iterative refinement on the best candidate (from ggml) */
    float cur_min = -best_min;
    float cur_scale = best_scale;
    if (cur_scale > 1e-15f) {
        float iscale = 1.0f / cur_scale;
        for (int itry = 0; itry < 5; itry++) {
            float sumlx = 0;
            int suml2 = 0;
            for (int i = 0; i < n; i++) {
                int l = gguf_nearest_int(iscale * (x[i] - cur_min));
                if (l < 0) l = 0;
                if (l > nmax) l = nmax;
                L[i] = (uint8_t)l;
                sumlx += (x[i] - cur_min) * l;
                suml2 += l * l;
            }
            if (suml2 > 0) cur_scale = sumlx / suml2;
            float sum = 0;
            for (int i = 0; i < n; i++)
                sum += x[i] - cur_scale * L[i];
            /* True coordinate-descent optimal: min* = sum/n (no momentum).
             * Clamp to ≤ 0 since min must be non-positive by convention. */
            cur_min = fminf(0.0f, sum / n);
            if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale;
        }
    }

    *out_min = -cur_min;
    return cur_scale;
}
#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */

/* ═══════════════════════════════════════════════════════════════════════════
 * HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
 *
 * Two-phase approach:
 * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
 * This produces per-sub-block (scale, min) with 16-step search.
 * Phase B: HPC BP refines the superblock-level d/dmin rounding.
 * 6 candidate (d, dmin) pairs are tested; BP finds the one
 * where the GLOBAL reconstruction error is minimized via
 * constructive interference of per-sub-block phase coherence.
 * ═══════════════════════════════════════════════════════════════════════════ */

/* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants).
 * Finds optimal (scale, min) by searching 16 candidate iscale values
 * and solving weighted least-squares for each.
 * Returns scale; *the_min is set to the negative of the optimal min. */
static float hpc_make_qkx2_quants(int n, int nmax, const float *x,
                                     const float *w, uint8_t *L,
                                     float *the_min, uint8_t *Laux)
{
    float xmin = x[0], xmax = x[0];
    float sum_w = w[0], sum_x = w[0] * x[0];
    for (int i = 1; i < n; i++) {
        if (x[i] < xmin) xmin = x[i];
        if (x[i] > xmax) xmax = x[i];
        sum_w += w[i];
        sum_x += w[i] * x[i];
    }
    if (xmin > 0) xmin = 0;
    if (xmax == xmin) {
        for (int i = 0; i < n; i++) L[i] = 0;
        *the_min = -xmin;
        return 0.0f;
    }

    float iscale = (float)nmax / (xmax - xmin);
    float scale = 1.0f / iscale;
    float best_mad = 0;
    for (int i = 0; i < n; i++) {
        int l = gguf_nearest_int(iscale * (x[i] - xmin));
        if (l < 0) l = 0;
        if (l > nmax) l = nmax;
        L[i] = (uint8_t)l;
        float diff = scale * (float)l + xmin - x[i];
        best_mad += w[i] * fabsf(diff);
    }

    /* 16 candidate iscale values: search [-0.5, -0.5 + 0.1*15] + nmax */
    for (int is = 0; is <= 15; is++) {
        float try_iscale = (-0.5f + 0.1f * (float)is + (float)nmax) / (xmax - xmin);
        float sl = 0, sl2 = 0, sxl = 0;
        for (int i = 0; i < n; i++) {
            int l = gguf_nearest_int(try_iscale * (x[i] - xmin));
            if (l < 0) l = 0;
            if (l > nmax) l = nmax;
            Laux[i] = (uint8_t)l;
            sl += w[i] * (float)l;
            sl2 += w[i] * (float)(l * l);
            sxl += w[i] * (float)l * x[i];
        }
        float det = sum_w * sl2 - sl * sl;
        if (det > 0) {
            float this_scale = (sum_w * sxl - sum_x * sl) / det;
            float this_min = (sl2 * sum_x - sl * sxl) / det;
            if (this_min > 0) {
                this_min = 0;
                this_scale = sxl / sl2;
            }
            float mad = 0;
            for (int i = 0; i < n; i++) {
                float diff = this_scale * (float)Laux[i] + this_min - x[i];
                mad += w[i] * fabsf(diff);
            }
            if (mad < best_mad) {
                for (int i = 0; i < n; i++) L[i] = Laux[i];
                best_mad = mad;
                scale = this_scale;
                xmin = this_min;
            }
        }
    }
    *the_min = -xmin;
    return scale;
}

/* Quantize the scale/min arrays into 4-bit values: make_qp_quants equivalent.
 * Returns the optimal d such that scales[j] ≈ d × Ls[j]. */
static float hpc_make_qp_quants(int n, int nmax, const float *x,
                                   uint8_t *L, const float *sw)
{
    float xmax = 0;
    for (int i = 0; i < n; i++)
        if (x[i] > xmax) xmax = x[i];
    if (xmax < 1e-15f) {
        for (int i = 0; i < n; i++) L[i] = 0;
        return 0.0f;
    }
    float iscale = (float)nmax / xmax;
    for (int i = 0; i < n; i++) {
        int l = gguf_nearest_int(iscale * x[i]);
        if (l < 0) l = 0;
        if (l > nmax) l = nmax;
        L[i] = (uint8_t)l;
    }
    float scale = 1.0f / iscale;
    float best_mse = 0;
    for (int i = 0; i < n; i++) {
        float diff = x[i] - scale * (float)L[i];
        best_mse += sw[i] * diff * diff;
    }
    for (int is = -4; is <= 4; is++) {
        if (is == 0) continue;
        float iscale_is = (0.1f * (float)is + (float)nmax) / xmax;
        float scale_is = 1.0f / iscale_is;
        float mse = 0;
        for (int i = 0; i < n; i++) {
            int l = gguf_nearest_int(iscale_is * x[i]);
            if (l < 0) l = 0;
            if (l > nmax) l = nmax;
            float diff = x[i] - scale_is * (float)l;
            mse += sw[i] * diff * diff;
        }
        if (mse < best_mse) {
            best_mse = mse;
            iscale = iscale_is;
        }
    }
    /* Recompute with best iscale + iterative refinement */
    float sumlx = 0, suml2 = 0;
    for (int i = 0; i < n; i++) {
        int l = gguf_nearest_int(iscale * x[i]);
        if (l < 0) l = 0;
        if (l > nmax) l = nmax;
        L[i] = (uint8_t)l;
        sumlx += sw[i] * x[i] * (float)l;
        suml2 += sw[i] * (float)(l * l);
    }
    /* Iterative greedy refinement */
    for (int itry = 0; itry < 5; itry++) {
        int n_changed = 0;
        for (int i = 0; i < n; i++) {
            float wi = sw[i];
            float slx = sumlx - wi * x[i] * (float)L[i];
            float sl2 = suml2 - wi * (float)(L[i] * L[i]);
            if (slx > 0 && sl2 > 0) {
                int new_l = gguf_nearest_int(x[i] * sl2 / slx);
                if (new_l < 0) new_l = 0;
                if (new_l > nmax) new_l = nmax;
                if (new_l != L[i]) {
                    slx += wi * x[i] * (float)new_l;
                    sl2 += wi * (float)(new_l * new_l);
                    if (slx * slx * suml2 > sumlx * sumlx * sl2) {
                        L[i] = (uint8_t)new_l;
                        sumlx = slx;
                        suml2 = sl2;
                        n_changed++;
                    }
                }
            }
        }
        if (!n_changed) break;
    }
    return suml2 > 0 ? sumlx / suml2 : 0.0f;
}

/* ═══════════════════════════════════════════════════════════════════════════
 * SHOR'S GRIFFITHS-NIU SEQUENTIAL MEASUREMENT FOR RMSE OPTIMIZATION
 * (Ported 1:1 from tesseract_factor.c — replaces BP)
 *
 * Instead of iterative message-passing (BP), this uses the EXACT sequential
 * measurement protocol from Shor's algorithm:
 *
 * For each block k (MSB → LSB):
 * 1. Compute feed-forward phase correction from previously measured blocks
 * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
 * 3. Bake C_k into locals: α(d) *= C_k(d)
 * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
 * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales
 * 6. Born rule measurement → select optimal scale candidate
 * 7. Collapse site + absorb edge weights into neighbors (back-action)
 *
 * This IS the quantum Fourier transform that creates constructive
 * interference at the optimal RMSE configuration, exactly as Shor's
 * algorithm creates interference at the correct period.
 *
 * Domain mapping:
 * Factoring: oracle phase 2π×d×c_k/N → period r
 * Quantize:  error Boltzmann amplitudes → optimal RMSE block
 * ═══════════════════════════════════════════════════════════════════════════ */

/* ω₆ roots of unity for CZ phase lookup come from hpc_graph.h
 * (HPC_W6_RE / HPC_W6_IM) — the file-local duplicates were unused. */
static const double INV_SQRT6 = 0.40824829046386301637;  /* 1/√6 */

/* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
 * After sampling an outcome, collapse the target site to |outcome⟩,
 * absorb all edge weights into neighbor local states (Magic Pointer
 * disentanglement), and remove dead edges from the graph.
 *
 * This is the EXACT same back-action protocol used in Shor's algorithm
 * for the semi-classical QFT: measurement of one site conditions all
 * remaining sites through the CZ phase correlations. */
static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome)
{
    /* Step 1: Collapse local state to |outcome⟩ */
    for (int v = 0; v < 6; v++) {
        graph->locals[target_site].edge_re[v] = (v == outcome) ? 1.0 : 0.0;
        graph->locals[target_site].edge_im[v] = 0.0;
    }
    graph->locals[target_site].primary = VIEW_EDGE;
    graph->locals[target_site].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
    graph->locals[target_site].delta_valid = 0;

    /* Step 2: Absorb edge weights into neighbor states (back-action).
     * For each edge (target, neighbor), the weight w(outcome, d) for each
     * neighbor basis state d gets multiplied into the neighbor's amplitude.
     * This is the Magic Pointer disentanglement from tesseract_factor.c. */
    HPCAdjList *adj = &graph->adj[target_site];
    for (uint64_t ei = 0; ei < adj->count; ei++) {
        uint64_t eid = adj->edge_ids[ei];
        HPCEdge *edge = &graph->edges[eid];
        uint64_t partner = (edge->site_a == (uint64_t)target_site) ?
                            edge->site_b : edge->site_a;

        TrialityQuhit *pq = &graph->locals[partner];
        for (int d = 0; d < 6; d++) {
            double w_re, w_im;
            if (edge->type == HPC_EDGE_CZ) {
                int pidx = (outcome * d) % 6;
                w_re = HPC_W6_RE[pidx];
                w_im = HPC_W6_IM[pidx];
            } else {
                /* Weighted phase edge */
                if (edge->site_a == (uint64_t)target_site) {
                    w_re = edge->w_re[outcome][d];
                    w_im = edge->w_im[outcome][d];
                } else {
                    w_re = edge->w_re[d][outcome];
                    w_im = edge->w_im[d][outcome];
                }
            }
            double old_re = pq->edge_re[d], old_im = pq->edge_im[d];
            pq->edge_re[d] = old_re * w_re - old_im * w_im;
            pq->edge_im[d] = old_re * w_im + old_im * w_re;
        }
        pq->dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
        pq->delta_valid = 0;
    }

    /* Step 3: Remove edges touching this site from the graph.
     * Mark by setting fidelity to -1 and remove from adj lists. */
    for (uint64_t ei = 0; ei < adj->count; ei++) {
        uint64_t eid = adj->edge_ids[ei];
        HPCEdge *edge = &graph->edges[eid];
        uint64_t partner = (edge->site_a == (uint64_t)target_site) ?
                            edge->site_b : edge->site_a;

        /* Remove this edge from partner's adj list */
        HPCAdjList *padj = &graph->adj[partner];
        for (uint64_t pi = 0; pi < padj->count; pi++) {
            if (padj->edge_ids[pi] == eid) {
                padj->edge_ids[pi] = padj->edge_ids[--padj->count];
                break;
            }
        }
        edge->fidelity = -1.0; /* Mark as dead */
    }
    adj->count = 0; /* Clear target's adj list */
}

/* ═══════════════════════════════════════════════════════════════════════════
 * SHOR SEQUENTIAL MEASUREMENT — Griffiths-Niu Protocol for Quantization
 *
 * Ported 1:1 from tesseract_factor.c lines 2343-2500.
 *
 * Measures sites MSB→LSB. For each site k:
 * 1. Compute feed-forward phase correction θ_k from previously measured sites
 * 2. Compute neighbor contribution C_k(d) analytically
 * 3. Bake C_k into locals
 * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
 * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
 * 6. Compute |β(v)|² as measurement probabilities
 * 7. Sample/argmax → outcome
 * 8. Collapse + back-action via shor_collapse_site()
 *
 * Returns: marginals are written into marg_out[n_sites][6].
 * measured_out[n_sites] receives the measurement outcomes.
 * ═══════════════════════════════════════════════════════════════════════════ */
static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
                                double (*marg_out)[6], int *measured_out,
                                int deterministic)
{
    /* Measure sites from last to first (MSB→LSB, same as Griffiths-Niu) */
    for (int64_t k = n_sites - 1; k >= 0; k--) {
        int site_k = (int)k;

        /* Step 1: Compute feed-forward phase correction from previously
         * measured sites. The QFT phase is 2π F x / 6^n. For site k,
         * the fractional phase from previously measured site j (j > k)
         * is measured_out[j] / 6^{j-k+1}.
         * Power MUST start at 36.0 (6^2) for the immediately previous site. */
        double theta_k = 0.0;
        {
            double power = 36.0;
            for (int64_t j = k + 1; j < n_sites; j++) {
                theta_k += (double)measured_out[j] / power;
                power *= 6.0;
            }
        }

        /* Step 2: Compute neighbor contribution C_k(d) analytically.
         * C_k(d) = Π_neighbor Σ_{w=0}^{5} local_neighbor(w) × edge_weight(d, w)
         * Each neighbor is independent (product state). */
        double ck_re[6], ck_im[6];
        for (int d = 0; d < 6; d++) { ck_re[d] = 1.0; ck_im[d] = 0.0; }

        const HPCAdjList *adj = &graph->adj[site_k];
        for (uint64_t ei = 0; ei < adj->count; ei++) {
            uint64_t eid = adj->edge_ids[ei];
            const HPCEdge *edge = &graph->edges[eid];
            if (edge->fidelity < 0.0) continue;  /* Skip dead edges */
            uint64_t partner = (edge->site_a == (uint64_t)site_k) ?
                                edge->site_b : edge->site_a;

            const TrialityQuhit *pq = &graph->locals[partner];
            for (int d = 0; d < 6; d++) {
                double sr = 0, si = 0;
                for (int w = 0; w < 6; w++) {
                    double lr = pq->edge_re[w], li = pq->edge_im[w];
                    double wr, wi;
                    if (edge->type == HPC_EDGE_CZ) {
                        int pidx = (d * w) % 6;
                        wr = HPC_W6_RE[pidx]; wi = HPC_W6_IM[pidx];
                    } else if (edge->site_a == (uint64_t)site_k) {
                        wr = edge->w_re[d][w]; wi = edge->w_im[d][w];
                    } else {
                        wr = edge->w_re[w][d]; wi = edge->w_im[w][d];
                    }
                    sr += lr*wr - li*wi;
                    si += lr*wi + li*wr;
                }
                double nr = ck_re[d]*sr - ck_im[d]*si;
                double ni = ck_re[d]*si + ck_im[d]*sr;
                ck_re[d] = nr; ck_im[d] = ni;
            }
        }

        /* Step 3: Bake C_k(d) into locals: α(d) *= C_k(d) */
        for (int d = 0; d < 6; d++) {
            double re = graph->locals[site_k].edge_re[d];
            double im = graph->locals[site_k].edge_im[d];
            graph->locals[site_k].edge_re[d] = re*ck_re[d] - im*ck_im[d];
            graph->locals[site_k].edge_im[d] = re*ck_im[d] + im*ck_re[d];
        }

        /* Step 4: Apply feed-forward phase correction to locals. */
        for (int d = 0; d < 6; d++) {
            double angle = -2.0 * 3.14159265358979323846 * d * theta_k;
            double pr = cos(angle), pi2 = sin(angle);
            double re = graph->locals[site_k].edge_re[d];
            double im = graph->locals[site_k].edge_im[d];
            graph->locals[site_k].edge_re[d] = re*pr - im*pi2;
            graph->locals[site_k].edge_im[d] = re*pi2 + im*pr;
        }

        /* Step 5: Apply IDFT6 in-place: phase basis → computational basis.
         * β(v) = (1/√6) Σ_{d=0}^{5} α'(d) × e^{2πi d v / 6}
         * C_k(d) is INSIDE the coherent sum — THIS creates interference
         * peaks at the optimal RMSE configuration, exactly as Shor's
         * algorithm creates peaks at the correct period. */
        {
            double alpha_re[6], alpha_im[6];
            for (int d = 0; d < 6; d++) {
                alpha_re[d] = graph->locals[site_k].edge_re[d];
                alpha_im[d] = graph->locals[site_k].edge_im[d];
            }
            for (int v = 0; v < 6; v++) {
                double sum_re = 0.0, sum_im = 0.0;
                for (int d = 0; d < 6; d++) {
                    double angle = 2.0 * 3.14159265358979323846 * d * v / 6.0;
                    double er = cos(angle), ei = sin(angle);
                    sum_re += alpha_re[d]*er - alpha_im[d]*ei;
                    sum_im += alpha_re[d]*ei + alpha_im[d]*er;
                }
                graph->locals[site_k].edge_re[v] = sum_re * INV_SQRT6;
                graph->locals[site_k].edge_im[v] = sum_im * INV_SQRT6;
            }
        }

        /* Step 6: Compute marginals from |local(v)|² */
        double probs[6];
        double total = 0.0;
        for (int v = 0; v < 6; v++) {
            probs[v] = graph->locals[site_k].edge_re[v] * graph->locals[site_k].edge_re[v] +
                       graph->locals[site_k].edge_im[v] * graph->locals[site_k].edge_im[v];
            total += probs[v];
        }
        if (total > 1e-30) {
            for (int v = 0; v < 6; v++) probs[v] /= total;
        } else {
            for (int v = 0; v < 6; v++) probs[v] = 1.0 / 6.0;
        }

        /* Store marginals for downstream beam search */
        for (int v = 0; v < 6; v++)
            marg_out[k][v] = probs[v];

        /* Step 7: Select outcome — deterministic argmax for quantization
         * (unlike factoring which uses Born sampling for probabilistic
         * period recovery, quantization wants the MAP estimate) */
        int outcome;
        if (deterministic) {
            outcome = 0;
            double max_p = probs[0];
            for (int v = 1; v < 6; v++) {
                if (probs[v] > max_p) { max_p = probs[v]; outcome = v; }
            }
        } else {
            /* Born sampling (for multi-shot refinement) */
            static unsigned int shor_rng = 271828;
            shor_rng = shor_rng * 1664525u + 1013904223u;
            double r01 = (double)(shor_rng >> 8) / 16777216.0;
            double cumul = 0.0;
            outcome = 5;
            for (int v = 0; v < 6; v++) {
                cumul += probs[v];
                if (r01 <= cumul) { outcome = v; break; }
            }
        }

        measured_out[k] = outcome;

        /* Step 8: Collapse + back-action — absorb edge weights into
         * neighbor locals (Magic Pointer disentanglement) */
        shor_collapse_site(graph, site_k, outcome);
    }
}

/* ═══════════════════════════════════════════════════════════════════════════
 * HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors)
 *
 * Same architecture as Q2_K HPC pipeline, but simpler:
 * - One parameter per block (scale d only, no dmin)
 * - Single quhit per block (6 states)
 * - 24 candidate scales → bin to 6 for BP
 * - 48-beam Hensel search for globally optimal configuration
 * - Triality 3-view marginals for robust scoring
 *
 * Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d
 * ═══════════════════════════════════════════════════════════════════════════ */

#define Q4_N_CAND 24  /* expanded scale candidates for Q4_0 */
#define Q4_N_BEAMS 48 /* expanded beam width */

/* Tight neighborhood around WLS optimum */
static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = {
    0.850f, 0.880f, 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f,
    0.975f, 0.985f, 0.995f, 1.000f, 1.005f, 1.015f, 1.025f, 1.035f,
    1.050f, 1.070f, 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
};
static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
    3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};

/* ── Candidate-selection error metric (shared by Q4_0 and Q2_K) ──
 * Candidates are now scored with the EXACT importance-weighted SSE
 *     err = Σ_i w_i · (x_i − deq_i)²
 * which is the same objective the final assembly/polish phases minimise and
 * the same quantity reported as RMSE. The previous 2-point Hadamard form
 * (0.5·vesica + 0.5·wave with pair-AVERAGED weights) is algebraically equal
 * to Σ w̄·(e_i² + e_j²), i.e. it silently replaced per-element importance
 * weights with the pair mean — a systematic mis-weighting whenever an
 * imatrix is supplied. Scoring candidates on a different objective than the
 * one being optimised mis-ranks them; aligning the two strictly lowers the
 * final weighted RMSE (and is bit-identical when no imatrix is used). */

/* ── Cross-block prior override ratio ──
 * Q2_K and Q4_0 blocks are decoded INDEPENDENTLY by every GGUF runtime:
 * there is no cross-block coupling in the dequantizer, so a smoothness
 * prior that keeps a block on a worse candidate can only raise the true
 * reconstruction RMSE. With 1.00f the per-block argmin over the candidate
 * grid always wins (provably optimal seed for the assembly phase); the HPC
 * graph/Viterbi/Born machinery still shapes ties and seeds the search.
 * Set to e.g. 0.95f to restore the old 5%-hysteresis smoothness prior. */
#ifndef HEX_GREEDY_OVERRIDE_RATIO
#define HEX_GREEDY_OVERRIDE_RATIO 1.00f
#endif

/* fp16-ULP radius of the monotone (d, dmin) micro-search in the Phase-4.6
 * polish (move 3). Larger radii let coordinate descent escape shallower
 * local minima at O(radius²) extra cost per polish iteration. */
#ifndef HEX_POLISH_ULP
#define HEX_POLISH_ULP 4
#endif

/* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
 *
 * The quantity that matters downstream is the layer-output error
 *     ε = Σᵢ eᵢ·aᵢ,   E[ε²] = eᵀRe,   R = activation second-moment matrix.
 * Modelling R with three components — per-channel power (diagonal, ≈
 * imatrix), a common mean μ (rank-1), and correlation c across the
 * half-block fold (i ↔ i+n/2) — gives EXACTLY:
 *
 *   E[ε²] ≈ Σᵢ wᵢeᵢ²  +  μ²·(Σᵢeᵢ)²  +  c·Σ_pairs[(eᵢ+eⱼ)² − (eᵢ−eⱼ)²]
 *                                          └── = vesica² − wave² = 4·eᵢeⱼ ──┘
 *
 * The vesica/wave decomposition is therefore the natural basis of the
 * fold-correlation term: in-phase (vesica) error energy COSTS output
 * accuracy, anti-phase (wave) error energy is CREDITED — it cancels in
 * the dot product. (The old 0.5/0.5 scorer ADDED the two, which collapses
 * to plain SSE; the spectrally meaningful combination SUBTRACTS them.)
 * Every selection/acceptance stage scores blocks with
 *
 *   E(block) = Σᵢ wᵢeᵢ²
 *            + (HEX_DC_LAMBDA / n) · (Σᵢeᵢ)²
 *            + (HEX_VW_LAMBDA / n) · Σ_{i<n/2} [(eᵢ+eⱼ)² − (eᵢ−eⱼ)²],  j = i+n/2
 *
 * applied CONSISTENTLY to: Q2_K/Q4_0 candidate scoring, the closed-form
 * (d, dmin) refit acceptance, the shaping accept guards, every polish
 * move, and the Phase-4.7 floor — so no stage optimises a different
 * objective than its acceptance test measures. The closed-form solvers
 * incorporate the DC term as a rank-1 augmented observation and act as
 * proposal generators; acceptance always uses the full extended E.
 * λ = 0 on both knobs reduces exactly to the pure weighted-SSE objective.
 * Positive-definiteness: the fold coupling adds ±2λ_vw/n off-diagonal —
 * negligible against any sane wᵢ, so E stays a valid quadratic objective.
 * NOTE: reported RMSE stays pure reconstruction RMSE; with λ > 0 a small
 * RMSE increase is the *intended* price for lower output error. Per-block
 * terms are a proxy for row-level structure (the API sees a flat stream);
 * the Phase-3.9 rolling-DC pass handles cross-block linkage. */
#ifndef HEX_DC_LAMBDA
#define HEX_DC_LAMBDA 1.0f
#endif
#ifndef HEX_VW_LAMBDA
#define HEX_VW_LAMBDA 1.0f
#endif
/* Default (1, 1): unit-strength spectral prior. Empirically (synthetic
 * benchmark, identical inputs): lowers dot-product output error ~0.8-1.4%
 * on both mean-only and fold-correlated activation models for ~+0.05%
 * weight RMSE. The theoretically optimal λ grows with the deployment
 * model's activation mean energy and row length (the per-block term
 * under-counts cross-block row coupling); the synthetic sweep kept
 * improving monotonically through λ = 4 at ~+0.1% RMSE. Set both to
 * 0.0f to recover the exact pure weighted-SSE / minimum-RMSE pipeline. */

/* Spectral penalty of the extended objective for one block: residuals e[n],
 * fold at n/2. Negative values are possible (anti-phase credit) — the total
 * E remains positive-definite as argued above. */
static inline float hex_spectral_penalty(const float *e, int n)
{
    if (HEX_DC_LAMBDA == 0.0f && HEX_VW_LAMBDA == 0.0f) return 0.0f;
    float dc = 0.0f, cross = 0.0f;
    int half = n / 2;
    for (int i = 0; i < half; i++) {
        dc    += e[i] + e[i + half];
        cross += e[i] * e[i + half];
    }
    return (HEX_DC_LAMBDA / (float)n) * dc * dc
         + (HEX_VW_LAMBDA / (float)n) * 4.0f * cross;
}

static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                                       BlockQ4_0 *output, float *out_total_error,
                                       const float *imat_importance, int verbose)
{
    int64_t n_blocks = n_elements / QK4_0;
    float total_err = 0.0f;
    (void)verbose;  /* kept for API symmetry with the Q2_K path */
    /* ── Phase 1: Greedy seed — compute scale per block ── */
    float *greedy_d = (float *)calloc(n_blocks, sizeof(float));

    #pragma omp parallel for schedule(dynamic, 64)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *bw = weights + blk * QK4_0;
        float amax = 0.0f;
        for (int j = 0; j < QK4_0; j++) {
            float av = fabsf(bw[j]);
            if (av > amax) amax = av;
        }
        greedy_d[blk] = amax / 7.0f;
    }

    /* ── Phase 2: WLS-Optimal Candidate Generation for Q4_0 ──
     * First find the true optimal d* via 3-iteration WLS,
     * then generate candidates centered on d* with tight spacing. */
    float (*cand_errors)[Q4_N_CAND] = (float (*)[Q4_N_CAND])
        calloc(n_blocks, sizeof(float[Q4_N_CAND]));
    uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
        calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));

    #pragma omp parallel for schedule(dynamic, 64)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *bw = weights + blk * QK4_0;

        /* ── Step 2a: WLS solve to find optimal d* ── */
        float wls_d = greedy_d[blk];
        uint16_t prev_wls_d16 = 0;
        for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
            if (wls_d < 1e-15f) break;
            float inv_d = 1.0f / wls_d;
            float num = 0.0f, den = 0.0f;
            float dcS = 0.0f, dcQ = 0.0f;   /* DC rank-1 augmentation sums */
            for (int j = 0; j < QK4_0; j++) {
                int q = (int)(bw[j] * inv_d + 8.5f);
                if (q < 0) q = 0; if (q > 15) q = 15;
                float qc = (float)q - 8.0f;
                float w = (imat_importance) ?
                          imat_importance[blk * QK4_0 + j] : 1.0f;
                num += w * bw[j] * qc;
                den += w * qc * qc;
                dcS += bw[j];
                dcQ += qc;
            }
            /* DC term of the extended objective enters the normal equation
             * as one extra observation (S ~ d·Q) of weight λ_dc/n. The
             * vesica/wave term is handled by extended-E acceptance in the
             * ULP search; the solver is a proposal generator. */
            num += (HEX_DC_LAMBDA / (float)QK4_0) * dcS * dcQ;
            den += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ * dcQ;
            if (den > 1e-15f) {
                float d_new = num / den;
                if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
                    wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new));
            }
            uint16_t cur_wls_d16 = gguf_fp32_to_fp16(wls_d);
            if (cur_wls_d16 == prev_wls_d16) break;  /* converged in FP16 */
            prev_wls_d16 = cur_wls_d16;
        }

        /* ── Step 2b: Generate candidates centered on WLS optimum ── */
        for (int ci = 0; ci < Q4_N_CAND; ci++) {
            float trial_d = wls_d * Q4_NEIGHBOR_MULTS[ci];
            uint16_t d16 = gguf_fp32_to_fp16(trial_d);
            float actual_d = gguf_fp16_to_fp32(d16);
            cand_d16[blk][ci] = d16;

            float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;

            /* ── Extended objective over all QK4_0 elements ──
             * Exact importance-weighted SSE + DC + vesica/wave spectral
             * penalty — the same objective every acceptance stage uses. */
            float err = 0.0f;
            float e_arr[QK4_0];
            for (int j = 0; j < QK4_0; j++) {
                float x = bw[j];
                int q = (int)(x * id + 8.5f);
                if (q < 0) q = 0; if (q > 15) q = 15;
                float deq = ((float)q - 8.0f) * actual_d;
                float e = x - deq;
                e_arr[j] = e;
                float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
                err += e * e * w;
            }
            cand_errors[blk][ci] = err + hex_spectral_penalty(e_arr, QK4_0);
        }
    }

    /* ── Phase 3: HPC graph — single quhit per block ── */
    int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
    int hpc_ran_q4 = 0;
    for (int64_t i = 0; i < n_blocks; i++)
        best_candidate[i] = 11;  /* Q4_NEIGHBOR_MULTS[11] = 1.00 */

    if (n_blocks >= 2) {
        float temperature = 0.5f;
        int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
        int64_t stride = n_blocks / graph_blocks;
        int64_t n_sites = graph_blocks;  /* 1 quhit per block */

        HPCGraph *graph = hpc_create(n_sites);
        if (graph) {
            hpc_ran_q4 = 1;
            for (int64_t i = 0; i < n_sites; i++)
                triality_dft(&graph->locals[i]);

            /* Adaptive temperature from error landscape */
            {
                double err_accum = 0.0;
                int err_count = 0;
                for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
                    int64_t blk = gi * stride;
                    float max_e = 0.0f;
                    for (int c = 0; c < Q4_N_CAND; c++)
                        if (cand_errors[blk][c] > max_e)
                            max_e = cand_errors[blk][c];
                    err_accum += (double)max_e;
                    err_count++;
                }
                if (err_count > 0) {
                    temperature = (float)(err_accum / err_count) * 0.1f;
                    if (temperature < 1e-10f) temperature = 1e-10f;
                }
            }

            /* Encode stride-group AGGREGATED candidate errors as Boltzmann amplitudes */
            for (int64_t i = 0; i < graph_blocks; i++) {
                /* Aggregate errors across stride group */
                float agg_errors[Q4_N_CAND];
                for (int c = 0; c < Q4_N_CAND; c++)
                    agg_errors[c] = 0.0f;

                int64_t blk_start = i * stride;
                int64_t blk_end = blk_start + stride;
                if (blk_end > n_blocks) blk_end = n_blocks;
                int64_t group_size = blk_end - blk_start;

                for (int64_t b = blk_start; b < blk_end; b++) {
                    for (int c = 0; c < Q4_N_CAND; c++)
                        agg_errors[c] += cand_errors[b][c];
                }
                if (group_size > 1) {
                    float inv_gs = 1.0f / (float)group_size;
                    for (int c = 0; c < Q4_N_CAND; c++)
                        agg_errors[c] *= inv_gs;
                }

                float min_err = 1e30f;
                for (int c = 0; c < Q4_N_CAND; c++)
                    if (agg_errors[c] < min_err)
                        min_err = agg_errors[c];

                double amp_re[6];
                double amp_norm = 0.0;
                for (int qi = 0; qi < 6; qi++) amp_re[qi] = 0.0;
                for (int ci = 0; ci < Q4_N_CAND; ci++) {
                    int qi = Q4_CAND_TO_QUHIT[ci];
                    amp_re[qi] += exp(-(double)(agg_errors[ci] - min_err) /
                                      (2.0 * (double)temperature));
                }
                for (int qi = 0; qi < 6; qi++)
                    amp_norm += amp_re[qi] * amp_re[qi];
                if (amp_norm > 1e-30) {
                    double inv = 1.0 / sqrt(amp_norm);
                    for (int v = 0; v < 6; v++) amp_re[v] *= inv;
                }

                for (int v = 0; v < 6; v++) {
                    graph->locals[i].edge_re[v] = amp_re[v];
                    graph->locals[i].edge_im[v] = 0.0;
                }
                graph->locals[i].primary = VIEW_EDGE;
                graph->locals[i].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
                graph->locals[i].delta_valid = 0;
                triality_update_mask(&graph->locals[i]);
            }

            /* Neighbor edges */
            for (int64_t i = 0; i < graph_blocks - 1; i++)
                hpc_cz(graph, i, i + 1);

            /* ── Shor's Griffiths-Niu Sequential Measurement ──
             * Replaces BP with exact marginals via IDFT6 + feed-forward +
             * collapse/back-action (ported 1:1 from tesseract_factor.c).
             * Single pass, no iteration, no message damping. */
            double (*marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
            int *shor_measured = (int *)calloc(graph_blocks, sizeof(int));

            shor_measure_graph(graph, graph_blocks, marg, shor_measured, 1);

            free(shor_measured);

            /* Beam search over candidates */
            typedef struct { double acc_error; int history_idx; } Q4Beam;
            typedef struct { int cand_idx; int parent_idx; } Q4BeamHistory;

            Q4Beam beams[Q4_N_BEAMS];
            int active_beams = 1;
            Q4BeamHistory *history = (Q4BeamHistory *)malloc(n_blocks * Q4_N_BEAMS * sizeof(Q4BeamHistory));

            for (int b = 0; b < Q4_N_BEAMS; b++) {
                beams[b].acc_error = 0.0;
                beams[b].history_idx = -1;
            }

            for (int64_t i = 0; i < graph_blocks; i++) {
                double m_total = 0.0;
                for (int v = 0; v < 6; v++) m_total += marg[i][v];

                double cand_score[Q4_N_CAND];
                int64_t blk = i * stride;
                /* Count candidates per quhit bin for normalization */
                int q4_bin_count[6] = {0};
                for (int ci = 0; ci < Q4_N_CAND; ci++)
                    q4_bin_count[Q4_CAND_TO_QUHIT[ci]]++;
                /* Per-block error normalization: divide by block mean error
                 * so small-weight blocks don't dominate beam selection */
                float blk_mean_err = 0.0f;
                for (int ci = 0; ci < Q4_N_CAND; ci++)
                    blk_mean_err += cand_errors[blk][ci];
                blk_mean_err /= (float)Q4_N_CAND;
                if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f;
                for (int ci = 0; ci < Q4_N_CAND; ci++) {
                    int qi = Q4_CAND_TO_QUHIT[ci];
                    double p = (m_total > 1e-30) ? marg[i][qi] / m_total : 1.0/6.0;
                    p /= (double)q4_bin_count[qi];  /* normalize by bin occupancy */
                    cand_score[ci] = p / (cand_errors[blk][ci] / blk_mean_err + 1e-15);
                }

                typedef struct { double score; int beam_idx; int cand_idx; } Q4Ext;
                Q4Ext extensions[Q4_N_BEAMS * Q4_N_CAND];
                int n_ext = 0;
                for (int b = 0; b < active_beams; b++) {
                    for (int c = 0; c < Q4_N_CAND; c++) {
                        double ext_err = beams[b].acc_error + cand_errors[blk][c];
                        extensions[n_ext].score = cand_score[c] / (ext_err + 1e-15);
                        extensions[n_ext].beam_idx = b;
                        extensions[n_ext].cand_idx = c;
                        n_ext++;
                    }
                }

                int top_k = (n_ext < Q4_N_BEAMS) ? n_ext : Q4_N_BEAMS;
                int top_indices[Q4_N_BEAMS];
                for (int k = 0; k < top_k; k++) {
                    int best = -1; double best_s = -1e30;
                    for (int e = 0; e < n_ext; e++) {
                        if (extensions[e].score > best_s) {
                            best_s = extensions[e].score; best = e;
                        }
                    }
                    top_indices[k] = best;
                    extensions[best].score = -2e30;
                }

                Q4Beam new_beams[Q4_N_BEAMS];
                for (int k = 0; k < top_k; k++) {
                    int ei = top_indices[k];
                    int sb = extensions[ei].beam_idx;
                    int cand = extensions[ei].cand_idx;

                    int hist_idx = i * Q4_N_BEAMS + k;
                    history[hist_idx].cand_idx = cand;
                    history[hist_idx].parent_idx = beams[sb].history_idx;

                    new_beams[k].history_idx = hist_idx;
                    new_beams[k].acc_error = beams[sb].acc_error + cand_errors[blk][cand];
                }
                for (int k = 0; k < top_k; k++) beams[k] = new_beams[k];
                active_beams = top_k;
            }

            int curr_hist = beams[0].history_idx;
            for (int64_t i = graph_blocks - 1; i >= 0; i--) {
                int group_cidx;
                if (curr_hist >= 0) {
                    group_cidx = history[curr_hist].cand_idx;
                    curr_hist = history[curr_hist].parent_idx;
                } else {
                    group_cidx = 11;
                }

                if (stride <= 1) {
                    best_candidate[i] = group_cidx;
                } else {
                    /* Per-block local optimization within stride group.
                     * Beam picks the quhit bin; each block picks its best
                     * candidate in that bin from its own error landscape. */
                    int target_bin = Q4_CAND_TO_QUHIT[group_cidx];

                    for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) {
                        float best_err = 1e30f;
                        int best_c = group_cidx;
                        for (int c = 0; c < Q4_N_CAND; c++) {
                            if (Q4_CAND_TO_QUHIT[c] != target_bin) continue;
                            if (cand_errors[b][c] < best_err) {
                                best_err = cand_errors[b][c];
                                best_c = c;
                            }
                        }
                        /* Greedy override if global best is >5% better */
                        float global_best = 1e30f;
                        int global_best_c = group_cidx;
                        for (int c = 0; c < Q4_N_CAND; c++) {
                            if (cand_errors[b][c] < global_best) {
                                global_best = cand_errors[b][c];
                                global_best_c = c;
                            }
                        }
                        if (global_best < best_err * HEX_GREEDY_OVERRIDE_RATIO)
                            best_candidate[b] = global_best_c;
                        else
                            best_candidate[b] = best_c;
                    }
                }
            }
            free(history);

            /* ══════════════════════════════════════════════════════════════
             * Phase 3.5: Born-Rule Multi-Shot Scale Refinement
             *
             * The beam search found the MAP candidate sequence. But the
             * triality marginals encode quantum phase-coherent structure
             * that a greedy beam can miss.
             * ══════════════════════════════════════════════════════════════ */
            {
                #define Q4_BORN_SHOTS 128

                /* Build per-block CDFs from triality marginals */
                unsigned int born_rng = 314159;

                /* Compute tail error once (blocks beyond graph coverage) */
                float tail_err_q4 = 0.0f;
                for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
                    tail_err_q4 += cand_errors[bi][best_candidate[bi]];

                /* Beam-search baseline over the SAME set of blocks a Born
                 * shot covers: stride representatives + tail. The previous
                 * code summed the baseline over ALL blocks (including
                 * mid-stride blocks the shots never touch), making shot_err
                 * systematically smaller than the baseline and letting
                 * strictly worse configurations be adopted whenever
                 * stride > 1. */
                float beam_total_err = tail_err_q4;
                for (int64_t gi = 0; gi < graph_blocks; gi++) {
                    int64_t rep = gi * stride;
                    beam_total_err += cand_errors[rep][best_candidate[rep]];
                }

                /* Sparse shot buffer: only track stride-sampled blocks */
                int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));

                for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
                    float shot_err = tail_err_q4;

                    for (int64_t gi = 0; gi < graph_blocks; gi++) {
                        /* Normalize marginals to CDF */
                        double m_total = 0.0;
                        for (int v = 0; v < 6; v++) m_total += marg[gi][v];

                        /* Born sample: CDF inversion (same as born_sample) */
                        born_rng = born_rng * 1664525u + 1013904223u;
                        double rnd = (double)(born_rng >> 8) / 16777216.0;
                        double target = rnd * m_total;
                        double cum = 0.0;
                        int sampled_qi = 5;
                        for (int v = 0; v < 6; v++) {
                            cum += marg[gi][v];
                            if (cum > target) { sampled_qi = v; break; }
                        }

                        /* Find the best candidate WITHIN this quhit bin */
                        int64_t blk = gi * stride;
                        float best_bin_err = 1e30f;
                        int best_bin_cand = 11; /* default */
                        for (int ci = 0; ci < Q4_N_CAND; ci++) {
                            if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) {
                                if (cand_errors[blk][ci] < best_bin_err) {
                                    best_bin_err = cand_errors[blk][ci];
                                    best_bin_cand = ci;
                                }
                            }
                        }

                        shot_sparse_q4[gi] = best_bin_cand;
                        shot_err += cand_errors[blk][best_bin_cand];
                    }

                    /* Metropolis acceptance: adopt if better than current best */
                    if (shot_err < beam_total_err) {
                        for (int64_t gi = 0; gi < graph_blocks; gi++)
                            best_candidate[gi * stride] = shot_sparse_q4[gi];
                        beam_total_err = shot_err;
                    }
                }

                free(shot_sparse_q4);
            }

            /* Born refinement pass: non-stride blocks were set during beam
             * traceback and never revisited by Born shots.  For each such block
             * pick the lowest-error candidate within the same quhit bin that
             * the winning Born shot chose for its stride-representative. */
            if (stride > 1) {
                for (int64_t b = 0; b < n_blocks; b++) {
                    if (b % stride == 0) continue;
                    int64_t rep = (b / stride) * stride;
                    int target_bin = Q4_CAND_TO_QUHIT[best_candidate[rep]];
                    float best_b_err = 1e30f;
                    int  best_b_cand = best_candidate[rep];
                    for (int ci = 0; ci < Q4_N_CAND; ci++) {
                        if (Q4_CAND_TO_QUHIT[ci] != target_bin) continue;
                        if (cand_errors[b][ci] < best_b_err) {
                            best_b_err  = cand_errors[b][ci];
                            best_b_cand = ci;
                        }
                    }
                    best_candidate[b] = best_b_cand;
                }
            }

            free(marg);
            hpc_destroy(graph);
        }
    }

    /* Fallback when the HPC graph never ran (single block, or hpc_create
     * failure): pick the per-block argmin over the candidate grid instead
     * of silently leaving every block on the neutral ×1.00 candidate. */
    if (!hpc_ran_q4) {
        #pragma omp parallel for schedule(static)
        for (int64_t blk = 0; blk < n_blocks; blk++) {
            float best_e = cand_errors[blk][0];
            int   best_c = 0;
            for (int c = 1; c < Q4_N_CAND; c++) {
                if (cand_errors[blk][c] < best_e) {
                    best_e = cand_errors[blk][c];
                    best_c = c;
                }
            }
            best_candidate[blk] = best_c;
        }
    }

    /* ══════════════════════════════════════════════════════════════════
     * PHASE 4: Assemble blocks via least-squares scale extraction
     * ══════════════════════════════════════════════════════════════════ */

    #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *bw = weights + blk * QK4_0;
        int cidx = best_candidate[blk];

        /* Start from the grid-selected scale (the "assembled frequency") */
        float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]);

        /* Analog assembly: iterate to full convergence. */
        for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
            if (d_current < 1e-15f) break;
            float id = 1.0f / d_current;

            int qs_tmp[QK4_0];
            for (int j = 0; j < QK4_0; j++) {
                int q = (int)(bw[j] * id + 8.5f);
                if (q < 0) q = 0; if (q > 15) q = 15;
                qs_tmp[j] = q;
            }

            float num = 0.0f, den = 0.0f;
            float dc4S = 0.0f, dc4Q = 0.0f;
            for (int j = 0; j < QK4_0; j++) {
                float q_centered = (float)qs_tmp[j] - 8.0f;
                float w = (imat_importance) ?
                          imat_importance[blk * QK4_0 + j] : 1.0f;
                num += w * bw[j] * q_centered;
                den += w * q_centered * q_centered;
                dc4S += bw[j];
                dc4Q += q_centered;
            }
            num += (HEX_DC_LAMBDA / (float)QK4_0) * dc4S * dc4Q;
            den += (HEX_DC_LAMBDA / (float)QK4_0) * dc4Q * dc4Q;

            if (den > 1e-15f) {
                float d_new = num / den;
                float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
                if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) {
                    uint16_t d16 = gguf_fp32_to_fp16(d_new);
                    d_current = gguf_fp16_to_fp32(d16);
                }
            }
        }

        /* ── FP16 ULP neighborhood search + sign-flip exploration ── */
        {
            uint16_t base_d16 = gguf_fp32_to_fp16(d_current);
            uint16_t best_d16 = base_d16;
            float best_ulp_err = 1e30f;

            /* Try ±8 ULP neighborhood + sign flip = up to 34 candidates */
            uint16_t ulp_candidates[35];
            int n_ulp = 0;
            for (int delta = -8; delta <= 8; delta++) {
                int cand16 = (int)base_d16 + delta;
                if (cand16 >= 0 && cand16 <= 0x7BFF)
                    ulp_candidates[n_ulp++] = (uint16_t)cand16;
            }
            {
                float neg_d = -d_current;
                uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d);
                for (int delta = -8; delta <= 8; delta++) {
                    int cand16 = (int)neg_d16 + delta;
                    if (cand16 >= 0 && cand16 <= 0x7BFF)
                        ulp_candidates[n_ulp++] = (uint16_t)cand16;
                }
            }

            for (int ui = 0; ui < n_ulp; ui++) {
                float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
                float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
                float err = 0.0f;
                float e_ulp[QK4_0];
                for (int j = 0; j < QK4_0; j++) {
                    int q = (int)(bw[j] * trial_id + 8.5f);
                    if (q < 0) q = 0; if (q > 15) q = 15;
                    float deq = ((float)q - 8.0f) * trial_d;
                    float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
                    e_ulp[j] = bw[j] - deq;
                    err += e_ulp[j] * e_ulp[j] * w;
                }
                err += hex_spectral_penalty(e_ulp, QK4_0);
                if (err < best_ulp_err) {
                    best_ulp_err = err;
                    best_d16 = ulp_candidates[ui];
                }
            }
            d_current = gguf_fp16_to_fp32(best_d16);
        }

        output[blk].d = gguf_fp32_to_fp16(d_current);
        float actual_d = d_current;
        float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;

        /* ── D₆ Hadamard Error Shaping with Simulated Annealing ── */
        int q_base[QK4_0], q_shaped[QK4_0];
        float q_cont[QK4_0];
        for (int j = 0; j < QK4_0; j++) {
            q_cont[j] = bw[j] * id + 8.0f;
            q_base[j] = (int)(q_cont[j] + 0.5f);
            if (q_base[j] < 0) q_base[j] = 0;
            if (q_base[j] > 15) q_base[j] = 15;
        }
        memcpy(q_shaped, q_base, QK4_0 * sizeof(int));

        {
            float e_live[QK4_0];
            for (int j = 0; j < QK4_0; j++) {
                float deq = ((float)q_shaped[j] - 8.0f) * actual_d;
                e_live[j] = bw[j] - deq;
            }

            float v_live[QK4_0 / 2];
            float vesica_cur = 0.0f, dc_cur = 0.0f;
            for (int j = 0; j < QK4_0 / 2; j++) {
                v_live[j] = e_live[j] + e_live[j + QK4_0 / 2];
                vesica_cur += v_live[j] * v_live[j];
            }
            for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
            float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

            /* Deterministic greedy descent: only strict improvements.
             * The previous SA acceptance called rand() inside an OpenMP
             * parallel region (data race in the shared PRNG state, and
             * non-reproducible output). Uphill moves were pointless anyway:
             * the base-vs-shaped MSE guard below discards any shaped result
             * that ends up worse, so accepted uphill excursions could only
             * waste the pass budget or strand the descent. */
            for (int pass = 0; pass < QK4_0; pass++) {
                int   best_k     = -1;
                int   best_q_alt = 0;
                float best_delta = 0.0f;   /* strictly positive threshold */

                for (int k = 0; k < QK4_0; k++) {
                    int q_cur = q_shaped[k];
                    int q_try = (q_cont[k] - (float)q_cur >= 0.0f)
                                ? q_cur + 1 : q_cur - 1;
                    if (q_try < 0 || q_try > 15) continue;

                    float deq_try = ((float)q_try - 8.0f) * actual_d;
                    float e_new   = bw[k] - deq_try;
                    float de      = e_new - e_live[k];

                    int pi = (k < QK4_0 / 2) ? k : k - QK4_0 / 2;
                    float v_old = v_live[pi];
                    float v_new = v_old + de;

                    float vesica_alt = vesica_cur - v_old * v_old + v_new * v_new;
                    float dc_alt     = dc_cur     + de;
                    float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;

                    float delta = metric_cur - metric_alt;
                    if (delta > best_delta) {
                        best_delta = delta;
                        best_k     = k;
                        best_q_alt = q_try;
                    }
                }

                if (best_k < 0) break;   /* converged — no improving flip */

                q_shaped[best_k] = best_q_alt;
                {
                    float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
                    float e_new_commit = bw[best_k] - deq_commit;
                    float de_commit    = e_new_commit - e_live[best_k];

                    int pi_commit = (best_k < QK4_0 / 2) ? best_k : best_k - QK4_0 / 2;
                    float v_old_commit = v_live[pi_commit];
                    float v_new_commit = v_old_commit + de_commit;

                    vesica_cur += v_new_commit * v_new_commit - v_old_commit * v_old_commit;
                    dc_cur     += de_commit;
                    metric_cur  = 4.0f * vesica_cur + dc_cur * dc_cur;

                    v_live[pi_commit] = v_new_commit;
                    e_live[best_k]    = e_new_commit;
                }
            }
        }

        float err_base = 0.0f, err_shaped = 0.0f;
        float e_gb[QK4_0], e_gs[QK4_0];
        for (int j = 0; j < QK4_0; j++) {
            float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
            float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
            float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
            e_gb[j] = bw[j] - deq_b;
            e_gs[j] = bw[j] - deq_s;
            err_base += e_gb[j] * e_gb[j] * w;
            err_shaped += e_gs[j] * e_gs[j] * w;
        }
        err_base   += hex_spectral_penalty(e_gb, QK4_0);
        err_shaped += hex_spectral_penalty(e_gs, QK4_0);
        int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;

        for (int j = 0; j < QK4_0 / 2; j++) {
            int q0 = q_final[j];
            int q1 = q_final[j + QK4_0/2];
            output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4));

            float deq0 = ((float)q0 - 8.0f) * actual_d;
            float deq1 = ((float)q1 - 8.0f) * actual_d;
            total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
        }
    }

    *out_total_error = total_err;
    free(greedy_d);
    free(cand_errors);
    free(cand_d16);
    free(best_candidate);
}

/* ════════════════════════════════════════════════════════════════════════
 * Q8_0 HPC QUANTIZER — Shor pipeline at 8 bits
 *
 * Same pipeline as Q4_0: WLS scale + tight candidate grid scored on the
 * extended objective (weighted SSE + DC + vesica/wave), triality-quhit
 * graph with Boltzmann-encoded candidate errors, CZ chain entanglement,
 * Shor Griffiths-Niu sequential measurement for bin consensus, greedy
 * override (HEX_GREEDY_OVERRIDE_RATIO), then per-block ULP polish, the
 * vesica/DC error-shaping descent with an extended-objective guard, and
 * the candidate floor. Intended for embedding / LM-head tensors (tied
 * embeddings especially), where 2-4 bit codes destroy logit precision.
 * At 8 bits the candidate grid is tight (±1.5%) — the win over naive
 * amax/127 rounding comes from WLS + ULP + spectral selection, not from
 * coarse scale exploration.
 * ════════════════════════════════════════════════════════════════════════ */

#ifndef QK8_0
#define QK8_0 32
#endif
typedef struct { uint16_t d; int8_t qs[QK8_0]; } hex_block_q8_0;

#define Q8_N_CAND 24
static const float Q8_NEIGHBOR_MULTS[Q8_N_CAND] = {
    0.9850f, 0.9865f, 0.9880f, 0.9895f, 0.9910f, 0.9925f,
    0.9940f, 0.9952f, 0.9964f, 0.9976f, 0.9988f, 1.0000f,
    1.0010f, 1.0020f, 1.0030f, 1.0040f, 1.0052f, 1.0064f,
    1.0076f, 1.0088f, 1.0100f, 1.0115f, 1.0130f, 1.0150f,
};
/* 24 candidates → 6 quhit states (4 per bin), same folding as Q4_0 */
static const int Q8_CAND_TO_QUHIT[Q8_N_CAND] = {
    0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5
};

static inline float q8_block_ext_err(const float *bw, const float *iw,
                                     float d, int8_t *qs_out)
{
    float e_arr[QK8_0];
    float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f;
    float err = 0.0f;
    for (int j = 0; j < QK8_0; j++) {
        int q = gguf_nearest_int(bw[j] * id);
        if (q < -127) q = -127; if (q > 127) q = 127;
        if (qs_out) qs_out[j] = (int8_t)q;
        float e = bw[j] - (float)q * d;
        e_arr[j] = e;
        float w = iw ? iw[j] : 1.0f;
        err += e * e * w;
    }
    return err + hex_spectral_penalty(e_arr, QK8_0);
}

static void quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
                                     hex_block_q8_0 *output,
                                     float *out_total_error,
                                     const float *imat_importance, int verbose)
{
    int64_t n_blocks = n_elements / QK8_0;
    float total_err = 0.0f;
    (void)verbose;

    float (*cand_errors)[Q8_N_CAND] = (float (*)[Q8_N_CAND])
        calloc(n_blocks, sizeof(float[Q8_N_CAND]));
    uint16_t (*cand_d16)[Q8_N_CAND] = (uint16_t (*)[Q8_N_CAND])
        calloc(n_blocks, sizeof(uint16_t[Q8_N_CAND]));
    int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
    if (!cand_errors || !cand_d16 || !best_candidate) {
        free(cand_errors); free(cand_d16); free(best_candidate);
        if (out_total_error) *out_total_error = -1.0f;
        return;
    }

    /* ── Phase 1+2: WLS-refined scale + tight candidate grid ── */
    #pragma omp parallel for schedule(dynamic, 256)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *bw = weights + blk * QK8_0;
        const float *iw = imat_importance ? imat_importance + blk * QK8_0 : NULL;

        float amax = 0.0f;
        for (int j = 0; j < QK8_0; j++) {
            float av = fabsf(bw[j]);
            if (av > amax) amax = av;
        }
        float wls_d = amax / 127.0f;

        /* ggml-style fixed-point WLS with DC rank-1 augmentation */
        for (int it = 0; it < 3 && wls_d > 1e-20f; it++) {
            float inv_d = 1.0f / wls_d;
            float num = 0.0f, den = 0.0f, dcS = 0.0f, dcQ = 0.0f;
            for (int j = 0; j < QK8_0; j++) {
                int q = gguf_nearest_int(bw[j] * inv_d);
                if (q < -127) q = -127; if (q > 127) q = 127;
                float qf = (float)q;
                float w  = iw ? iw[j] : 1.0f;
                num += w * bw[j] * qf;
                den += w * qf * qf;
                dcS += bw[j];
                dcQ += qf;
            }
            num += (HEX_DC_LAMBDA / (float)QK8_0) * dcS * dcQ;
            den += (HEX_DC_LAMBDA / (float)QK8_0) * dcQ * dcQ;
            if (den > 1e-15f) {
                float d_new = num / den;
                if (d_new > 1e-20f) wls_d = d_new;
            }
        }

        for (int ci = 0; ci < Q8_N_CAND; ci++) {
            float    trial_d  = wls_d * Q8_NEIGHBOR_MULTS[ci];
            uint16_t d16      = gguf_fp32_to_fp16(trial_d);
            float    actual_d = gguf_fp16_to_fp32(d16);
            cand_d16  [blk][ci] = d16;
            cand_errors[blk][ci] = q8_block_ext_err(bw, iw, actual_d, NULL);
        }
        best_candidate[blk] = 11;   /* ×1.0000 neutral seed */
    }

    /* ── Phase 3: Shor graph — triality quhits, CZ chain, GN measurement ── */
    int shor_ran = 0;
    if (n_blocks >= 2) {
        int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
        int64_t stride = n_blocks / graph_blocks;

        HPCGraph *graph = hpc_create(graph_blocks);
        if (graph) {
            shor_ran = 1;

            /* Adaptive temperature from the candidate-error landscape */
            float temperature = 1e-10f;
            {
                double err_accum = 0.0;
                int err_count = 0;
                for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
                    int64_t blk = gi * stride;
                    float max_e = 0.0f;
                    for (int c = 0; c < Q8_N_CAND; c++)
                        if (cand_errors[blk][c] > max_e)
                            max_e = cand_errors[blk][c];
                    err_accum += (double)max_e;
                    err_count++;
                }
                if (err_count > 0) {
                    temperature = (float)(err_accum / err_count) * 0.1f;
                    if (temperature < 1e-10f) temperature = 1e-10f;
                }
            }

            /* Boltzmann-encode stride-aggregated candidate errors as
             * quhit amplitudes (24 candidates folded into 6 states) */
            for (int64_t i = 0; i < graph_blocks; i++) {
                float agg_errors[Q8_N_CAND];
                for (int c = 0; c < Q8_N_CAND; c++) agg_errors[c] = 0.0f;
                int64_t blk_start = i * stride;
                int64_t blk_end   = blk_start + stride;
                if (blk_end > n_blocks) blk_end = n_blocks;
                for (int64_t b = blk_start; b < blk_end; b++)
                    for (int c = 0; c < Q8_N_CAND; c++)
                        agg_errors[c] += cand_errors[b][c];
                float min_err = 1e30f;
                for (int c = 0; c < Q8_N_CAND; c++)
                    if (agg_errors[c] < min_err) min_err = agg_errors[c];

                double amp_re[6] = {0,0,0,0,0,0};
                double amp_norm = 0.0;
                for (int ci = 0; ci < Q8_N_CAND; ci++)
                    amp_re[Q8_CAND_TO_QUHIT[ci]] +=
                        exp(-(double)(agg_errors[ci] - min_err) /
                             (2.0 * (double)temperature));
                for (int v = 0; v < 6; v++) amp_norm += amp_re[v] * amp_re[v];
                if (amp_norm > 1e-30) {
                    double inv = 1.0 / sqrt(amp_norm);
                    for (int v = 0; v < 6; v++) amp_re[v] *= inv;
                }
                for (int v = 0; v < 6; v++) {
                    graph->locals[i].edge_re[v] = amp_re[v];
                    graph->locals[i].edge_im[v] = 0.0;
                }
                graph->locals[i].primary = VIEW_EDGE;
                graph->locals[i].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
                graph->locals[i].delta_valid = 0;
                triality_update_mask(&graph->locals[i]);
            }

            for (int64_t i = 0; i < graph_blocks - 1; i++)
                hpc_cz(graph, i, i + 1);

            double (*marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
            int *measured = (int *)calloc(graph_blocks, sizeof(int));
            if (marg && measured) {
                shor_measure_graph(graph, graph_blocks, marg, measured, 1);

                /* Per-block selection: best candidate inside the Shor-
                 * measured bin, then greedy override against the global
                 * argmin — identical Step-F semantics to Q2_K/Q4_0. */
                for (int64_t i = 0; i < graph_blocks; i++) {
                    int bin = measured[i];
                    if (bin < 0 || bin > 5) {
                        double bm = -1.0; bin = 0;
                        for (int v = 0; v < 6; v++)
                            if (marg[i][v] > bm) { bm = marg[i][v]; bin = v; }
                    }
                    int64_t blk_start = i * stride;
                    int64_t blk_end   = blk_start + stride;
                    if (blk_end > n_blocks) blk_end = n_blocks;
                    for (int64_t b = blk_start; b < blk_end; b++) {
                        float bin_best = 1e30f; int bin_cand = -1;
                        float g_best   = 1e30f; int g_cand   = 0;
                        for (int c = 0; c < Q8_N_CAND; c++) {
                            float e = cand_errors[b][c];
                            if (e < g_best) { g_best = e; g_cand = c; }
                            if (Q8_CAND_TO_QUHIT[c] == bin && e < bin_best) {
                                bin_best = e; bin_cand = c;
                            }
                        }
                        int sel = (bin_cand >= 0) ? bin_cand : g_cand;
                        if (g_best < cand_errors[b][sel] * HEX_GREEDY_OVERRIDE_RATIO)
                            sel = g_cand;
                        best_candidate[b] = sel;
                    }
                }
            }
            free(marg); free(measured);
            hpc_destroy(graph);
        }
    }
    if (!shor_ran) {
        for (int64_t blk = 0; blk < n_blocks; blk++) {
            float g_best = cand_errors[blk][0]; int g_cand = 0;
            for (int c = 1; c < Q8_N_CAND; c++)
                if (cand_errors[blk][c] < g_best) {
                    g_best = cand_errors[blk][c]; g_cand = c;
                }
            best_candidate[blk] = g_cand;
        }
    }

    /* ── Phase 4: ULP polish + vesica/DC shaping guard + floor ── */
    #pragma omp parallel for schedule(dynamic, 256) reduction(+:total_err)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *bw = weights + blk * QK8_0;
        const float *iw = imat_importance ? imat_importance + blk * QK8_0 : NULL;
        int cidx = best_candidate[blk];

        uint16_t best_d16 = cand_d16[blk][cidx];
        float    best_err = cand_errors[blk][cidx];

        /* ±8 fp16 ULP joint search on the extended objective */
        for (int du = -8; du <= 8; du++) {
            if (du == 0) continue;
            int c16 = (int)cand_d16[blk][cidx] + du;
            if (c16 <= 0 || c16 > 0x7BFF) continue;
            float td  = gguf_fp16_to_fp32((uint16_t)c16);
            float err = q8_block_ext_err(bw, iw, td, NULL);
            if (err < best_err) { best_err = err; best_d16 = (uint16_t)c16; }
        }

        /* Candidate floor: final ≤ best raw grid candidate (by construction
         * the ULP search already starts from it, so this is implicit). */
        float  d = gguf_fp16_to_fp32(best_d16);
        int8_t qs[QK8_0];
        (void)q8_block_ext_err(bw, iw, d, qs);

        /* Vesica/DC greedy shaping with extended-objective guard */
        {
            int8_t qs_shaped[QK8_0];
            memcpy(qs_shaped, qs, QK8_0);
            float e_live[QK8_0], v_live[QK8_0 / 2];
            float vesica_cur = 0.0f, dc_cur = 0.0f;
            for (int k = 0; k < QK8_0; k++)
                e_live[k] = bw[k] - (float)qs_shaped[k] * d;
            for (int p = 0; p < QK8_0 / 2; p++) {
                v_live[p] = e_live[p] + e_live[p + QK8_0 / 2];
                vesica_cur += v_live[p] * v_live[p];
                dc_cur     += v_live[p];
            }
            float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
            for (int pass = 0; pass < QK8_0; pass++) {
                int best_k = -1, best_q_alt = 0;
                float best_delta = 0.0f;
                for (int k = 0; k < QK8_0; k++) {
                    int q_try = (e_live[k] >= 0.0f) ? qs_shaped[k] + 1
                                                    : qs_shaped[k] - 1;
                    if (q_try < -127 || q_try > 127) continue;
                    float e_new = bw[k] - (float)q_try * d;
                    float de    = e_new - e_live[k];
                    int   pi    = (k < QK8_0 / 2) ? k : k - QK8_0 / 2;
                    float v_new = v_live[pi] + de;
                    float ves_a = vesica_cur - v_live[pi] * v_live[pi]
                                             + v_new * v_new;
                    float dc_a  = dc_cur + de;
                    float delta = metric_cur - (4.0f * ves_a + dc_a * dc_a);
                    if (delta > best_delta) {
                        best_delta = delta; best_k = k; best_q_alt = q_try;
                    }
                }
                if (best_k < 0) break;
                {
                    float e_new = bw[best_k] - (float)best_q_alt * d;
                    float de    = e_new - e_live[best_k];
                    int   pi    = (best_k < QK8_0 / 2) ? best_k
                                                       : best_k - QK8_0 / 2;
                    float v_new = v_live[pi] + de;
                    vesica_cur += v_new * v_new - v_live[pi] * v_live[pi];
                    dc_cur     += de;
                    metric_cur  = 4.0f * vesica_cur + dc_cur * dc_cur;
                    v_live[pi]      = v_new;
                    e_live[best_k]  = e_new;
                    qs_shaped[best_k] = (int8_t)best_q_alt;
                }
            }
            /* Guard on the extended objective vs originals */
            float e_b[QK8_0], e_s[QK8_0];
            float err_b = 0.0f, err_s = 0.0f;
            for (int k = 0; k < QK8_0; k++) {
                float w = iw ? iw[k] : 1.0f;
                e_b[k] = bw[k] - (float)qs[k]        * d;
                e_s[k] = bw[k] - (float)qs_shaped[k] * d;
                err_b += e_b[k] * e_b[k] * w;
                err_s += e_s[k] * e_s[k] * w;
            }
            err_b += hex_spectral_penalty(e_b, QK8_0);
            err_s += hex_spectral_penalty(e_s, QK8_0);
            if (err_s < err_b) memcpy(qs, qs_shaped, QK8_0);
        }

        output[blk].d = best_d16;
        for (int k = 0; k < QK8_0; k++) {
            output[blk].qs[k] = qs[k];
            float e = bw[k] - (float)qs[k] * d;
            total_err += e * e;          /* pure reconstruction SSE report */
        }
    }

    free(cand_errors);
    free(cand_d16);
    free(best_candidate);
    if (out_total_error) *out_total_error = total_err;
}


/* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
 * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
 * candidate generation, so stored codes are unnecessary. */
static inline void hex_derive_subscales(const float *scales, const float *mins,
                                        float actual_dm, float actual_mm,
                                        uint8_t *Ls, uint8_t *Lm)
{
    for (int j = 0; j < 16; j++) {
        if (actual_dm > 1e-15f) {
            int ls = gguf_nearest_int(scales[j] / actual_dm);
            if (ls < 0) ls = 0; if (ls > 15) ls = 15;
            Ls[j] = (uint8_t)ls;
        } else { Ls[j] = 0; }
        if (actual_mm > 1e-15f) {
            int lm = gguf_nearest_int(mins[j] / actual_mm);
            if (lm < 0) lm = 0; if (lm > 15) lm = 15;
            Lm[j] = (uint8_t)lm;
        } else { Lm[j] = 0; }
    }
}

static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                                      BlockQ2K *output, float *out_total_error,
                                      OptimizerMode opt_mode,
                                      const float *imat_importance,
                                      int verbose)
{
    int64_t n_blocks = n_elements / QK_K;
    float total_err = 0.0f;
    const int N_SUB = QK_K / 16;

    /* ── Outlier Clamping for WLS Seeds ──
     * Protects the Phase 1 greedy seed from being violently warped by extreme
     * >4.0 sigma outliers, which creates better centering for the grid search. */
    double t_sum_sq = 0.0, t_sum_4 = 0.0;
    for (int64_t i = 0; i < n_elements; i++) {
        double w2 = (double)weights[i] * (double)weights[i];
        t_sum_sq += w2;
        t_sum_4  += w2 * w2;
    }
    float w_sigma = sqrtf((float)(t_sum_sq / (double)n_elements));

    /* ── Adaptive outlier clamp (kurtosis-driven) ──
     * The fixed 3.5σ clamp suppressed the heavy-tail mass that dominates
     * reconstruction error, inflating RMSE on near-Gaussian tensors that did
     * not need clamping at all. Instead, gate the clamp on the tensor's raw
     * kurtosis (Gaussian = 3): leave near-Gaussian tensors untouched and only
     * apply a stabilising clamp to genuinely heavy-tailed tensors, where the
     * final (d, dmin) refit later recovers fidelity against the UNCLIPPED
     * weights anyway. */
    double t_var  = t_sum_sq / (double)n_elements;
    double t_kurt = (t_var > 1e-30) ? (t_sum_4 / (double)n_elements) / (t_var * t_var) : 3.0;
    float clamp_sigma;
    if      (t_kurt <= 6.0)  clamp_sigma = 1.0e9f;   /* ~Gaussian: effectively no clamp */
    else if (t_kurt <= 20.0) clamp_sigma = 6.0f;     /* moderately heavy tails          */
    else                     clamp_sigma = 4.0f;     /* very heavy tails: stabilise seed */
    float clamp_val = w_sigma * clamp_sigma;

    /* ══════════════════════════════════════════════════════════════════
     * PHASE 1: Greedy quantization — produce seed (d, dmin) per block
     * ══════════════════════════════════════════════════════════════════ */

    typedef struct {
        float dm, mm;
        uint16_t d_fp16, dmin_fp16;
        uint8_t Ls[16], Lm[16];
        float scales[16], mins[16], sw[16];
    } BlockSeed;

    BlockSeed *seeds = (BlockSeed *)calloc(n_blocks, sizeof(BlockSeed));

    #pragma omp parallel for schedule(dynamic, 64)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *block_x = weights + blk * QK_K;
        uint8_t L[QK_K], Laux[16];
        float wt[16];

        float sumx2 = 0;
        for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i];
        float sigma2 = sumx2 / (float)QK_K;

        /* Phase 1 WLS uses clamped values to generate stable seeds */
        float sx_clipped[16];
        for (int j = 0; j < N_SUB; j++) {
            const float *sx = block_x + 16 * j;
            seeds[blk].sw[j] = 0;
            for (int l = 0; l < 16; l++) {
                float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f;
                float v = sx[l];
                if (v > clamp_val) v = clamp_val;
                if (v < -clamp_val) v = -clamp_val;
                sx_clipped[l] = v;
                /* Activation-aware weighting: an imatrix entry already encodes
                 * E[a^2] for that column, which is the correct weight for
                 * minimising output (dot-product) error. Use it directly rather
                 * than re-multiplying by the |w| magnitude heuristic, which
                 * double-counts magnitude. Without an imatrix, fall back to the
                 * magnitude-relative heuristic. */
                wt[l] = (imat_importance)
                        ? imp
                        : sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
                seeds[blk].sw[j] += wt[l];
            }
            seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
                                        L + 16 * j, &seeds[blk].mins[j], Laux);
        }

        seeds[blk].dm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].scales,
                                              seeds[blk].Ls, seeds[blk].sw);
        seeds[blk].mm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].mins,
                                              seeds[blk].Lm, seeds[blk].sw);
        seeds[blk].d_fp16 = gguf_fp32_to_fp16(seeds[blk].dm);
        seeds[blk].dmin_fp16 = gguf_fp32_to_fp16(seeds[blk].mm);
    }

    /* ══════════════════════════════════════════════════════════════════
     * PHASE 2: WLS-Optimal Candidate Generation
     * ══════════════════════════════════════════════════════════════════ */

    /* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */
    /* d is the sensitive axis, so concentrate resolution near 1.0 while
     * keeping wide tails for blocks whose WLS seed is off. 1.000 stays at
     * index 11 so the neutral-candidate fallback/init remains valid. */
    static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
        0.780f, 0.835f, 0.880f, 0.915f, 0.943f, 0.963f,
        0.978f, 0.988f, 0.994f, 0.997f, 0.999f, 1.000f,
        1.002f, 1.005f, 1.011f, 1.021f, 1.035f, 1.054f,
        1.080f, 1.115f, 1.160f, 1.215f, 1.275f, 1.340f
    };
    static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
        0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
        0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
        1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
        1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
    };
    /* Map 24 candidates → 6 quhit states for BP encoding */
    static const int CAND_TO_QUHIT[24] = {
        0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
        3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
    };

    float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
    uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
    uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;

    candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                            sizeof(float[TOTAL_SCALE_CANDIDATES]));
    candidate_d = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                            sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
    candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                            sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
    /* NOTE: the per-candidate sub-scale codes (Ls/Lm) are NOT stored.
     * They are a pure function of (seeds[blk].scales/mins, candidate fp16
     * d/dmin) and are re-derived where needed. Storing them cost
     * n_blocks × 576 × 16 × 2 bytes ≈ 18 KB/superblock — multiple GB of
     * peak RSS on large FFN tensors — for data used at exactly one index. */

    #pragma omp parallel for schedule(dynamic, 16)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *block_x = weights + blk * QK_K;

        /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ── */
        float wls_dm = seeds[blk].dm;
        float wls_mm = seeds[blk].mm;
        uint8_t wls_Ls[16], wls_Lm[16];
        memcpy(wls_Ls, seeds[blk].Ls, 16);
        memcpy(wls_Lm, seeds[blk].Lm, 16);

        /* Generate soft-clipped buffer for WLS internal stability */
        float clipped_block_x[QK_K];
        for(int i=0; i<QK_K; i++) {
            float v = block_x[i];
            if (v > clamp_val) v = clamp_val;
            if (v < -clamp_val) v = -clamp_val;
            clipped_block_x[i] = v;
        }

        for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
            uint8_t L_wls[QK_K];
            for (int j = 0; j < N_SUB; j++) {
                float d_sub = wls_dm * (float)wls_Ls[j];
                float m_sub = wls_mm * (float)wls_Lm[j];
                if (d_sub < 1e-15f) {
                    for (int k = 0; k < 16; k++) L_wls[16*j+k] = 0;
                    continue;
                }
                for (int k = 0; k < 16; k++) {
                    int q = gguf_nearest_int((clipped_block_x[16*j+k] + m_sub) / d_sub);
                    if (q < 0) q = 0; if (q > 3) q = 3;
                    L_wls[16*j+k] = (uint8_t)q;
                }
            }

            double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
            for (int j = 0; j < N_SUB; j++) {
                float ls_f = (float)wls_Ls[j];
                float lm_f = (float)wls_Lm[j];
                for (int k = 0; k < 16; k++) {
                    float x = clipped_block_x[16*j+k];
                    float w = (imat_importance) ?
                              imat_importance[blk * QK_K + 16*j+k] : 1.0f;
                    float a = ls_f * (float)L_wls[16*j+k];
                    float b = lm_f;
                    Saa += w * a * a;
                    Sab += w * a * b;
                    Sbb += w * b * b;
                    Sxa += w * x * a;
                    Sxb += w * x * b;
                }
            }

            double det = Saa * Sbb - Sab * Sab;
            if (fabs(det) > 1e-30) {
                double d_new  = (Sbb * Sxa - Sab * Sxb) / det;
                double dm_new = (Sab * Sxa - Saa * Sxb) / det;
                if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10))
                    wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
                if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10))
                    wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
            }

            for (int j = 0; j < N_SUB; j++) {
                if (wls_dm > 1e-15f) {
                    int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm);
                    if (ls < 0) ls = 0; if (ls > 15) ls = 15;
                    wls_Ls[j] = (uint8_t)ls;
                } else { wls_Ls[j] = 0; }
                if (wls_mm > 1e-15f) {
                    int lm = gguf_nearest_int(seeds[blk].mins[j] / wls_mm);
                    if (lm < 0) lm = 0; if (lm > 15) lm = 15;
                    wls_Lm[j] = (uint8_t)lm;
                } else { wls_Lm[j] = 0; }
            }
        }

        /* ── Step 2b: Generate Candidates ── */
        for (int di = 0; di < N_CAND_D; di++) {
            float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di];
            uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm);
            float actual_dm = gguf_fp16_to_fp32(trial_d16);

            for (int mi = 0; mi < N_CAND_M; mi++) {
                int cidx = di * N_CAND_M + mi;
                float trial_mm = wls_mm * NEIGHBOR_MULTS_M[mi];
                uint16_t trial_dmin16 = gguf_fp32_to_fp16(trial_mm);
                float actual_mm = gguf_fp16_to_fp32(trial_dmin16);

                candidate_d[blk][cidx] = trial_d16;
                candidate_dmin[blk][cidx] = trial_dmin16;

                uint8_t trial_Ls[16], trial_Lm[16];
                for (int j = 0; j < N_SUB; j++) {
                    if (actual_dm > 1e-15f) {
                        int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm);
                        if (ls < 0) ls = 0; if (ls > 15) ls = 15;
                        trial_Ls[j] = (uint8_t)ls;
                    } else { trial_Ls[j] = 0; }
                    if (actual_mm > 1e-15f) {
                        int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm);
                        if (lm < 0) lm = 0; if (lm > 15) lm = 15;
                        trial_Lm[j] = (uint8_t)lm;
                    } else { trial_Lm[j] = 0; }
                }

                /* Error evaluation MUST use the non-clipped original weights.
                 * Exact importance-weighted SSE — the same objective the
                 * assembly/polish phases minimise and the reported RMSE. */
                float err = 0.0f;
                float e_arr[QK_K];
                for (int i = 0; i < QK_K; i++) {
                    int jj   = i >> 4;
                    float d  = actual_dm * (float)trial_Ls[jj];
                    float m  = actual_mm * (float)trial_Lm[jj];
                    float x  = block_x[i];
                    float w  = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
                    float e;
                    if (d < 1e-15f) {
                        /* Decoder semantics: deq = d·ls·q − dmin·lm = −m here */
                        e = x + m;
                    } else {
                        int q = gguf_nearest_int((x + m) / d);
                        if (q < 0) q = 0; if (q > 3) q = 3;
                        e = x - (d * (float)q - m);
                    }
                    e_arr[i] = e;
                    err += e * e * w;
                }
                candidate_errors[blk][cidx] =
                    err + hex_spectral_penalty(e_arr, QK_K);
            }
        }
    }

    /* ══════════════════════════════════════════════════════════════════
     * PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement
     * ══════════════════════════════════════════════════════════════════ */

    int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
    for (int64_t i = 0; i < n_blocks; i++)
        best_candidate[i] = 11 * N_CAND_M + 11;  /* index 11 = 1.0 multiplier */

    if (opt_mode != OPT_MSE && n_blocks >= 2) {
        int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks;
        int64_t stride = n_blocks / graph_blocks;
        float temperature = 0.5f;
        int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK;

        HPCGraph *graph = hpc_create(n_sites);
        if (graph) {
            for (int64_t i = 0; i < n_sites; i++)
                triality_dft(&graph->locals[i]);

            {
                double err_accum = 0.0;
                int err_count = 0;
                for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
                    int64_t blk = gi * stride;
                    float max_e = 0.0f;
                    for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
                        if (candidate_errors[blk][c] > max_e)
                            max_e = candidate_errors[blk][c];
                    err_accum += (double)max_e;
                    err_count++;
                }
                if (err_count > 0) {
                    float median_err = (float)(err_accum / err_count);
                    temperature = median_err * 0.1f;
                    if (temperature < 1e-10f) temperature = 1e-10f;
                }
            }

            for (int64_t i = 0; i < graph_blocks; i++) {
                float agg_errors[TOTAL_SCALE_CANDIDATES];
                for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] = 0.0f;

                int64_t blk_start = i * stride;
                int64_t blk_end = blk_start + stride;
                if (blk_end > n_blocks) blk_end = n_blocks;
                int64_t group_size = blk_end - blk_start;

                for (int64_t b = blk_start; b < blk_end; b++) {
                    for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
                        agg_errors[c] += candidate_errors[b][c];
                }
                if (group_size > 1) {
                    float inv_gs = 1.0f / (float)group_size;
                    for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
                        agg_errors[c] *= inv_gs;
                }

                float min_err = 1e30f;
                for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
                    if (agg_errors[c] < min_err)
                        min_err = agg_errors[c];

                double coarse_re[6];
                double coarse_norm = 0.0;
                for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0;
                for (int di = 0; di < N_CAND_D; di++) {
                    int qi = CAND_TO_QUHIT[di];
                    for (int mi = 0; mi < N_CAND_M; mi++) {
                        int cidx = di * N_CAND_M + mi;
                        coarse_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) /
                                              (2.0 * (double)temperature));
                    }
                }
                for (int qi = 0; qi < 6; qi++) coarse_norm += coarse_re[qi] * coarse_re[qi];
                if (coarse_norm > 1e-30) {
                    double inv = 1.0 / sqrt(coarse_norm);
                    for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
                }

                double fine_re[6];
                double fine_norm = 0.0;
                for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0;
                for (int mi = 0; mi < N_CAND_M; mi++) {
                    int qi = CAND_TO_QUHIT[mi];
                    for (int di = 0; di < N_CAND_D; di++) {
                        int cidx = di * N_CAND_M + mi;
                        fine_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) /
                                            (2.0 * (double)temperature));
                    }
                }
                for (int qi = 0; qi < 6; qi++) fine_norm += fine_re[qi] * fine_re[qi];
                if (fine_norm > 1e-30) {
                    double inv = 1.0 / sqrt(fine_norm);
                    for (int v = 0; v < 6; v++) fine_re[v] *= inv;
                }

                int64_t s0 = 2 * i, s1 = 2 * i + 1;
                for (int v = 0; v < 6; v++) {
                    graph->locals[s0].edge_re[v] = coarse_re[v];
                    graph->locals[s0].edge_im[v] = 0.0;
                    graph->locals[s1].edge_re[v] = fine_re[v];
                    graph->locals[s1].edge_im[v] = 0.0;
                }
                graph->locals[s0].primary = VIEW_EDGE;
                graph->locals[s0].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
                graph->locals[s0].delta_valid = 0;
                triality_update_mask(&graph->locals[s0]);
                graph->locals[s1].primary = VIEW_EDGE;
                graph->locals[s1].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
                graph->locals[s1].delta_valid = 0;
                triality_update_mask(&graph->locals[s1]);
            }

            for (int64_t i = 0; i < graph_blocks; i++) {
                hpc_cz(graph, 2 * i, 2 * i + 1);
                if (i + 1 < graph_blocks) {
                    hpc_cz(graph, 2 * i, 2 * (i + 1));
                    hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1);
                }
            }

            double (*shor_marg)[6] = (double (*)[6])calloc(n_sites, sizeof(double[6]));
            int *shor_measured = (int *)calloc(n_sites, sizeof(int));

            shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1);

            double (*coarse_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
            double (*fine_marg)[6]   = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));

            for (int64_t i = 0; i < graph_blocks; i++) {
                for (int v = 0; v < 6; v++) {
                    coarse_marg[i][v] = shor_marg[2 * i][v];
                    fine_marg[i][v]   = shor_marg[2 * i + 1][v];
                }
            }

            free(shor_marg);
            free(shor_measured);

            /* ══════════════════════════════════════════════════════════════
             * PHASE 3 — DETERMINISTIC VITERBI DP
             *
             * Replaces the probabilistic beam-search + Born-rule Monte-Carlo
             * shots with an exact, fully-deterministic DP over the 36-state
             * Shor quhit space (6 coarse bins × 6 fine bins).
             *
             * For each graph block i and combined state s = qi_d*6 + qi_m:
             *
             *   bin_best_err[i][s]  = min candidate error in that (d,m)-bin
             *                         aggregated over the stride group
             *   bin_log_prior[i][s] = log P_coarse(qi_d) + log P_fine(qi_m)
             *                         from Shor marginals → HPC prior bonus
             *
             * Local Viterbi cost (lower = better):
             *   vcost[i][s] = bin_best_err[i][s]
             *               − VITERBI_BETA × scale_err × bin_log_prior[i][s]
             *
             * Transition cost (cross-block smoothness prior):
             *   trans(s′→s) = VITERBI_ALPHA × scale_err
             *                 × (|qi_d − qi_d′| + |qi_m − qi_m′|)
             *
             * DP recurrence:
             *   dp[0][s] = vcost[0][s]
             *   dp[i][s] = vcost[i][s] + min_{s′}(dp[i-1][s′] + trans(s′→s))
             *
             * Traceback yields the globally optimal sequence of bin choices,
             * which is then mapped to per-block best_candidate[] indices.
             * A 5%-threshold greedy override rescues blocks where the local
             * MSE-optimal candidate is meaningfully better than the bin winner.
             * ══════════════════════════════════════════════════════════════ */

            #define VIT_N_STATES  36      /* 6 coarse × 6 fine quhit bins        */
            #define VITERBI_BETA  0.25f   /* log-prior bonus weight               */
            #define VITERBI_ALPHA 0.08f   /* cross-block smoothness penalty weight */

            {
                int64_t vit_gi, vit_b;
                int     vit_s, vit_sp;

                /* Per-graph-block per-state workspace */
                float (*vit_bin_err )[VIT_N_STATES] =
                    (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
                int   (*vit_bin_cand)[VIT_N_STATES] =
                    (int   (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int  [VIT_N_STATES]));
                float (*vit_log_pri )[VIT_N_STATES] =
                    (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
                float (*vit_dp      )[VIT_N_STATES] =
                    (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
                int   (*vit_back    )[VIT_N_STATES] =
                    (int   (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int  [VIT_N_STATES]));

                /* ── Step A: build per-block per-bin statistics ── */
                for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
                    double c_tot = 0.0, f_tot = 0.0;

                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
                        vit_bin_err [vit_gi][vit_s] = 1e30f;
                        vit_bin_cand[vit_gi][vit_s] = -1;
                    }

                    /* Best candidate per (qi_d, qi_m) bin over stride group */
                    for (vit_b = vit_gi * stride;
                         vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
                         vit_b++) {
                        int vit_c;
                        for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
                            int qi_d = CAND_TO_QUHIT[vit_c / N_CAND_M];
                            int qi_m = CAND_TO_QUHIT[vit_c % N_CAND_M];
                            vit_s = qi_d * 6 + qi_m;
                            float e = candidate_errors[vit_b][vit_c];
                            if (e < vit_bin_err[vit_gi][vit_s]) {
                                vit_bin_err[vit_gi][vit_s] = e;
                                /* Canonical candidate = stride-rep block's best */
                                if (vit_b == vit_gi * stride)
                                    vit_bin_cand[vit_gi][vit_s] = vit_c;
                            }
                        }
                    }

                    /* HPC log-prior from Shor marginals */
                    for (int v = 0; v < 6; v++) {
                        c_tot += coarse_marg[vit_gi][v];
                        f_tot += fine_marg  [vit_gi][v];
                    }
                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
                        int qi_d = vit_s / 6, qi_m = vit_s % 6;
                        double pc = (c_tot > 1e-30)
                                    ? coarse_marg[vit_gi][qi_d] / c_tot : 1.0/6.0;
                        double pf = (f_tot > 1e-30)
                                    ? fine_marg  [vit_gi][qi_m] / f_tot : 1.0/6.0;
                        vit_log_pri[vit_gi][vit_s] =
                            (float)(log(pc + 1e-30) + log(pf + 1e-30));
                    }
                }

                /* ── Step B: scale_err normaliser for transition cost ── */
                float vit_scale_err = 0.0f;
                int   vit_scale_cnt = 0;
                for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
                        if (vit_bin_err[vit_gi][vit_s] < 1e29f) {
                            vit_scale_err += vit_bin_err[vit_gi][vit_s];
                            vit_scale_cnt++;
                        }
                    }
                }
                vit_scale_err = (vit_scale_cnt > 0)
                                ? vit_scale_err / (float)vit_scale_cnt : 1e-10f;
                if (vit_scale_err < 1e-20f) vit_scale_err = 1e-20f;

                /* ── Step C: Forward Viterbi pass ── */

                /* Block 0 — no predecessor */
                for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
                    float local = (vit_bin_err[0][vit_s] < 1e29f)
                        ? vit_bin_err[0][vit_s]
                          - VITERBI_BETA * vit_scale_err * vit_log_pri[0][vit_s]
                        : 1e30f;
                    vit_dp  [0][vit_s] = local;
                    vit_back[0][vit_s] = -1;
                }

                /* Blocks 1..graph_blocks-1 */
                for (vit_gi = 1; vit_gi < graph_blocks; vit_gi++) {
                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
                        float local;
                        float best_pred = 1e30f;
                        int   best_sp   = 0;
                        int qi_d = vit_s / 6;
                        int qi_m = vit_s % 6;

                        if (vit_bin_err[vit_gi][vit_s] > 1e29f) {
                            vit_dp  [vit_gi][vit_s] = 1e30f;
                            vit_back[vit_gi][vit_s] = 0;
                            continue;
                        }
                        local = vit_bin_err[vit_gi][vit_s]
                              - VITERBI_BETA * vit_scale_err * vit_log_pri[vit_gi][vit_s];

                        /* Min-cost predecessor with Manhattan transition penalty */
                        for (vit_sp = 0; vit_sp < VIT_N_STATES; vit_sp++) {
                            float prev = vit_dp[vit_gi - 1][vit_sp];
                            if (prev > 1e29f) continue;
                            int td = abs(qi_d - (vit_sp / 6));
                            int tm = abs(qi_m - (vit_sp % 6));
                            float trans = VITERBI_ALPHA * vit_scale_err * (float)(td + tm);
                            float total = prev + trans;
                            if (total < best_pred) {
                                best_pred = total;
                                best_sp   = vit_sp;
                            }
                        }
                        vit_dp  [vit_gi][vit_s] = (best_pred < 1e29f)
                                                   ? best_pred + local : 1e30f;
                        vit_back[vit_gi][vit_s] = best_sp;
                    }
                }

                /* ── Step D: Traceback ── */
                int *vit_path = (int *)malloc(graph_blocks * sizeof(int));
                {
                    int   best_s = 0;
                    float best_f = vit_dp[graph_blocks - 1][0];
                    for (vit_s = 1; vit_s < VIT_N_STATES; vit_s++) {
                        if (vit_dp[graph_blocks - 1][vit_s] < best_f) {
                            best_f = vit_dp[graph_blocks - 1][vit_s];
                            best_s = vit_s;
                        }
                    }
                    vit_path[graph_blocks - 1] = best_s;
                    for (vit_gi = graph_blocks - 2; vit_gi >= 0; vit_gi--)
                        vit_path[vit_gi] = vit_back[vit_gi + 1][vit_path[vit_gi + 1]];
                }

                /* ── Step E: Map Viterbi path → best_candidate[] ── */
                for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
                    vit_s = vit_path[vit_gi];
                    int qi_d = vit_s / 6;
                    int qi_m = vit_s % 6;
                    int64_t blk_rep = vit_gi * stride;

                    /* Stride-representative block: use precomputed bin winner */
                    if (vit_bin_cand[vit_gi][vit_s] >= 0)
                        best_candidate[blk_rep] = vit_bin_cand[vit_gi][vit_s];

                    /* Non-representative blocks in the stride group */
                    for (vit_b = blk_rep + 1;
                         vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
                         vit_b++) {
                        int vit_c;
                        float best_e = 1e30f;
                        int   best_c = best_candidate[blk_rep];
                        for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
                            if (CAND_TO_QUHIT[vit_c / N_CAND_M] != qi_d) continue;
                            if (CAND_TO_QUHIT[vit_c % N_CAND_M] != qi_m) continue;
                            if (candidate_errors[vit_b][vit_c] < best_e) {
                                best_e = candidate_errors[vit_b][vit_c];
                                best_c = vit_c;
                            }
                        }
                        best_candidate[vit_b] = best_c;
                    }
                }

                /* ── Step F: 5 % greedy override (pure MSE safety net) ── */
                for (vit_b = 0; vit_b < n_blocks; vit_b++) {
                    int vit_c;
                    float cur_err = candidate_errors[vit_b][best_candidate[vit_b]];
                    float g_best  = cur_err;
                    int   g_cand  = best_candidate[vit_b];
                    for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
                        if (candidate_errors[vit_b][vit_c] < g_best) {
                            g_best = candidate_errors[vit_b][vit_c];
                            g_cand = vit_c;
                        }
                    }
                    if (g_best < cur_err * HEX_GREEDY_OVERRIDE_RATIO)
                        best_candidate[vit_b] = g_cand;
                }

                free(vit_path);
                free(vit_dp);
                free(vit_back);
                free(vit_bin_err);
                free(vit_bin_cand);
                free(vit_log_pri);
            }

            free(coarse_marg);
            free(fine_marg);
            hpc_destroy(graph);
        }
    } else {
        for (int64_t blk = 0; blk < n_blocks; blk++) {
            float best_err = candidate_errors[blk][0];
            int best_idx = 0;
            for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
                if (candidate_errors[blk][c] < best_err) {
                    best_err = candidate_errors[blk][c];
                    best_idx = c;
                }
            }
            best_candidate[blk] = best_idx;
        }
    }

    /* ══════════════════════════════════════════════════════════════════
     * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
     *
     * Transforms the tensor from a collection of isolated 256-element
     * Q2_K superblocks into a single, continuous error-cancelling waveform.
     *
     * After Phase 3 has selected the optimal (d, dmin) candidate for every
     * block, this sequential pass computes the net DC residual left by each
     * block using a cheap round-nearest forward quantization, then feeds the
     * negated, exponentially-decayed residual as a correction bias into the
     * WLS solver of the immediately following block.
     *
     * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
     *
     *   dc_bias[N+1] = −DC_DECAY × R_N / QK_K      (per-element offset)
     *
     * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
     * quantizer toward codes whose reconstruction deq ≈ x′, so that
     *
     *   Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N
     *
     * The accumulated cross-block DC collapses geometrically:
     *
     *   R₀, DC_DECAY·R₀, DC_DECAY²·R₀, …  → 0
     *
     * The result is written into block_dc_bias[n_blocks].  Phase 4 reads
     * this array (safe: written sequentially before the parallel loop).
     * ══════════════════════════════════════════════════════════════════ */

    #define DC_DECAY 0.85f   /* Boundary-condition leak factor (0 = isolated, 1 = full) */

    float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));

    if (block_dc_bias) {
        float rolling_dc = 0.0f;

        for (int64_t blk = 0; blk < n_blocks; blk++) {
            const float *bx  = weights + blk * QK_K;
            int          cidx = best_candidate[blk];
            float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
            float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);

            uint8_t dc_Ls[16], dc_Lm[16];
            hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
                                 dm0, mm0, dc_Ls, dc_Lm);

            /* Bias applied to THIS block's WLS targets */
            float dc_bias       = (DC_DECAY * rolling_dc) / (float)QK_K;
            block_dc_bias[blk]  = dc_bias;

            /* Quick round-nearest quant to estimate DC residual for NEXT block.
             * We quantize the adjusted target x′ = x − dc_bias, then measure
             * the residual of the ORIGINAL weight against the chosen code. */
            float dc_res = 0.0f;
            int   j, k;
            for (j = 0; j < N_SUB; j++) {
                float d_sub = dm0 * (float)dc_Ls[j];
                float m_sub = mm0 * (float)dc_Lm[j];
                for (k = 0; k < 16; k++) {
                    float x_adj = bx[16*j + k] - dc_bias;
                    int q = 0;
                    if (d_sub >= 1e-15f) {
                        q = gguf_nearest_int((x_adj + m_sub) / d_sub);
                        if (q < 0) q = 0;
                        if (q > 3) q = 3;
                    }
                    float deq = d_sub * (float)q - m_sub;
                    /* Residual against ORIGINAL weight (not adjusted) */
                    dc_res += bx[16*j + k] - deq;
                }
            }
            rolling_dc = dc_res;
        }
    }

    /* ══════════════════════════════════════════════════════════════════
     * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
     * ══════════════════════════════════════════════════════════════════ */

    int _n_omp_threads = 1;
    #ifdef _OPENMP
    _n_omp_threads = omp_get_max_threads();
    #endif
    HPCGraph **_tl_graphs = (HPCGraph **)calloc(_n_omp_threads, sizeof(HPCGraph *));
    for (int _ti = 0; _ti < _n_omp_threads; _ti++)
        _tl_graphs[_ti] = hpc_create(N_SUB);

    #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
    for (int64_t blk = 0; blk < n_blocks; blk++) {
        const float *block_x = weights + blk * QK_K;
        int cidx = best_candidate[blk];
        uint8_t Ls_blk[16], Lm_blk[16];

        /* ── Rolling DC boundary condition ──────────────────────────────
         * dc_adj shifts every WLS target in this block so that the net
         * quantisation error steers toward cancelling the previous block's
         * DC residual (written by the sequential Phase 3.9 pre-pass). */
        float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;

        /* Adjusted weight view — WLS and Shor work on this array;
         * the final error is always reported against the original block_x. */
        float adj_block_x[QK_K];
        {
            int _i;
            for (_i = 0; _i < QK_K; _i++)
                adj_block_x[_i] = block_x[_i] - dc_adj;
        }

        float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
        float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);

        hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
                             dm, mm, Ls_blk, Lm_blk);

        uint16_t prev_dm16 = 0, prev_mm16 = 0;
        for (int ls_iter = 0; ls_iter < 5; ls_iter++) {

            uint8_t state_ls[N_SUB][6];
            uint8_t state_lm[N_SUB][6];
            float state_err[N_SUB][6];

            for (int j = 0; j < N_SUB; j++) {
                const float *sx = adj_block_x + 16 * j;
                for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f;

                for (int try_ls = 0; try_ls <= 15; try_ls++) {
                    float d_sub = dm * (float)try_ls;
                    for (int try_lm = 0; try_lm <= 15; try_lm++) {
                        float m_sub = mm * (float)try_lm;
                        float sub_err = 0.0f;

                        for (int k = 0; k < 16; k++) {
                            float x = sx[k];
                            float w = (imat_importance) ?
                                      imat_importance[blk * QK_K + 16*j + k] : 1.0f;
                            int q = 0;
                            if (d_sub >= 1e-15f) {
                                q = gguf_nearest_int((x + m_sub) / d_sub);
                                if (q < 0) q = 0; if (q > 3) q = 3;
                            }
                            float deq = d_sub * (float)q - m_sub;
                            float diff = x - deq;
                            sub_err += diff * diff * w;
                        }

                        for (int v = 0; v < 6; v++) {
                            if (sub_err < state_err[j][v]) {
                                for (int u = 5; u > v; u--) {
                                    state_err[j][u] = state_err[j][u-1];
                                    state_ls[j][u] = state_ls[j][u-1];
                                    state_lm[j][u] = state_lm[j][u-1];
                                }
                                state_err[j][v] = sub_err;
                                state_ls[j][v] = (uint8_t)try_ls;
                                state_lm[j][v] = (uint8_t)try_lm;
                                break;
                            }
                        }
                    }
                }
            }

            int _tid = 0;
            #ifdef _OPENMP
            _tid = omp_get_thread_num();
            #endif
            HPCGraph *sg = _tl_graphs[_tid];
            hpc_reset_for_subblock(sg, N_SUB);
            {
                float min_sub_err[N_SUB];
                for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];

                for (int j = 0; j < N_SUB; j++) {
                    triality_dft(&sg->locals[j]);
                    double amp_re[6];
                    double amp_norm = 0.0;
                    for (int v = 0; v < 6; v++) {
                        float err_spread = state_err[j][5] - state_err[j][0];
                        float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
                        if (sub_temp < 1e-12f) sub_temp = 1e-12f;
                        amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
                        amp_norm += amp_re[v] * amp_re[v];
                    }
                    if (amp_norm > 1e-30) {
                        double inv = 1.0 / sqrt(amp_norm);
                        for (int v = 0; v < 6; v++) amp_re[v] *= inv;
                    }
                    for (int v = 0; v < 6; v++) {
                        sg->locals[j].edge_re[v] = amp_re[v];
                        sg->locals[j].edge_im[v] = 0.0;
                    }
                    sg->locals[j].primary = VIEW_EDGE;
                    sg->locals[j].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
                    sg->locals[j].delta_valid = 0;
                    triality_update_mask(&sg->locals[j]);
                }

                for (int j = 0; j < N_SUB - 1; j++)
                    hpc_cz(sg, j, j + 1);

                double sub_marg[N_SUB][6];
                int sub_measured[N_SUB];
                memset(sub_marg, 0, sizeof(sub_marg));
                memset(sub_measured, 0, sizeof(sub_measured));

                shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);

                for (int j = 0; j < N_SUB; j++) {
                    double best_prob = -1.0;
                    int best_v = 0;
                    for (int v = 0; v < 6; v++) {
                        if (sub_marg[j][v] > best_prob) {
                            best_prob = sub_marg[j][v];
                            best_v = v;
                        }
                    }
                    Ls_blk[j] = state_ls[j][best_v];
                    Lm_blk[j] = state_lm[j][best_v];
                }
            }

            uint8_t L[QK_K];
            for (int j = 0; j < N_SUB; j++) {
                float d_sub = dm * (float)Ls_blk[j];
                float m_sub = mm * (float)Lm_blk[j];
                if (d_sub < 1e-15f) {
                    for (int k = 0; k < 16; k++) L[16*j+k] = 0;
                    continue;
                }
                for (int k = 0; k < 16; k++) {
                    int q = gguf_nearest_int((adj_block_x[16*j+k] + m_sub) / d_sub);
                    if (q < 0) q = 0; if (q > 3) q = 3;
                    L[16*j+k] = (uint8_t)q;
                }
            }

            double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
            for (int j = 0; j < N_SUB; j++) {
                float ls_f = (float)Ls_blk[j];
                float lm_f = (float)Lm_blk[j];
                for (int k = 0; k < 16; k++) {
                    float x = adj_block_x[16*j+k];
                    float w = (imat_importance) ?
                              imat_importance[blk * QK_K + 16*j+k] : 1.0f;
                    float a = ls_f * (float)L[16*j+k];
                    float b = lm_f;
                    Saa += w * a * a;
                    Sab += w * a * b;
                    Sbb += w * b * b;
                    Sxa += w * x * a;
                    Sxb += w * x * b;
                }
            }

            double det = Saa * Sbb - Sab * Sab;
            if (fabs(det) > 1e-30) {
                double d_new  = (Sbb * Sxa - Sab * Sxb) / det;
                double dm_new = (Sab * Sxa - Saa * Sxb) / det;
                float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
                float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
                if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10))
                    dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
                if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10))
                    mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
            }

            uint16_t cur_dm16 = gguf_fp32_to_fp16(dm);
            uint16_t cur_mm16 = gguf_fp32_to_fp16(mm);
            if (cur_dm16 == prev_dm16 && cur_mm16 == prev_mm16) break;
            prev_dm16 = cur_dm16;
            prev_mm16 = cur_mm16;
        }

        /* ── FP16 ULP neighborhood search for (d, dmin) — Expanded to ±8 ── */
        {
            uint16_t base_d16 = gguf_fp32_to_fp16(dm);
            uint16_t base_m16 = gguf_fp32_to_fp16(mm);
            uint16_t best_d16 = base_d16, best_m16 = base_m16;
            float best_ulp_err = 1e30f;

            for (int dd = -8; dd <= 8; dd++) {
                int cd16 = (int)base_d16 + dd;
                if (cd16 < 0 || cd16 > 0x7BFF) continue;
                float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16);

                for (int dm_delta = -8; dm_delta <= 8; dm_delta++) {
                    int cm16 = (int)base_m16 + dm_delta;
                    if (cm16 < 0 || cm16 > 0x7BFF) continue;
                    float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16);

                    float err = 0.0f;
                    for (int j = 0; j < N_SUB; j++) {
                        float d_sub = trial_dm * (float)Ls_blk[j];
                        float m_sub = trial_mm * (float)Lm_blk[j];
                        for (int k = 0; k < 16; k++) {
                            float x = adj_block_x[16*j+k];
                            float w = (imat_importance) ?
                                      imat_importance[blk * QK_K + 16*j+k] : 1.0f;
                            int q;
                            if (d_sub < 1e-15f) { q = 0; }
                            else {
                                q = gguf_nearest_int((x + m_sub) / d_sub);
                                if (q < 0) q = 0; if (q > 3) q = 3;
                            }
                            float deq = d_sub * (float)q - m_sub;
                            float diff = x - deq;
                            err += diff * diff * w;
                        }
                    }
                    if (err < best_ulp_err) {
                        best_ulp_err = err;
                        best_d16 = (uint16_t)cd16;
                        best_m16 = (uint16_t)cm16;
                    }
                }
            }
            dm = gguf_fp16_to_fp32(best_d16);
            mm = gguf_fp16_to_fp32(best_m16);
        }

        for (int j = 0; j < N_SUB; j++) {
            const float *sx = adj_block_x + 16 * j;
            float best_sub_err = 1e30f;
            uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
            for (int try_ls = 0; try_ls <= 15; try_ls++) {
                float d_sub = dm * (float)try_ls;
                for (int try_lm = 0; try_lm <= 15; try_lm++) {
                    float m_sub = mm * (float)try_lm;
                    float sub_err = 0.0f;
                    for (int k = 0; k < 16; k++) {
                        float x = sx[k];
                        float w = (imat_importance) ?
                                  imat_importance[blk * QK_K + 16*j + k] : 1.0f;
                        int q;
                        if (d_sub < 1e-15f) { q = 0; }
                        else {
                            q = gguf_nearest_int((x + m_sub) / d_sub);
                            if (q < 0) q = 0; if (q > 3) q = 3;
                        }
                        float deq = d_sub * (float)q - m_sub;
                        float diff = x - deq;
                        sub_err += diff * diff * w;
                    }
                    if (sub_err < best_sub_err) {
                        best_sub_err = sub_err;
                        best_ls = (uint8_t)try_ls;
                        best_lm = (uint8_t)try_lm;
                    }
                }
            }
            Ls_blk[j] = best_ls;
            Lm_blk[j] = best_lm;
        }

        output[blk].d    = gguf_fp32_to_fp16(dm);
        output[blk].dmin = gguf_fp32_to_fp16(mm);

        for (int j = 0; j < N_SUB; j++)
            output[blk].scales[j] = Ls_blk[j] | (Lm_blk[j] << 4);

        /* ── Final quantization: D₆ Hadamard Greedy Descent (deterministic) ──
         *
         * The original Simulated Annealing acceptance rule is replaced by a
         * strict greedy descent: only accept a flip if it strictly reduces the
         * D₆ Hadamard metric (4·‖vesica‖² + DC²).  This makes error shaping
         * fully deterministic and thread-safe (no rand() inside omp parallel),
         * consistent with the Viterbi philosophy applied in Phase 3.
         *
         * The metric measures both:
         *  - Vesica Piscis term: correlated error between weights i and i+QK_K/2
         *    (targets the first non-DC harmonic — halfwave symmetry)
         *  - DC term: total signed error across the 256-weight superblock
         *    (captured and propagated to the next block by Phase 3.9)
         */
        uint8_t L[QK_K];
        {
            float q_cont_all[QK_K];
            int   q_base_all[QK_K];
            int   q_shaped_all[QK_K];

            for (int i = 0; i < QK_K; i++) {
                int   jj  = i >> 4;
                float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                float m_s = mm * (float)(output[blk].scales[jj] >> 4);
                if (d_s < 1e-15f) {
                    q_cont_all[i] = 0.0f;
                    q_base_all[i] = 0;
                } else {
                    /* Quantize the DC-adjusted target */
                    float qc = (adj_block_x[i] + m_s) / d_s;
                    q_cont_all[i] = qc;
                    int qr = gguf_nearest_int(qc);
                    if (qr < 0) qr = 0; if (qr > 3) qr = 3;
                    q_base_all[i] = qr;
                }
            }
            memcpy(q_shaped_all, q_base_all, QK_K * sizeof(int));

            float e_live[QK_K];
            for (int i = 0; i < QK_K; i++) {
                int   jj  = i >> 4;
                float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                float m_s = mm * (float)(output[blk].scales[jj] >> 4);
                /* Decoder semantics: deq = d_s·q − m_s, which is −m_s when
                 * d_s == 0 (NOT 0 — the −dmin·lm term always applies). */
                float deq = d_s * (float)q_shaped_all[i] - m_s;
                /* Residual against the adjusted target (DC-corrected view) */
                e_live[i] = adj_block_x[i] - deq;
            }

            float v_live[QK_K / 2];
            float vesica_cur = 0.0f, dc_cur = 0.0f;
            for (int i = 0; i < QK_K / 2; i++) {
                v_live[i] = e_live[i] + e_live[i + QK_K / 2];
                vesica_cur += v_live[i] * v_live[i];
            }
            for (int i = 0; i < QK_K; i++) dc_cur += e_live[i];
            float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

            /* Deterministic greedy descent: accept only strict improvements */
            for (int pass = 0; pass < QK_K; pass++) {
                int   best_k     = -1;
                int   best_q_alt = 0;
                float best_delta = 0.0f;   /* strictly positive threshold */

                for (int k = 0; k < QK_K; k++) {
                    int jj  = k >> 4;
                    float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                    if (d_s < 1e-15f) continue;

                    int q_cur = q_shaped_all[k];
                    int q_try = (q_cont_all[k] - (float)q_cur >= 0.0f)
                                ? q_cur + 1 : q_cur - 1;
                    if (q_try < 0 || q_try > 3) continue;

                    float m_s   = mm * (float)(output[blk].scales[jj] >> 4);
                    float e_new = adj_block_x[k] - (d_s * (float)q_try - m_s);
                    float de    = e_new - e_live[k];

                    int   pi    = (k < QK_K / 2) ? k : k - QK_K / 2;
                    float v_new = v_live[pi] + de;

                    float vesica_alt = vesica_cur - v_live[pi]*v_live[pi] + v_new*v_new;
                    float dc_alt     = dc_cur + de;
                    float delta      = metric_cur - (4.0f * vesica_alt + dc_alt * dc_alt);

                    if (delta > best_delta) {
                        best_delta = delta;
                        best_k     = k;
                        best_q_alt = q_try;
                    }
                }

                if (best_k < 0) break;   /* converged — no further improvement */

                q_shaped_all[best_k] = best_q_alt;
                {
                    int   jj_c  = best_k >> 4;
                    float d_c   = dm * (float)(output[blk].scales[jj_c] & 0xF);
                    float m_c   = mm * (float)(output[blk].scales[jj_c] >> 4);
                    float e_new_c = adj_block_x[best_k] - (d_c * (float)best_q_alt - m_c);
                    float de_c    = e_new_c - e_live[best_k];
                    int   pi_c    = (best_k < QK_K / 2) ? best_k : best_k - QK_K / 2;
                    float v_new_c = v_live[pi_c] + de_c;
                    vesica_cur   += v_new_c * v_new_c - v_live[pi_c] * v_live[pi_c];
                    dc_cur       += de_c;
                    metric_cur    = 4.0f * vesica_cur + dc_cur * dc_cur;
                    v_live[pi_c]  = v_new_c;
                    e_live[best_k]= e_new_c;
                }
            }

            /* Choose base vs shaped on the EXTENDED objective vs originals */
            float err_base = 0.0f, err_shaped = 0.0f;
            float e_qb[QK_K], e_qs[QK_K];
            for (int i = 0; i < QK_K; i++) {
                int   jj  = i >> 4;
                float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                float m_s = mm * (float)(output[blk].scales[jj] >> 4);
                float w   = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
                float deq_b = d_s * (float)q_base_all[i]   - m_s;  /* −m_s when d_s==0 */
                float deq_s = d_s * (float)q_shaped_all[i] - m_s;
                float xv    = block_x[i];   /* original weight for error report */
                e_qb[i] = xv - deq_b;
                e_qs[i] = xv - deq_s;
                err_base   += e_qb[i] * e_qb[i] * w;
                err_shaped += e_qs[i] * e_qs[i] * w;
            }
            err_base   += hex_spectral_penalty(e_qb, QK_K);
            err_shaped += hex_spectral_penalty(e_qs, QK_K);
            {
                int use_shaped = (err_shaped <= err_base);
                for (int i = 0; i < QK_K; i++)
                    L[i] = (uint8_t)(use_shaped ? q_shaped_all[i] : q_base_all[i]);
            }
        }

        /* ── Cross-weight error diffusion — intra-sub-block Floyd-Steinberg ──
         *
         * Implements cross-weight error diffusion within each 16-weight sub-block.
         * After the greedy descent has committed quantisation codes, the residual
         * of each weight is partially propagated forward to the next position in
         * the same sub-block (7/16 of the error), re-quantising if the diffused
         * target falls in a different bin.
         *
         * This is the "cross-weight" dimension of the error-diffusion request:
         * neighbouring weights share and partially absorb each other's rounding
         * error, shaping the within-block spectrum away from the DC component
         * that Phase 3.9 already propagates between blocks.
         *
         * Staying within sub-blocks avoids scale-mismatch artefacts that would
         * arise from diffusing across the dm * Ls[j] boundary between sub-blocks.
         *
         * The diffused codes are accepted only when they reduce the weighted MSE
         * against the ORIGINAL weight (not the adjusted target), so the diffusion
         * cannot increase the total reconstruction error.
         */
        {
            int fs_j, fs_k;
            for (fs_j = 0; fs_j < N_SUB; fs_j++) {
                int   base  = fs_j * 16;
                float d_s   = dm * (float)(output[blk].scales[fs_j] & 0xF);
                float m_s   = mm * (float)(output[blk].scales[fs_j] >> 4);
                if (d_s < 1e-15f) continue;

                float carry = 0.0f;   /* FS carry from position k-1 */

                for (fs_k = 0; fs_k < 16; fs_k++) {
                    int   idx    = base + fs_k;
                    float x_orig = block_x[idx];
                    float x_adj  = adj_block_x[idx] + carry;  /* adjusted + diffused */

                    /* Propose new code from diffused target */
                    int q_fs = gguf_nearest_int((x_adj + m_s) / d_s);
                    if (q_fs < 0) q_fs = 0; if (q_fs > 3) q_fs = 3;

                    if (q_fs != (int)L[idx]) {
                        /* Accept only when MSE against original weight improves */
                        float w_imp = (imat_importance)
                                      ? imat_importance[blk * QK_K + idx] : 1.0f;
                        float deq_old = d_s * (float)L[idx]  - m_s;
                        float deq_new = d_s * (float)q_fs     - m_s;
                        float e_old   = (x_orig - deq_old) * (x_orig - deq_old) * w_imp;
                        float e_new   = (x_orig - deq_new) * (x_orig - deq_new) * w_imp;
                        if (e_new < e_old)
                            L[idx] = (uint8_t)q_fs;
                    }

                    /* Propagate 7/16 of the residual (adj target vs committed code) */
                    {
                        float deq_final = d_s * (float)L[idx] - m_s;
                        float residual  = (adj_block_x[idx] - deq_final);
                        carry = (fs_k < 15) ? residual * (7.0f / 16.0f) : 0.0f;
                    }
                }
            }
        }

        /* ── Final closed-form (d, dmin) refit against the UNCLIPPED weights ──
         * (issues #2 / #5)
         *
         * Every earlier (d, dmin) solve fits the DC-adjusted, soft-clipped
         * target and runs BEFORE the greedy descent and Floyd-Steinberg passes
         * mutate the committed 2-bit codes. Once L[], and the 4-bit sub-block
         * scale codes (Ls = scales & 0xF, Lm = scales >> 4), are final, the two
         * fp16 scalars (d, dmin) that minimise the importance-weighted SSE
         * against the ORIGINAL weights have a closed form. Solve it and adopt it
         * only when it lowers the weighted block error — so it can never raise
         * RMSE, and because the integer codes are held fixed, the vesica/wave/DC
         * error shaping baked into them is preserved intact. */
        {
            double rSaa = 0, rSab = 0, rSbb = 0, rSxa = 0, rSxb = 0;
            double rA = 0, rB = 0, rS = 0;     /* DC rank-1 augmentation */
            for (int j = 0; j < N_SUB; j++) {
                float ls_f = (float)(output[blk].scales[j] & 0xF);
                float lm_f = (float)(output[blk].scales[j] >> 4);
                for (int k = 0; k < 16; k++) {
                    int   idx = 16 * j + k;
                    float x   = block_x[idx];                 /* unclipped original */
                    float w   = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
                    float a   = ls_f * (float)L[idx];
                    float b   = lm_f;
                    rSaa += (double)w * a * a;
                    rSab += (double)w * a * b;
                    rSbb += (double)w * b * b;
                    rSxa += (double)w * x * a;
                    rSxb += (double)w * x * b;
                    rA += a; rB += b; rS += x;
                }
            }
            /* DC term as one augmented observation (S ~ A·d − B·m), weight
             * λ_dc/n; vesica/wave handled by the extended-E acceptance. */
            {
                double rw = (double)HEX_DC_LAMBDA / (double)QK_K;
                rSaa += rw * rA * rA;  rSab += rw * rA * rB;
                rSbb += rw * rB * rB;  rSxa += rw * rS * rA;
                rSxb += rw * rS * rB;
            }
            double rdet = rSaa * rSbb - rSab * rSab;
            if (fabs(rdet) > 1e-30) {
                double d_ref = (rSbb * rSxa - rSab * rSxb) / rdet;
                double m_ref = (rSab * rSxa - rSaa * rSxb) / rdet;
                if (d_ref > 0.0) {
                    float dm_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_ref));
                    float mm_try = (m_ref > 0.0)
                                   ? gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)m_ref))
                                   : mm;
                    /* Extended-objective acceptance test vs original weights. */
                    float err_cur = 0.0f, err_try = 0.0f;
                    float e_rc[QK_K], e_rt[QK_K];
                    for (int j = 0; j < N_SUB; j++) {
                        float ls_f = (float)(output[blk].scales[j] & 0xF);
                        float lm_f = (float)(output[blk].scales[j] >> 4);
                        for (int k = 0; k < 16; k++) {
                            int   idx = 16 * j + k;
                            float x   = block_x[idx];
                            float w   = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
                            float qf  = (float)L[idx];
                            float dc  = dm     * ls_f * qf - mm     * lm_f;
                            float dt  = dm_try * ls_f * qf - mm_try * lm_f;
                            e_rc[idx] = x - dc;
                            e_rt[idx] = x - dt;
                            err_cur += e_rc[idx] * e_rc[idx] * w;
                            err_try += e_rt[idx] * e_rt[idx] * w;
                        }
                    }
                    err_cur += hex_spectral_penalty(e_rc, QK_K);
                    err_try += hex_spectral_penalty(e_rt, QK_K);
                    if (err_try < err_cur) { dm = dm_try; mm = mm_try; }
                }
            }
            output[blk].d    = gguf_fp32_to_fp16(dm);
            output[blk].dmin = gguf_fp32_to_fp16(mm);
        }

        /* ══ PHASE 4.6: MONOTONE COORDINATE-DESCENT POLISH (RMSE-guaranteed) ══
         *
         * Objective-function mismatch fix: the final passes that commit the
         * 2-bit codes — the 16×16 (ls, lm) sub-block search, the ±8 ULP
         * (d, dmin) neighborhood search, and the greedy-descent error shaping
         * — all minimise error against the DC-ADJUSTED target adj_block_x.
         * The reported RMSE, however, is measured against the ORIGINAL
         * weights. The codes are therefore stranded at the optimum of a
         * SHIFTED objective, while only the scalar (d, dmin) refit above
         * targets the true one (and it holds all codes frozen).
         *
         * This polish runs alternating coordinate descent on the TRUE
         * objective (importance-weighted SSE vs the original weights):
         *
         *   (1) For each 16-weight sub-block, an exact joint re-search of
         *       (ls, lm) over the full 16×16 grid with per-weight optimal
         *       q ∈ {0..3}, committed only on strict improvement of the
         *       extended objective E. With λ_dc = λ_vw = 0 sub-blocks are
         *       independent given (d, dmin); with spectral terms active the
         *       coupling (DC: all subs; fold: sub j ↔ sub j⊕8) is handled
         *       exactly via live residual bookkeeping.
         *   (2) Closed-form weighted LS refit of the two fp16 scalars
         *       (d, dmin) with all codes held fixed, committed only on
         *       strict improvement (same guard as the refit above).
         *
         * All moves are accept-only-if-better on E ⇒ the extended block
         * objective is monotonically non-increasing; at λ = 0 this reduces
         * to RMSE-monotone (final RMSE can only go DOWN relative to the
         * unpatched pipeline), at λ > 0 small RMSE giveback is permitted
         * exactly where it buys dot-product error cancellation. The state space is finite
         * (4-bit codes, fp16 scalars), so the loop terminates; in practice
         * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
         * into L survives wherever it is SSE-neutral, and is overridden
         * only where it was costing true reconstruction error.            */
        {
            uint8_t pl_Ls[16], pl_Lm[16];
            for (int j = 0; j < N_SUB; j++) {
                pl_Ls[j] = output[blk].scales[j] & 0xF;
                pl_Lm[j] = output[blk].scales[j] >> 4;
            }

            for (int pol_iter = 0; pol_iter < 6; pol_iter++) {
                int pol_improved = 0;

                /* ── (1) Exact per-sub-block (ls, lm, q) re-search on the
                 * EXTENDED objective. Under the spectral terms sub-blocks
                 * are no longer independent: every sub couples to all others
                 * through the DC term and to its fold partner (sub j ⊕ 8,
                 * i.e. weights i ↔ i+128) through vesica² − wave². The
                 * search therefore keeps live residuals pe[] and scores each
                 * candidate against the whole-block penalty with the partner
                 * residuals held fixed — exact coordinate descent on E.    */
                float pe[QK_K];
                float sub_sse[16], sub_dc[16], pair_cross[8];
                float dc_tot = 0.0f, cross_tot = 0.0f;
                for (int j = 0; j < N_SUB; j++) {
                    float d_sub = dm * (float)pl_Ls[j];
                    float m_sub = mm * (float)pl_Lm[j];
                    sub_sse[j] = 0.0f;
                    sub_dc[j]  = 0.0f;
                    for (int k = 0; k < 16; k++) {
                        int   idx = 16 * j + k;
                        float w = (imat_importance) ?
                                  imat_importance[blk * QK_K + idx] : 1.0f;
                        /* deq = d·ls·q − dmin·lm; equals −m_sub at ls==0 */
                        float e = block_x[idx] - (d_sub * (float)L[idx] - m_sub);
                        pe[idx]     = e;
                        sub_sse[j] += e * e * w;
                        sub_dc[j]  += e;
                    }
                    dc_tot += sub_dc[j];
                }
                for (int p = 0; p < 8; p++) {
                    pair_cross[p] = 0.0f;
                    for (int k = 0; k < 16; k++)
                        pair_cross[p] += pe[16*p + k] * pe[16*(p+8) + k];
                    cross_tot += pair_cross[p];
                }

                for (int j = 0; j < N_SUB; j++) {
                    const float *sx  = block_x + 16 * j;
                    int   pi         = j & 7;          /* fold-pair index    */
                    int   pj         = j ^ 8;          /* partner sub-block  */
                    const float *ppe = pe + 16 * pj;   /* partner residuals  */
                    float dc_rest    = dc_tot    - sub_dc[j];
                    float cross_rest = cross_tot - pair_cross[pi];

                    /* Extended score of the CURRENT committed state */
                    float best_sub = sub_sse[j]
                        + (HEX_DC_LAMBDA / (float)QK_K) * dc_tot * dc_tot
                        + (HEX_VW_LAMBDA / (float)QK_K) * 4.0f * cross_tot;
                    int     best_ls = -1, best_lm = 0;
                    uint8_t best_q[16];
                    float   best_e[16];
                    float   best_sse = 0.0f, best_dcc = 0.0f, best_cxc = 0.0f;

                    for (int try_ls = 0; try_ls <= 15; try_ls++) {
                        float d_sub = dm * (float)try_ls;
                        for (int try_lm = 0; try_lm <= 15; try_lm++) {
                            float m_sub = mm * (float)try_lm;
                            float sub_err = 0.0f, dcc = 0.0f, cxc = 0.0f;
                            uint8_t q_loc[16];
                            float   e_loc[16];
                            int     aborted = 0;
                            for (int k = 0; k < 16; k++) {
                                float x = sx[k];
                                float w = (imat_importance) ?
                                          imat_importance[blk * QK_K + 16*j + k] : 1.0f;
                                int q = 0;
                                if (d_sub >= 1e-15f) {
                                    q = gguf_nearest_int((x + m_sub) / d_sub);
                                    if (q < 0) q = 0; if (q > 3) q = 3;
                                }
                                q_loc[k] = (uint8_t)q;
                                /* deq = d·ls·q − dmin·lm; −m_sub at ls==0 */
                                float e = x - (d_sub * (float)q - m_sub);
                                e_loc[k] = e;
                                sub_err += e * e * w;
                                dcc     += e;
                                cxc     += e * ppe[k];
                                /* SSE-partial prune is a valid lower bound
                                 * only while the spectral terms are ≥ 0,
                                 * i.e. when the (signable) vw credit is off */
                                if (HEX_VW_LAMBDA == 0.0f &&
                                    sub_err >= best_sub) { aborted = 1; break; }
                            }
                            if (aborted) continue;
                            float score = sub_err
                                + (HEX_DC_LAMBDA / (float)QK_K)
                                  * (dc_rest + dcc) * (dc_rest + dcc)
                                + (HEX_VW_LAMBDA / (float)QK_K) * 4.0f
                                  * (cross_rest + cxc);
                            if (score < best_sub) {
                                best_sub = score;
                                best_ls  = try_ls;
                                best_lm  = try_lm;
                                memcpy(best_q, q_loc, 16);
                                memcpy(best_e, e_loc, sizeof(e_loc));
                                best_sse = sub_err;
                                best_dcc = dcc;
                                best_cxc = cxc;
                            }
                        }
                    }

                    if (best_ls >= 0) {   /* strict improvement in E found */
                        pl_Ls[j] = (uint8_t)best_ls;
                        pl_Lm[j] = (uint8_t)best_lm;
                        memcpy(L  + 16 * j, best_q, 16);
                        memcpy(pe + 16 * j, best_e, sizeof(best_e));
                        sub_sse[j]     = best_sse;
                        sub_dc[j]      = best_dcc;
                        pair_cross[pi] = best_cxc;
                        dc_tot         = dc_rest    + best_dcc;
                        cross_tot      = cross_rest + best_cxc;
                        pol_improved = 1;
                    }
                }

                /* ── (2) Closed-form (d, dmin) refit vs ORIGINAL, codes fixed ── */
                {
                    double pSaa = 0, pSab = 0, pSbb = 0, pSxa = 0, pSxb = 0;
                    double pA = 0, pB = 0, pS = 0;  /* DC rank-1 augmentation */
                    for (int j = 0; j < N_SUB; j++) {
                        float ls_f = (float)pl_Ls[j];
                        float lm_f = (float)pl_Lm[j];
                        for (int k = 0; k < 16; k++) {
                            int   idx = 16 * j + k;
                            float x   = block_x[idx];
                            float w   = (imat_importance) ?
                                        imat_importance[blk * QK_K + idx] : 1.0f;
                            float a   = ls_f * (float)L[idx];
                            float b   = lm_f;
                            pSaa += (double)w * a * a;
                            pSab += (double)w * a * b;
                            pSbb += (double)w * b * b;
                            pSxa += (double)w * x * a;
                            pSxb += (double)w * x * b;
                            pA += a; pB += b; pS += x;
                        }
                    }
                    {
                        double pw = (double)HEX_DC_LAMBDA / (double)QK_K;
                        pSaa += pw * pA * pA;  pSab += pw * pA * pB;
                        pSbb += pw * pB * pB;  pSxa += pw * pS * pA;
                        pSxb += pw * pS * pB;
                    }
                    double pdet = pSaa * pSbb - pSab * pSab;
                    if (fabs(pdet) > 1e-30) {
                        double d_ref = (pSbb * pSxa - pSab * pSxb) / pdet;
                        double m_ref = (pSab * pSxa - pSaa * pSxb) / pdet;
                        if (d_ref > 0.0) {
                            float dm_try = gguf_fp16_to_fp32(
                                               gguf_fp32_to_fp16((float)d_ref));
                            float mm_try = (m_ref > 0.0)
                                           ? gguf_fp16_to_fp32(
                                                 gguf_fp32_to_fp16((float)m_ref))
                                           : mm;
                            float err_cur = 0.0f, err_try = 0.0f;
                            float e_pc[QK_K], e_pt[QK_K];
                            for (int j = 0; j < N_SUB; j++) {
                                float ls_f = (float)pl_Ls[j];
                                float lm_f = (float)pl_Lm[j];
                                for (int k = 0; k < 16; k++) {
                                    int   idx = 16 * j + k;
                                    float x   = block_x[idx];
                                    float w   = (imat_importance) ?
                                                imat_importance[blk * QK_K + idx] : 1.0f;
                                    float qf  = (float)L[idx];
                                    float dc  = dm     * ls_f * qf - mm     * lm_f;
                                    float dt  = dm_try * ls_f * qf - mm_try * lm_f;
                                    e_pc[idx] = x - dc;
                                    e_pt[idx] = x - dt;
                                    err_cur += e_pc[idx] * e_pc[idx] * w;
                                    err_try += e_pt[idx] * e_pt[idx] * w;
                                }
                            }
                            err_cur += hex_spectral_penalty(e_pc, QK_K);
                            err_try += hex_spectral_penalty(e_pt, QK_K);
                            if (err_try < err_cur) {
                                dm = dm_try;
                                mm = mm_try;
                                pol_improved = 1;
                            }
                        }
                    }
                }

                if (!pol_improved) {
                    /* ── (3) ±2 ULP joint (d, dmin) micro-search vs ORIGINAL ──
                     * The closed-form refit rounds its real-valued optimum to
                     * fp16, which can land 1–2 ULP away from the best
                     * representable pair (and the earlier ±8 ULP search ran
                     * against the DC-shifted objective). With codes fixed,
                     * scan the (2·HEX_POLISH_ULP+1)² fp16 neighborhood on the
                     * true objective;
                     * accept only strict improvement, then loop once more so
                     * move (1) can re-optimise codes for the new scalars.
                     * Monotone ⇒ final RMSE can only decrease. */
                    uint16_t base_d16 = gguf_fp32_to_fp16(dm);
                    uint16_t base_m16 = gguf_fp32_to_fp16(mm);

                    float cur_err = 0.0f;
                    float e_u[QK_K];
                    for (int j = 0; j < N_SUB; j++) {
                        float d_sub = dm * (float)pl_Ls[j];
                        float m_sub = mm * (float)pl_Lm[j];
                        for (int k = 0; k < 16; k++) {
                            int   idx = 16 * j + k;
                            float w   = (imat_importance) ?
                                        imat_importance[blk * QK_K + idx] : 1.0f;
                            e_u[idx] = block_x[idx] -
                                       (d_sub * (float)L[idx] - m_sub);
                            cur_err += e_u[idx] * e_u[idx] * w;
                        }
                    }
                    cur_err += hex_spectral_penalty(e_u, QK_K);

                    float    best_err = cur_err;
                    uint16_t best_d16 = base_d16, best_m16 = base_m16;
                    for (int dd = -HEX_POLISH_ULP; dd <= HEX_POLISH_ULP; dd++) {
                        int cd16 = (int)base_d16 + dd;
                        if (cd16 < 0 || cd16 > 0x7BFF) continue;
                        float t_dm = gguf_fp16_to_fp32((uint16_t)cd16);
                        for (int dmm = -HEX_POLISH_ULP; dmm <= HEX_POLISH_ULP; dmm++) {
                            if (dd == 0 && dmm == 0) continue;
                            int cm16 = (int)base_m16 + dmm;
                            if (cm16 < 0 || cm16 > 0x7BFF) continue;
                            float t_mm = gguf_fp16_to_fp32((uint16_t)cm16);

                            float err = 0.0f;
                            /* SSE-partial prune valid only without the
                             * signable vesica/wave credit */
                            for (int j = 0;
                                 j < N_SUB && (HEX_VW_LAMBDA != 0.0f ||
                                               err < best_err); j++) {
                                float d_sub = t_dm * (float)pl_Ls[j];
                                float m_sub = t_mm * (float)pl_Lm[j];
                                for (int k = 0; k < 16; k++) {
                                    int   idx = 16 * j + k;
                                    float w   = (imat_importance) ?
                                                imat_importance[blk * QK_K + idx] : 1.0f;
                                    e_u[idx] = block_x[idx] -
                                               (d_sub * (float)L[idx] - m_sub);
                                    err += e_u[idx] * e_u[idx] * w;
                                }
                            }
                            if (HEX_DC_LAMBDA != 0.0f || HEX_VW_LAMBDA != 0.0f)
                                err = (err < best_err || HEX_VW_LAMBDA != 0.0f)
                                      ? err + hex_spectral_penalty(e_u, QK_K)
                                      : err;
                            if (err < best_err) {
                                best_err = err;
                                best_d16 = (uint16_t)cd16;
                                best_m16 = (uint16_t)cm16;
                            }
                        }
                    }
                    if (best_d16 != base_d16 || best_m16 != base_m16) {
                        dm = gguf_fp16_to_fp32(best_d16);
                        mm = gguf_fp16_to_fp32(best_m16);
                        pol_improved = 1;
                    }
                }

                if (!pol_improved) break;   /* converged on true objective */
            }

            /* Write back polished codes and scalars */
            for (int j = 0; j < N_SUB; j++)
                output[blk].scales[j] = pl_Ls[j] | (pl_Lm[j] << 4);
            output[blk].d    = gguf_fp32_to_fp16(dm);
            output[blk].dmin = gguf_fp32_to_fp16(mm);
        }

        /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
         *
         * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
         * encodable configuration (fp16 d/dmin + derived Ls/Lm + nearest
         * rounding vs the original weights). The multi-stage assembly
         * (DC-shifted WLS, shaping, diffusion, polish) usually improves on
         * its seed, but each stage optimises a slightly different objective
         * and coordinate descent can land in a worse basin. Compare the
         * finished block against the best raw candidate and fall back when
         * the pipeline ended up worse — guaranteeing
         *     final weighted SSE ≤ min_c candidate_errors[blk][c].          */
        {
            float fin_err = 0.0f;
            float e_f[QK_K];
            for (int j = 0; j < N_SUB; j++) {
                float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
                float m_sub = mm * (float)(output[blk].scales[j] >> 4);
                for (int k = 0; k < 16; k++) {
                    int   idx = 16 * j + k;
                    float w   = (imat_importance) ?
                                imat_importance[blk * QK_K + idx] : 1.0f;
                    e_f[idx] = block_x[idx] -
                               (d_sub * (float)L[idx] - m_sub);
                    fin_err += e_f[idx] * e_f[idx] * w;
                }
            }
            fin_err += hex_spectral_penalty(e_f, QK_K);

            float g_best = candidate_errors[blk][0];
            int   g_cand = 0;
            for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
                if (candidate_errors[blk][c] < g_best) {
                    g_best = candidate_errors[blk][c];
                    g_cand = c;
                }
            }

            if (g_best < fin_err) {
                /* Rebuild the block exactly as the candidate was scored */
                float c_dm = gguf_fp16_to_fp32(candidate_d   [blk][g_cand]);
                float c_mm = gguf_fp16_to_fp32(candidate_dmin[blk][g_cand]);
                uint8_t c_Ls[16], c_Lm[16];
                hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
                                     c_dm, c_mm, c_Ls, c_Lm);
                for (int j = 0; j < N_SUB; j++) {
                    float d_sub = c_dm * (float)c_Ls[j];
                    float m_sub = c_mm * (float)c_Lm[j];
                    for (int k = 0; k < 16; k++) {
                        int idx = 16 * j + k;
                        int q = 0;
                        if (d_sub >= 1e-15f) {
                            q = gguf_nearest_int((block_x[idx] + m_sub) / d_sub);
                            if (q < 0) q = 0; if (q > 3) q = 3;
                        }
                        L[idx] = (uint8_t)q;
                    }
                    output[blk].scales[j] = c_Ls[j] | (c_Lm[j] << 4);
                }
                dm = c_dm;  mm = c_mm;
                output[blk].d    = candidate_d   [blk][g_cand];
                output[blk].dmin = candidate_dmin[blk][g_cand];
            }
        }

        for (int j = 0; j < QK_K; j += 128) {
            for (int l = 0; l < 32; l++) {
                output[blk].qs[j / 4 + l] = L[j + l]
                                           | (L[j + l + 32] << 2)
                                           | (L[j + l + 64] << 4)
                                           | (L[j + l + 96] << 6);
            }
        }

        float berr = gguf_q2_k_block_error(block_x, &output[blk]);
        if (isnan(berr)) {
            printf("NaN block error at blk %ld! dm=%f mm=%f\n", (long)blk, dm, mm);
            for (int j=0; j<16; j++) printf("Ls[%d]=%d Lm[%d]=%d\n", j, Ls_blk[j], j, Lm_blk[j]);
            exit(1);
        }
        total_err += berr;
    }

    for (int _ti = 0; _ti < _n_omp_threads; _ti++)
        hpc_destroy(_tl_graphs[_ti]);
    free(_tl_graphs);

    free(block_dc_bias);
    free(seeds);
    free(candidate_errors);
    free(candidate_d);
    free(candidate_dmin);
    free(best_candidate);
    if (out_total_error) *out_total_error = total_err;
    
    if (verbose) {
        float rmse = sqrtf(total_err / (float)n_elements);

        double w_sum2 = 0.0;
        for (int64_t i = 0; i < n_elements; i++)
            w_sum2 += (double)weights[i] * (double)weights[i];
        w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
        float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f;

        const char *fidelity_class;
        const char *fidelity_icon;
        if (rmse <= 1.0e-04f) {
            fidelity_class = "ULTRA (≤1e-04)";
            fidelity_icon = "★★★★";
        } else if (rmse <= 3.0e-04f) {
            fidelity_class = "HIGH (≤3e-04)";
            fidelity_icon = "★★★☆";
        } else if (rmse <= 1.0e-03f) {
            fidelity_class = "GOOD (≤1e-03)";
            fidelity_icon = "★★☆☆";
        } else {
            fidelity_class = "STANDARD";
            fidelity_icon = "★☆☆☆";
        }

        printf("\n  ┌──── Shor Measurement Q2_K Report ────────────────────────────────┐\n");
        printf("  │  Elements:      %-12lld  Blocks:     %-12lld          │\n",
               (long long)n_elements, (long long)(n_elements / QK_K));
        printf("  │  Weight σ:      %-12.4e  Range: [%.4e, %.4e]   │\n",
               w_sigma, w_sigma * -4.0f, w_sigma * 4.0f);
        printf("  │  Total MSE:     %-12.6f                                    │\n", total_err);
        printf("  │  RMSE:          %-12.4e  RMSE/σ: %-8.4f                  │\n",
               rmse, rmse_over_sigma);
        printf("  │  Fidelity:      %s %-14s                        │\n",
               fidelity_icon, fidelity_class);
        printf("  │  Engine:        Shor Griffiths-Niu (IDFT6 + feed-forward)       │\n");
        printf("  └─────────────────────────────────────────────────────────────────┘\n");
    }
}


/* ═══════════════════════════════════════════════════════════════════════════
 * PROGRESS REPORTING
 * ═══════════════════════════════════════════════════════════════════════════ */

static void print_progress_bar(int current, int total, const char *label,
                                 time_t start_time)
{
    if (total <= 0) return;
    float pct = (float)current / (float)total;
    int bar_width = 40;
    int filled = (int)(pct * bar_width);

    /* Wall-clock elapsed: clock() sums CPU time over all OpenMP threads,
     * which inflated elapsed/ETA by ~the thread count on multicore. */
    double elapsed = difftime(time(NULL), start_time);
    double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;

    printf("\r  [");
    for (int i = 0; i < bar_width; i++) {
        if (i < filled) printf("█");
        else if (i == filled) printf("▓");
        else printf("░");
    }
    printf("] %3d%% (%d/%d) %.0fs ETA:%.0fs  %s",
           (int)(pct * 100), current, total, elapsed, eta, label);
    fflush(stdout);

    if (current == total) printf("\n");
}

/* ═══════════════════════════════════════════════════════════════════════════
 * GGUF FILE WRITER — Assembles the complete output file
 * ═══════════════════════════════════════════════════════════════════════════ */

static int write_gguf(const char *output_path, const STMultiFile *mf,
                        const ModelArchitecture *arch,
                        const TokenizerData *tokenizer,
                        OptimizerMode opt_mode,
                        const IMatrixData *imatrix,
                        int verbose)
{
    FILE *fp = fopen(output_path, "wb");
    if (!fp) {
        fprintf(stderr, "  ERROR: Cannot open '%s' for writing\n", output_path);
        return -1;
    }

    printf("\n  ╔════════════════════════════════════════════════════════════════╗\n");
    printf("  ║  WRITING GGUF FILE                                           ║\n");
    printf("  ╚════════════════════════════════════════════════════════════════╝\n\n");

    /* ── Determine which tensors to include ── */
    int *include_list = (int *)calloc(mf->n_tensors, sizeof(int));
    int n_include = 0;
    for (int i = 0; i < mf->n_tensors; i++) {
        if (!should_skip_tensor(mf->tensor_map[i].name)) {
            include_list[n_include++] = i;
        } else {
            if (verbose) printf("  SKIP: %s (not needed in GGUF)\n", mf->tensor_map[i].name);
        }
    }

    /* ── Count metadata KV pairs ── */
    int n_kv = 0;
    n_kv++;  /* general.architecture */
    n_kv++;  /* general.name */
    n_kv++;  /* general.quantization_version */
    n_kv++;  /* general.file_type */
    n_kv++;  /* {arch}.context_length */
    n_kv++;  /* {arch}.embedding_length */
    n_kv++;  /* {arch}.block_count */
    n_kv++;  /* {arch}.feed_forward_length */
    n_kv++;  /* {arch}.attention.head_count */
    n_kv++;  /* {arch}.attention.head_count_kv */
    n_kv++;  /* {arch}.attention.layer_norm_rms_epsilon */
    n_kv++;  /* {arch}.rope.freq_base */
    n_kv++;  /* {arch}.vocab_size */

    /* Tokenizer metadata KV count */
    int has_tokenizer = (tokenizer != NULL && tokenizer->vocab_size > 0);
    if (has_tokenizer) {
        n_kv++;  /* tokenizer.ggml.model */
        n_kv++;  /* tokenizer.ggml.tokens */
        n_kv++;  /* tokenizer.ggml.scores */
        n_kv++;  /* tokenizer.ggml.token_type */
        n_kv++;  /* tokenizer.ggml.bos_token_id */
        n_kv++;  /* tokenizer.ggml.eos_token_id */
        n_kv++;  /* tokenizer.ggml.unknown_token_id */
        if (tokenizer->n_merges > 0)
            n_kv++;  /* tokenizer.ggml.merges */
    }

    /* ── Check for weight tying ──
     * If tie_word_embeddings is set and there's no separate lm_head,
     * llama.cpp handles this internally — do NOT duplicate the tensor.
     * Only add output.weight if the model has a separate lm_head.weight. */
    int has_lm_head = (st_multi_find_tensor(mf, "lm_head.weight") >= 0);
    int total_tensors = n_include;

    if (arch->tie_word_embeddings && !has_lm_head) {
        printf("  Weight-tied embeddings detected — llama.cpp handles internally\n\n");
    }

    /* ── Prepare tensor info ── */
    char (*gguf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN);
    GGMLType *tensor_types = calloc(total_tensors, sizeof(GGMLType));
    int64_t *tensor_sizes = calloc(total_tensors, sizeof(int64_t));
    uint64_t data_offset = 0;
    uint64_t *tensor_offsets = calloc(total_tensors, sizeof(uint64_t));
    int *tensor_src_idx = calloc(total_tensors, sizeof(int)); /* map to unified ST index */
    char (*tensor_hf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN);

    GGMLType quant_type = GGML_TYPE_Q2_K;

    for (int i = 0; i < n_include; i++) {
        int src = include_list[i];
        const STTensorInfo *ti = st_multi_tensor_info(mf, src);
        map_tensor_name(mf->tensor_map[src].name, gguf_names[i], ST_MAX_NAME_LEN);
        strncpy(tensor_hf_names[i], mf->tensor_map[src].name, ST_MAX_NAME_LEN - 1);
        tensor_src_idx[i] = src;

        if (should_quantize(ti, gguf_names[i])) {
            if (is_attention_tensor(gguf_names[i])) {
                tensor_types[i] = GGML_TYPE_Q4_0;
                int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0;
                tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0);
                if (verbose)
                    printf("  [ATTN→Q4_0] %s (%ld elements)\n",
                           gguf_names[i], (long)ti->n_elements);
            } else {
                tensor_types[i] = quant_type;
                tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements);
            }
        } else if (ti->n_dims >= 2) {
            tensor_types[i] = GGML_TYPE_F16;
            tensor_sizes[i] = ti->n_elements * sizeof(uint16_t);
        } else {
            tensor_types[i] = GGML_TYPE_F32;
            tensor_sizes[i] = ti->n_elements * sizeof(float);
        }

        tensor_offsets[i] = data_offset;

        data_offset += tensor_sizes[i];
        data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) &
                      ~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1);
    }

    /* ── Write header ── */
    gguf_write_header(fp, total_tensors, n_kv);

    /* ── Write metadata KV pairs ── */
    gguf_write_kv_string(fp, "general.architecture", arch->architecture);
    gguf_write_kv_string(fp, "general.name", arch->name);
    gguf_write_kv_uint32(fp, "general.quantization_version", 2);
    gguf_write_kv_uint32(fp, "general.file_type", 10);  /* Q2_K = 10 */

    char kbuf[128];
    snprintf(kbuf, sizeof(kbuf), "%s.context_length", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->context_length);

    snprintf(kbuf, sizeof(kbuf), "%s.embedding_length", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->embedding_length);

    snprintf(kbuf, sizeof(kbuf), "%s.block_count", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->block_count);

    snprintf(kbuf, sizeof(kbuf), "%s.feed_forward_length", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->feed_forward_length);

    snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->head_count);

    snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count_kv", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->head_count_kv);

    snprintf(kbuf, sizeof(kbuf), "%s.attention.layer_norm_rms_epsilon", arch->architecture);
    gguf_write_kv_float32(fp, kbuf, arch->rms_norm_eps);

    snprintf(kbuf, sizeof(kbuf), "%s.rope.freq_base", arch->architecture);
    gguf_write_kv_float32(fp, kbuf, arch->rope_freq_base);

    snprintf(kbuf, sizeof(kbuf), "%s.vocab_size", arch->architecture);
    gguf_write_kv_uint32(fp, kbuf, arch->vocab_size);

    /* ── Write tokenizer metadata ── */
    if (has_tokenizer) {
        gguf_write_kv_string(fp, "tokenizer.ggml.model", tokenizer->model_type);
        gguf_write_kv_string_array(fp, "tokenizer.ggml.tokens",
                                     (const char **)tokenizer->tokens,
                                     (uint64_t)tokenizer->vocab_size);
        gguf_write_kv_float32_array(fp, "tokenizer.ggml.scores",
                                      tokenizer->scores,
                                      (uint64_t)tokenizer->vocab_size);
        gguf_write_kv_int32_array(fp, "tokenizer.ggml.token_type",
                                    tokenizer->token_types,
                                    (uint64_t)tokenizer->vocab_size);
        gguf_write_kv_uint32(fp, "tokenizer.ggml.bos_token_id",
                               (uint32_t)tokenizer->bos_id);
        gguf_write_kv_uint32(fp, "tokenizer.ggml.eos_token_id",
                               (uint32_t)tokenizer->eos_id);
        gguf_write_kv_uint32(fp, "tokenizer.ggml.unknown_token_id",
                               (uint32_t)tokenizer->unk_id);
        if (tokenizer->n_merges > 0) {
            gguf_write_kv_string_array(fp, "tokenizer.ggml.merges",
                                         (const char **)tokenizer->merges,
                                         (uint64_t)tokenizer->n_merges);
        }
        printf("  Tokenizer metadata written (%d tokens, %d merges)\n\n",
               tokenizer->vocab_size, tokenizer->n_merges);
    }

    /* ── Write tensor info descriptors ── */
    for (int i = 0; i < total_tensors; i++) {
        int src = tensor_src_idx[i];
        const STTensorInfo *ti = st_multi_tensor_info(mf, src);
        uint64_t dims[ST_MAX_DIMS];
        int nd = ti->n_dims;
        for (int d = 0; d < nd; d++) {
            dims[d] = (uint64_t)ti->shape[nd - 1 - d];
        }
        gguf_write_tensor_info(fp, gguf_names[i],
                                ti->n_dims, dims,
                                tensor_types[i], tensor_offsets[i]);
    }

    /* ── Alignment padding before data section ── */
    gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);

    /* ── Write tensor data ── */
    printf("  Quantizing and writing %d tensors...\n\n", total_tensors);

    float total_error_sum = 0.0f;
    int quant_count = 0;
    int64_t total_elements_quantized = 0;
    int64_t total_bytes_quantized = 0;
    int64_t total_bytes_unquantized = 0;
    time_t quant_start = time(NULL);

    for (int i = 0; i < total_tensors; i++) {
        int src = tensor_src_idx[i];
        const STTensorInfo *ti = st_multi_tensor_info(mf, src);

        print_progress_bar(i, total_tensors, gguf_names[i], quant_start);

        if (tensor_types[i] == GGML_TYPE_Q2_K) {
            float *f32_data = st_multi_tensor_to_f32(mf, src);
            if (!f32_data) {
                fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s' to F32\n",
                        ti->name);
                continue;
            }

            int64_t n_elements = ti->n_elements;
            float tensor_error = 0.0f;

            int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
            if (padded > n_elements) {
                float *grown = realloc(f32_data, padded * sizeof(float));
                if (!grown) {
                    fprintf(stderr, "\n  ERROR: Out of memory padding '%s'\n",
                            ti->name);
                    free(f32_data);
                    continue;
                }
                f32_data = grown;
                for (int64_t j = n_elements; j < padded; j++)
                    f32_data[j] = 0.0f;
                n_elements = padded;
            }

            int64_t n_blocks = n_elements / QK_K;
            BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K));

            const float *imp = NULL;
            if (imatrix) {
                const IMatrixEntry *ime = imatrix_find_any(imatrix,
                    gguf_names[i], tensor_hf_names[i]);
                if (ime && ime->n_values > 0) {
                    imp = ime->normalized;
                    if (verbose)
                        printf("\n    imatrix: using %d importance weights for %s\n",
                               ime->n_values, gguf_names[i]);
                }
            }

            quantize_tensor_q2k_hpc(f32_data, n_elements,
                                      quant_data, &tensor_error,
                                      opt_mode, imp, verbose);

            fwrite(quant_data, sizeof(BlockQ2K), n_blocks, fp);

            float rmse = sqrtf(tensor_error / (float)ti->n_elements);

            double wss = 0.0;
            for (int64_t j = 0; j < ti->n_elements; j++)
                wss += (double)f32_data[j] * (double)f32_data[j];
            float w_sig = (float)sqrt(wss / (double)ti->n_elements);

            const char *fid;
            if      (rmse <= 1.0e-04f) fid = "★★★★ ULTRA";
            else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH";
            else if (rmse <= 1.0e-03f) fid = "★★☆☆ GOOD";
            else                       fid = "★☆☆☆ STD";

            if (verbose) {
                printf("\n  [Q2_K·Shor] %-47s\n", gguf_names[i]);
                printf("         %10ld elements → %ld bytes  σ=%.2e  RMSE=%.4e  %s\n",
                       (long)ti->n_elements,
                       (long)(n_blocks * sizeof(BlockQ2K)),
                       w_sig, rmse, fid);
            }

            total_error_sum += tensor_error;
            total_elements_quantized += ti->n_elements;
            total_bytes_quantized += n_blocks * sizeof(BlockQ2K);
            quant_count++;

            free(quant_data);
            free(f32_data);
        } else if (tensor_types[i] == GGML_TYPE_Q4_0) {
            float *f32_data = st_multi_tensor_to_f32(mf, src);
            if (!f32_data) {
                fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s' to F32\n",
                        ti->name);
                continue;
            }

            int64_t n_elements = ti->n_elements;

            int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
            if (padded > n_elements) {
                float *grown = realloc(f32_data, padded * sizeof(float));
                if (!grown) {
                    fprintf(stderr, "\n  ERROR: Out of memory padding '%s'\n",
                            ti->name);
                    free(f32_data);
                    continue;
                }
                f32_data = grown;
                for (int64_t j = n_elements; j < padded; j++)
                    f32_data[j] = 0.0f;
                n_elements = padded;
            }

            int64_t n_blocks_q4 = n_elements / QK4_0;
            BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0));
            float tensor_error = 0.0f;

            const float *imp = NULL;
            if (imatrix) {
                const IMatrixEntry *ime = imatrix_find_any(imatrix,
                    gguf_names[i], tensor_hf_names[i]);
                if (ime && ime->n_values > 0) {
                    imp = ime->normalized;
                    if (verbose)
                        printf("\n    imatrix: using %d importance weights for %s\n",
                               ime->n_values, gguf_names[i]);
                }
            }

            quantize_tensor_q4_0_hpc(f32_data, n_elements,
                                       q4_data, &tensor_error,
                                       imp, verbose);

            fwrite(q4_data, sizeof(BlockQ4_0), n_blocks_q4, fp);

            float rmse = sqrtf(tensor_error / (float)ti->n_elements);

            double wss4 = 0.0;
            for (int64_t j = 0; j < ti->n_elements; j++)
                wss4 += (double)f32_data[j] * (double)f32_data[j];
            float w_sig4 = (float)sqrt(wss4 / (double)ti->n_elements);

            const char *fid4;
            if      (rmse <= 1.0e-04f) fid4 = "★★★★ ULTRA";
            else if (rmse <= 3.0e-04f) fid4 = "★★★☆ HIGH";
            else if (rmse <= 1.0e-03f) fid4 = "★★☆☆ GOOD";
            else                       fid4 = "★☆☆☆ STD";

            if (verbose) {
                printf("\n  [Q4_0·Shor] %-47s\n", gguf_names[i]);
                printf("         %10ld elements → %ld bytes  σ=%.2e  RMSE=%.4e  %s\n",
                       (long)ti->n_elements,
                       (long)(n_blocks_q4 * sizeof(BlockQ4_0)),
                       w_sig4, rmse, fid4);
            }

            total_error_sum += tensor_error;
            total_elements_quantized += ti->n_elements;
            total_bytes_quantized += n_blocks_q4 * sizeof(BlockQ4_0);
            quant_count++;

            free(q4_data);
            free(f32_data);
        } else if (tensor_types[i] == GGML_TYPE_F16) {
            float *f32_data = st_multi_tensor_to_f32(mf, src);
            if (!f32_data) {
                fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s'\n",
                        ti->name);
                continue;
            }

            uint16_t *f16_data = (uint16_t *)malloc(ti->n_elements * sizeof(uint16_t));
            for (int64_t j = 0; j < ti->n_elements; j++)
                f16_data[j] = gguf_fp32_to_fp16(f32_data[j]);

            fwrite(f16_data, sizeof(uint16_t), ti->n_elements, fp);

            total_bytes_unquantized += ti->n_elements * sizeof(uint16_t);

            if (verbose) {
                printf("\n  [F16 ] %-50s  %10ld elements → %ld bytes\n",
                       gguf_names[i], (long)ti->n_elements,
                       (long)(ti->n_elements * sizeof(uint16_t)));
            }

            free(f16_data);
            free(f32_data);
        } else {
            float *f32_data = st_multi_tensor_to_f32(mf, src);
            if (!f32_data) {
                fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s'\n",
                        ti->name);
                continue;
            }

            fwrite(f32_data, sizeof(float), ti->n_elements, fp);

            total_bytes_unquantized += ti->n_elements * sizeof(float);

            if (verbose) {
                printf("\n  [F32 ] %-50s  %10ld elements → %ld bytes\n",
                       gguf_names[i], (long)ti->n_elements,
                       (long)(ti->n_elements * sizeof(float)));
            }

            free(f32_data);
        }

        gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);
    }

    print_progress_bar(total_tensors, total_tensors, "done", quant_start);

    long final_size = ftell(fp);
    fclose(fp);

    int64_t original_f32_size = 0;
    for (int i = 0; i < total_tensors; i++) {
        const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]);
        original_f32_size += ti->n_elements * sizeof(float);
    }
    float compression_ratio = (original_f32_size > 0) ?
                               (float)original_f32_size / (float)final_size : 0.0f;
    float effective_bpw = (total_elements_quantized > 0) ?
                           8.0f * (float)total_bytes_quantized / (float)total_elements_quantized :
                           0.0f;
    float total_rmse = (total_elements_quantized > 0) ?
                        sqrtf(total_error_sum / (float)total_elements_quantized) : 0.0f;
    float mean_mse_per_tensor = (quant_count > 0) ?
                                 total_error_sum / (float)quant_count : 0.0f;

    const char *overall_fid, *overall_icon;
    if      (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)";  overall_icon = "★★★★"; }
    else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)";   overall_icon = "★★★☆"; }
    else if (total_rmse <= 1.0e-03f) { overall_fid = "GOOD (≤1e-03)";   overall_icon = "★★☆☆"; }
    else                             { overall_fid = "STANDARD";        overall_icon = "★☆☆☆"; }

    printf("\n  ╔════════════════════════════════════════════════════════════════╗\n");
    printf("  ║  SHOR-OPTIMIZED QUANTIZATION SUMMARY                          ║\n");
    printf("  ╠════════════════════════════════════════════════════════════════╣\n");
    printf("  ║                                                              ║\n");
    printf("  ║  Engine:         Griffiths-Niu Sequential Measurement        ║\n");
    printf("  ║  Protocol:       IDFT6 → feed-forward → Born → collapse      ║\n");
    printf("  ║  Origin:         tesseract_factor.c (Shor's algorithm)       ║\n");
    printf("  ║                                                              ║\n");
    printf("  ╠════════════════════════════════════════════════════════════════╣\n");
    printf("  ║  Tensors quantized:      %-33d  ║\n", quant_count);
    printf("  ║  Elements quantized:     %15ld                   ║\n",
           (long)total_elements_quantized);
    printf("  ║  Quantized data:         %12ld bytes (%6.1f MB)    ║\n",
           (long)total_bytes_quantized,
           (double)total_bytes_quantized / (1024.0 * 1024.0));
    printf("  ║  Unquantized data:       %12ld bytes (%6.1f MB)    ║\n",
           (long)total_bytes_unquantized,
           (double)total_bytes_unquantized / (1024.0 * 1024.0));
    printf("  ║  Effective bits/weight:  %15.2f                       ║\n",
           effective_bpw);
    printf("  ║  Compression ratio:      %15.1fx                      ║\n",
           compression_ratio);
    printf("  ║                                                              ║\n");
    printf("  ╠════════════════════════════════════════════════════════════════╣\n");
    printf("  ║  FIDELITY METRICS (target: 1e-04)                            ║\n");
    printf("  ╠════════════════════════════════════════════════════════════════╣\n");
    printf("  ║                                                              ║\n");
    printf("  ║  Total MSE:              %15.6e                  ║\n",
           total_error_sum);
    printf("  ║  Per-element RMSE:       %15.4e                  ║\n",
           total_rmse);
    printf("  ║  Mean MSE/tensor:        %15.6e                  ║\n",
           mean_mse_per_tensor);
    printf("  ║                                                              ║\n");
    printf("  ║  Fidelity class:   %s %-14s                      ║\n",
           overall_icon, overall_fid);
    if (total_rmse <= 1.0e-04f)
        printf("  ║  ✓ RMSE ≤ 1e-04: TARGET MET — maximum fidelity achieved    ║\n");
    else if (total_rmse <= 3.0e-04f)
        printf("  ║  ◐ RMSE ≤ 3e-04: near target — high fidelity achieved      ║\n");
    else
        printf("  ║  ○ RMSE > 3e-04: below target — weight σ may be large      ║\n");
    printf("  ║                                                              ║\n");
    printf("  ╠════════════════════════════════════════════════════════════════╣\n");
    printf("  ║  Output file:      %ld bytes (%.1f MB)%*s║\n",
           final_size, (double)final_size / (1024.0 * 1024.0),
           (int)(27 - snprintf(NULL, 0, "%ld bytes (%.1f MB)",
                               final_size, (double)final_size / (1024.0 * 1024.0))), "");
    printf("  ╚════════════════════════════════════════════════════════════════╝\n\n");

    free(include_list);
    free(gguf_names);
    free(tensor_types);
    free(tensor_sizes);
    free(tensor_offsets);
    free(tensor_src_idx);
    free(tensor_hf_names);

    return 0;
}

/* ═══════════════════════════════════════════════════════════════════════════
 * LIBRARY API — Exported functions for Python ctypes integration
 *
 * When built with -DHEXSTATE_LIBRARY, these are the only public symbols.
 * The Python GGUF pipeline handles metadata/IO; C handles HPC quantization.
 * ═══════════════════════════════════════════════════════════════════════════ */

/* Initialize HExState subsystems (must be called once before quantization) */
void hexstate_init(void)
{
    static int initialized = 0;
    if (!initialized) {
        srand(42);  /* Deterministic for reproducibility */
        triality_exotic_init();
        s6_exotic_init();
        triality_stats_reset();
        initialized = 1;
    }
}

/* Quantize a single tensor's F32 data to Q2_K using HPC optimization.
 *
 * Parameters:
 * weights:     input F32 data (must be padded to multiple of 256)
 * n_elements:  number of elements (must be multiple of 256)
 * output:      output buffer (must be n_elements/256 * 84 bytes)
 * out_error:   pointer to receive total MSE (can be NULL)
 * opt_mode:    0=HPC, 1=MSE, 2=Hybrid (recommended)
 * verbose:     1 for per-block diagnostics
 */
void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements,
                                    void *output, float *out_error,
                                    int opt_mode, int verbose)
{
    hexstate_init();
    quantize_tensor_q2k_hpc(weights, n_elements,
                              (BlockQ2K *)output, out_error,
                              (OptimizerMode)opt_mode, NULL, verbose);
}

/* Same as above but with importance matrix weights */
void hexstate_quantize_tensor_q2k_imat(const float *weights, int64_t n_elements,
                                         void *output, float *out_error,
                                         int opt_mode,
                                         const float *imat_importance,
                                         int verbose)
{
    hexstate_init();
    quantize_tensor_q2k_hpc(weights, n_elements,
                              (BlockQ2K *)output, out_error,
                              (OptimizerMode)opt_mode, imat_importance, verbose);
}

/* Get the block size for Q2_K (84 bytes per 256 elements) */
int hexstate_q2k_block_bytes(void) { return sizeof(BlockQ2K); }
int hexstate_q2k_block_elements(void) { return QK_K; }

/* HPC-optimized Q4_0 quantization for attention tensors.
 * Called from Python requantizer via ctypes.
 * weights:     input F32 weights
 * n_elements:  number of elements (must be multiple of 32)
 * output:      output buffer (must be n_elements/32 * 18 bytes)
 * out_error:   pointer to receive total MSE (can be NULL)
 * imat_importance: optional per-element importance weights
 * verbose:     1 for per-block diagnostics
 */
void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                                         void *output, float *out_error,
                                         const float *imat_importance,
                                         int verbose)
{
    hexstate_init();
    float err = 0.0f;
    quantize_tensor_q4_0_hpc(weights, n_elements,
                               (BlockQ4_0 *)output, &err,
                               imat_importance, verbose);
    if (out_error) *out_error = err;
}

int hexstate_q8_0_block_bytes(void)    { return (int)sizeof(hex_block_q8_0); }
int hexstate_q8_0_block_elements(void) { return QK8_0; }

void hexstate_quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
                                       void *output, float *out_error,
                                       const float *imat_importance, int verbose)
{
    quantize_tensor_q8_0_hpc(weights, n_elements,
                             (hex_block_q8_0 *)output, out_error,
                             imat_importance, verbose);
}

#ifndef HEXSTATE_LIBRARY
/* ═══════════════════════════════════════════════════════════════════════════
 * MAIN
 * ═══════════════════════════════════════════════════════════════════════════ */

int main(int argc, char **argv)
{
    srand(time(NULL));

    /* Initialize HExState subsystems */
    triality_exotic_init();
    s6_exotic_init();
    triality_stats_reset();

    printf("\n");
    printf("  ╔════════════════════════════════════════════════════════════════╗\n");
    printf("  ║                                                              ║\n");
    printf("  ║   HExState GGUF QUANTIZER v3.0 — Shor-Optimized             ║\n");
    printf("  ║                                                              ║\n");
    printf("  ║   Architecture: HPCGraph Sensitivity Propagation             ║\n");
    printf("  ║   Optimization: Shor's Griffiths-Niu Measurement + iMatrix  ║\n");
    printf("  ║   Output: GGUF v3 (Q2_K, 2.625 bpw)                        ║\n");
    printf("  ║                                                              ║\n");
    printf("  ║   \"The weight and the quantized are opposite faces.\"         ║\n");
    printf("  ║                                                              ║\n");
    printf("  ╚════════════════════════════════════════════════════════════════╝\n\n");

    if (argc < 3) {
        printf("  Usage: %s <input> <output.gguf> [options]\n\n", argv[0]);
        printf("  Input:\n");
        printf("    Single .safetensors file, or\n");
        printf("    Model directory with sharded .safetensors files\n\n");
        printf("  Options:\n");
        printf("    --optimizer hpc|mse|hybrid   Scale optimization (default: hybrid)\n");
        printf("    --imatrix <file>             Importance matrix for Q2_K quality\n");
        printf("    --config <file>              Explicit config.json for arch detection\n");
        printf("    --qwen                       Force Qwen 3.5/3.6 architecture\n");
        printf("    --verbose                    Per-block diagnostics\n\n");
        return 1;
    }

    const char *input_path = argv[1];
    const char *output_path = argv[2];
    OptimizerMode opt_mode = OPT_HYBRID;
    const char *imatrix_path = NULL;
    const char *config_override = NULL;
    int verbose = 0;
    int force_qwen = 0;

    /* Parse options */
    for (int i = 3; i < argc; i++) {
        if (strcmp(argv[i], "--optimizer") == 0 && i + 1 < argc) {
            i++;
            if (strcmp(argv[i], "hpc") == 0) opt_mode = OPT_HPC;
            else if (strcmp(argv[i], "mse") == 0) opt_mode = OPT_MSE;
            else if (strcmp(argv[i], "hybrid") == 0) opt_mode = OPT_HYBRID;
            else {
                fprintf(stderr, "  ERROR: Unknown optimizer '%s'. Use hpc, mse, or hybrid.\n", argv[i]);
                return 1;
            }
        } else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
            imatrix_path = argv[++i];
        } else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
            config_override = argv[++i];
        } else if (strcmp(argv[i], "--qwen") == 0) {
            force_qwen = 1;
        } else if (strcmp(argv[i], "--verbose") == 0) {
            verbose = 1;
        } else {
            fprintf(stderr, "  ERROR: Unknown option '%s'\n", argv[i]);
            return 1;
        }
    }

    const char *opt_names[] = {"HPC (BP only)", "MSE (grid search)", "Hybrid (HPC+MSE)"};
    printf("  Input:      %s\n", input_path);
    printf("  Output:     %s\n", output_path);
    printf("  Quant type: Q2_K (2.625 bpw)\n");
    printf("  Optimizer:  %s\n", opt_names[opt_mode]);
    if (imatrix_path) printf("  iMatrix:    %s\n", imatrix_path);
    if (config_override) printf("  Config:     %s\n", config_override);
    if (force_qwen) printf("  Model:      Qwen 3.5/3.6 (forced via --qwen)\n");
    printf("\n");

    /* ── Phase 1: Load model ── */
    printf("  Phase 1: Loading model...\n");
    time_t t_start = time(NULL);

    /* Determine if input is a file or directory */
    struct stat st;
    if (stat(input_path, &st) != 0) {
        fprintf(stderr, "  ERROR: Cannot access '%s'\n", input_path);
        return 1;
    }

    STMultiFile *mf = NULL;
    char input_dir[512] = "";

    if (S_ISDIR(st.st_mode)) {
        /* Input is a directory — open all shards */
        mf = st_open_dir(input_path);
        strncpy(input_dir, input_path, sizeof(input_dir) - 2);
        input_dir[sizeof(input_dir) - 2] = '\0';
        int dlen = strlen(input_dir);
        if (dlen > 0 && input_dir[dlen - 1] != '/') {
            input_dir[dlen] = '/';
            input_dir[dlen + 1] = '\0';
        }
    } else {
        /* Input is a single file — wrap in STMultiFile */
        STFile *sf = st_open(input_path);
        if (!sf) {
            fprintf(stderr, "  ERROR: Failed to open '%s'\n", input_path);
            return 1;
        }
        mf = (STMultiFile *)calloc(1, sizeof(STMultiFile));
        mf->shards[0] = sf;
        mf->n_shards = 1;
        for (int i = 0; i < sf->n_tensors && mf->n_tensors < ST_MAX_TENSORS; i++) {
            strncpy(mf->tensor_map[mf->n_tensors].name,
                    sf->tensors[i].name, ST_MAX_NAME_LEN - 1);
            mf->tensor_map[mf->n_tensors].shard_idx = 0;
            mf->tensor_map[mf->n_tensors].tensor_idx = i;
            mf->n_tensors++;
        }

        /* Extract directory from file path */
        strncpy(input_dir, input_path, sizeof(input_dir) - 1);
        input_dir[sizeof(input_dir) - 1] = '\0';
        char *last_slash = strrchr(input_dir, '/');
        if (last_slash) {
            *(last_slash + 1) = '\0';
        } else {
            strcpy(input_dir, "./");
        }
    }

    if (!mf) {
        fprintf(stderr, "  ERROR: Failed to load model from '%s'\n", input_path);
        return 1;
    }

    st_multi_print_summary(mf);

    time_t t_load = time(NULL);
    printf("  Loaded in %.0f seconds\n\n", difftime(t_load, t_start));

    /* ── Phase 2: Detect architecture ── */
    printf("  Phase 2: Detecting model architecture...\n");

    /* Try to read config.json from model directory */
    char config_path[1024];
    snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
    const char *config_ptr = NULL;
    {
        FILE *check = fopen(config_path, "rb");
        if (check) {
            fclose(check);
            config_ptr = config_path;
            printf("  Found config.json: %s\n", config_path);
        }
    }

    ModelArchitecture arch;
    detect_architecture(mf, &arch, config_ptr);

    /* --qwen override: force Qwen 3.5/3.6 architecture parameters */
    if (force_qwen) {
        strcpy(arch.architecture, "qwen2");
        strcpy(arch.name, "Qwen3.6-HExState-Q2K");
        printf("  [--qwen] Forcing qwen2-compatible architecture\n");
    }

    printf("  ╔═══════════════════════════════════════════════════════════════╗\n");
    printf("  ║  Model Architecture                                         ║\n");
    printf("  ╠═══════════════════════════════════════════════════════════════╣\n");
    printf("  ║  Architecture:     %-40s ║\n", arch.architecture);
    printf("  ║  Layers:           %-40u ║\n", arch.block_count);
    printf("  ║  Hidden size:      %-40u ║\n", arch.embedding_length);
    printf("  ║  Attention heads:  %-40u ║\n", arch.head_count);
    printf("  ║  KV heads:         %-40u ║\n", arch.head_count_kv);
    printf("  ║  Vocab size:       %-40u ║\n", arch.vocab_size);
    printf("  ║  FFN size:         %-40u ║\n", arch.feed_forward_length);
    printf("  ║  Context length:   %-40u ║\n", arch.context_length);
    printf("  ║  Has bias:         %-40s ║\n", arch.has_bias ? "yes" : "no");
    printf("  ║  Tied embeddings:  %-40s ║\n", arch.tie_word_embeddings ? "yes" : "no");
    printf("  ╚═══════════════════════════════════════════════════════════════╝\n\n");

    /* ── Phase 2b: Load tokenizer ── */
    printf("  Phase 2b: Loading tokenizer...\n");
    TokenizerData *tokenizer = NULL;
    {
        char tok_json[512], tok_config[512];
        snprintf(tok_json, sizeof(tok_json), "%stokenizer.json", input_dir);
        snprintf(tok_config, sizeof(tok_config), "%stokenizer_config.json", input_dir);

        tokenizer = tok_load(tok_json, tok_config);
        if (tokenizer) {
            tok_print_summary(tokenizer);
        } else {
            printf("  No tokenizer found in '%s'\n", input_dir);
            printf("  (Output GGUF will lack tokenizer data — not inference-ready)\n\n");
        }
    }

    /* ── Phase 2c: Load importance matrix (optional) ── */
    IMatrixData *imatrix = NULL;
    if (imatrix_path) {
        printf("  Phase 2c: Loading importance matrix...\n");
        imatrix = imatrix_load(imatrix_path);
        if (imatrix) {
            imatrix_print_summary(imatrix);
        } else {
            printf("  WARNING: Failed to load imatrix from '%s'\n", imatrix_path);
            printf("  Proceeding without importance weighting.\n\n");
        }
    }

    /* ── Phase 3-5: Quantize and write GGUF ── */
    printf("  Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
    int result = write_gguf(output_path, mf, &arch, tokenizer,
                              opt_mode, imatrix, verbose);

    /* Wall-clock total: clock() sums CPU time over all OpenMP threads */
    time_t t_end = time(NULL);
    printf("  Total time: %.0f seconds\n\n", difftime(t_end, t_start));

    if (imatrix) imatrix_free(imatrix);
    if (tokenizer) tok_free(tokenizer);
    st_multi_close(mf);
    return result;
}
#endif /* HEXSTATE_LIBRARY */