# -*- coding: utf-8 -*-
from __future__ import annotations

import math
import time
from functools import lru_cache
from pathlib import Path
from typing import Iterable, Optional, Tuple, List

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

# ============================================================
# 0) GAME SETTINGS (EuroJackpot main field)
# ============================================================
N_MAIN = 50
N_PICK = 5

# Total combos: C(50,5)
TOTAL = math.comb(N_MAIN, N_PICK)

# Number of 5-number tickets that make EXACTLY a 4-hit vs a fixed draw:
# choose 4 of the 5 winning numbers (C(5,4)=5) * choose 1 number from outside (50-5=45) => 225
K4 = math.comb(N_PICK, N_PICK - 1) * (N_MAIN - N_PICK)

# ============================================================
# 1) PATHS
# ============================================================
hist_path       = r"data\hist_df.csv"
cover_path      = r"data\covering_50_5_4_33572.csv"
tot_path        = r"data\tot_df_dynamic_basic.parquet"

out_path        = r"data\covering_random_mapping_100_sims.csv"
best_cover_out  = r"data\best_cover_by_5hit_lift.csv"
best_map_out    = r"data\best_number_mapping_by_5hit_lift.csv"
blog_md_out     = r"data\random_mapping_cover_blogpost.md"

ST = ["st1", "st2", "st3", "st4", "st5"]

N_SIMS = 1000
SEED = 123  # change if you want different random mappings

# ============================================================
# 2) LEX INDEX FUNCTIONS (adapted to 50 choose 5)
# ============================================================
@lru_cache(maxsize=None)
def A_optimized(n, r, k, l):
    if l <= 0:
        return 0
    return sum(math.comb(n - (j + 1), r - k) for j in range(l))

def L_optimized(st):
    # st is a list of 1..N_MAIN
    st.sort()
    a = A_optimized(N_MAIN, N_PICK, 1, st[0] - 1)
    for k in range(1, len(st)):
        a += A_optimized(N_MAIN, N_PICK, k + 1, st[k] - 1) - A_optimized(N_MAIN, N_PICK, k + 1, st[k - 1])
    return a

# ============================================================
# 3) HELPERS
# ============================================================
def pick_five_cols(df, preferred_cols=ST):
    df = df.loc[:, ~df.columns.astype(str).str.contains("^Unnamed")]
    if all(c in df.columns for c in preferred_cols):
        return df[preferred_cols].astype(int)
    return df.iloc[:, :5].astype(int)

# lookup table for popcount on bytes 0..255
LUT = np.array([bin(i).count("1") for i in range(256)], dtype=np.uint8)

def popcount_uint64(arr_u64):
    b = arr_u64.view(np.uint8).reshape(-1, 8)
    return LUT[b].sum(axis=1).astype(np.uint8)

def masks_from_idx(idx_mat):
    # idx_mat: (rows,5) values in 0..(N_MAIN-1)
    bits = np.left_shift(np.uint64(1), idx_mat.astype(np.uint64))
    return np.bitwise_or.reduce(bits, axis=1)

def backtest_stats(cover_masks_u64, draw_masks_u64):
    N = len(draw_masks_u64)
    total_hits4 = 0
    total_hits5 = 0
    draws_with_4 = 0
    draws_with_5 = 0
    min_hits4 = 10**9
    max_hits4 = -1

    for dm in draw_masks_u64:
        hits = popcount_uint64(cover_masks_u64 & dm)   # (M,) values 0..5
        dist = np.bincount(hits, minlength=6)[:6]

        h4 = int(dist[4])
        h5 = int(dist[5])

        total_hits4 += h4
        total_hits5 += h5
        draws_with_4 += int(h4 > 0)
        draws_with_5 += int(h5 > 0)

        if h4 < min_hits4: min_hits4 = h4
        if h4 > max_hits4: max_hits4 = h4

    return {
        "p4_emp": draws_with_4 / N,
        "p5_emp": draws_with_5 / N,
        "mean_hits4": total_hits4 / N,
        "mean_hits5": total_hits5 / N,
        "min_hits4": min_hits4,
        "max_hits4": max_hits4,
    }

def log_comb(n, k):
    return math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1)

def p_at_least_one_success(N, K, n):
    # exact: 1 - C(N-K, n)/C(N, n)
    if n >= N:
        return 1.0
    if n > (N - K):
        return 1.0
    log_p_no = log_comb(N - K, n) - log_comb(N, n)
    return 1.0 - math.exp(log_p_no)

def compute_stidx_for_cover(sorted_cover_1to50: np.ndarray) -> np.ndarray:
    # sorted_cover_1to50: (M,5) in 1..50 sorted rowwise
    out = np.empty(len(sorted_cover_1to50), dtype=np.int64)
    for i, row in enumerate(sorted_cover_1to50):
        out[i] = L_optimized(list(row))
    return out

def detect_stidx_col(cols) -> Optional[str]:
    cols = list(cols)
    candidates = [
        "stidx", "st_idx", "stindex", "lexi", "lexi_idx",
        "idx", "index",
        "__index_level_0__",
    ]
    for c in candidates:
        if c in cols:
            return c
    lower_map = {c.lower(): c for c in cols}
    for c in candidates:
        if c.lower() in lower_map:
            return lower_map[c.lower()]
    return None

def load_tot_rows_by_stidx_parquet(
    tot_parquet_path: Path | str,
    stidx_values: Iterable[int],
    columns: Optional[List[str]] = None,
    batch_size: int = 256_000,
    preserve_input_order: bool = True,
) -> Tuple[pd.DataFrame, str, bool]:
    """
    Loads ONLY rows of a parquet tot_df where stidx is in stidx_values.

    Returns: (tot_found_df, stidx_col_name, one_based_bool)

    - Detects stidx column name.
    - Infers 1-based vs 0-based by sampling the first batch.
    - Optionally reorders output to match the order of stidx_values.
    """
    try:
        import pyarrow.dataset as ds
    except Exception as e:
        print("[warn] pyarrow not available, skipping tot_df row fetch:", e)
        return pd.DataFrame(), "stidx", False

    tot_parquet_path = str(tot_parquet_path)
    stidx_list = [int(x) for x in stidx_values]
    stidx_set = set(stidx_list)

    dataset = ds.dataset(tot_parquet_path, format="parquet")
    stidx_col = detect_stidx_col(dataset.schema.names)
    if stidx_col is None:
        raise ValueError(
            f"Could not find stidx column in {tot_parquet_path}. "
            f"Columns start: {dataset.schema.names[:40]}"
        )

    if columns is not None:
        cols_to_read = list(dict.fromkeys(columns + [stidx_col]))
    else:
        cols_to_read = None

    # infer 1-based vs 0-based
    one_based = False
    probe = dataset.scanner(columns=[stidx_col], batch_size=min(batch_size, 100_000))
    probe_batches = probe.to_batches()
    first_batch = next(iter(probe_batches), None)
    if first_batch is not None:
        probe_df = first_batch.to_pandas()
        cmin = pd.to_numeric(probe_df[stidx_col], errors="coerce").min()
        if pd.notna(cmin) and int(cmin) == 1:
            one_based = True
            stidx_set = set(x + 1 for x in stidx_set)

    flt = ds.field(stidx_col).isin(list(stidx_set))
    scanner = dataset.scanner(columns=cols_to_read, filter=flt, batch_size=batch_size)

    parts = [batch.to_pandas() for batch in scanner.to_batches()]
    if not parts:
        return pd.DataFrame(), stidx_col, one_based

    out = pd.concat(parts, ignore_index=True)

    if preserve_input_order and not out.empty:
        order_list = [x + 1 for x in stidx_list] if one_based else stidx_list
        order_map = {v: i for i, v in enumerate(order_list)}
        out["_ord"] = out[stidx_col].map(order_map)
        out = out.sort_values("_ord", kind="stable").drop(columns=["_ord"]).reset_index(drop=True)

    return out, stidx_col, one_based

def write_blog_md(
    md_path: str,
    N_SIMS: int,
    SEED: int,
    cover_size: int,
    n_draws: int,
    p5_norm: float,
    p4_norm: float,
    orig_row: pd.Series,
    best_row: pd.Series,
    orig_percentile: float,
    lift5_q: dict,
):
    md = f"""# Random Number Mapping: a quick stress-test for “too-good” backtests

> **Responsible play note:** this is hobby stats and code. Lotteries are still luck-first.  
> If it stops being fun, it’s time to step away.

## What problem are we poking?

You’ve got a *covering set* (a fixed pack of lines) and you backtest it on the historical draws.

Sometimes the backtest looks… spicy. Like: “Wait, why did this set hit 5-of-5 on *that many* draws?”

Before we start daydreaming, we should ask a rude but healthy question:

**Is this backtest strong because the cover is genuinely good, or because it accidentally fits the quirks of the past?**

This post shows a simple stress-test: **randomly relabel the numbers** (many times) and see how the same cover behaves.

## The idea in one sentence

A permutation of 1..{N_MAIN} creates a “new” cover that is structurally the same (still a cover), just with the labels shuffled.

If we try a lot of these shuffles, we get a feel for:
- what “normal” looks like,
- how rare a given backtest score is,
- and how easy it is to cherry-pick a great-looking result when you run many trials.

## What the script does

Inputs:
- `hist_df.csv` with `st1..st5` (past draws)
- `covering_50_5_4_33572.csv` (your cover lines)
- optional: `tot_df_dynamic_basic.parquet` (to export the best found cover with extra features)

For each simulation:
1. Draw a random permutation `perm` of 0..49 (numbers 1..50).
2. Remap the cover lines through `perm`.
3. Compare the remapped cover against the *real* history and count:
   - did we get at least one 5-hit line in each draw?
   - did we get at least one 4-hit line in each draw?
   - how many 4-hit / 5-hit lines on average?

We also run the **original** cover first (sim_id = -1), as the reference.

Baselines (random tickets):
- 5-hit chance per draw (with M lines): `p5_norm = M / C(50,5)`
- 4-hit chance per draw uses the exact hypergeometric form:
  `1 - C(N-K4, M)/C(N, M)` where `K4 = 225` for 5/50.

## Outputs you get

- `covering_random_mapping_100_sims.csv`  
  One row per sim with p(>=1 five-hit), p(>=1 four-hit), and lift values.

- `best_number_mapping_by_5hit_lift.csv`  
  The permutation that produced the best 5-hit lift in this run.

- `best_cover_by_5hit_lift.csv`  
  The remapped cover lines for that best permutation (plus tot_df features if found).

## Results (auto-filled by the script)

### Setup
- Sims: **{N_SIMS}** (seed={SEED})
- History draws: **{n_draws}**
- Cover size M: **{cover_size}**
- Total combinations C(50,5): **{TOTAL:,}**
- Random baseline p(>=1 five-hit): **{p5_norm*100:.6f}%**
- Random baseline p(>=1 four-hit): **{p4_norm*100:.6f}%**

### Original cover (real labels)
- p(>=1 five-hit): **{orig_row["p5_emp"]*100:.3f}%**  
  lift vs random: **{orig_row["lift5_prob"]:.3f}**
- p(>=1 four-hit): **{orig_row["p4_emp"]*100:.3f}%**  
  lift vs random: **{orig_row["lift4_prob"]:.3f}**
- mean #4-hit lines per draw: **{orig_row["mean_hits4_emp"]:.3f}**  
  lift vs random mean: **{orig_row["lift4_count"]:.3f}**

### Best remapped cover (best sim in this run)
- best sim_id: **{int(best_row["sim_id"])}**
- p(>=1 five-hit): **{best_row["p5_emp"]*100:.3f}%**  
  lift vs random: **{best_row["lift5_prob"]:.3f}**

### Where does the original sit among random remaps?
- Percentile of original 5-hit lift among the {N_SIMS} remaps: **{orig_percentile:.1f} percentile**

(If that percentile is high, it means the original labels look unusually good compared to typical shuffles. If it’s mid-pack, the “magic” was probably just normal variance.)

### Lift distribution (remaps only)
- lift5 50% (median): **{lift5_q["p50"]:.3f}**
- lift5 90%: **{lift5_q["p90"]:.3f}**
- lift5 95%: **{lift5_q["p95"]:.3f}**
- lift5 99%: **{lift5_q["p99"]:.3f}**

## A reality check (the part nobody wants to read)

If you run **lots** of permutations and keep the best one, you’re doing a search.  
A search always finds something that looks “special”.

That’s not bad — it’s just how randomness behaves when you keep asking it questions.

So treat “best mapping found” as:
- a fun diagnostic,
- a warning about cherry-picking,
- and a reason to do honest backtests (time-splits, forward tests, fresh draws).

---

If you want, next step after you run this:
- we can add a time-split version (train on older draws, score on newer draws)
- and we can track whether the “best mapping” keeps its shine out-of-sample.
"""
    Path(md_path).parent.mkdir(parents=True, exist_ok=True)
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md)

# ============================================================
# 4) LOAD DATA
# ============================================================
hist = pick_five_cols(pd.read_csv(hist_path))
cover = pick_five_cols(pd.read_csv(cover_path))

cover = cover.drop_duplicates().reset_index(drop=True)

hist_mat  = hist.to_numpy(dtype=int)
cover_mat = cover.to_numpy(dtype=int)

# 0-based indices for masks
hist_idx  = hist_mat - 1
cover_idx = cover_mat - 1

# build draw masks once
draw_masks = masks_from_idx(hist_idx)

M = len(cover_idx)
N_DRAWS = len(hist_idx)

# ============================================================
# 5) NORMS / BASELINES
# ============================================================
p5_norm = M / TOTAL
mean5_norm = p5_norm

p4_norm = p_at_least_one_success(TOTAL, K4, M)
mean4_norm = M * (K4 / TOTAL)

print("=== Norms (random tickets baseline) ===")
print(f"Cover size M: {M}")
print(f"Total combos: {TOTAL}")
print(f"K4 (4-hit states per draw): {K4}")
print(f"Norm p(>=1 five-hit): {p5_norm*100:.6f}%")
print(f"Norm p(>=1 four-hit): {p4_norm*100:.6f}%")
print(f"Norm mean #4-hit lines per draw: {mean4_norm:.6f}")
print("")

# ============================================================
# 6) RUN: ORIGINAL + RANDOM MAPPINGS
# ============================================================
rng = np.random.default_rng(SEED)
records = []
t0 = time.time()

best_lift5 = -1.0
best_perm = None
best_sim_id = None
best_stats = None
best_cover_mapped_idx = None

def add_record(sim_id, stats):
    records.append({
        "sim_id": sim_id,
        "n_draws": N_DRAWS,
        "cover_size": M,

        "p5_emp": stats["p5_emp"],
        "p5_norm": p5_norm,
        "lift5_prob": (stats["p5_emp"] / p5_norm) if p5_norm > 0 else np.nan,
        "mean_hits5_emp": stats["mean_hits5"],
        "mean_hits5_norm": mean5_norm,
        "lift5_count": (stats["mean_hits5"] / mean5_norm) if mean5_norm > 0 else np.nan,

        "p4_emp": stats["p4_emp"],
        "p4_norm": p4_norm,
        "lift4_prob": (stats["p4_emp"] / p4_norm) if p4_norm > 0 else np.nan,
        "mean_hits4_emp": stats["mean_hits4"],
        "mean_hits4_norm": mean4_norm,
        "lift4_count": (stats["mean_hits4"] / mean4_norm) if mean4_norm > 0 else np.nan,

        "min_hits4": stats["min_hits4"],
        "max_hits4": stats["max_hits4"],
    })

# ORIGINAL
cover_masks_original = masks_from_idx(cover_idx)
stats_original = backtest_stats(cover_masks_original, draw_masks)
add_record(sim_id=-1, stats=stats_original)

best_lift5 = stats_original["p5_emp"] / p5_norm
best_perm = np.arange(N_MAIN, dtype=int)
best_sim_id = -1
best_stats = stats_original
best_cover_mapped_idx = cover_idx.copy()

print("Original (real cover vs real history)")
print(f"  5-hit draws: {stats_original['p5_emp']*100:.3f}%   lift vs norm: {stats_original['p5_emp']/p5_norm:.3f}")
print(f"  4-hit draws: {stats_original['p4_emp']*100:.3f}%   lift vs norm: {stats_original['p4_emp']/p4_norm:.3f}")
print(f"  mean #4-hit lines per draw: {stats_original['mean_hits4']:.3f}   lift vs norm: {stats_original['mean_hits4']/mean4_norm:.3f}")
print("")

# RANDOM MAPPINGS
for s in range(N_SIMS):
    perm = rng.permutation(N_MAIN)     # maps old_idx -> new_idx
    mapped_idx = perm[cover_idx]       # mapped cover (0..49), row order unchanged

    cover_masks = masks_from_idx(mapped_idx)
    stats = backtest_stats(cover_masks, draw_masks)
    add_record(sim_id=s, stats=stats)

    lift5 = stats["p5_emp"] / p5_norm
    if lift5 > best_lift5:
        best_lift5 = lift5
        best_perm = perm.copy()
        best_sim_id = s
        best_stats = stats
        best_cover_mapped_idx = mapped_idx.copy()

    if (s + 1) % 50 == 0:
        print(f"Done {s+1}/{N_SIMS} sims...")

sim_df = pd.DataFrame(records)
sim_df.to_csv(out_path, index=False)
print(f"\nSaved simulations: {out_path}")

# ============================================================
# 7) SAVE BEST MAPPING + BEST COVER (sorted rowwise + stidx + tot_df features)
# ============================================================
best_map_df = pd.DataFrame({
    "old_number": np.arange(1, N_MAIN + 1, dtype=int),
    "new_number": (best_perm + 1).astype(int),
})
best_map_df.to_csv(best_map_out, index=False)

# Sort best cover rowwise (important before stidx)
best_cover_sorted_1to50 = np.sort(best_cover_mapped_idx, axis=1) + 1  # (M,5) in 1..50

print("\nComputing stidx for best cover...")
best_stidx = compute_stidx_for_cover(best_cover_sorted_1to50)

best_cover_basic = pd.DataFrame(best_cover_sorted_1to50, columns=ST)
best_cover_basic["stidx"] = best_stidx

# Try to fetch extra columns from tot_df by stidx (optional enrichment)
cover_stidx = best_cover_basic["stidx"].tolist()
tot_rows, stidx_col, one_based = load_tot_rows_by_stidx_parquet(
    tot_path,
    cover_stidx,
    columns=None,
    preserve_input_order=True
)

if tot_rows.empty:
    print("[warn] tot_df fetch returned empty. Saving basic cover only (st1..st5 + stidx).")
    best_cover_basic.to_csv(best_cover_out, index=False)
else:
    tot_rows.to_csv(best_cover_out, index=False)

# ============================================================
# 8) BLOGPOST MARKDOWN (auto-filled)
# ============================================================
df_remaps = sim_df[sim_df["sim_id"] >= 0].copy()
orig_row = sim_df[sim_df["sim_id"] == -1].iloc[0]
best_row = sim_df.sort_values("lift5_prob", ascending=False).iloc[0]

orig_lift = float(orig_row["lift5_prob"])
orig_percentile = 100.0 * float((df_remaps["lift5_prob"] <= orig_lift).mean())

lift5_q = {
    "p50": float(df_remaps["lift5_prob"].quantile(0.50)),
    "p90": float(df_remaps["lift5_prob"].quantile(0.90)),
    "p95": float(df_remaps["lift5_prob"].quantile(0.95)),
    "p99": float(df_remaps["lift5_prob"].quantile(0.99)),
}

write_blog_md(
    md_path=blog_md_out,
    N_SIMS=N_SIMS,
    SEED=SEED,
    cover_size=M,
    n_draws=N_DRAWS,
    p5_norm=p5_norm,
    p4_norm=p4_norm,
    orig_row=orig_row,
    best_row=best_row,
    orig_percentile=orig_percentile,
    lift5_q=lift5_q,
)

print("\n=== Best mapping by 5-hit lift ===")
print("best_sim_id:", best_sim_id)
print(f"best 5-hit lift: {best_lift5:.3f}")
print(f"best p5_emp: {best_stats['p5_emp']*100:.3f}%  (norm {p5_norm*100:.6f}%)")
print("Saved best cover:", best_cover_out)
print("Saved mapping   :", best_map_out)
print("Saved blog md   :", blog_md_out)
print(f"\nTotal time: {time.time() - t0:.1f}s")
