diff --git a/examples/README.md b/examples/README.md index 8b91d5b..dbfa4b8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -16,6 +16,7 @@ Explore runnable examples that show how to use Weco to optimize ML models, promp - [🧠 Prompt Engineering](#-prompt-engineering) - [πŸ“Š Extract Line Plot β€” Chart to CSV](#-extract-line-plot--chart-to-csv) - [πŸ›°οΈ Model Development β€” Spaceship Titanic](#️-model-development--spaceship-titanic) + - [πŸ•΅οΈ Fraud Detection β€” IEEE-CIS](#️-fraud-detection--ieee-cis) ### Prerequisites @@ -35,6 +36,7 @@ pip install weco | 🧠 Prompt Engineering | Iteratively refine LLM prompts to improve accuracy | `openai`, `datasets`, OpenAI API key | [README](prompt/README.md) | | πŸ“Š Agentic Scaffolding | Optimize agentic scaffolding for chart-to-CSV extraction | `openai`, `huggingface_hub`, `uv`, OpenAI API key | [README](extract-line-plot/README.md) | | πŸ›°οΈ Spaceship Titanic | Improve a Kaggle model training pipeline | `pandas`, `numpy`, `scikit-learn`, `torch`, `xgboost`, `lightgbm`, `catboost` | [README](spaceship-titanic/README.md) | +| πŸ•΅οΈ Fraud Detection | Optimize a fraud pipeline on IEEE-CIS (real Vesta transactions) | `pandas`, `numpy`, `scikit-learn`, `lightgbm`, `pyarrow`, `kaggle` | [README](fraud-detection/README.md) | --- @@ -162,8 +164,33 @@ weco run --source train.py \ --log-dir .runs/spaceship-titanic ``` +### πŸ•΅οΈ Fraud Detection β€” IEEE-CIS + +Optimize a tabular fraud-detection pipeline on real Vesta payment data. +Reproduces Weco's +[fraud-detection case study](https://weco.ai/blog/framing-the-problem) +(baseline AUC 0.914 β†’ pooled 6-seed mean 0.9305 Β± 0.0035 with full +instructions at 200 steps). + +- **Prereqs**: Kaggle API token + [join the competition](https://www.kaggle.com/c/ieee-fraud-detection) +- **Install Dependencies**: `pip install -r requirements.txt` +- **Prepare data** (once, ~2-3 min): `python prepare_data.py` +- **Run**: +```bash +cd examples/fraud-detection +weco run --source train.py \ + --eval-command "python evaluate.py" \ + --metric auc_roc \ + --goal maximize \ + --steps 50 \ + --model gemini-3.1-pro-preview \ + --additional-instructions instructions.md \ + --eval-timeout 300 \ + --log-dir .runs/fraud-detection +``` + --- -If you're new to Weco, start with **Hello World**, then try **LangSmith ZephHR QA** for a realistic LangSmith optimization workflow, explore **Triton** and **CUDA** for kernel engineering, **Prompt Engineering** for optimzing an LLM's prompt, **Extract Line Plot** for optimzing agentic scaffolds, or **Spaceship Titanic** for model development. +If you're new to Weco, start with **Hello World**, then try **LangSmith ZephHR QA** for a realistic LangSmith optimization workflow, explore **Triton** and **CUDA** for kernel engineering, **Prompt Engineering** for optimzing an LLM's prompt, **Extract Line Plot** for optimzing agentic scaffolds, **Spaceship Titanic** for model development, or **Fraud Detection** for a production-scale tabular ML case study. diff --git a/examples/fraud-detection/.gitignore b/examples/fraud-detection/.gitignore new file mode 100644 index 0000000..60f2536 --- /dev/null +++ b/examples/fraud-detection/.gitignore @@ -0,0 +1,4 @@ +data/ +.runs/ +__pycache__/ +*.pyc diff --git a/examples/fraud-detection/README.md b/examples/fraud-detection/README.md new file mode 100644 index 0000000..e85cb27 --- /dev/null +++ b/examples/fraud-detection/README.md @@ -0,0 +1,170 @@ +# Fraud Detection (IEEE-CIS) + +Optimize a tabular fraud-detection pipeline on the +[IEEE-CIS Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection) Kaggle +dataset (real Vesta payment transactions). Weco rewrites `train.py` β€” both +feature engineering and the LightGBM configuration β€” to maximize AUC-ROC on a +held-out, time-based validation split. + +This example reproduces the setup from Weco's fraud-detection case study +([blog post](https://weco.ai/blog/framing-the-problem), +[code](https://github.com/WecoAI/fraud-detection-case-study)). The example's +baseline is **AUC β‰ˆ 0.910** β€” a few points below the 0.914 reported in the +case study because this example fits all encoders on `train_df` only +(no time-leakage into val features). With the bundled `instructions.md` +and 200 steps of `gemini-3.1-pro-preview`, expect AUC in the **0.928–0.933** +range, consistent with the case study trajectory on a clean baseline. + +## Prerequisites + +1. **Kaggle API token**. Put a valid `kaggle.json` at `~/.kaggle/kaggle.json` + (see [Kaggle API credentials](https://github.com/Kaggle/kaggle-api#api-credentials)), + then `chmod 600 ~/.kaggle/kaggle.json` to silence the permissions warning. +2. **You must join the competition.** Visit + and click "Late Submission" / + "Join Competition" to accept the rules. Without this, + `prepare_data.py` will fail with `403 Forbidden` from the Kaggle API β€” + this is the single most common first-time friction. +3. **Weco API key** (free tier is fine). See the + [Weco docs](https://docs.weco.ai). + +## Setup + +```bash +cd examples/fraud-detection + +# Virtualenv is strongly recommended β€” modern Python installs (Debian/Ubuntu, +# recent Homebrew) refuse `pip install` to the system site-packages under +# PEP 668. If you skip this step you'll hit +# `error: externally-managed-environment`. +python3 -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +# After activation, `python` resolves to the venv's interpreter. + +pip install -r requirements.txt + +# Downloads ~120MB of CSVs, builds a small 100K/25K parquet split. +# Time-based split: last 20% of transactions by TransactionDT = validation. +# ~2-3 minutes on a modern laptop. +python prepare_data.py +``` + +After this you should have: + +``` +data/ + train_transaction.csv, train_identity.csv, test_*.csv # raw + base_train_small.parquet # 100K rows, time-ordered + base_val_small.parquet # 25K rows, later in time +``` + +## Quick sanity check + +Run the baseline once to confirm everything loads: + +```bash +python evaluate.py +# β†’ auc_roc: 0.910xxx (takes ~30s) +``` + +If you see an AUC in the 0.90-0.92 range, you're ready. + +## Run Weco + +The "default" run uses the full EDA + techniques instructions (recommended β€” +they contain the column semantics and known-good techniques for this dataset): + +```bash +weco run --source train.py \ + --eval-command "python evaluate.py" \ + --metric auc_roc \ + --goal maximize \ + --steps 50 \ + --model gemini-3.1-pro-preview \ + --additional-instructions instructions.md \ + --eval-timeout 300 \ + --log-dir .runs/fraud-detection +``` + +Expected trajectory: + +- Steps 1–10: Weco explores β€” tries log-amount, simple aggregations, category + encodings. AUC moves into 0.918-0.925. +- Steps 10–50: builds UID-style features (card1 + addr1 + account-creation + estimate via `D1`), target encoding with out-of-fold protection, velocity + features. AUC climbs to 0.928-0.933. +- Beyond step 50: diminishing returns; the pooled mean across 6 seeds in our + case study was 0.9305 Β± 0.0035. + +## Explanation + +- `--source train.py` β€” the file Weco rewrites. Both `build_features` and + `train_and_evaluate` are fair game. +- `--eval-command "python evaluate.py"` β€” called after every proposed edit; + reimports `train.py`, runs the pipeline, prints `auc_roc: 0.xxxxxx`. Weco + parses the last line matching `--metric`. +- `--metric auc_roc --goal maximize` β€” Weco optimizes the metric printed by + the evaluator. +- `--additional-instructions instructions.md` β€” injects domain context into + every optimization step. **This is what mostly matters.** See the + case study: EDA-level instructions (what each column means in this + specific dataset) drive most of the gain. Kaggle-classic techniques are + typically already in the LLM's pretraining distribution. Feed the optimizer + what it couldn't already know β€” dataset-specific semantics, proprietary + heuristics, internal constraints. +- `--eval-timeout 300` β€” one eval takes ~30-60s; 300s gives headroom for + feature-heavy proposals. + +## Things to try + +1. **No instructions baseline**: remove `--additional-instructions` and watch + variance across seeds balloon (std ~0.008 vs ~0.002 with instructions). + Also watch for silently-leaky proposals (see below). +2. **EDA only**: keep only the column-meaning section of `instructions.md` β€” + the case study found this accounts for most of the mean gain. +3. **Scope restriction**: point Weco at `train.py`'s `build_features` only by + editing the file to expose just that function (or split the pipeline into + `features.py` + `model.py`). In our case study, features-only delivered + most of the improvement that full-pipeline did. + +## Watch out for silent leakage + +Two flavors both show up in IEEE-CIS optimization runs. + +**Target leakage** β€” `isFraud` ends up encoded into features. A plausible +idea like "count how many columns are zero per row" becomes leaky if the +dataframe still contains `isFraud`, because fraud rows contribute a +different count than non-fraud rows. The baseline `build_features` drops +`isFraud` and `TransactionID` up-front; don't let proposals reintroduce +aggregations on a dataframe that still has the label. The case study walks +through a real instance where this bug reported AUC 0.9591 that dropped to +0.9154 after a one-line fix β€” see +. + +**Time leakage** β€” validation-period statistics leak into train features. +This is a time-based split; at serving time you don't have the val period. +Any encoder, groupby aggregation, frequency count, or target encoding must +be **fit on `train_df` only** and then applied to both splits. The baseline +demonstrates the pattern β€” fit `card1_amt_mean` on train, `.join` it onto +both train and val, fill unseen val keys with a train-global default. If a +proposal does `pd.concat([train_df, val_df]).groupby(...)`, that's a leak +even if it drops `isFraud` first. + +Signs a run has one of these leaks (AUC suspiciously high on this 100K/25K +subsample, e.g. > 0.95): + +- Any `df.sum`/`df.mean`/`(df == x)` across all columns before the label is + dropped. +- Target encoding without out-of-fold protection (encoder fit on full train + then applied to train). +- Groupby / value-counts / target encoders fit on `pd.concat([train, val])`. +- Features computed using validation data at all β€” velocity features that + sort train + val together and take row-wise diffs, etc. + +## Citing the case study + +If you use this example, the underlying numbers come from +. Setup: 200 steps, +3 seeds per condition (6 for the Full pipeline + Full-instructions condition, +pooled since the two ablations share that configuration), +`gemini-3.1-pro-preview`. diff --git a/examples/fraud-detection/evaluate.py b/examples/fraud-detection/evaluate.py new file mode 100644 index 0000000..d3ad2d6 --- /dev/null +++ b/examples/fraud-detection/evaluate.py @@ -0,0 +1,35 @@ +"""Evaluator Weco calls after each proposed edit. + +Loads train.py fresh each run (Weco rewrites it in place), executes the +pipeline, and prints a single `auc_roc: 0.xxxxxx` line that Weco parses as +the metric. +""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +def load_module(path: str): + spec = importlib.util.spec_from_file_location("train_under_test", path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def main() -> int: + train = load_module(str(Path(__file__).parent / "train.py")) + auc = train.run_pipeline() + + if not (0.0 <= auc <= 1.0): + print(f"Constraint violated: AUC-ROC out of range ({auc})") + return 1 + + print(f"auc_roc: {auc:.6f}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/fraud-detection/instructions.md b/examples/fraud-detection/instructions.md new file mode 100644 index 0000000..a57ac47 --- /dev/null +++ b/examples/fraud-detection/instructions.md @@ -0,0 +1,116 @@ +# Fraud Detection Optimization Instructions + +## Task +Optimize `train.py` to maximize AUC-ROC for fraud detection on the IEEE-CIS dataset. You may modify both `build_features` (feature engineering) and `train_and_evaluate` (model config). Keep `run_pipeline`'s interface and the `auc_roc: 0.xxxxxx` print format unchanged so the evaluator can parse the metric. + +## Dataset Details +- 100K train / 25K val, 3.5% fraud rate, time-based split +- Base data has 297 columns after V-feature correlation pruning +- Categoricals are already label-encoded as integers +- TransactionDT is in seconds (timedelta from reference date, NOT a timestamp) + +## Column Meanings (from Kaggle community reverse-engineering) + +### Raw columns +- **TransactionAmt**: USD amount. Heavy-tailed (median $68, max $4578). Log transform essential. +- **ProductCD**: Product type (5 categories: C, H, R, S, W). Each has a distinct V-feature NaN pattern and fraud rate (C=11%, W=2.1%). +- **card1**: Bank Identification Number (BIN) β€” first 6 digits of card. Top-3 importance. +- **card2**: Additional card info. 1.5% NaN. Top-3 importance. +- **card3/card5**: Card country/product type codes. +- **card4**: Card network (visa, mastercard, etc). +- **card6**: Card type (credit, debit). +- **addr1**: Billing zip code (anonymized). 11.5% NaN. +- **addr2**: Billing country. +- **P_emaildomain**: Purchaser email domain (gmail.com, yahoo.com, etc). +- **R_emaildomain**: Recipient email domain. Mismatch between P and R = fraud signal. +- **dist1/dist2**: Distance features. + +### C-features (C1-C14): Entity occurrence COUNTS, no NaN +- **C1** (importance rank #2): Count of addresses associated with the payment card +- **C2**: Count of cards at the billing address +- **C5**: Count of email addresses seen with this card +- **C11**: Count of cards associated with a user identity +- **C12**: Count of addresses associated with a user identity +- **C13** (importance rank #4): Count of distinct email domains per entity β€” **one of the single most predictive raw features**. High values = fraud ring. +- **C14** (importance rank #3): Related count feature + +### D-features (D1-D15): TIMEDELTA in days between events +- **D1** (0.2% NaN, median 1 day): Days since last transaction. Most important D-feature. `TransactionDT/86400 - D1` estimates the **account creation date** β€” this is the key insight for UID construction. +- **D2** (49% NaN, median 97 days): Days since card was first associated with the identity +- **D3** (46% NaN): Days since last similar transaction +- **D4** (29.5% NaN): Days since email association +- **D10** (14% NaN): Days since last device-linked transaction +- **D11** (52% NaN): Days since account was opened / account age +- **D15** (16.5% NaN, median 46 days): Days since last transaction (alternative) +- D-feature NaN rates themselves are informative β€” missingness patterns encode transaction type + +### M-features (M1-M9): Binary MATCH indicators +Whether certain attributes match each other (name↔address, card↔billing, device↔historical, etc). Sum of True values, count of NaN, and the M-vector signature are all useful. + +### V-features (V1-V339, ~202 after pruning): Vesta-engineered risk signals +Grouped by ProductCD β€” each product type uses a different subset of V-features (others are NaN). V258 is the #1 most important feature overall (gain=16703). Other important V-features: V283, V69, V130, V307, V294, V201. + +## Top Winning Techniques (from 1st-3rd place solutions) + +### 1. UID Construction (THE most impactful single technique) +```python +D1_start = floor(TransactionDT / 86400 - D1) # estimated account creation day +uid = card1 + "_" + addr1 + "_" + D1_start +``` +This creates a stable user fingerprint. All aggregation features should be computed on this UID. + +### 2. UID-level aggregation features +For each UID, compute: mean, std, count of TransactionAmt. Then z-score and ratio for each transaction relative to user's history. This captures "is this transaction unusual for this user?" + +### 3. Temporal centroid distance +Compute the user's typical time-of-day using cyclical hour_sin/hour_cos means. The Euclidean distance of the current transaction from the centroid = "is this at an unusual time for this user?" + +### 4. D-feature lifecycle lags +D1 - D2, D1 - D4, D1 - D10, D1 - D15: Inconsistencies between these timestamps indicate synthetic identities or account takeovers. + +### 5. Velocity features (sort by [uid, TransactionDT]) +Time since last transaction per user. Amount change from previous transaction. High velocity + high amount = fraud signal. + +### 6. Cross-entity cardinality (nunique) +How many unique addr1 values per card1? How many unique card1 per addr1? How many unique P_emaildomain per uid? High cardinality = suspicious. + +### 7. NaN pattern signature +The binary NaN/not-NaN pattern across D+M columns encodes the transaction type. Compute a bitwise signature or just count NaN per feature group. + +### 8. Frequency encoding +For card1, card2, addr1, P_emaildomain, etc. β€” map each value to its frequency. Rare values (appearing once or twice) are fraud signals. + +### 9. Interaction features +- amount_zscore Γ— time_distance (unusual amount at unusual time) +- amount_zscore Γ— C1_ratio (unusual amount with unusual address count) +- amount / (D1 + 1) = spending rate per day since last transaction + +### 10. Row-wise missingness features +Count of NaN values across D-columns, M-columns, V-columns per row. Sum/mean of M-column values. The NaN pattern encodes the transaction profile. + +## Important Constraints +- Keep code under 300 lines (Weco backend limit) +- Use n_jobs=4 for any model operations +- `train.py` loads `data/base_train_small.parquet` and `data/base_val_small.parquet` β€” don't change these paths +- Categoricals are already integer-encoded β€” treat them as numeric +- Keep the `run_pipeline() -> float` function signature and the `auc_roc: 0.xxxxxx` print format intact + +## Avoiding silent leakage + +Two distinct leaks to avoid. Both inflate reported AUC without improving the real pipeline. + +**1. Target leakage (isFraud bleeding into features).** `isFraud` is the label. If you compute features that aggregate across all columns of the dataframe (e.g. `(df == 0).sum(axis=1)`, row-wise NaN counts over the entire frame), drop `isFraud` and `TransactionID` first. Otherwise the label signal encodes into the features and produces implausibly high AUC (> 0.95) that collapses the moment the fix is applied. + +**2. Time leakage (validation distribution bleeding into features).** This is a time-based train/val split β€” val rows are transactions from a later period you wouldn't see at serving time. Any encoder, aggregation, frequency count, or target encoding MUST be fit on `train_df` only and then applied to both splits. Concatenating `train_df + val_df` before a `groupby` lets val-period statistics shape train features and lets each val row influence its own encoded values. Expected fallout: smaller inflation than target leakage, but still material (noticeable bump in val AUC that doesn't survive a real time cutoff). + +Pattern to follow for any new group/frequency/target encoder: + +```python +# Fit on train +freq = train_df[col].value_counts(normalize=True) +# Apply to both, unseen keys get 0 (or a sensible train-global default) +train_df[f"{col}_freq"] = train_df[col].map(freq).fillna(0) +val_df[f"{col}_freq"] = val_df[col].map(freq).fillna(0) +``` + +For target encoding specifically, even on train you need out-of-fold protection (fit encoder on K-1 folds, apply to the held-out fold) β€” otherwise you leak train labels into train features. diff --git a/examples/fraud-detection/prepare_data.py b/examples/fraud-detection/prepare_data.py new file mode 100644 index 0000000..f538b9d --- /dev/null +++ b/examples/fraud-detection/prepare_data.py @@ -0,0 +1,145 @@ +"""Download IEEE-CIS data, build base features, subsample to a small split. + +Produces `data/base_train_small.parquet` and `data/base_val_small.parquet` that +`train.py` loads. The split is time-based (the last 20% of transactions by +TransactionDT are held out for validation), which mirrors production fraud +detection: you never train on future data. + +Usage: + # 1. Put your Kaggle API token at ~/.kaggle/kaggle.json + # (see https://github.com/Kaggle/kaggle-api#api-credentials) + # 2. Join the competition on kaggle.com/c/ieee-fraud-detection to accept rules + # 3. Run: + python prepare_data.py + +Runtime: ~2-3 minutes on a modern laptop. Produces ~150MB of parquet files. +""" + +from __future__ import annotations + +import subprocess +import sys +import zipfile +from pathlib import Path + +import numpy as np +import pandas as pd + +DATA_DIR = Path(__file__).parent / "data" +TRAIN_SIZE = 100_000 +VAL_SIZE = 25_000 +TIME_SPLIT_FRAC = 0.8 # first 80% of transactions by time = train candidates +SEED = 42 + + +def download_kaggle() -> None: + """Download ieee-fraud-detection via the Kaggle CLI.""" + DATA_DIR.mkdir(exist_ok=True) + txn = DATA_DIR / "train_transaction.csv" + ident = DATA_DIR / "train_identity.csv" + if txn.exists() and ident.exists(): + print(f"[skip] raw CSVs already present in {DATA_DIR}") + return + + print(f"[download] kaggle competitions download -c ieee-fraud-detection -p {DATA_DIR}") + print("[download] this takes ~1-2 min over a fast link; ~120MB of CSVs") + # Use `python -m kaggle.cli` β€” the `kaggle` package has no __main__, so + # `python -m kaggle` fails. kaggle.cli is the canonical entry point. + try: + subprocess.check_call( + [sys.executable, "-m", "kaggle.cli", "competitions", "download", + "-c", "ieee-fraud-detection", "-p", str(DATA_DIR)] + ) + except subprocess.CalledProcessError as e: + print( + "\n[error] Kaggle download failed. Most common causes:\n" + " 1. You haven't joined the competition. Visit\n" + " https://www.kaggle.com/c/ieee-fraud-detection\n" + " and click 'Late Submission' / 'Join Competition' to accept the rules.\n" + " 2. ~/.kaggle/kaggle.json is missing or has wrong permissions.\n" + " Run: chmod 600 ~/.kaggle/kaggle.json\n", + file=sys.stderr, + ) + raise SystemExit(e.returncode) + zip_path = DATA_DIR / "ieee-fraud-detection.zip" + print(f"[extract] {zip_path}") + with zipfile.ZipFile(zip_path) as zf: + zf.extractall(DATA_DIR) + zip_path.unlink() + + +def build_base_features(df: pd.DataFrame) -> pd.DataFrame: + """Minimal, leakage-safe preprocessing so train.py has a clean starting point. + + - Drop test-specific columns + - Label-encode object columns (LightGBM doesn't take strings) + - Reduce highly correlated V-features (drop one per cluster with r > 0.95) + to keep train.py's input dimensionality manageable + """ + # Label-encode all object columns. Keep isFraud/TransactionID/TransactionDT intact. + obj_cols = df.select_dtypes(include=["object"]).columns.tolist() + for col in obj_cols: + df[col] = df[col].astype("category").cat.codes.astype(np.int32) + + # Reduce V-features by correlation clustering (done on a sample for speed). + v_cols = [c for c in df.columns if c.startswith("V")] + if v_cols: + sample = df[v_cols].sample(n=min(10_000, len(df)), random_state=SEED) + corr = sample.corr().abs() + upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) + to_drop = [c for c in upper.columns if (upper[c] > 0.95).any()] + df = df.drop(columns=to_drop) + print(f"[v-reduce] dropped {len(to_drop)}/{len(v_cols)} correlated V-features") + + return df + + +def time_based_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + df = df.sort_values("TransactionDT").reset_index(drop=True) + split_point = df["TransactionDT"].quantile(TIME_SPLIT_FRAC) + train = df[df["TransactionDT"] <= split_point].copy() + val = df[df["TransactionDT"] > split_point].copy() + return train, val + + +def subsample(df: pd.DataFrame, n: int, label: str) -> pd.DataFrame: + if len(df) <= n: + return df + sampled = df.sample(n=n, random_state=SEED).sort_values("TransactionDT").reset_index(drop=True) + fraud_rate = sampled["isFraud"].mean() + print(f"[subsample] {label}: {len(df)} -> {len(sampled)} (fraud rate {fraud_rate:.3%})") + return sampled + + +def main() -> None: + download_kaggle() + + train_out = DATA_DIR / "base_train_small.parquet" + val_out = DATA_DIR / "base_val_small.parquet" + if train_out.exists() and val_out.exists(): + print(f"[skip] {train_out.name} and {val_out.name} already exist") + return + + print("[load] merging train_transaction + train_identity") + txn = pd.read_csv(DATA_DIR / "train_transaction.csv") + ident = pd.read_csv(DATA_DIR / "train_identity.csv") + df = txn.merge(ident, on="TransactionID", how="left") + print(f"[load] shape={df.shape}, fraud rate {df['isFraud'].mean():.3%}") + + df = build_base_features(df) + + print("[split] time-based 80/20") + train_df, val_df = time_based_split(df) + print(f"[split] train={len(train_df)} val={len(val_df)}") + + train_small = subsample(train_df, TRAIN_SIZE, "train") + val_small = subsample(val_df, VAL_SIZE, "val") + + train_small.to_parquet(train_out, index=False) + val_small.to_parquet(val_out, index=False) + print(f"[write] {train_out}") + print(f"[write] {val_out}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/fraud-detection/requirements.txt b/examples/fraud-detection/requirements.txt new file mode 100644 index 0000000..188fe96 --- /dev/null +++ b/examples/fraud-detection/requirements.txt @@ -0,0 +1,7 @@ +weco +numpy>=1.24 +pandas>=2.0 +scikit-learn>=1.3 +lightgbm>=4.0 +pyarrow>=13.0 +kaggle>=1.6 diff --git a/examples/fraud-detection/train.py b/examples/fraud-detection/train.py new file mode 100644 index 0000000..10e1c46 --- /dev/null +++ b/examples/fraud-detection/train.py @@ -0,0 +1,154 @@ +"""Baseline fraud-detection pipeline on IEEE-CIS. Weco will optimize this file. + +Weco can modify anything in `build_features` and `train_and_evaluate`. The +`run_pipeline` function is the entry point called by `evaluate.py`. + +Keep the final print format exactly as `auc_roc: 0.xxxxxx` so Weco can parse +the metric. Everything else is fair game to rewrite. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import lightgbm as lgb +from sklearn.metrics import roc_auc_score + + +def _add_row_features(df: pd.DataFrame) -> pd.DataFrame: + """Per-row features that don't depend on any other row (safe to compute anywhere).""" + df = df.copy() + df["hour"] = (df["TransactionDT"] // 3600) % 24 + df["day_of_week"] = (df["TransactionDT"] // 86400) % 7 + df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24) + df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24) + + df["TransactionAmt_log"] = np.log1p(df["TransactionAmt"]) + df["TransactionAmt_decimal"] = ( + df["TransactionAmt"] - df["TransactionAmt"].astype(int) + ).round(2) + df["TransactionAmt_is_round"] = (df["TransactionAmt_decimal"] == 0).astype(np.int8) + return df + + +def build_features( + train_df: pd.DataFrame, val_df: pd.DataFrame +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Build features from the base data. Returns (X_train, y_train, X_val, y_val). + + Any aggregation or encoding (groupby stats, frequency, target encoding, ...) + is fit on `train_df` ONLY and applied to both train and val. This mirrors + production: at serving time you do not have the validation period yet, so + letting val rows shape the features is time-leakage that inflates the AUC + Weco optimizes against. + + Weco can replace or extend this β€” the case study found UID-based + aggregations (card1 + addr1 + account-creation-day estimate), target + encoding with out-of-fold protection, frequency encoding, and velocity + features are the most impactful additions. Keep the fit-on-train / + apply-to-both discipline for any new encoder. + """ + y_train = train_df["isFraud"].values.astype(np.int32) + y_val = val_df["isFraud"].values.astype(np.int32) + + # Drop label/ID from a copy of each split so no downstream aggregation can + # accidentally include them. + train = train_df.drop(columns=["isFraud", "TransactionID"]) + val = val_df.drop(columns=["isFraud", "TransactionID"]) + + train = _add_row_features(train) + val = _add_row_features(val) + + # --- Aggregations on card1 / addr1 (fit on train, apply to both) --- + for key in ["card1", "addr1"]: + grp = train.groupby(key)["TransactionAmt"] + stats = grp.agg(["mean", "std", "count"]).rename( + columns={"mean": f"{key}_amt_mean", + "std": f"{key}_amt_std", + "count": f"{key}_amt_count"} + ) + # Unseen keys in val: fall back to train-global mean/std and count=0. + defaults = { + f"{key}_amt_mean": train["TransactionAmt"].mean(), + f"{key}_amt_std": train["TransactionAmt"].std(), + f"{key}_amt_count": 0, + } + train = train.join(stats, on=key) + val = val.join(stats, on=key) + for col, default in defaults.items(): + train[col] = train[col].fillna(default) + val[col] = val[col].fillna(default) + + # --- Frequency encoding (fit on train, apply to both; unseen = 0) --- + for col in ["card1", "card2", "card5", "addr1"]: + if col not in train.columns: + continue + freq = train[col].value_counts(normalize=True) + train[f"{col}_freq"] = train[col].map(freq).fillna(0) + val[f"{col}_freq"] = val[col].map(freq).fillna(0) + + train = train.drop(columns=["TransactionDT"]) + val = val.drop(columns=["TransactionDT"]) + # Align columns in case defaults introduced divergent dtypes. + val = val[train.columns] + + X_train = train.values.astype(np.float32) + X_val = val.values.astype(np.float32) + return X_train, y_train, X_val, y_val + + +def train_and_evaluate( + X_train: np.ndarray, + y_train: np.ndarray, + X_val: np.ndarray, + y_val: np.ndarray, +) -> float: + """Train LightGBM and return AUC-ROC on the validation set. + + Reasonable-but-not-heavily-tuned hyperparameters. A fraud team would + typically run Optuna for 50-100 trials on these β€” there is headroom. + """ + params = { + "objective": "binary", + "metric": "auc", + "boosting_type": "gbdt", + "learning_rate": 0.05, + "num_leaves": 127, + "max_depth": -1, + "min_child_samples": 50, + "subsample": 0.8, + "colsample_bytree": 0.8, + "reg_alpha": 0.1, + "reg_lambda": 1.0, + "scale_pos_weight": 1, + "n_jobs": 4, + "verbose": -1, + "seed": 42, + } + + train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False) + val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=False) + + model = lgb.train( + params, + train_data, + num_boost_round=1000, + valid_sets=[val_data], + valid_names=["val"], + callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)], + ) + + y_pred = model.predict(X_val) + return float(roc_auc_score(y_val, y_pred)) + + +def run_pipeline() -> float: + train_df = pd.read_parquet("data/base_train_small.parquet") + val_df = pd.read_parquet("data/base_val_small.parquet") + X_train, y_train, X_val, y_val = build_features(train_df, val_df) + return train_and_evaluate(X_train, y_train, X_val, y_val) + + +if __name__ == "__main__": + auc = run_pipeline() + print(f"auc_roc: {auc:.6f}")