Program Analysis
Obesity Machine Learning Competition
Julie Laffy
Postdoctoral fellow of the Eric and Wendy Schmidt Center
Broad Institute of MIT and Harvard
May 2026

Overview¶

Program analysis methodology used in the Obesity ML Competition.

  • Parts 1–3 — cell-program classification (pre-adipocyte / adipocyte / lipogenic / other) and proportions. Methodology used for Crunch 1 and Crunch 2.
  • Part 4 — thermogenic signature scoring. Methodology used for Crunch 3.

Setup¶

In [1]:
%%capture
# Install required packages
%pip install "pyscalop>=0.3.0" pandas pyarrow --quiet
In [2]:
import numpy as np
import pandas as pd
import pyscalop as ps

Part 1: All Cells¶

1.1 Preprocess¶

In [3]:
full_matrix = "data/m.allgenes.logcpm.parquet"
proc_matrix = "data/m.logcpm.parquet"
gene_cutoff = 3

# read in expression matrix (cells x genes, log2(CPM/10 + 1))
m = pd.read_parquet(full_matrix).set_index("cell_id")

# calculate average gene expression
avg = ps.aggr_gene_expr(m, is_bulk=False)

# filter genes that do not pass minimum gene expression cutoff
m2 = m.loc[:, avg >= gene_cutoff]

# save filtered matrix
m2.reset_index().to_parquet(proc_matrix, index=False)

1.2 Score Adipo/Preadipo¶

In [4]:
proc_matrix = "data/m.logcpm.parquet"
gene_sigs = "sigs/adipo_preadipo_sigs.csv"
frac_genes_retained = 0.5
n_genes_retained = 10
score_results = "data/permuted_scores.adipo_preadipo_sigs.parquet"

# Read in processed expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")

# Read in gene signatures to score cells by
# there are 2 adipocyte signatures: "adipomap.Ad" and "yi.AD"
# and 2 pre-adipocyte (progenitor) signatures: "adipomap.ASPC" and "yi.ASPC"
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()

# Filter out signatures whose genes are poorly represented in {proc_matrix}
# A signature is kept if at least 10 genes and 50% of the signature exist in {proc_matrix}
# {frac_genes_retained} and {n_genes_retained}
sigs_filt = {name: genes for name, genes in sigs.items()
             if sum(g in m.columns for g in genes) >= n_genes_retained}

# Calculate single-cell expression scores for each signature and FDRs
scores_permuted = ps.sig_scores(m, sigs_filt, permute=True, conserved_genes=frac_genes_retained)

# Save results
scores_permuted.to_parquet(score_results, index=False)

1.3 Classify Adipo/Preadipo/Other¶

In [5]:
score_results = "data/permuted_scores.adipo_preadipo_sigs.parquet"
fdr_cutoff = 0.01
diff_cutoff = 0.3

# Adipocyte and pre-adipocyte signature names (matches sig file).
adipo_sigs    = ["adipomap.Ad",   "yi.AD"]
preadipo_sigs = ["adipomap.ASPC", "yi.ASPC"]

# Pivot long -> wide; flatten the MultiIndex to score_<sig> / fdr_<sig>
scores = pd.read_parquet(score_results)
scores_wide = scores.pivot(index="id", columns="sig", values=["score", "fdr"])
scores_wide.columns = scores_wide.columns.map("_".join)
scores_wide = scores_wide.reset_index()

# Per-cell summary stats
score_adipo    = [f"score_{s}" for s in adipo_sigs]
score_preadipo = [f"score_{s}" for s in preadipo_sigs]
fdr_adipo      = [f"fdr_{s}"   for s in adipo_sigs]
fdr_preadipo   = [f"fdr_{s}"   for s in preadipo_sigs]

scores_wide["max_adipo"]    = scores_wide[score_adipo].max(axis=1)
scores_wide["max_preadipo"] = scores_wide[score_preadipo].max(axis=1)
scores_wide["score_diff"]   = scores_wide["max_adipo"] - scores_wide["max_preadipo"]
scores_wide["max_overall"]  = scores_wide[score_adipo + score_preadipo].max(axis=1)

# Classify cells:
#   Adipo:    score_diff > diff_cutoff, both pre-adipo sigs non-significant,
#             at least one adipo sig significant.
#   Preadipo: mirror of the above.
#   Other:    ambiguous, dual-identity, or neither.
adipo_mask = (
    (scores_wide["score_diff"] > diff_cutoff)
    & (scores_wide[fdr_preadipo] >= fdr_cutoff).all(axis=1)
    & (scores_wide[fdr_adipo]    <  fdr_cutoff).any(axis=1)
)
preadipo_mask = (
    (scores_wide["score_diff"] < -diff_cutoff)
    & (scores_wide[fdr_adipo]    >= fdr_cutoff).all(axis=1)
    & (scores_wide[fdr_preadipo] <  fdr_cutoff).any(axis=1)
)
scores_wide["compartment"] = np.where(
    adipo_mask, "Adipo",
    np.where(preadipo_mask, "Preadipo", "Other"),
)

# Extract and save cell IDs per compartment (one cell per line)
def write_cells(cells, path):
    with open(path, "w") as f:
        f.write("\n".join(cells))

for label in ("Adipo", "Preadipo", "Other"):
    cells = scores_wide.loc[scores_wide["compartment"] == label, "id"].tolist()
    write_cells(cells, f"data/cells.{label.lower()}.txt")

Part 2: Adipo Cells¶

2.1 Preprocess¶

In [6]:
adipo_cells_path = "data/cells.adipo.txt"
full_matrix = "data/m.allgenes.logcpm.parquet"
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_cutoff = 3

# read expression matrix (cells x genes) and cells classified as Adipo
cells = open(adipo_cells_path).read().splitlines()
m = pd.read_parquet(full_matrix).set_index("cell_id")

# filter to keep only Adipo cells
m_adipo = m.loc[cells]

# calculate average gene expression
avg = ps.aggr_gene_expr(m_adipo, is_bulk=False)

# filter genes that do not pass minimum gene expression cutoff
m2 = m_adipo.loc[:, avg >= gene_cutoff]

# save filtered matrix
m2.reset_index().to_parquet(proc_matrix, index=False)

2.2 Score Lipogenic¶

In [7]:
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_sigs = "sigs/lipogenic_sigs.csv"
frac_genes_retained = 0.5
n_genes_retained = 10
score_results = "data/permuted_scores.lipogenic_sigs.parquet"

# Read in processed expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")

# Read in gene signatures to score cells by
# These are a collection of lipogenic signatures
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()

# Filter out signatures whose genes are poorly represented in {proc_matrix}
# A signature is kept if at least 10 genes and 50% of the signature exist in {proc_matrix}
# {frac_genes_retained} and {n_genes_retained}
sigs_filt = {name: genes for name, genes in sigs.items()
             if sum(g in m.columns for g in genes) >= n_genes_retained}

# Calculate single-cell expression scores for each signature and FDRs
scores_permuted = ps.sig_scores(m, sigs_filt, permute=True, conserved_genes=frac_genes_retained)

# Save results
scores_permuted.to_parquet(score_results, index=False)

2.3 Classify Lipogenic¶

In [8]:
score_results = "data/permuted_scores.lipogenic_sigs.parquet"
fdr_cutoff = 0.01
score_cutoff = 0.3
n_sigs = 2

# Pivot long -> wide; flatten the MultiIndex to score_<sig> / fdr_<sig>
scores = pd.read_parquet(score_results)
scores_wide = scores.pivot(index="id", columns="sig", values=["score", "fdr"])
scores_wide.columns = scores_wide.columns.map("_".join)
scores_wide = scores_wide.reset_index()

# A signature passes if score > score_cutoff AND fdr < fdr_cutoff. Count per cell.
score_cols = [c for c in scores_wide.columns if c.startswith("score_")]
fdr_cols   = [c.replace("score_", "fdr_") for c in score_cols]
passes = (
    (scores_wide[score_cols].to_numpy() > score_cutoff)
    & (scores_wide[fdr_cols].to_numpy() < fdr_cutoff)
)
scores_wide["n_pass"] = passes.sum(axis=1)

# Keep cells with at least n_sigs passing signatures; save as plain text.
lipogenic_cells = scores_wide.loc[scores_wide["n_pass"] >= n_sigs, "id"].tolist()
with open("data/cells.lipogenic.txt", "w") as f:
    f.write("\n".join(lipogenic_cells))

Part 3: Proportions¶

In [9]:
import os, glob

# get cell assignment file paths
cell_files = sorted(glob.glob("data/cells.*.txt"))

# read in cell assignments
cell_list = {}
for f in cell_files:
    name = os.path.basename(f).replace("cells.", "").replace(".txt", "")
    cell_list[name] = open(f).read().splitlines()

# reorder groups
ord_keys = ["adipo", "preadipo", "other", "lipogenic"]
cell_list = {k: cell_list[k] for k in ord_keys if k in cell_list}

# get cell proportions
n_cells = {k: len(v) for k, v in cell_list.items()}
# lipogenic is a sub-population of adipo; exclude it from the denominator
denom = sum(v for k, v in n_cells.items() if k != "lipogenic")
frac_cells = {k: v / denom for k, v in n_cells.items()}
frac_cells
Out[9]:
{'adipo': 0.291, 'preadipo': 0.396, 'other': 0.313, 'lipogenic': 0.084}

Expected output for sample data:

{'adipo':     0.291,
 'preadipo':  0.384,
 'other':     0.325,
 'lipogenic': 0.084}

Part 4: Thermogenic Scoring¶

The scoring pipeline below is what produced the reference *_ThermoScores_cell.csv and *_ThermoScores_perturbation.csv files released to participants in Crunch 3. Twelve signatures are scored, taken from thermogenic_signatures.csv:

nonucp1.all
REACTOME_AMPK_inhibits_chREBP_R-HSA-163680
REACTOME_Integration_of_energy_metabolism_R-HSA-163685
REACTOME_Mitochondrial_biogenesis_R-HSA-1592230
emont.hAd6
lit.thermogenic
C2.WP_THERMOGENESIS
C5.GOBP_ADAPTIVE_THERMOGENESIS
C5.GOBP_BROWN_FAT_CELL_DIFFERENTIATION
C5.GOBP_NEGATIVE_REGULATION_OF_COLD_INDUCED_THERMOGENESIS
C5.GOBP_POSITIVE_REGULATION_OF_COLD_INDUCED_THERMOGENESIS
yi.Thermogenesis_Village

4.1 Score Thermogenic Signatures¶

In [10]:
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_sigs = "sigs/thermogenic_signatures.csv"
n_genes_retained = 2
score_results = "data/permuted_scores.thermo_sigs.parquet"

# Read in processed adipocyte expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")

# Read in the 12 thermogenic gene signatures
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()

# Trim each signature to genes present in the matrix
sigs_present = {name: [g for g in genes if g in m.columns] for name, genes in sigs.items()}

# Keep only sigs with at least {n_genes_retained} of their genes in the matrix
sigs_to_score = {name: genes for name, genes in sigs_present.items() if len(genes) >= n_genes_retained}

# Score each signature against each cell (raw score + permutation-based FDR)
scores = ps.sig_scores(m, sigs_to_score, permute=True, conserved_genes=0)

# Annotate each row with how many genes the signature contributed
n_genes_used = {name: len(g) for name, g in sigs_to_score.items()}
scores["n_genes_used"] = scores["sig"].map(n_genes_used)

# Save long per-(cell, sig) table: id, sig, score, fdr, n_genes_used
scores.to_parquet(score_results, index=False)

4.2 Z-score and Per-Perturbation Aggregation¶

Z-score each signature across all adipocyte cells, aggregate to perturbation level (mean z per gene × sig), and rank perturbations by agg_top3_z (mean of the top-3 signature z-scores). Outputs in the format of the released reference files for Crunch 3: ThermoScores_cell.csv (one row per cell) and ThermoScores_perturbation.csv (one row per perturbation, with agg_top3_z and rank).

In [11]:
score_results = "data/permuted_scores.thermo_sigs.parquet"
cell_out = "data/ThermoScores_cell.csv"
perturbation_out = "data/ThermoScores_perturbation.csv"

scores = pd.read_parquet(score_results)

# 1. Z-score each signature across all adipocyte cells.
scores["z_score"] = scores.groupby("sig")["score"].transform(lambda x: (x - x.mean()) / x.std())

# 2. Derive perturbation (gene) from cell id prefix.
#    TF150 convention only — otherwise use AnnData `.obs['gene']` / cell metadata.
scores["gene"] = scores["id"].str.split("_", n=1).str[0]

# 3. Per-cell wide table: one row per cell, raw + z-scored columns per sig.
cell_raw = scores.pivot(index=["id", "gene"], columns="sig", values="score").reset_index()
cell_z   = scores.pivot(index="id",          columns="sig", values="z_score").reset_index()
cell_z.columns = ["id"] + [f"z_{c}" for c in cell_z.columns[1:]]
cell_wide = cell_raw.merge(cell_z, on="id").rename(columns={"id": "cell_id"})
cell_wide.to_csv(cell_out, index=False)

# 4. Per-perturbation: mean z-score per (gene, sig), then agg_top3_z = mean of
#    the top-3 sig z-scores per perturbation. Rank descending.
perturb_wide = (
    scores.groupby(["gene", "sig"])["z_score"].mean()
          .unstack("sig").reset_index()
)
sig_cols = [c for c in perturb_wide.columns if c != "gene"]
sig_z = perturb_wide[sig_cols].to_numpy()
perturb_wide["agg_top3_z"] = np.sort(sig_z, axis=1)[:, -3:].mean(axis=1)
perturb_wide = perturb_wide.sort_values("agg_top3_z", ascending=False, ignore_index=True)
perturb_wide["rank"] = np.arange(1, len(perturb_wide) + 1)

perturb_wide.to_csv(perturbation_out, index=False)

Below: results from running the pipeline on the TF150 dataset.

ThermoScores_cell.csv — top row (of 30,975), showing 2 of the 12 raw + 12 z-scored signature columns:

                    cell_id  gene  emont.hAd6  z_emont.hAd6
PPARG_P2_TAACCTAGTGATGAGC-1 PPARG      0.0969        0.3058

ThermoScores_perturbation.csv — top 5 of 158 perturbations ranked by agg_top3_z, showing the first 2 of the 12 signature columns:

    gene  emont.hAd6  lit.thermogenic  agg_top3_z  rank
RNASEH2C      0.5796           0.1700      0.5484     1
 FAM136A      0.1190           0.4887      0.4885     2
 TMEM107     -0.2030           0.3624      0.4199     3
   EP400      0.2438           0.3996      0.3956     4
   TRIM5      0.1003           0.1147      0.3177     5