Overview¶
Program analysis methodology used in the Obesity ML Competition.
- Parts 1–3 — cell-program classification (pre-adipocyte / adipocyte / lipogenic / other) and proportions. Methodology used for Crunch 1 and Crunch 2.
- Part 4 — thermogenic signature scoring. Methodology used for Crunch 3.
Setup¶
%%capture
# Install required packages
%pip install "pyscalop>=0.3.0" pandas pyarrow --quiet
import numpy as np
import pandas as pd
import pyscalop as ps
full_matrix = "data/m.allgenes.logcpm.parquet"
proc_matrix = "data/m.logcpm.parquet"
gene_cutoff = 3
# read in expression matrix (cells x genes, log2(CPM/10 + 1))
m = pd.read_parquet(full_matrix).set_index("cell_id")
# calculate average gene expression
avg = ps.aggr_gene_expr(m, is_bulk=False)
# filter genes that do not pass minimum gene expression cutoff
m2 = m.loc[:, avg >= gene_cutoff]
# save filtered matrix
m2.reset_index().to_parquet(proc_matrix, index=False)
1.2 Score Adipo/Preadipo¶
proc_matrix = "data/m.logcpm.parquet"
gene_sigs = "sigs/adipo_preadipo_sigs.csv"
frac_genes_retained = 0.5
n_genes_retained = 10
score_results = "data/permuted_scores.adipo_preadipo_sigs.parquet"
# Read in processed expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")
# Read in gene signatures to score cells by
# there are 2 adipocyte signatures: "adipomap.Ad" and "yi.AD"
# and 2 pre-adipocyte (progenitor) signatures: "adipomap.ASPC" and "yi.ASPC"
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()
# Filter out signatures whose genes are poorly represented in {proc_matrix}
# A signature is kept if at least 10 genes and 50% of the signature exist in {proc_matrix}
# {frac_genes_retained} and {n_genes_retained}
sigs_filt = {name: genes for name, genes in sigs.items()
if sum(g in m.columns for g in genes) >= n_genes_retained}
# Calculate single-cell expression scores for each signature and FDRs
scores_permuted = ps.sig_scores(m, sigs_filt, permute=True, conserved_genes=frac_genes_retained)
# Save results
scores_permuted.to_parquet(score_results, index=False)
1.3 Classify Adipo/Preadipo/Other¶
score_results = "data/permuted_scores.adipo_preadipo_sigs.parquet"
fdr_cutoff = 0.01
diff_cutoff = 0.3
# Adipocyte and pre-adipocyte signature names (matches sig file).
adipo_sigs = ["adipomap.Ad", "yi.AD"]
preadipo_sigs = ["adipomap.ASPC", "yi.ASPC"]
# Pivot long -> wide; flatten the MultiIndex to score_<sig> / fdr_<sig>
scores = pd.read_parquet(score_results)
scores_wide = scores.pivot(index="id", columns="sig", values=["score", "fdr"])
scores_wide.columns = scores_wide.columns.map("_".join)
scores_wide = scores_wide.reset_index()
# Per-cell summary stats
score_adipo = [f"score_{s}" for s in adipo_sigs]
score_preadipo = [f"score_{s}" for s in preadipo_sigs]
fdr_adipo = [f"fdr_{s}" for s in adipo_sigs]
fdr_preadipo = [f"fdr_{s}" for s in preadipo_sigs]
scores_wide["max_adipo"] = scores_wide[score_adipo].max(axis=1)
scores_wide["max_preadipo"] = scores_wide[score_preadipo].max(axis=1)
scores_wide["score_diff"] = scores_wide["max_adipo"] - scores_wide["max_preadipo"]
scores_wide["max_overall"] = scores_wide[score_adipo + score_preadipo].max(axis=1)
# Classify cells:
# Adipo: score_diff > diff_cutoff, both pre-adipo sigs non-significant,
# at least one adipo sig significant.
# Preadipo: mirror of the above.
# Other: ambiguous, dual-identity, or neither.
adipo_mask = (
(scores_wide["score_diff"] > diff_cutoff)
& (scores_wide[fdr_preadipo] >= fdr_cutoff).all(axis=1)
& (scores_wide[fdr_adipo] < fdr_cutoff).any(axis=1)
)
preadipo_mask = (
(scores_wide["score_diff"] < -diff_cutoff)
& (scores_wide[fdr_adipo] >= fdr_cutoff).all(axis=1)
& (scores_wide[fdr_preadipo] < fdr_cutoff).any(axis=1)
)
scores_wide["compartment"] = np.where(
adipo_mask, "Adipo",
np.where(preadipo_mask, "Preadipo", "Other"),
)
# Extract and save cell IDs per compartment (one cell per line)
def write_cells(cells, path):
with open(path, "w") as f:
f.write("\n".join(cells))
for label in ("Adipo", "Preadipo", "Other"):
cells = scores_wide.loc[scores_wide["compartment"] == label, "id"].tolist()
write_cells(cells, f"data/cells.{label.lower()}.txt")
adipo_cells_path = "data/cells.adipo.txt"
full_matrix = "data/m.allgenes.logcpm.parquet"
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_cutoff = 3
# read expression matrix (cells x genes) and cells classified as Adipo
cells = open(adipo_cells_path).read().splitlines()
m = pd.read_parquet(full_matrix).set_index("cell_id")
# filter to keep only Adipo cells
m_adipo = m.loc[cells]
# calculate average gene expression
avg = ps.aggr_gene_expr(m_adipo, is_bulk=False)
# filter genes that do not pass minimum gene expression cutoff
m2 = m_adipo.loc[:, avg >= gene_cutoff]
# save filtered matrix
m2.reset_index().to_parquet(proc_matrix, index=False)
2.2 Score Lipogenic¶
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_sigs = "sigs/lipogenic_sigs.csv"
frac_genes_retained = 0.5
n_genes_retained = 10
score_results = "data/permuted_scores.lipogenic_sigs.parquet"
# Read in processed expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")
# Read in gene signatures to score cells by
# These are a collection of lipogenic signatures
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()
# Filter out signatures whose genes are poorly represented in {proc_matrix}
# A signature is kept if at least 10 genes and 50% of the signature exist in {proc_matrix}
# {frac_genes_retained} and {n_genes_retained}
sigs_filt = {name: genes for name, genes in sigs.items()
if sum(g in m.columns for g in genes) >= n_genes_retained}
# Calculate single-cell expression scores for each signature and FDRs
scores_permuted = ps.sig_scores(m, sigs_filt, permute=True, conserved_genes=frac_genes_retained)
# Save results
scores_permuted.to_parquet(score_results, index=False)
2.3 Classify Lipogenic¶
score_results = "data/permuted_scores.lipogenic_sigs.parquet"
fdr_cutoff = 0.01
score_cutoff = 0.3
n_sigs = 2
# Pivot long -> wide; flatten the MultiIndex to score_<sig> / fdr_<sig>
scores = pd.read_parquet(score_results)
scores_wide = scores.pivot(index="id", columns="sig", values=["score", "fdr"])
scores_wide.columns = scores_wide.columns.map("_".join)
scores_wide = scores_wide.reset_index()
# A signature passes if score > score_cutoff AND fdr < fdr_cutoff. Count per cell.
score_cols = [c for c in scores_wide.columns if c.startswith("score_")]
fdr_cols = [c.replace("score_", "fdr_") for c in score_cols]
passes = (
(scores_wide[score_cols].to_numpy() > score_cutoff)
& (scores_wide[fdr_cols].to_numpy() < fdr_cutoff)
)
scores_wide["n_pass"] = passes.sum(axis=1)
# Keep cells with at least n_sigs passing signatures; save as plain text.
lipogenic_cells = scores_wide.loc[scores_wide["n_pass"] >= n_sigs, "id"].tolist()
with open("data/cells.lipogenic.txt", "w") as f:
f.write("\n".join(lipogenic_cells))
Part 3: Proportions¶
import os, glob
# get cell assignment file paths
cell_files = sorted(glob.glob("data/cells.*.txt"))
# read in cell assignments
cell_list = {}
for f in cell_files:
name = os.path.basename(f).replace("cells.", "").replace(".txt", "")
cell_list[name] = open(f).read().splitlines()
# reorder groups
ord_keys = ["adipo", "preadipo", "other", "lipogenic"]
cell_list = {k: cell_list[k] for k in ord_keys if k in cell_list}
# get cell proportions
n_cells = {k: len(v) for k, v in cell_list.items()}
# lipogenic is a sub-population of adipo; exclude it from the denominator
denom = sum(v for k, v in n_cells.items() if k != "lipogenic")
frac_cells = {k: v / denom for k, v in n_cells.items()}
frac_cells
{'adipo': 0.291, 'preadipo': 0.396, 'other': 0.313, 'lipogenic': 0.084}
Expected output for sample data:
{'adipo': 0.291,
'preadipo': 0.384,
'other': 0.325,
'lipogenic': 0.084}
Part 4: Thermogenic Scoring¶
The scoring pipeline below is what produced the reference
*_ThermoScores_cell.csv and *_ThermoScores_perturbation.csv files released
to participants in
Crunch 3. Twelve
signatures are scored, taken from thermogenic_signatures.csv:
nonucp1.all
REACTOME_AMPK_inhibits_chREBP_R-HSA-163680
REACTOME_Integration_of_energy_metabolism_R-HSA-163685
REACTOME_Mitochondrial_biogenesis_R-HSA-1592230
emont.hAd6
lit.thermogenic
C2.WP_THERMOGENESIS
C5.GOBP_ADAPTIVE_THERMOGENESIS
C5.GOBP_BROWN_FAT_CELL_DIFFERENTIATION
C5.GOBP_NEGATIVE_REGULATION_OF_COLD_INDUCED_THERMOGENESIS
C5.GOBP_POSITIVE_REGULATION_OF_COLD_INDUCED_THERMOGENESIS
yi.Thermogenesis_Village
4.1 Score Thermogenic Signatures¶
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_sigs = "sigs/thermogenic_signatures.csv"
n_genes_retained = 2
score_results = "data/permuted_scores.thermo_sigs.parquet"
# Read in processed adipocyte expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")
# Read in the 12 thermogenic gene signatures
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()
# Trim each signature to genes present in the matrix
sigs_present = {name: [g for g in genes if g in m.columns] for name, genes in sigs.items()}
# Keep only sigs with at least {n_genes_retained} of their genes in the matrix
sigs_to_score = {name: genes for name, genes in sigs_present.items() if len(genes) >= n_genes_retained}
# Score each signature against each cell (raw score + permutation-based FDR)
scores = ps.sig_scores(m, sigs_to_score, permute=True, conserved_genes=0)
# Annotate each row with how many genes the signature contributed
n_genes_used = {name: len(g) for name, g in sigs_to_score.items()}
scores["n_genes_used"] = scores["sig"].map(n_genes_used)
# Save long per-(cell, sig) table: id, sig, score, fdr, n_genes_used
scores.to_parquet(score_results, index=False)
4.2 Z-score and Per-Perturbation Aggregation¶
Z-score each signature across all adipocyte cells, aggregate to perturbation
level (mean z per gene × sig), and rank perturbations by
agg_top3_z (mean of the top-3 signature z-scores). Outputs in the format of
the released reference files
for Crunch 3: ThermoScores_cell.csv (one row per cell) and
ThermoScores_perturbation.csv (one row per perturbation, with agg_top3_z
and rank).
score_results = "data/permuted_scores.thermo_sigs.parquet"
cell_out = "data/ThermoScores_cell.csv"
perturbation_out = "data/ThermoScores_perturbation.csv"
scores = pd.read_parquet(score_results)
# 1. Z-score each signature across all adipocyte cells.
scores["z_score"] = scores.groupby("sig")["score"].transform(lambda x: (x - x.mean()) / x.std())
# 2. Derive perturbation (gene) from cell id prefix.
# TF150 convention only — otherwise use AnnData `.obs['gene']` / cell metadata.
scores["gene"] = scores["id"].str.split("_", n=1).str[0]
# 3. Per-cell wide table: one row per cell, raw + z-scored columns per sig.
cell_raw = scores.pivot(index=["id", "gene"], columns="sig", values="score").reset_index()
cell_z = scores.pivot(index="id", columns="sig", values="z_score").reset_index()
cell_z.columns = ["id"] + [f"z_{c}" for c in cell_z.columns[1:]]
cell_wide = cell_raw.merge(cell_z, on="id").rename(columns={"id": "cell_id"})
cell_wide.to_csv(cell_out, index=False)
# 4. Per-perturbation: mean z-score per (gene, sig), then agg_top3_z = mean of
# the top-3 sig z-scores per perturbation. Rank descending.
perturb_wide = (
scores.groupby(["gene", "sig"])["z_score"].mean()
.unstack("sig").reset_index()
)
sig_cols = [c for c in perturb_wide.columns if c != "gene"]
sig_z = perturb_wide[sig_cols].to_numpy()
perturb_wide["agg_top3_z"] = np.sort(sig_z, axis=1)[:, -3:].mean(axis=1)
perturb_wide = perturb_wide.sort_values("agg_top3_z", ascending=False, ignore_index=True)
perturb_wide["rank"] = np.arange(1, len(perturb_wide) + 1)
perturb_wide.to_csv(perturbation_out, index=False)
Below: results from running the pipeline on the TF150 dataset.
ThermoScores_cell.csv — top row (of 30,975), showing 2 of the
12 raw + 12 z-scored signature columns:
cell_id gene emont.hAd6 z_emont.hAd6
PPARG_P2_TAACCTAGTGATGAGC-1 PPARG 0.0969 0.3058
ThermoScores_perturbation.csv — top 5 of 158 perturbations ranked by
agg_top3_z, showing the first 2 of the 12 signature columns:
gene emont.hAd6 lit.thermogenic agg_top3_z rank
RNASEH2C 0.5796 0.1700 0.5484 1
FAM136A 0.1190 0.4887 0.4885 2
TMEM107 -0.2030 0.3624 0.4199 3
EP400 0.2438 0.3996 0.3956 4
TRIM5 0.1003 0.1147 0.3177 5