%%capture
# Install required packages
%pip install "pyscalop>=0.3.0" pandas pyarrow --quiet

import numpy as np
import pandas as pd
import pyscalop as ps

full_matrix = "data/m.allgenes.logcpm.parquet"
proc_matrix = "data/m.logcpm.parquet"
gene_cutoff = 3

# read in expression matrix (cells x genes, log2(CPM/10 + 1))
m = pd.read_parquet(full_matrix).set_index("cell_id")

# calculate average gene expression
avg = ps.aggr_gene_expr(m, is_bulk=False)

# filter genes that do not pass minimum gene expression cutoff
m2 = m.loc[:, avg >= gene_cutoff]

# save filtered matrix
m2.reset_index().to_parquet(proc_matrix, index=False)

proc_matrix = "data/m.logcpm.parquet"
gene_sigs = "sigs/adipo_preadipo_sigs.csv"
frac_genes_retained = 0.5
n_genes_retained = 10
score_results = "data/permuted_scores.adipo_preadipo_sigs.parquet"

# Read in processed expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")

# Read in gene signatures to score cells by
# there are 2 adipocyte signatures: "adipomap.Ad" and "yi.AD"
# and 2 pre-adipocyte (progenitor) signatures: "adipomap.ASPC" and "yi.ASPC"
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()

# Filter out signatures whose genes are poorly represented in {proc_matrix}
# A signature is kept if at least 10 genes and 50% of the signature exist in {proc_matrix}
# {frac_genes_retained} and {n_genes_retained}
sigs_filt = {name: genes for name, genes in sigs.items()
             if sum(g in m.columns for g in genes) >= n_genes_retained}

# Calculate single-cell expression scores for each signature and FDRs
scores_permuted = ps.sig_scores(m, sigs_filt, permute=True, conserved_genes=frac_genes_retained)

# Save results
scores_permuted.to_parquet(score_results, index=False)

score_results = "data/permuted_scores.adipo_preadipo_sigs.parquet"
fdr_cutoff = 0.01
diff_cutoff = 0.3

# Adipocyte and pre-adipocyte signature names (matches sig file).
adipo_sigs    = ["adipomap.Ad",   "yi.AD"]
preadipo_sigs = ["adipomap.ASPC", "yi.ASPC"]

# Pivot long -> wide; flatten the MultiIndex to score_<sig> / fdr_<sig>
scores = pd.read_parquet(score_results)
scores_wide = scores.pivot(index="id", columns="sig", values=["score", "fdr"])
scores_wide.columns = scores_wide.columns.map("_".join)
scores_wide = scores_wide.reset_index()

# Per-cell summary stats
score_adipo    = [f"score_{s}" for s in adipo_sigs]
score_preadipo = [f"score_{s}" for s in preadipo_sigs]
fdr_adipo      = [f"fdr_{s}"   for s in adipo_sigs]
fdr_preadipo   = [f"fdr_{s}"   for s in preadipo_sigs]

scores_wide["max_adipo"]    = scores_wide[score_adipo].max(axis=1)
scores_wide["max_preadipo"] = scores_wide[score_preadipo].max(axis=1)
scores_wide["score_diff"]   = scores_wide["max_adipo"] - scores_wide["max_preadipo"]
scores_wide["max_overall"]  = scores_wide[score_adipo + score_preadipo].max(axis=1)

# Classify cells:
#   Adipo:    score_diff > diff_cutoff, both pre-adipo sigs non-significant,
#             at least one adipo sig significant.
#   Preadipo: mirror of the above.
#   Other:    ambiguous, dual-identity, or neither.
adipo_mask = (
    (scores_wide["score_diff"] > diff_cutoff)
    & (scores_wide[fdr_preadipo] >= fdr_cutoff).all(axis=1)
    & (scores_wide[fdr_adipo]    <  fdr_cutoff).any(axis=1)
)
preadipo_mask = (
    (scores_wide["score_diff"] < -diff_cutoff)
    & (scores_wide[fdr_adipo]    >= fdr_cutoff).all(axis=1)
    & (scores_wide[fdr_preadipo] <  fdr_cutoff).any(axis=1)
)
scores_wide["compartment"] = np.where(
    adipo_mask, "Adipo",
    np.where(preadipo_mask, "Preadipo", "Other"),
)

# Extract and save cell IDs per compartment (one cell per line)
def write_cells(cells, path):
    with open(path, "w") as f:
        f.write("\n".join(cells))

for label in ("Adipo", "Preadipo", "Other"):
    cells = scores_wide.loc[scores_wide["compartment"] == label, "id"].tolist()
    write_cells(cells, f"data/cells.{label.lower()}.txt")

adipo_cells_path = "data/cells.adipo.txt"
full_matrix = "data/m.allgenes.logcpm.parquet"
proc_matrix = "data/m.adipo.logcpm.parquet"
gene_cutoff = 3

# read expression matrix (cells x genes) and cells classified as Adipo
cells = open(adipo_cells_path).read().splitlines()
m = pd.read_parquet(full_matrix).set_index("cell_id")

# filter to keep only Adipo cells
m_adipo = m.loc[cells]

# calculate average gene expression
avg = ps.aggr_gene_expr(m_adipo, is_bulk=False)

# filter genes that do not pass minimum gene expression cutoff
m2 = m_adipo.loc[:, avg >= gene_cutoff]

# save filtered matrix
m2.reset_index().to_parquet(proc_matrix, index=False)

proc_matrix = "data/m.adipo.logcpm.parquet"
gene_sigs = "sigs/lipogenic_sigs.csv"
frac_genes_retained = 0.5
n_genes_retained = 10
score_results = "data/permuted_scores.lipogenic_sigs.parquet"

# Read in processed expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")

# Read in gene signatures to score cells by
# These are a collection of lipogenic signatures
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()

# Filter out signatures whose genes are poorly represented in {proc_matrix}
# A signature is kept if at least 10 genes and 50% of the signature exist in {proc_matrix}
# {frac_genes_retained} and {n_genes_retained}
sigs_filt = {name: genes for name, genes in sigs.items()
             if sum(g in m.columns for g in genes) >= n_genes_retained}

# Calculate single-cell expression scores for each signature and FDRs
scores_permuted = ps.sig_scores(m, sigs_filt, permute=True, conserved_genes=frac_genes_retained)

# Save results
scores_permuted.to_parquet(score_results, index=False)

score_results = "data/permuted_scores.lipogenic_sigs.parquet"
fdr_cutoff = 0.01
score_cutoff = 0.3
n_sigs = 2

# Pivot long -> wide; flatten the MultiIndex to score_<sig> / fdr_<sig>
scores = pd.read_parquet(score_results)
scores_wide = scores.pivot(index="id", columns="sig", values=["score", "fdr"])
scores_wide.columns = scores_wide.columns.map("_".join)
scores_wide = scores_wide.reset_index()

# A signature passes if score > score_cutoff AND fdr < fdr_cutoff. Count per cell.
score_cols = [c for c in scores_wide.columns if c.startswith("score_")]
fdr_cols   = [c.replace("score_", "fdr_") for c in score_cols]
passes = (
    (scores_wide[score_cols].to_numpy() > score_cutoff)
    & (scores_wide[fdr_cols].to_numpy() < fdr_cutoff)
)
scores_wide["n_pass"] = passes.sum(axis=1)

# Keep cells with at least n_sigs passing signatures; save as plain text.
lipogenic_cells = scores_wide.loc[scores_wide["n_pass"] >= n_sigs, "id"].tolist()
with open("data/cells.lipogenic.txt", "w") as f:
    f.write("\n".join(lipogenic_cells))

import os, glob

# get cell assignment file paths
cell_files = sorted(glob.glob("data/cells.*.txt"))

# read in cell assignments
cell_list = {}
for f in cell_files:
    name = os.path.basename(f).replace("cells.", "").replace(".txt", "")
    cell_list[name] = open(f).read().splitlines()

# reorder groups
ord_keys = ["adipo", "preadipo", "other", "lipogenic"]
cell_list = {k: cell_list[k] for k in ord_keys if k in cell_list}

# get cell proportions
n_cells = {k: len(v) for k, v in cell_list.items()}
# lipogenic is a sub-population of adipo; exclude it from the denominator
denom = sum(v for k, v in n_cells.items() if k != "lipogenic")
frac_cells = {k: v / denom for k, v in n_cells.items()}
frac_cells

{'adipo': 0.291, 'preadipo': 0.396, 'other': 0.313, 'lipogenic': 0.084}

proc_matrix = "data/m.adipo.logcpm.parquet"
gene_sigs = "sigs/thermogenic_signatures.csv"
n_genes_retained = 2
score_results = "data/permuted_scores.thermo_sigs.parquet"

# Read in processed adipocyte expression matrix (cells x genes)
m = pd.read_parquet(proc_matrix).set_index("cell_id")

# Read in the 12 thermogenic gene signatures
sigs = pd.read_csv(gene_sigs).groupby("name")["genes"].agg(list).to_dict()

# Trim each signature to genes present in the matrix
sigs_present = {name: [g for g in genes if g in m.columns] for name, genes in sigs.items()}

# Keep only sigs with at least {n_genes_retained} of their genes in the matrix
sigs_to_score = {name: genes for name, genes in sigs_present.items() if len(genes) >= n_genes_retained}

# Score each signature against each cell (raw score + permutation-based FDR)
scores = ps.sig_scores(m, sigs_to_score, permute=True, conserved_genes=0)

# Annotate each row with how many genes the signature contributed
n_genes_used = {name: len(g) for name, g in sigs_to_score.items()}
scores["n_genes_used"] = scores["sig"].map(n_genes_used)

# Save long per-(cell, sig) table: id, sig, score, fdr, n_genes_used
scores.to_parquet(score_results, index=False)

score_results = "data/permuted_scores.thermo_sigs.parquet"
cell_out = "data/ThermoScores_cell.csv"
perturbation_out = "data/ThermoScores_perturbation.csv"

scores = pd.read_parquet(score_results)

# 1. Z-score each signature across all adipocyte cells.
scores["z_score"] = scores.groupby("sig")["score"].transform(lambda x: (x - x.mean()) / x.std())

# 2. Derive perturbation (gene) from cell id prefix.
#    TF150 convention only — otherwise use AnnData `.obs['gene']` / cell metadata.
scores["gene"] = scores["id"].str.split("_", n=1).str[0]

# 3. Per-cell wide table: one row per cell, raw + z-scored columns per sig.
cell_raw = scores.pivot(index=["id", "gene"], columns="sig", values="score").reset_index()
cell_z   = scores.pivot(index="id",          columns="sig", values="z_score").reset_index()
cell_z.columns = ["id"] + [f"z_{c}" for c in cell_z.columns[1:]]
cell_wide = cell_raw.merge(cell_z, on="id").rename(columns={"id": "cell_id"})
cell_wide.to_csv(cell_out, index=False)

# 4. Per-perturbation: mean z-score per (gene, sig), then agg_top3_z = mean of
#    the top-3 sig z-scores per perturbation. Rank descending.
perturb_wide = (
    scores.groupby(["gene", "sig"])["z_score"].mean()
          .unstack("sig").reset_index()
)
sig_cols = [c for c in perturb_wide.columns if c != "gene"]
sig_z = perturb_wide[sig_cols].to_numpy()
perturb_wide["agg_top3_z"] = np.sort(sig_z, axis=1)[:, -3:].mean(axis=1)
perturb_wide = perturb_wide.sort_values("agg_top3_z", ascending=False, ignore_index=True)
perturb_wide["rank"] = np.arange(1, len(perturb_wide) + 1)

perturb_wide.to_csv(perturbation_out, index=False)

Overview¶

Setup¶

Part 1: All Cells¶

1.1 Preprocess¶

1.2 Score Adipo/Preadipo¶

1.3 Classify Adipo/Preadipo/Other¶

Part 2: Adipo Cells¶

2.1 Preprocess¶

2.2 Score Lipogenic¶

2.3 Classify Lipogenic¶

Part 3: Proportions¶

Part 4: Thermogenic Scoring¶

4.1 Score Thermogenic Signatures¶

4.2 Z-score and Per-Perturbation Aggregation¶