Weinreb et al., 2020 Datasets (LARRY)

Lineage tracing on transcriptional landscapes links state to fate during differentiation

[12]:

import scanpy as sc
import numpy as np
import operator
import pandas as pd
import torch.nn.functional as F
import torch.nn as nn
import scanpy as sc
import matplotlib.pyplot as plt
import torch
import os
from torch.nn import DataParallel
import os
import anndata as ad
import cospar as cs
import scanpy as sc
def createFig(figsize=(8, 4)):
    fig,ax=plt.subplots()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    #ax.spines['bottom'].set_color('none')
    #ax.spines['left'].set_color('none')
    for line in ax.yaxis.get_ticklines():
        line.set_markersize(5)
        line.set_color("#585958")
        line.set_markeredgewidth(0.5)
    for line in ax.xaxis.get_ticklines():
        line.set_markersize(5)
        line.set_markeredgewidth(0.5)
        line.set_color("#585958")
    ax.set_xbound(0,10)
    ax.set_ybound(0,10)
    fig.set_size_inches(figsize)
    return fig,ax
def setPltLinewidth(linewidth:float):
    mpl.rcParams['axes.linewidth'] = linewidth
import matplotlib as mpl
fig,ax=createFig()
fig.set_size_inches(5,5)
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
setPltLinewidth(1)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 1
##setting draw parameters

[35]:

adata.obsm['X_clone']

[35]:

<49116x5864 sparse matrix of type '<class 'numpy.bool_'>'
        with 49116 stored elements in Compressed Sparse Row format>

[13]:

def ScanpyVolcanoPlot(adata, axis, use_adjusted_p=True, show_label=True, label_fold_change=2, label_log_p=80, add_grid=True, label_size=4, filter_labels=None, kept_labels=None, color1='#F09D30', color2='#3D5FA6', label_excludes=None):
    fig, ax = createFig()
    log2foldchanges = list(map(lambda x: x[axis], adata.uns['rank_genes_groups']['logfoldchanges']))
    if use_adjusted_p:
        log10adjp = -np.log10(list(map(lambda x: x[axis], adata.uns['rank_genes_groups']['pvals_adj'])))
    else:
        log10adjp = -np.log10(list(map(lambda x: x[axis], adata.uns['rank_genes_groups']['pvals'])))

    ax.scatter(
        x=log2foldchanges,
        y=log10adjp,
        s=12,
        alpha=0.8,
        linewidth=0,
        c=list(map(lambda x: color2 if abs(x[0]) > label_fold_change and x[1] > label_log_p and x[0] < 0
                   else color1 if abs(x[0]) > label_fold_change and x[1] > label_log_p and x[0] > 0
                   else 'gray', zip(log2foldchanges, log10adjp)))
    )
    if add_grid:
        ax.axvline(-label_fold_change, color ="black", alpha = 0.8, lw = 0.8, ls='--')
        ax.axvline(label_fold_change, color ="black", alpha = 0.8, lw = 0.8, ls='--')
        ax.axhline(label_log_p, color ="black", alpha = 0.8, lw = 0.8, ls='--')
        ax.grid(alpha=0.4)
    if show_label:
        indices = list(map(lambda x: abs(x[0]) > label_fold_change and x[1] > label_log_p, zip(log2foldchanges, log10adjp)))
        for i, j, s in zip(np.array(log2foldchanges)[indices], np.array(log10adjp)[indices], list(map(lambda x: x[1], adata.uns['rank_genes_groups']['names'][indices]))):
            if label_excludes is not None and s in label_excludes:
                continue
            if filter_labels is not None and s in filter_labels:
                continue
            if kept_labels is not None and s not in kept_labels:
                continue
            ax.text(i, j, s=s, size=label_size)

    return fig, ax

[26]:

adata=cs.hf.read("./2.2_weinreb_forziwei.h5ad")
import pickle
f_read = open('./dic_fate_weinreb_ourmodel_allundiff.pkl', 'rb')
a = pickle.load(f_read)
f_read = open('./dic_true_weinreb.pkl', 'rb')
b = pickle.load(f_read)
adata.obs['fate_pred']=None
adata.obs['fate_pred']=adata.obs['index'].map(a)
adata.obs['GT']=None
adata.obs['GT']=adata.obs['index'].map(b)

[36]:

f_read = open('/home/zhengtuo/songtao/DestinyNet/weinreb_2.22_fate_and_prob/dic_fate_weinreb_ourmodel.pkl', 'rb')
a = pickle.load(f_read)
adata.obs['fate_pred']=None
adata.obs['fate_pred']=adata.obs['index'].map(a)

[38]:

adata.obs['GT'].value_counts()#0: 1633, 1: 1761, 2: 271

[38]:

1.0    1761
0.0    1633
2.0     271
Name: GT, dtype: int64

[21]:

adata[adata.obs['time_info'].isin([2,4])].obs['state_info'].value_counts()

[21]:

undiff        11717
Neutrophil     2831
Monocyte       2571
Baso           1545
Meg             310
Mast            283
Lymphoid        115
Erythroid        83
Eos              36
pDC              32
Ccr7_DC          24
Name: state_info, dtype: int64

[5]:

weinreb_adata_raw = sc.read_h5ad("/home/zhengtuo/songtao/DestinyNet/weinreb.h5ad")
sc.pp.log1p(weinreb_adata_raw)
adata.raw = weinreb_adata_raw

[6]:

adata.obsm["X_tsne"] = sc.read_h5ad("/home/zhengtuo/songtao/DestinyNet/1.8_weinreb_forziwei.h5ad").obsm["X_emb_old"]

[7]:

sc.pl.umap(adata,color='fate_pred',s=10)

[15]:

###### from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from math import sqrt
from random import choice
from torch.utils.data  import Dataset, DataLoader
import pandas as pd
#from layers import SinkhornDistance
num_relations=8
len_geneExp=25289
len_embedding=256
learning_rate=0.0001
num_epoch=50
batchsize=512

criterion_rec = nn.MSELoss()
geneEnc=nn.Sequential(
        nn.Dropout(),
        nn.Linear(len_geneExp, 100),
        nn.ReLU(),
                nn.Linear(100, 100),
                nn.ReLU(),
         nn.Linear(100, 100),
                nn.ReLU(),
        nn.Linear(100, 100),
                nn.ReLU(),
        nn.Linear(100, len_embedding),
    )

geneDec = nn.Sequential(
    nn.Linear(len_embedding, len_geneExp),
    #nn.Linear(512, len_geneExp),
)
genemap = nn.Sequential(
     #nn.Dropout(),
     nn.Linear(len_embedding, 100),
     nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100, 100),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100, 100),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100, 100),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100, len_embedding),

)
class SELayer(nn.Module):
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.LeakyReLU(inplace=False),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _ = x.size()
        #print('channel:',c)
        #print('size:',x.shape)
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1)
        #print('y_size:',y.shape)
        return x * y.expand_as(x)
class MutiheadAttention(nn.Module):
    def __init__(self, input_dim, dim_k, dim_v,num_heads):
        super(MutiheadAttention, self).__init__()
        self.dim_q = dim_k
        self.dim_k = dim_k
        self.dim_v = dim_v
        self.num_units=dim_k
        self.num_heads=num_heads
        self.linear_q = nn.Linear(input_dim, dim_k, bias=False)
        self.linear_k = nn.Linear(input_dim, dim_k, bias=False)
        self.linear_v = nn.Linear(input_dim, dim_v, bias=False)
        self._norm_fact = 1 / sqrt(dim_k)

    def forward(self, x):
        # x: batch_size, seq_len, input_dim
        q = self.linear_q(x)  # batch_size, seq_len, dim_k
        k = self.linear_k(x)  # batch_size, seq_len, dim_k
        v = self.linear_v(x)  # batch_size, seq_len, dim_v
        split_size = self.num_units // self.num_heads
        q = torch.stack(torch.split(q, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
        k = torch.stack(torch.split(k, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
        v = torch.stack(torch.split(v, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]

        scores = torch.matmul(q, k.transpose(2, 3))
        scores = scores / (self.dim_k ** 0.5)

        scores = F.softmax(scores, dim=3)
        ## out = score * V
        out = torch.matmul(scores, v)  # [h, N, T_q, num_units/h]
        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)

        return out

class ResidualBlock(torch.nn.Module):
    def __init__(self,channels):
        super(ResidualBlock,self).__init__()
        self.channels = channels
        self.conv1 = nn.Conv1d(channels,channels,kernel_size=3,padding=1)
        self.conv2 = nn.Conv1d(channels,channels,kernel_size=3,padding=1)
        self.se=SELayer(channels,16)
    def forward(self, x):
        y = F.relu(self.conv1(x))
        y = self.conv2(y)
        y=self.se(y)
        return F.relu(x+y)

class DestinyNet(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.att=MutiheadAttention(len_embedding*2,512,512,64)
        self.layernorm=nn.LayerNorm(512)
        self.conv1 = nn.Conv1d(1, 32, 4)
        self.relu1=nn.LeakyReLU(0.2, inplace=True)
        self.rblock1 = ResidualBlock(32)
        self.conv2 = nn.Conv1d(32,64, 4)
        self.batchn1=nn.BatchNorm1d(64)
        self.relu2= nn.LeakyReLU(0.2, inplace=True)
        self.rblock2 = ResidualBlock(64)
        self.conv3=nn.Conv1d(64,128,4)
        self.batchn2=nn.BatchNorm1d(128)
        self.relu3= nn.LeakyReLU(0.2, inplace=True)
        self.rblock3 = ResidualBlock(128)
        self.conv4=nn.Conv1d(128,256,4)
        self.batchn3=nn.BatchNorm1d(256)
        self.relu4= nn.LeakyReLU(0.2, inplace=True)
        self.rblock4 = ResidualBlock(256)
        self.dropout=nn.Dropout()
        self.fc1 = nn.Linear(7424, num_relations)

    def forward(self, x):
        x=self.att(x)+x
        x=self.layernorm(x)
        x = self.conv1(x)
        x = self.relu1(x)
        x = F.max_pool1d(x, 2)
        x=self.rblock1(x)
        x = self.conv2(x)
        x=self.batchn1(x)
        x=self.relu2(x)
        x = F.max_pool1d(x, 2)
        x=self.rblock2(x)

        x = self.conv3(x)
        x=self.batchn2(x)
        x=self.relu3(x)
        x = F.max_pool1d(x, 2)
        x=self.rblock3(x)

        x = self.conv4(x)
        x=self.batchn3(x)
        x=self.relu4(x)
        x = F.max_pool1d(x, 2)
        x=self.rblock4(x)
        x = x.view(x.size()[0], -1)
        x=self.dropout(x)
        x=self.fc1(x)
        return x
class TrainDataset(Dataset):
    def __init__(self,length):
        self.length=length

    def __getitem__(self, idx):
        cell1_id=int(traincell1[idx])
        cell2_id=int(traincell2[idx])
        gene1=adata_orig.X[cell1_id]
        gene1=torch.tensor(gene1)
        gene2=adata_orig.X[cell2_id]
        gene2=torch.tensor(gene2)
        genetype=train_rel[idx]
        #genetype=dic1[genetype]
        genetype=torch.tensor(genetype)
        return gene1,gene2,genetype

    def __len__(self):
        return self.length
model=DestinyNet()
optimizer = torch.optim.Adam(
    [{'params': geneEnc.parameters()},
    {'params': model.parameters()},
        {'params': geneDec.parameters()},
    #{'params': norelmodel.parameters()},
            {'params': genemap.parameters()}],
    lr=learning_rate)
from sklearn.metrics import confusion_matrix
TwoNets_path='/home/zhengtuo/songtao/weinreb.pth'
checkpoint = torch.load(TwoNets_path,map_location='cuda:1')
device2=torch.device("cuda:1")
device=torch.device("cuda:1")

model.load_state_dict(checkpoint['model'])
model.to(device2)
geneEnc.load_state_dict(checkpoint['geneEnc'])
geneEnc.to(device2)
geneDec.load_state_dict(checkpoint['geneDec'])
geneDec.to(device2)

model.eval()
geneEnc.eval()
geneDec.eval()

genemap.load_state_dict(checkpoint['genemap'])
genemap.to(device2)
genemap.eval()

[15]:

Sequential(
  (0): Linear(in_features=256, out_features=100, bias=True)
  (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Linear(in_features=100, out_features=100, bias=True)
  (4): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): ReLU()
  (6): Linear(in_features=100, out_features=100, bias=True)
  (7): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (8): ReLU()
  (9): Linear(in_features=100, out_features=100, bias=True)
  (10): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): ReLU()
  (12): Linear(in_features=100, out_features=256, bias=True)
)

[21]:

def compute_velocity_on_grid(
    X_emb,
    M_emb,
    density=None,
    smooth=None,
    n_neighbors=None,
    min_mass=None,
    autoscale=True,
    adjust_for_stream=False,
    cutoff_perc=None,
):
    # remove invalid cells
    idx_valid = np.isfinite(X_emb.sum(1) + M_emb.sum(1))
    X_emb = X_emb[idx_valid]
    M_emb = M_emb[idx_valid]

    # prepare grid
    n_obs, n_dim = X_emb.shape
    density = 0.5 if density is None else density
    smooth = 0.5 if smooth is None else smooth

    grs = []
    for dim_i in range(n_dim):
        m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i])
        m = m - 0.01 * np.abs(M - m)
        M = M + 0.01 * np.abs(M - m)
        gr = np.linspace(m, M, int(50 * density))
        grs.append(gr)

    meshes_tuple = np.meshgrid(*grs)
    X_grid = np.vstack([i.flat for i in meshes_tuple]).T

    # estimate grid velocities
    if n_neighbors is None:
        n_neighbors = int(n_obs / 50)
    nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1)
    nn.fit(X_emb)
    dists, neighs = nn.kneighbors(X_grid)

    scale = np.mean([(g[1] - g[0]) for g in grs]) * smooth
    weight = normal.pdf(x=dists, scale=scale)
    p_mass = weight.sum(1)

    V_grid = (M_emb[neighs] * weight[:, :, None]).sum(1)
    V_grid /= np.maximum(1, p_mass)[:, None]
    if min_mass is None:
        min_mass = 1

    if adjust_for_stream:
        X_grid = np.stack([np.unique(X_grid[:, 0]), np.unique(X_grid[:, 1])])
        ns = int(np.sqrt(len(V_grid[:, 0])))
        V_grid = V_grid.T.reshape(2, ns, ns)

        mass = np.sqrt((V_grid ** 2).sum(0))
        min_mass = 10 ** (min_mass - 6)  # default min_mass = 1e-5
        min_mass = np.clip(min_mass, None, np.max(mass) * 0.9)
        cutoff = mass.reshape(V_grid[0].shape) < min_mass

        if cutoff_perc is None:
            cutoff_perc = 5
        length = np.sum(np.mean(np.abs(M_emb[neighs]), axis=1), axis=1).T
        length = length.reshape(ns, ns)
        cutoff |= length < np.percentile(length, cutoff_perc)

        V_grid[0][cutoff] = np.nan
    else:
        min_mass *= np.percentile(p_mass, 99) / 100
        X_grid, V_grid = X_grid[p_mass > min_mass], V_grid[p_mass > min_mass]

        if autoscale:
            V_grid /= 3 * quiver_autoscale(X_grid, V_grid)

    return X_grid, V_grid

def quiver_autoscale(X_emb, V_emb):
    import matplotlib.pyplot as pl

    scale_factor = np.abs(X_emb).max()  # just so that it handles very large values
    fig, ax = pl.subplots()
    Q = ax.quiver(
        X_emb[:, 0] / scale_factor,
        X_emb[:, 1] / scale_factor,
        V_emb[:, 0],
        V_emb[:, 1],
        angles="xy",
        scale_units="xy",
        scale=None,
    )
    Q._init()
    fig.clf()
    pl.close(fig)
    return Q.scale / scale_factor
import numpy as np
from scipy.stats import norm as normal
from sklearn.neighbors import NearestNeighbors
import tqdm
sc.pp.neighbors(adata)

X = adata.obsm['geneEnc'][:,0,:]
M = adata.obsm['genemap'][:,0,:]
N = adata.obsp['connectivities']
E = adata.obsm["X_umap"]
norm = lambda x: (x-min(x))/(max(x)-min(x))
nn = NearestNeighbors(n_neighbors=20)
nn.fit(X)
dE = []
for i in tqdm.trange(N.shape[0]):

    D,I = nn.kneighbors( np.array([M[i]]) )
    dE.append( (E[I][0] - E[i]).mean(0))
dE = np.vstack(dE)
gE, gdE = compute_velocity_on_grid(E,dE,density=2, adjust_for_stream=True,smooth=True)
fig,ax=createFig()
fig.set_size_inches(5,5)
palette = sc.pl._tools.scatterplots._get_palette(adata, 'state_info')
ax.scatter(E[:,0],E[:,1],s=30, linewidths=0,c=list(map(lambda x: palette[x], adata.obs['state_info'])))
stream_kwargs = {
        "linewidth": 0.5,
        "density": 3,
        "zorder": 3,
        "color": "black",
        "arrowsize":  1,

        "arrowstyle": "-|>",
        "maxlength":  10,
        "integration_direction":  "both",
}
ax.streamplot(gE[0],gE[1],gdE[0],gdE[1],**stream_kwargs)

100%|█████████████████████████████████████| 49116/49116 [28:23<00:00, 28.83it/s]

[21]:

<matplotlib.streamplot.StreamplotSet at 0x7f9138121c40>

[93]:

adata3=adata[adata.obs['fate_pred'].isin([0,1])].copy()
adata3.obs['fate_pred'] = adata3.obs['fate_pred'].astype('category')

from sklearn.neighbors import KNeighborsClassifier
plt.rcParams['font.size'] = 14
fig,ax=createFig(figsize=(3,3))
sc.pl.umap(adata3,color=['fate_pred'],ax=ax,s=10)
sc.tl.rank_genes_groups(adata3, groupby='fate_pred', method='wilcoxon')
sc.pl.heatmap(
    adata3,
    pd.DataFrame(adata3.uns['rank_genes_groups']['names']).head(20).to_numpy().flatten(),
    groupby='fate_pred'#,log=True, cmap='Reds'
)

/home/zhengtuo/miniconda3/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

[95]:

#adata3.uns
pd.DataFrame(adata3.uns['rank_genes_groups']['logfoldchanges'].head(20),
             adata3.uns['rank_genes_groups']['names'].head(20))

[70]:

f_read = open('./dic_fate_weinreb_ourmodel_allundiff.pkl', 'rb')
#f_read = open('/home/zhengtuo/songtao/DestinyNet/weinreb_2.22_fate_and_prob/dic_fate_weinreb_ourmodel.pkl', 'rb')
a = pickle.load(f_read)
adata.obs['fate_pred']=None
adata.obs['fate_pred']=adata.obs['index'].map(a)
adata3=adata[adata.obs['fate_pred'].isin([0,1])].copy()
adata3.obs['fate_pred'] = adata3.obs['fate_pred'].astype('category')
sc.tl.rank_genes_groups(adata3, groupby='fate_pred', method='wilcoxon',use_raw=True)
# fig, ax = ScanpyVolcanoPlot(adata3, axis=1,
#                            label_fold_change=1,label_log_p=2,
#                            label_size=8,show_label=True,label_includes=['Elane'])
# ax.set_xbound(-5,5)
# fig.set_size_inches(5,5)

[129]:

adata3=adata[adata.obs['GT'].isin([0,1])].copy()
adata3.obs['GT']=adata3.obs['GT'].astype(str)
sc.tl.rank_genes_groups(adata3, groupby='GT', method='wilcoxon')
# fig, ax = ScanpyVolcanoPlot(adata3, axis=1,
#                            label_fold_change=1,label_log_p=2,
#                            label_size=8,show_label=True,label_includes=['Gata2'])
# ax.set_xbound(-5,5)
# fig.set_size_inches(5,5)

... storing 'GT' as categorical

[81]:

logFC_threshold = 1
adj_pval_threshold = 0.05
result_dict = adata3.uns['rank_genes_groups']
groups = result_dict['names'].dtype.names
markers_list = []
for group in groups:
    markers_df = pd.DataFrame(
        {
            'names': result_dict['names'][group],
            'scores': result_dict['scores'][group],
            'logfoldchanges': result_dict['logfoldchanges'][group],
            'pvals': result_dict['pvals'][group],
            'pvals_adj': result_dict['pvals_adj'][group],
            'group': group,
        }
    )
 # Filter the table by log fold change and adjusted p-value thresholds
    filtered_markers_df = markers_df[(markers_df['logfoldchanges'] > 1) & (markers_df['pvals_adj'] < adj_pval_threshold)]
    markers_list.append(filtered_markers_df)

all_markers_df = pd.concat(markers_list, axis=0)

all_markers_df.to_csv(f'/home/zhengtuo/songtao/marker_allundiff_pred_9.15.csv', index=False)
#all_markers_df.to_csv(f'/home/songtaojiang/deng/peaks_marker_withpeaks.csv', index=False)

[77]:

len(all_markers_df)

[77]:

[719]:

df_GT_pred=all_markers_df
df_GT_pred_group = df_GT_pred[df_GT_pred['group'] == '1.0']

genes_GT_pred = set(df_GT_pred_group['names'])
print(len(genes_GT_pred))

[95]:

import pandas as pd
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

df_GT_pred = pd.read_csv('/home/zhengtuo/songtao/marker_GT_pred.csv')
df_allundiff_pred = pd.read_csv('/home/zhengtuo/songtao/marker_allundiff_pred.csv')
df_GT_groundtruth = pd.read_csv('/home/zhengtuo/songtao/marker_GT_groundtruth.csv')

with PdfPages('./Venn3_weinreb.pdf') as pdf:
    for group in df_GT_pred['group'].unique():
        df_GT_pred_group = df_GT_pred[df_GT_pred['group'] == group]
        df_allundiff_pred_group = df_allundiff_pred[df_allundiff_pred['group'] == group]
        df_GT_groundtruth_group = df_GT_groundtruth[df_GT_groundtruth['group'] == group]

        genes_GT_pred = set(df_GT_pred_group['names'])
        genes_allundiff_pred = set(df_allundiff_pred_group['names'])
        genes_GT_groundtruth = set(df_GT_groundtruth_group['names'])
        print(len(genes_GT_pred))
        print(len(genes_allundiff_pred))
        print(len(genes_GT_groundtruth))
        overlap_GT_pred_allundiff = genes_GT_pred.intersection(genes_allundiff_pred)
        overlap_GT_pred_groundtruth = genes_GT_pred.intersection(genes_GT_groundtruth)
        overlap_allundiff_groundtruth = genes_allundiff_pred.intersection(genes_GT_groundtruth)

        fig, ax = plt.subplots()
        venn3([genes_GT_pred, genes_allundiff_pred, genes_GT_groundtruth], ('GT_pred', 'allundiff_pred', 'GT_groundtruth'), ax=ax)
        ax.set_title(f'Overlap among GT_pred, allundiff_pred and GT_groundtruth for group {group}')

        pdf.savefig(fig, bbox_inches='tight')
        plt.close(fig)

[680]:

df_GT_pred[df_GT_pred['group'] == 0.0]

[680]:

	names	scores	logfoldchanges	pvals	pvals_adj	group
0	Fth1	20.916939	0.655497	3.754473e-97	1.898938e-93	0.0
1	Cybb	16.727560	2.075559	8.255037e-63	2.087616e-59	0.0
2	Ifi203	16.475517	1.720728	5.501524e-61	1.264800e-57	0.0
3	Ly6a	16.257133	1.924848	1.988261e-59	3.867780e-56	0.0
4	Psap	15.400703	1.177309	1.618987e-53	2.732804e-50	0.0
...	...	...	...	...	...	...
241	Ywhab	3.156941	0.169605	1.594336e-03	4.354121e-02	0.0
242	Cse1l	3.155207	0.204191	1.603845e-03	4.370650e-02	0.0
243	1110004F10Rik	3.151378	0.190176	1.625019e-03	4.423584e-02	0.0
244	Anxa2	3.141347	0.340332	1.681729e-03	4.558332e-02	0.0
245	Fmnl1	3.136003	0.810482	1.712677e-03	4.637247e-02	0.0

246 rows × 6 columns

[126]:

f_read = open('/home/zhengtuo/songtao/DestinyNet/weinreb_2.22_fate_and_prob/dic_fate_weinreb_ourmodel.pkl', 'rb')
a = pickle.load(f_read)
adata.obs['fate_pred']=None
adata.obs['fate_pred']=adata.obs['index'].map(a)
adata3=adata[adata.obs['fate_pred'].isin([0,1])].copy()
adata3.obs['fate_pred'] = adata3.obs['fate_pred'].astype('category')
sc.tl.rank_genes_groups(adata3, groupby='fate_pred', method='wilcoxon')

# fig, ax = ScanpyVolcanoPlot(adata3, axis=1,
#                            label_fold_change=1,label_log_p=2,
#                            label_size=8,show_label=True,label_includes=['Gata2'])
# ax.set_xbound(-5,5)
# fig.set_size_inches(5,5)

[122]:

weinreb_adata_background = adata[
    list(map(lambda x: x == 'Monocyte' or x == 'Neutrophil', adata.obs['state_info'] ))
]
sc.tl.rank_genes_groups(weinreb_adata_background, groupby='state_info',method='wilcoxon')
#ScanpyVolcanoPlot(weinreb_adata_background, axis=1)

[595]:

d = set(pd.DataFrame(adata3.uns['rank_genes_groups']['names'])['0.0'].loc[
    np.array(pd.DataFrame(adata3.uns['rank_genes_groups']['logfoldchanges'])['0.0'] > 1.5) &
    np.array(pd.DataFrame(adata3.uns['rank_genes_groups']['pvals_adj']['0.0']) < 0.05)
]).union(
set(pd.DataFrame(adata3.uns['rank_genes_groups']['names'])['1.0'].loc[
    np.array(pd.DataFrame(adata3.uns['rank_genes_groups']['logfoldchanges'])['1.0'] > 1.5) &
    np.array(pd.DataFrame(adata3.uns['rank_genes_groups']['pvals_adj']['1.0']) < 0.05)
]))

[194]:

s = set(pd.DataFrame(weinreb_adata_background.uns['rank_genes_groups']['names']).head(200).to_numpy().flatten())

[135]:

adata3.uns['rank_genes_groups']['pvals_adj']['1.0'][0]=1e-300

[135]:

1e-300

[18]:

plt.rcParams['font.size'] = 6

fig, ax = ScanpyVolcanoPlot(
    adata3,
    1,
    label_log_p=5,
    show_label=True,
    label_fold_change=1,
    label_size=6,
    add_grid=False,
    color1= '#50C3C6',
    color2= '#FAA339',
)

scatter = ax.collections[0]

y_values = scatter.get_offsets()[:, 1].compressed()

ymax = max(y_values) + 20

ax.set_xbound(-7,7)
ax.set_ybound(upper=ymax)
fig.set_size_inches(4,4)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
plt.show()
#fig.savefig("/home/zhengtuo/songtao/weinreb_allundiff_pred_includeNeuMoMarker_nowords.pdf", dpi=1000)

/tmp/ipykernel_116596/210889169.py:5: RuntimeWarning: divide by zero encountered in log10
  log10adjp = -np.log10(list(map(lambda x: x[axis], adata.uns['rank_genes_groups']['pvals_adj'])))

[577]:

top_genes

[577]:

Index(['Ctsc', 'Lpl', 'Nfkbia', 'Rbms1', 'Fth1', 'Psap', 'Ifi203', 'H2afy',
       'Ighm', 'Ly6a', 'Srgn', 'Gata2', 'Elane', 'Mpo', 'Lgals9', 'Muc13',
       'Gzmb', 'Ctsg', 'Calr', 'H2-Q7'],
      dtype='object')

[74]:

plt.rcParams['font.size'] = 6

fig, ax = ScanpyVolcanoPlot(
    adata3,
    1,
    label_log_p=5,
    show_label=True,
    label_fold_change=0,
    label_size=6,
    label_includes=top_genes,
    add_grid=False,
    color1= '#50C3C6',
    color2= '#FAA339',
)

ax.set_xbound(-7,7)
fig.set_size_inches(4,4)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

to_remove_coords = []
for text in ax.texts:
    if text.get_text() not in list(top_genes):
        x, y = text.get_position()
        to_remove_coords.append((x, y))
        text.remove()

scatter = ax.collections[0]

offsets = scatter.get_offsets()
colors = scatter.get_facecolor()

new_colors = []
new_offsets = []

for i, offset in enumerate(offsets):
    if tuple(offset) not in to_remove_coords:
        new_offsets.append(offset)
        new_colors.append(colors[i])

assert len(new_colors) == len(new_offsets)

scatter.remove()

point_size = 10
ax.scatter(*zip(*new_offsets), c=new_colors, cmap=scatter.get_cmap(), norm=scatter.norm, s=point_size)

# 调整文字的大小
for text in ax.texts:
    text.set_fontsize(5)

# 扩大图形大小
fig.set_size_inches(4,4)

plt.draw()

fig.savefig("/home/zhengtuo/songtao/weinreb_GT_includeNeuMoMarker_top20withoutOthers.pdf", dpi=1000)

[599]:

plt.rcParams['font.size'] = 6

fig, ax = ScanpyVolcanoPlot(
    adata3,
    1,
    label_log_p=5,
    show_label=True,
    label_fold_change=0,
    label_size=6,
    label_includes=top_genes,
    add_grid=False,
    color1= '#50C3C6',
    color2= '#FAA339',
)

ax.set_xbound(-7,7)
fig.set_size_inches(4,4)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

to_remove_coords = []
for text in ax.texts:
    if text.get_text() not in list(top_genes):
        x, y = text.get_position()
        to_remove_coords.append((x, y))
        text.remove()
plt.draw()

fig.savefig("/home/zhengtuo/songtao/weinreb_GT_includeNeuMoMarker_top20withOthers.pdf", dpi=1000)

[ ]:

[874]:

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
gene_jia=['Pou2f2', 'Tmem173', 'Rab38', 'Mef2c', 'Mmp8', 'S100a9', 'Ngp']
def create_umap_for_gene(adata, gene_name):
    fig, ax = plt.subplots()
    fig.set_size_inches(5, 5)
    ax.scatter(
        adata.obsm["X_umap"][:, 0],
        adata.obsm["X_umap"][:, 1],
        s=1,
        linewidths=0,
        color='#FDF6F3'
    )
    sc.pl.umap(
        adata[adata.X[:, list(adata.var.index).index(gene_name)] > 0], ax=ax,
        color=gene_name,
        cmap='Reds',
        s=5
    )
    return fig

with PdfPages('genes_extent.pdf') as pdf:
    for gene_name in gene_jia:
        fig = create_umap_for_gene(adata, gene_name)
        pdf.savefig(fig)
        plt.close(fig)

[610]:

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

def create_umap_for_gene(adata, gene_name):
    fig, ax = plt.subplots()
    fig.set_size_inches(5, 5)
    ax.scatter(
        adata.obsm["X_umap"][:, 0],
        adata.obsm["X_umap"][:, 1],
        s=1,
        linewidths=0,
        color='#FDF6F3'
    )
    sc.pl.umap(
        adata[adata.X[:, list(adata.var.index).index(gene_name)] > 0], ax=ax,
        color=gene_name,
        cmap='Reds',
        s=5
    )
    return fig

with PdfPages('top20genes_plots.pdf') as pdf:
    for gene_name in top_genes:
        fig = create_umap_for_gene(adata, gene_name)
        pdf.savefig(fig)
        plt.close(fig)

[865]:

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
#adata.obsm["X_tsne"] = sc.read_h5ad("../1.8_weinreb_forziwei.h5ad").obsm["X_emb_old"]
def create_umap_for_gene(adata, gene_name):
    fig, ax = plt.subplots()
    fig.set_size_inches(5, 5)
    ax.scatter(
        adata.obsm["X_tsne"][:, 0],
        adata.obsm["X_tsne"][:, 1],
        s=1,
        linewidths=0,
        color='#FDF6F3'
    )
    sc.pl.tsne(
        adata[adata.X[:, list(adata.var.index).index(gene_name)] > 0], ax=ax,
        color=gene_name,
        cmap='Reds',
        s=5
    )
    return fig

# 创建一个PDF文件
with PdfPages('top20genes_plots_weinreb_embedding.pdf') as pdf:
    for gene_name in top_genes:
        fig = create_umap_for_gene(adata, gene_name)
        pdf.savefig(fig)
        plt.close(fig)

[110]:

# f_read = open('/home/zhengtuo/songtao/DestinyNet/weinreb_2.22_fate_and_prob/dic_fate_weinreb_ourmodel_allundiff.pkl', 'rb')
# a = pickle.load(f_read)
# adata.obs['fate_pred']=None
# adata.obs['fate_pred']=adata.obs['index'].map(a)
# adata3=adata[adata.obs['fate_pred'].isin([0,1])].copy()
# adata3.obs['fate_pred'] = adata3.obs['fate_pred'].astype('category')
f_read = open('./dic_fate_weinreb_ourmodel_allundiff.pkl', 'rb')
a = pickle.load(f_read)
adata.obs['fate_pred']=None
adata.obs['fate_pred']=adata.obs['index'].map(a)
adata3=adata[adata.obs['fate_pred'].isin([0,1])].copy()
adata3.obs['fate_pred'] = adata3.obs['fate_pred'].astype('category')
sc.tl.rank_genes_groups(adata3, groupby='fate_pred', method='wilcoxon')

[111]:

# adata3=adata[adata.obs['GT'].isin([0,1])].copy()
# adata3.obs['fate_pred'] = adata3.obs['fate_pred'].astype('category')
# adata3.obs['GT'] = adata3.obs['GT'].astype('category')
# sc.tl.rank_genes_groups(adata3, groupby='GT', method='wilcoxon')
threshold_pval=0.05
def get_marker_genes(adata, label_value, threshold_pval=0.05):
    names_df = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    logfoldchanges_df = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
    pvals_adj_df = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])

    return set(names_df[label_value].loc[
        (logfoldchanges_df[label_value] > 1) &
        (pvals_adj_df[label_value] < threshold_pval)
    ])

Mo_marker = get_marker_genes(adata3, '0.0')
Neu_marker = get_marker_genes(adata3, '1.0')
d = Mo_marker.union(Neu_marker)

[112]:

print(len(Mo_marker))
print(len(Neu_marker))

109
90

[113]:

import gseapy
adata3.obs['fate_pred']=adata3.obs['fate_pred'].astype('category')
# sc.tl.rank_genes_groups(adata, groupby=groupby, method="wilcoxon", key_added='rank_genes_test',
#                         groups=[1.0], reference=0.0)
# deg_tex = pd.DataFrame(np.hstack([
#     np.array(list(map(list, adata.uns["rank_genes_test"]["names"]))),
#     np.array(list(map(list, adata.uns["rank_genes_test"]['logfoldchanges']))),
#     np.array(list(map(list, adata.uns["rank_genes_test"]['pvals_adj'])))
# ]),
#         columns = list(range(3))
# )
deg_tex_up =Neu_marker
deg_tex_dw =Mo_marker

enr_tex_up = gseapy.enrichr(gene_list=list(deg_tex_up),
                            organism='Mouse',
                            gene_sets='GO_Biological_Process_2021')
enr_tex_dw = gseapy.enrichr(gene_list=list(deg_tex_dw),
                            organism='Mouse',
                            gene_sets='GO_Biological_Process_2021')

# return diff_exp_genes,diff_exp_genes_up,diff_exp_genes_dw,enr_res_up,enr_res_dw
# adata3.obs['fate_pred']=adata3.obs['fate_pred'].astype('category')
# #sc.tl.rank_genes_groups(adata3, groupby='fate_pred', method="wilcoxon", key_added='rank_genes_test',groups=[1.0],reference=0.0)
# deg_tex,deg_tex_up,deg_tex_dw,enr_tex_up,enr_tex_dw = DEG_analysis(adata3, groupby='fate_pred',
#                                                                    query_subtype=1.0,
#                                                                    reference_subtype=0.0)

/home/zhengtuo/miniconda3/lib/python3.9/site-packages/gseapy/enrichr.py:335: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  self.results = self.results.append(res, ignore_index=True, sort=True)
/home/zhengtuo/miniconda3/lib/python3.9/site-packages/gseapy/enrichr.py:335: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  self.results = self.results.append(res, ignore_index=True, sort=True)

[753]:

len(deg_tex_up)

[753]:

[754]:

len(deg_tex_dw)

[754]:

[812]:

enr_tex_up.res2d['Genes'][15]

[812]:

'HSP90AB1;MS4A3;SLC2A3;PYGL;MPO;HK3;CTSG;CD93;ANXA3;GAA;ATP6AP2;NME2;RHOG;ATP11B;TNFRSF1B;DDOST;VAMP8;EEF1A1;RAB31;PECAM1;PRTN3;PPIA;S100A8;VCL;CD63;CAB39;CLEC12A;FGL2;AGPAT2;PLAC8;ADGRG3;ALOX5;CREG1;NEU1;MLEC;STOM;LTA4H;RAB6A;ELANE;S100A11;VAT1;TMEM30A;IDH1;SURF4;TUBB4B;EEF2;PRDX6;ERP44;IMPDH1;LAMTOR2;YPEL5'

[115]:

enr_tex_up.res2d.Term = enr_tex_up.res2d.Term.str.split(" \(GO").str[0]
# dotplot
gseapy.dotplot(enr_tex_up.res2d[0:20], size=10, title="Up",
               top_term=20,
               cmap='Reds', figsize=(3,5))
fig = plt.gcf()

with PdfPages('./figures/GT_pred_up.pdf') as pdf:
    pdf.savefig(fig, bbox_inches='tight')
plt.show()

/home/zhengtuo/miniconda3/lib/python3.9/site-packages/gseapy/plot.py:282: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, colname] = df[colname].map(float)

[816]:

genes_1 = set(enr_tex_dw.res2d['Genes'][0].split(","))
genes_2 = set(enr_tex_up.res2d['Genes'][15].split(","))
overlapping_genes = genes_1.intersection(genes_2)
overlapping_genes

[816]:

set()

[116]:

# Figure2C
# trim (go:...)
enr_tex_dw.res2d.Term = enr_tex_dw.res2d.Term.str.split(" \(GO").str[0]
# dotplot
gseapy.dotplot(enr_tex_dw.res2d[0:20], size=12, top_term=20,
               figsize=(3,5), title="Down", cmap='Blues')

fig = plt.gcf()

with PdfPages('./figures/GT_pred_down.pdf') as pdf:
    pdf.savefig(fig, bbox_inches='tight')
plt.show()

/home/zhengtuo/miniconda3/lib/python3.9/site-packages/gseapy/plot.py:282: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, colname] = df[colname].map(float)

[24]:

gradients_per_label

[24]:

{0: tensor([7.1293e-09, 6.4681e-08, 3.2634e-08,  ..., 9.6019e-08, 4.2670e-07,
         1.2204e-08], device='cuda:1'),
 1: tensor([9.1898e-09, 5.8025e-08, 2.9834e-08,  ..., 8.4796e-08, 3.7161e-07,
         1.0011e-08], device='cuda:1')}

[23]:

import seaborn as sns
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable
def compute_gradients_for_label(target_label, data, model_encoder, model_decoder):
    mask = data.obs['fate_pred'] == target_label
    filtered_data = data[mask]

    geneex_data = torch.tensor(filtered_data.raw.X.toarray(), dtype=torch.float32).to(device2)

    dataset = TensorDataset(geneex_data)
    dataloader = DataLoader(dataset, batch_size=256, shuffle=False)

    total_gradients = torch.zeros(len_geneExp).to(device2)
    for batch in dataloader:
        geneExp = batch[0]
        geneExp = Variable(geneExp, requires_grad=True)
        output_data = model_encoder(geneExp)
        output_data = model_decoder(output_data)

        loss_function = nn.MSELoss()
        loss = loss_function(output_data, geneExp)

        model_encoder.zero_grad()
        model_decoder.zero_grad()
        loss.backward()

        total_gradients += geneExp.grad.abs().mean(dim=0)

    average_gradients = total_gradients / len(dataloader)
    return average_gradients

labels = [0, 1]
gradients_per_label = {}

for label in labels:
    gradients_per_label[label] = compute_gradients_for_label(label, adata, geneEnc, geneDec)

[29]:

num_key_genes = 30

key_genes_per_label = {}

for label, gradients in gradients_per_label.items():
    df = pd.DataFrame({
        'gene_ids': adata.var.index.values,
        'importance': gradients.cpu().detach().numpy()
    })

    sorted_df = df.sort_values(by="importance", ascending=False)

    key_genes = sorted_df['gene_ids'].head(num_key_genes).tolist()
    key_genes_per_label[label] = key_genes

# 输出每个标签的关键基因
for label, genes in key_genes_per_label.items():
    print(f"Key genes for label {label}:")
    print(', '.join(genes))
    print("\n")

Key genes for label 0:
mt-Co2, mt-Atp6, mt-Nd1, mt-Co1, Eef1a1, mt-Nd4, Gapdh, mt-Cytb, H2afy, Ppia, Rpl23, Ptma, Tuba1b, Ybx1, Rps23, Actg1, mt-Nd2, Set, Srgn, Rpl12, Rpl4, Actb, Npm1, Pabpc1, Ifitm1, Eif4g2, Rps17, Rpl7, Rps16, Rps24


Key genes for label 1:
mt-Co2, mt-Atp6, mt-Nd1, Eef1a1, mt-Co1, Gapdh, mt-Nd4, mt-Cytb, Srgn, Rpl23, Ppia, Rps23, Ybx1, Actg1, Ptma, H2afy, Rpl12, mt-Nd2, Tuba1b, Rpl4, Npm1, Pabpc1, Rps17, Set, Rps24, Ifitm1, Rpl7, Actb, Rpl27a, Rps16

[14]:

# adata.obsm['X_pca']=adata.obsm['X_pca'].cpu().numpy()
df_gradients_all['diff'].abs().nlargest(50).index

[14]:

Index(['Srgn', 'Elane', 'Mpo', 'Ctsg', 'Gstm1', 'Ly6a', 'Calr', 'Prtn3',
       'Fth1', 'H2-Q7', 'Gfi1', 'Ap3s1', 'Igf1r', 'Itm2b', 'Ifi203', 'Cybb',
       'Hk3', 'Spp1', 'Lpl', 'Cd63', 'Psap', 'Muc13', 'Rpl37a', 'Ctsc',
       'Plac8', 'Rps17', 'Anxa3', 'Gas5', 'Rbms1', '2810417H13Rik', 'Tpm4',
       'Ighm', 'Gpx1', 'Rps24', 'Dstn', 'Rpl23', 'Gsr', 'Spint2', 'Thy1',
       'Casp6', 'Lta4h', 'Rpl26', 'Sh3bgrl3', 'Tmbim6', 'Rps23', 'Rpsa',
       'Rpl27a', 'Rps19', 'Cd44', 'Rps10'],
      dtype='object')

[86]:

gradient_data = {}
for label, gradients in gradients_per_label.items():
    gradient_data[label] = gradients.cpu().detach().numpy()

# 转为DataFrame
df_gradients_all = pd.DataFrame(gradient_data, index=adata.var.index.values)

def normalize(column):
    """Normalize a pandas column to range between 0 and 1"""
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Normalize gradients for each label
for label in labels:
    df_gradients_all[label] = normalize(df_gradients_all[label])

# Compute difference between normalized gradients for the two labels
df_gradients_all['diff'] = df_gradients_all[labels[0]] - df_gradients_all[labels[1]]

# Select the top 20 genes with the largest gradient difference
top_genes = df_gradients_all['diff'].abs().nlargest(20).index

# Extract the normalized gradient values for the top genes
df_gradient_top = df_gradients_all.loc[top_genes, labels]

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_gradient_top, cmap="coolwarm", annot=True, cbar_kws={'label': 'Normalized Gradient Difference'})
plt.title("Normalized Gradient Difference of Top 20 Differentially Expressed Genes across Labels", fontsize=15)
plt.ylabel('Top Genes', fontsize=12)
plt.xlabel('Labels', fontsize=12)
plt.tight_layout()
plt.show()

[85]:

df_gradient_top
# df_gradient_top.to_csv('./weinreb_top50.csv')

[85]:

	0	1
Srgn	0.540197	0.720850
Gata2	0.040284	0.158828
Ly6a	0.215512	0.101576
Elane	0.163360	0.255346
Ighm	0.109224	0.020179
H2afy	0.681413	0.593418
Mpo	0.097797	0.176222
Lgals9	0.146582	0.223989
Muc13	0.115342	0.191869
Ifi203	0.138005	0.061555
Gzmb	0.020378	0.096122
Ctsg	0.160164	0.234527
Calr	0.310392	0.382003
Psap	0.183319	0.112002
Fth1	0.456394	0.385800
H2-Q7	0.130340	0.200920
Igf1r	0.059137	0.127686
Rsu1	0.083553	0.149635
Gstm1	0.128448	0.194066
Alox5	0.016399	0.081993
Cd63	0.116019	0.179740
Nt5c3	0.054703	0.117510
Rbms1	0.207719	0.145031
Nfkbia	0.188631	0.126453
Lpl	0.111949	0.050364
F2r	0.028852	0.089945
Ctsc	0.173801	0.115267
Rpl37a	0.349934	0.408366
Tmem176b	0.183777	0.127356
H2-Aa	0.084571	0.028233
Cd74	0.083975	0.027706
Cybb	0.138917	0.082854
Rpl23	0.610108	0.665414
Rpn1	0.246744	0.301163
Clint1	0.124384	0.178549
Rps17	0.479229	0.532979
Rpl12	0.527401	0.581106
Scin	0.040654	0.094235
H2-Eb1	0.073607	0.020919
Hbb-bt	0.056614	0.108873
Tk1	0.211874	0.159731
Cpne2	0.053908	0.105856
Nedd4	0.136443	0.188341
Ap3s1	0.241022	0.292903
Rps24	0.459745	0.510892
Rpsa	0.357013	0.408056
Sh3bgrl3	0.269304	0.219289
Cyp11a1	0.019235	0.068958
Tmsb4x	0.312536	0.361726
Spn	0.112933	0.162044

[42]:

gradient_data = {}
for label, gradients in gradients_per_label.items():
    gradient_data[label] = gradients.cpu().detach().numpy()

df_gradients_all = pd.DataFrame(gradient_data, index=adata.var.index.values)

def normalize(column):
    """Normalize a pandas column to range between 0 and 1"""
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Normalize gradients for each label
for label in labels:
    df_gradients_all[label] = normalize(df_gradients_all[label])

# Compute difference between normalized gradients for the two labels
df_gradients_all['diff'] = df_gradients_all[labels[0]] - df_gradients_all[labels[1]]

# Select the top 20 genes with the largest gradient difference
top_genes = df_gradients_all['diff'].abs().nlargest(50).index

# Extract the normalized gradient values for the top genes
df_gradient_top = df_gradients_all.loc[top_genes, labels]

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_gradient_top, cmap="coolwarm", annot=True, cbar_kws={'label': 'Normalized Gradient Difference'})
plt.title("Normalized Gradient Difference of Top 20 Differentially Expressed Genes across Labels", fontsize=15)
plt.ylabel('Top Genes', fontsize=12)
plt.xlabel('Labels', fontsize=12)
plt.tight_layout()
plt.show()

[76]:

top_genes_bias

[76]:

	0	1	diff	bias
Ctsc	-0.173801	0.115267	0.058534	-0.058534
Lpl	-0.111949	0.050364	0.061585	-0.061585
Nfkbia	-0.188631	0.126453	0.062178	-0.062178
Rbms1	-0.207719	0.145031	0.062688	-0.062688
Fth1	-0.456394	0.385800	0.070594	-0.070594
Psap	-0.183319	0.112002	0.071317	-0.071317
Ifi203	-0.138005	0.061555	0.076451	-0.076451
H2afy	-0.681413	0.593418	0.087995	-0.087995
Ighm	-0.109224	0.020179	0.089045	-0.089045
Ly6a	-0.215512	0.101576	0.113935	-0.113935
H2-Q7	-0.130340	0.200920	-0.070580	0.070580
Calr	-0.310392	0.382003	-0.071612	0.071612
Ctsg	-0.160164	0.234527	-0.074363	0.074363
Gzmb	-0.020378	0.096122	-0.075744	0.075744
Muc13	-0.115342	0.191869	-0.076527	0.076527
Lgals9	-0.146582	0.223989	-0.077408	0.077408
Mpo	-0.097797	0.176222	-0.078425	0.078425
Elane	-0.163360	0.255346	-0.091986	0.091986
Gata2	-0.040284	0.158828	-0.118543	0.118543
Srgn	-0.540197	0.720850	-0.180652	0.180652

[35]:

import matplotlib.pyplot as plt
df_gradients_all_normalized = df_gradients_all.copy()
# for label in labels:
#     df_gradients_all_normalized[label] = (df_gradients_all[label] - df_gradients_all[label].min()) / (df_gradients_all[label].max() - df_gradients_all[label].min())


df_gradients_all_normalized[labels[0]] = -df_gradients_all_normalized[labels[0]]

df_gradients_all_normalized['bias'] = df_gradients_all_normalized[labels[1]] + df_gradients_all_normalized[labels[0]]

sorted_df_right_bias = df_gradients_all_normalized.sort_values(by='bias', ascending=False).head(10)
sorted_df_right_bias =sorted_df_right_bias.sort_values(by='bias', ascending=True)

sorted_df_left_bias = df_gradients_all_normalized.sort_values(by='bias').head(10)
sorted_df_left_bias =sorted_df_left_bias.sort_values(by='bias', ascending=False)
top_genes_bias = pd.concat([sorted_df_left_bias, sorted_df_right_bias])
#top_genes_bias = top_genes_bias.sort_values(by='bias', key=abs, ascending=False)

fig, ax = plt.subplots(figsize=(12, 10))


colors = ['blue', 'red']

for gene in top_genes_bias.index:
    for i, label in enumerate(labels):
        x_value = top_genes_bias.loc[gene, label]
        y_value = list(top_genes_bias.index).index(gene)
        ax.plot(x_value, y_value, marker='o', color=colors[i])
        if i == 0:
            x_values = [top_genes_bias.loc[gene, labels[0]], top_genes_bias.loc[gene, labels[1]]]
            y_values = [y_value, y_value]
            ax.plot(x_values, y_values, linestyle='-', color='grey')

ax.axvline(0, color='black', linestyle='--')

divider_position = 10
ax.axhline(divider_position - 0.5, color='green', linestyle='--')

ax.set_yticks(range(len(top_genes_bias.index)))
ax.set_yticklabels(top_genes_bias.index)
ax.set_title('Top 20 Genes with Highest Gradient Bias')
ax.set_xlabel('Normalized Gradient Value')
ax.legend(labels, loc='upper right')

plt.tight_layout()
plt.show()
fig.savefig("/home/zhengtuo/songtao/weinreb_lolipop_plot.pdf", dpi=1000)

[66]:

top_genes_bias1 = top_genes_bias.sort_values(by='bias', key=abs, ascending=False)
top_genes_bias1.index

[66]:

Index(['Srgn', 'Elane', 'Mpo', 'Ctsg', 'Gstm1', 'Ly6a', 'Calr', 'Prtn3',
       'Fth1', 'H2-Q7', 'Gfi1', 'Ap3s1', 'Ifi203', 'Cybb', 'Spp1', 'Lpl',
       'Psap', 'Ctsc', 'Rbms1', '2810417H13Rik'],
      dtype='object')

[71]:

top_genes_bias1 = top_genes_bias.sort_values(by='bias', key=abs, ascending=False)
top_genes_bias1.index

[71]:

Index(['Srgn', 'Gata2', 'Ly6a', 'Elane', 'Ighm', 'H2afy', 'Mpo', 'Lgals9',
       'Muc13', 'Ifi203', 'Gzmb', 'Ctsg', 'Calr', 'Psap', 'Fth1', 'H2-Q7',
       'Rbms1', 'Nfkbia', 'Lpl', 'Ctsc'],
      dtype='object')

[881]:

cs.tl.clonal_fate_bias(
    adata, selected_fate="Neutrophil", alternative="two-sided"
)
cs.pl.clonal_fate_bias(adata)

100%|██████████████████████████████████████| 5864/5864 [00:41<00:00, 140.11it/s]

Data saved at adata.uns['clonal_fate_bias']

[347]:

from matplotlib.colors import LinearSegmentedColormap
colors = [(1, 0, 0), (1, 1, 1), (0, 0, 1)]  # R -> W -> B
n_bins = 100
cmap_name = 'custom_div_cmap'
cm = LinearSegmentedColormap.from_list(cmap_name, colors, N=n_bins)

[77]:

import anndata
# df_expression_all['diff'] = df_expression_all[labels[0]] - df_expression_all[labels[1]]
adata.obs['fate_pred']=adata.obs['fate_pred'].astype('category')
top_genes =  top_genes_bias[::-1].index

adata_raw = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs)

sc.pl.dotplot(adata_raw,
              var_names=top_genes,
              groupby='fate_pred',
              figsize=(8.5, 3),
              dot_max=1,
              dot_min=0.01,
              color_map='Reds',
              title="Top Differentially Expressed Genes across Cell Fate",
              vmin=0.1,
              vmax=1.5,
              save='weinreb_dotplot.pdf')

WARNING: saving figure to file figures/dotplot_weinreb_dotplot.pdf

/home/zhengtuo/miniconda3/lib/python3.9/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)

[501]:

sc.pl.dotplot(adata_raw,
            var_names=top_genes,
              groupby='fate_pred',

            )

/home/zhengtuo/miniconda3/lib/python3.9/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)

[140]:

import matplotlib.pyplot as plt

methods_10 = ["10% Clone", "10% Clone + Transfer"]
all_acc_10 = [0.73, 0.81]
test_acc_10 = [0.69, 0.79]
R_values_10 = [0.46, 0.63]

methods_30 = ["30% Clone", "30% Clone + Transfer"]
all_acc_30 = [0.83, 0.86]
test_acc_30 = [0.77, 0.81]
R_values_30 = [0.67, 0.72]

methods_combined = methods_10 + methods_30
all_acc_combined = all_acc_10 + all_acc_30
test_acc_combined = test_acc_10 + test_acc_30
R_values_combined = R_values_10 + R_values_30

bar_width = 0.25
colors = ['deepskyblue', 'lightgreen', 'darkred']

# ... [rest of the code remains unchanged]

def plot_data(methods, all_acc, test_acc, R_values, title, filename):
    fig, ax = plt.subplots(figsize=(10, 6))
    index = range(len(methods))
    bar1 = ax.bar(index, all_acc, bar_width, label='All Acc', color=colors[1])
    bar2 = ax.bar([i+bar_width for i in index], test_acc, bar_width, label='Test Acc', color=colors[0])
    bar3 = ax.bar([i+2*bar_width for i in index], R_values, bar_width, label='R-value', color=colors[2], alpha=0.7)

    ax.set_title(title)
    ax.set_xlabel('Methods')
    ax.set_ylabel('Accuracy / R-value')
    ax.set_xticks([i + bar_width for i in index])
    ax.set_xticklabels(methods)
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.tight_layout()

    plt.savefig(filename, dpi=1000)

plot_data(methods_10, all_acc_10, test_acc_10, R_values_10, "10% Clone Accuracy Comparison", "10_clone_accuracy.pdf")
plot_data(methods_30, all_acc_30, test_acc_30, R_values_30, "30% Clone Accuracy Comparison", "30_clone_accuracy.pdf")
plot_data(methods_combined, all_acc_combined, test_acc_combined, R_values_combined, "10% & 30% Clone Accuracy Comparison", "combined_clone_accuracy.pdf")

[116]:

import matplotlib.pyplot as plt

methods_10 = ["10% Clone", "10% Clone + Transfer"]
all_acc_10 = [0.744, 0.81]
test_acc_10 = [0.67, 0.79]
R_values_10 = [0.49, 0.63]

methods_30 = ["30% Clone", "30% Clone + Transfer"]
all_acc_30 = [0.81, 0.86]
test_acc_30 = [0.77, 0.81]
R_values_30 = [0.64, 0.72]

methods_combined = methods_10 + methods_30
all_acc_combined = all_acc_10 + all_acc_30
test_acc_combined = test_acc_10 + test_acc_30
R_values_combined = R_values_10 + R_values_30

colors = ['deepskyblue', 'lightgreen', 'darkred']

def plot_data(methods, all_acc, test_acc, R_values, title):
    fig, ax = plt.subplots(figsize=(10, 6))
    index = range(len(methods))
    line1, = ax.plot(index, all_acc, marker='o', color=colors[1], label='All Acc')
    line2, = ax.plot(index, test_acc, marker='o', color=colors[0], label='Test Acc')
    line3, = ax.plot(index, R_values, marker='o', color=colors[2], label='R-value', linestyle='--')

    for r in R_values:
        ax.axhline(r, linestyle=':', color='gray', alpha=0.5)

    ax.set_title(title)
    ax.set_xlabel('Methods')
    ax.set_ylabel('Accuracy / R-value')
    ax.set_xticks(index)
    ax.set_xticklabels(methods)
    ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.show()

plot_data(methods_10, all_acc_10, test_acc_10, R_values_10, "10% Clone Accuracy Comparison")
plot_data(methods_30, all_acc_30, test_acc_30, R_values_30, "30% Clone Accuracy Comparison")
plot_data(methods_combined, all_acc_combined, test_acc_combined, R_values_combined, "10% & 30% Clone Accuracy Comparison")