空间转录组 CCST

最新推荐文章于 2025-06-30 16:45:36 发布

原创最新推荐文章于 2025-06-30 16:45:36 发布 · 1.7k 阅读

13 ·

CC 4.0 BY-SA版权

文章标签：

#机器学习 #算法 #人工智能

今天接着写CCST，CCST借鉴了DGI模型，总体来说是构建了邻接矩阵，然后四层GCN，损失函数是信息熵，与spaGCN有点相似性，后者是计算KL离散度。最后根据模型提取的特征进行聚类。

具体看代码，CCST是以乳腺癌细胞和人骨肉瘤细胞为例公布的代码，其中乳腺癌细胞为10X数据，人骨肉瘤细胞是MERFISH数据， seqFISH+和DLPFC处理方法没有公布，我们以乳腺癌数据为例看代码。

首先看数据处理部分。

首先用的stlearn读取数据，PCA降维到300，正则化

adata_h5 = st.Read10X(path=data_fold, count_file=args.data_name+'_filtered_feature_bc_matrix.h5' )

def adata_preprocess(i_adata, min_cells=3, pca_n_comps=300):
    print('===== Preprocessing Data ')
    sc.pp.filter_genes(i_adata, min_cells=min_cells)
    adata_X = sc.pp.normalize_total(i_adata, target_sum=1, exclude_highly_expressed=True, inplace=False)['X']
    adata_X = sc.pp.scale(adata_X)
    adata_X = sc.pp.pca(adata_X, n_comps=pca_n_comps)
    return adata_X

然后是计算邻接矩阵。邻接矩阵一共要构建两个，一个是所有点的距离矩阵，就称为GCN邻接矩阵吧，第二个邻接矩阵类似于STAGATE的GAT邻接矩阵一样，就称为GAT邻接矩阵吧，距离小于阈值为1，否则是0，用于后期对比学习的分类。

第一个双重for循环计算了所有的spot之间的欧氏距离，是为了找到最大的GAT邻接矩阵的总个数个平均值，并prient一下，代码效率有点低，经测试注释掉即可。

第二个双重for循环前面先计算了了所有的spot之间的欧氏距离，构建gcn邻接矩阵，双重for循环遍历所有的距离，找到距离相近的sport，赋值为1，也就是构建gat邻接矩阵，效率上应该还可以再提升一下。

最后将gat邻接矩阵压缩保存。

def get_adj(generated_data_fold):
    coordinates = np.load(generated_data_fold + 'coordinates.npy')
    if not os.path.exists(generated_data_fold):
        os.makedirs(generated_data_fold) 
    ############# get batch adjacent matrix
    cell_num = len(coordinates)

    ############ the distribution of distance 
    if 1:#not os.path.exists(generated_data_fold + 'distance_array.npy'):
        distance_list = []
        print ('calculating distance matrix, it takes a while')
        
        distance_list = []
        for j in range(cell_num):
            for i in range (cell_num):
                if i!=j:
                    distance_list.append(np.linalg.norm(coordinates[j]-coordinates[i]))

        distance_array = np.array(distance_list)
        #np.save(generated_data_fold + 'distance_array.npy', distance_array)
    else:
        distance_array = np.load(generated_data_fold + 'distance_array.npy')

    ###try different distance threshold, so that on average, each cell has x neighbor cells, see Tab. S1 for results
    from scipy import sparse
    import pickle
    import scipy.linalg

    for threshold in [300]:#range (210,211):#(100,400,40):
        num_big = np.where(distance_array<threshold)[0].shape[0]
        print (threshold,num_big,str(num_big/(cell_num*2))) #300 22064 2.9046866771985256
        from sklearn.metrics.pairwise import euclidean_distances

        distance_matrix = euclidean_distances(coordinates, coordinates)
        distance_matrix_threshold_I = np.zeros(distance_matrix.shape)
        distance_matrix_threshold_W = np.zeros(distance_matrix.shape)
        for i in range(distance_matrix_threshold_I.shape[0]):
            for j in range(distance_matrix_threshold_I.shape[1]):
                if distance_matrix[i,j] <= threshold and distance_matrix[i,j] > 0:
                    distance_matrix_threshold_I[i,j] = 1
                    distance_matrix_threshold_W[i,j] = distance_matrix[i,j]
            
        
        ############### get normalized sparse adjacent matrix
        distance_matrix_threshold_I_N = np.float32(distance_matrix_threshold_I) ## do not normalize adjcent matrix
        distance_matrix_threshold_I_N_crs = sparse.csr_matrix(distance_matrix_threshold_I_N)
        with open(generated_data_fold + 'Adjacent', 'wb') as fp:
            pickle.dump(distance_matrix_threshold_I_N_crs, fp)

第一个for循环将细胞类型（也就是我们一直说的spot）转化为数字list，做个map映射似乎更好一点吧。

随后再明确下顺序……保存

def get_type(args, cell_types, generated_data_fold):
    types_dic = []
    types_idx = []
    for t in cell_types:
        if not t in types_dic:
            types_dic.append(t) 
        id = types_dic.index(t)
        types_idx.append(id)

    n_types = max(types_idx) + 1 # start from 0
    # For human breast cancer dataset, sort the cells for better visualization
    if args.data_name == 'V1_Breast_Cancer_Block_A_Section_1':
        types_dic_sorted = ['Healthy_1', 'Healthy_2', 'Tumor_edge_1', 'Tumor_edge_2', 'Tumor_edge_3', 'Tumor_edge_4', 'Tumor_edge_5', 'Tumor_edge_6',
            'DCIS/LCIS_1', 'DCIS/LCIS_2', 'DCIS/LCIS_3', 'DCIS/LCIS_4', 'DCIS/LCIS_5', 'IDC_1', 'IDC_2', 'IDC_3', 'IDC_4', 'IDC_5', 'IDC_6', 'IDC_7']
        relabel_map = {}
        cell_types_relabel=[]
        for i in range(n_types):
            relabel_map[i]= types_dic_sorted.index(types_dic[i])
        for old_index in types_idx:
            cell_types_relabel.append(relabel_map[old_index])
        
    np.save(generated_data_fold+'cell_types.npy', np.array(cell_types_relabel))
    np.savetxt(generated_data_fold+'types_dic.txt', np.array(types_dic_sorted), fmt='%s', delimiter='\t')

随后将真实标签可视化。

def draw_map(generated_data_fold):
    coordinates = np.load(generated_data_fold + 'coordinates.npy')
    cell_types = np.load(generated_data_fold+'cell_types.npy')
    n_cells = len(cell_types)
    n_types = max(cell_types) + 1 # start from 0

    types_dic = np.loadtxt(generated_data_fold+'types_dic.txt', dtype='|S15',   delimiter='\t').tolist()
    for i,tmp in enumerate(types_dic):
        types_dic[i] = tmp.decode()
    print(types_dic)

    sc_cluster = plt.scatter(x=coordinates[:,0], y=-coordinates[:,1], s=5, c=cell_types, cmap='rainbow')  
    plt.legend(handles = sc_cluster.legend_elements(num=n_types)[0],labels=types_dic, bbox_to_anchor=(1,0.5), loc='center left', prop={'size': 9}) 
    
    plt.xticks([])
    plt.yticks([])
    plt.axis('scaled')
    #plt.xlabel('X')
    #plt.ylabel('Y')
    plt.title('Annotation')
    plt.savefig(generated_data_fold+'/spacial.png', dpi=400, bbox_inches='tight') 
    plt.clf()

数据预处理阶段终于结束了，下面上正菜，模型构建和训练。

先初始化了一下参数 lamba和batch_size

    lambda_I = args.lambda_I
    # Parameters
    batch_size = 1  # Batch size

随后读取预处理的数据并构建邻接矩阵。

def get_data(args):
    data_file = args.data_path + args.data_name +'/'
    with open(data_file + 'Adjacent', 'rb') as fp:
        adj_0 = pickle.load(fp)
    X_data = np.load(data_file + 'features.npy')

    num_points = X_data.shape[0]
    adj_I = np.eye(num_points)
    adj_I = sparse.csr_matrix(adj_I)
    adj = (1-args.lambda_I)*adj_0 + args.lambda_I*adj_I

    cell_type_indeces = np.load(data_file + 'cell_types.npy')
    
    return adj_0, adj, X_data, cell_type_indeces

adj_0就是一般的GAT邻接矩阵，adj则是CCST的邻接矩阵，具体形式为：对角矩阵线为lambda，原来为1的地方为mu，原来为0的地方不变。

然后将邻接矩阵转化为图的形式：特征、图、权重，随后做了dataloader

def get_graph(adj, X):
    # create sparse matrix
    row_col = []
    edge_weight = []
    rows, cols = adj.nonzero()
    edge_nums = adj.getnnz() 
    for i in range(edge_nums):
        row_col.append([rows[i], cols[i]])
        edge_weight.append(adj.data[i])
    edge_index = torch.tensor(np.array(row_col), dtype=torch.long).T
    edge_attr = torch.tensor(np.array(edge_weight), dtype=torch.float)

    graph_bags = []
    graph = Data(x=torch.tensor(X, dtype=torch.float), edge_index=edge_index, edge_attr=edge_attr)  
    graph_bags.append(graph)
    return graph_bags

紧接着构建模型，利用torch_geometric.nn构建DGI模型。具体的我单独做个DGI和infoGraph的博客来巩固一下。

    DGI_model = DeepGraphInfomax(
        hidden_channels=args.hidden,
        encoder=Encoder(in_channels=in_channels, hidden_channels=args.hidden),
        summary=lambda z, *args, **kwargs: torch.sigmoid(z.mean(dim=0)),
        corruption=corruption).to(device)

其中encoder是四层GCN，同样的每一层GCN也是由torch_geometric.nn构建

class Encoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(Encoder, self).__init__()
        self.conv = GCNConv(in_channels, hidden_channels)
        self.conv_2 = GCNConv(hidden_channels, hidden_channels)
        self.conv_3 = GCNConv(hidden_channels, hidden_channels)
        self.conv_4 = GCNConv(hidden_channels, hidden_channels)
        
        self.prelu = nn.PReLU(hidden_channels)
        
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        x = self.conv(x, edge_index, edge_weight=edge_weight)
        x = self.conv_2(x, edge_index, edge_weight=edge_weight)
        x = self.conv_3(x, edge_index, edge_weight=edge_weight)
        x = self.conv_4(x, edge_index, edge_weight=edge_weight)
        x = self.prelu(x)

        return x

然后就是训练模型，都是调用的包，这里就不细讲了

        for epoch in range(args.num_epoch):
            DGI_model.train()
            DGI_optimizer.zero_grad()
        
            DGI_all_loss = []
            
            for data in data_loader:
                data = data.to(device)
                pos_z, neg_z, summary = DGI_model(data=data)

                DGI_loss = DGI_model.loss(pos_z, neg_z, summary)
                DGI_loss.backward()
                DGI_all_loss.append(DGI_loss.item())
                DGI_optimizer.step()

            if ((epoch+1)%100) == 0:
                print('Epoch: {:03d}, Loss: {:.4f}'.format(epoch+1, np.mean(DGI_all_loss)))

        end_time = datetime.datetime.now()
        DGI_filename =  args.model_path+'DGI_lambdaI_' + str(args.lambda_I) + '_epoch' + str(args.num_epoch) + '.pth.tar'
        torch.save(DGI_model.state_dict(), DGI_filename)

随后提取特征并聚类

        cluster_type = 'kmeans' # 'louvain' leiden kmeans

        print("-----------Clustering-------------")
    
        X_embedding_filename =  args.embedding_data_path+'lambdaI' + str(lambda_I) + '_epoch' + str(args.num_epoch) + '_Embed_X.npy'
        X_embedding = np.load(X_embedding_filename)

        X_embedding = PCA_process(X_embedding, nps=30)

        #X_data_PCA = PCA_process(X_data, nps=X_embedding.shape[1])

        # concate
        #X_embedding = np.concatenate((X_embedding, X_data), axis=1)
        
        print('Shape of data to cluster:', X_embedding.shape)

        if cluster_type == 'kmeans':
            cluster_labels, score = Kmeans_cluster(X_embedding, n_clusters) 
        else:
            results_file = args.result_path + '/adata.h5ad'
            adata = ad.AnnData(X_embedding)
            sc.tl.pca(adata, n_comps=50, svd_solver='arpack')
            sc.pp.neighbors(adata, n_neighbors=20, n_pcs=50) # 20
            eval_resolution = res_search_fixed_clus(cluster_type, adata, n_clusters)
            if cluster_type == 'leiden':
                sc.tl.leiden(adata, key_added="CCST_leiden", resolution=eval_resolution)
                cluster_labels = np.array(adata.obs['leiden'])
            if cluster_type == 'louvain':
                sc.tl.louvain(adata, key_added="CCST_louvain", resolution=eval_resolution)
                cluster_labels = np.array(adata.obs['louvain'])
            #sc.tl.umap(adata)
            #sc.pl.umap(adata, color=['leiden'], save='_lambdaI_' + str(lambda_I) + '.png')
            adata.write(results_file)
            cluster_labels = [ int(x) for x in cluster_labels ]
            score = False

        all_data = [] 
        for index in range(num_cell):
            #all_data.append([index, cell_type_indeces[index], cluster_labels[index]])  # txt: cell_id, gt_labels, cluster type 
            all_data.append([index,  cluster_labels[index]])   #txt: cell_id, cluster type 
        np.savetxt(args.result_path+'/types.txt', np.array(all_data), fmt='%3d', delimiter='\t')