今天接着写CCST,CCST借鉴了DGI模型,总体来说是构建了邻接矩阵,然后四层GCN,损失函数是信息熵,与spaGCN有点相似性,后者是计算KL离散度。最后根据模型提取的特征进行聚类。
具体看代码,CCST是以乳腺癌细胞和人骨肉瘤细胞为例公布的代码,其中乳腺癌细胞为10X数据,人骨肉瘤细胞是MERFISH数据, seqFISH+和DLPFC处理方法没有公布,我们以乳腺癌数据为例看代码。
首先看数据处理部分。
首先用的stlearn读取数据,PCA降维到300,正则化
adata_h5 = st.Read10X(path=data_fold, count_file=args.data_name+'_filtered_feature_bc_matrix.h5' )
def adata_preprocess(i_adata, min_cells=3, pca_n_comps=300):
print('===== Preprocessing Data ')
sc.pp.filter_genes(i_adata, min_cells=min_cells)
adata_X = sc.pp.normalize_total(i_adata, target_sum=1, exclude_highly_expressed=True, inplace=False)['X']
adata_X = sc.pp.scale(adata_X)
adata_X = sc.pp.pca(adata_X, n_comps=pca_n_comps)
return adata_X
然后是计算邻接矩阵。邻接矩阵一共要构建两个,一个是所有点的距离矩阵,就称为GCN邻接矩阵吧,第二个邻接矩阵类似于STAGATE的GAT邻接矩阵一样,就称为GAT邻接矩阵吧,距离小于阈值为1,否则是0,用于后期对比学习的分类。
第一个双重for循环计算了所有的spot之间的欧氏距离,是为了找到最大的GAT邻接矩阵的总个数个平均值,并prient一下,代码效率有点低,经测试注释掉即可。
第二个双重for循环前面先计算了了所有的spot之间的欧氏距离,构建gcn邻接矩阵,双重for循环遍历所有的距离,找到距离相近的sport,赋值为1,也就是构建gat邻接矩阵,效率上应该还可以再提升一下。
最后将gat邻接矩阵压缩保存。
def get_adj(generated_data_fold):
coordinates = np.load(generated_data_fold + 'coordinates.npy')
if not os.path.exists(generated_data_fold):
os.makedirs(generated_data_fold)
############# get batch adjacent matrix
cell_num = len(coordinates)
############ the distribution of distance
if 1:#not os.path.exists(generated_data_fold + 'distance_array.npy'):
distance_list = []
print ('calculating distance matrix, it takes a while')
distance_list = []
for j in range(cell_num):
for i in range (cell_num):
if i!=j:
distance_list.append(np.linalg.norm(coordinates[j]-coordinates[i]))
distance_array = np.array(distance_list)
#np.save(generated_data_fold + 'distance_array.npy', distance_array)
else:
distance_array = np.load(generated_data_fold + 'distance_array.npy')
###try different distance threshold, so that on average, each cell has x neighbor cells, see Tab. S1 for results
from scipy import sparse
import pickle
import scipy.linalg
for threshold in [300]:#range (210,211):#(100,400,40):
num_big = np.where(distance_array<threshold)[0].shape[0]
print (threshold,num_big,str(num_big/(cell_num*2))) #300 22064 2.9046866771985256
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix = euclidean_distances(coordinates, coordinates)
distance_matrix_threshold_I = np.zeros(distance_matrix.shape)
distance_matrix_threshold_W = np.zeros(distance_matrix.shape)
for i in range(distance_matrix_threshold_I.shape[0]):
for j in range(distance_matrix_threshold_I.shape[1]):
if distance_matrix[i,j] <= threshold and distance_matrix[i,j] > 0:
distance_matrix_threshold_I[i,j] = 1
distance_matrix_threshold_W[i,j] = distance_matrix[i,j]
############### get normalized sparse adjacent matrix
distance_matrix_threshold_I_N = np.float32(distance_matrix_threshold_I) ## do not normalize adjcent matrix
distance_matrix_threshold_I_N_crs = sparse.csr_matrix(distance_matrix_threshold_I_N)
with open(generated_data_fold + 'Adjacent', 'wb') as fp:
pickle.dump(distance_matrix_threshold_I_N_crs, fp)
第一个for循环将细胞类型(也就是我们一直说的spot)转化为数字list,做个map映射似乎更好一点吧。
随后再明确下顺序……保存
def get_type(args, cell_types, generated_data_fold):
types_dic = []
types_idx = []
for t in cell_types:
if not t in types_dic:
types_dic.append(t)
id = types_dic.index(t)
types_idx.append(id)
n_types = max(types_idx) + 1 # start from 0
# For human breast cancer dataset, sort the cells for better visualization
if args.data_name == 'V1_Breast_Cancer_Block_A_Section_1':
types_dic_sorted = ['Healthy_1', 'Healthy_2', 'Tumor_edge_1', 'Tumor_edge_2', 'Tumor_edge_3', 'Tumor_edge_4', 'Tumor_edge_5', 'Tumor_edge_6',
'DCIS/LCIS_1', 'DCIS/LCIS_2', 'DCIS/LCIS_3', 'DCIS/LCIS_4', 'DCIS/LCIS_5', 'IDC_1', 'IDC_2', 'IDC_3', 'IDC_4', 'IDC_5', 'IDC_6', 'IDC_7']
relabel_map = {}
cell_types_relabel=[]
for i in range(n_types):
relabel_map[i]= types_dic_sorted.index(types_dic[i])
for old_index in types_idx:
cell_types_relabel.append(relabel_map[old_index])
np.save(generated_data_fold+'cell_types.npy', np.array(cell_types_relabel))
np.savetxt(generated_data_fold+'types_dic.txt', np.array(types_dic_sorted), fmt='%s', delimiter='\t')
随后将真实标签可视化。
def draw_map(generated_data_fold):
coordinates = np.load(generated_data_fold + 'coordinates.npy')
cell_types = np.load(generated_data_fold+'cell_types.npy')
n_cells = len(cell_types)
n_types = max(cell_types) + 1 # start from 0
types_dic = np.loadtxt(generated_data_fold+'types_dic.txt', dtype='|S15', delimiter='\t').tolist()
for i,tmp in enumerate(types_dic):
types_dic[i] = tmp.decode()
print(types_dic)
sc_cluster = plt.scatter(x=coordinates[:,0], y=-coordinates[:,1], s=5, c=cell_types, cmap='rainbow')
plt.legend(handles = sc_cluster.legend_elements(num=n_types)[0],labels=types_dic, bbox_to_anchor=(1,0.5), loc='center left', prop={'size': 9})
plt.xticks([])
plt.yticks([])
plt.axis('scaled')
#plt.xlabel('X')
#plt.ylabel('Y')
plt.title('Annotation')
plt.savefig(generated_data_fold+'/spacial.png', dpi=400, bbox_inches='tight')
plt.clf()
数据预处理阶段终于结束了,下面上正菜,模型构建和训练。
先初始化了一下参数 lamba和batch_size
lambda_I = args.lambda_I
# Parameters
batch_size = 1 # Batch size
随后读取预处理的数据并构建邻接矩阵。
def get_data(args):
data_file = args.data_path + args.data_name +'/'
with open(data_file + 'Adjacent', 'rb') as fp:
adj_0 = pickle.load(fp)
X_data = np.load(data_file + 'features.npy')
num_points = X_data.shape[0]
adj_I = np.eye(num_points)
adj_I = sparse.csr_matrix(adj_I)
adj = (1-args.lambda_I)*adj_0 + args.lambda_I*adj_I
cell_type_indeces = np.load(data_file + 'cell_types.npy')
return adj_0, adj, X_data, cell_type_indeces
adj_0就是一般的GAT邻接矩阵,adj则是CCST的邻接矩阵,具体形式为:对角矩阵线为lambda,原来为1的地方为mu,原来为0的地方不变。
然后将邻接矩阵转化为图的形式:特征、图、权重,随后做了dataloader
def get_graph(adj, X):
# create sparse matrix
row_col = []
edge_weight = []
rows, cols = adj.nonzero()
edge_nums = adj.getnnz()
for i in range(edge_nums):
row_col.append([rows[i], cols[i]])
edge_weight.append(adj.data[i])
edge_index = torch.tensor(np.array(row_col), dtype=torch.long).T
edge_attr = torch.tensor(np.array(edge_weight), dtype=torch.float)
graph_bags = []
graph = Data(x=torch.tensor(X, dtype=torch.float), edge_index=edge_index, edge_attr=edge_attr)
graph_bags.append(graph)
return graph_bags
紧接着构建模型,利用torch_geometric.nn构建DGI模型。具体的我单独做个DGI和infoGraph的博客来巩固一下。
DGI_model = DeepGraphInfomax(
hidden_channels=args.hidden,
encoder=Encoder(in_channels=in_channels, hidden_channels=args.hidden),
summary=lambda z, *args, **kwargs: torch.sigmoid(z.mean(dim=0)),
corruption=corruption).to(device)
其中encoder是四层GCN,同样的每一层GCN也是由torch_geometric.nn构建
class Encoder(nn.Module):
def __init__(self, in_channels, hidden_channels):
super(Encoder, self).__init__()
self.conv = GCNConv(in_channels, hidden_channels)
self.conv_2 = GCNConv(hidden_channels, hidden_channels)
self.conv_3 = GCNConv(hidden_channels, hidden_channels)
self.conv_4 = GCNConv(hidden_channels, hidden_channels)
self.prelu = nn.PReLU(hidden_channels)
def forward(self, data):
x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
x = self.conv(x, edge_index, edge_weight=edge_weight)
x = self.conv_2(x, edge_index, edge_weight=edge_weight)
x = self.conv_3(x, edge_index, edge_weight=edge_weight)
x = self.conv_4(x, edge_index, edge_weight=edge_weight)
x = self.prelu(x)
return x
然后就是训练模型,都是调用的包,这里就不细讲了
for epoch in range(args.num_epoch):
DGI_model.train()
DGI_optimizer.zero_grad()
DGI_all_loss = []
for data in data_loader:
data = data.to(device)
pos_z, neg_z, summary = DGI_model(data=data)
DGI_loss = DGI_model.loss(pos_z, neg_z, summary)
DGI_loss.backward()
DGI_all_loss.append(DGI_loss.item())
DGI_optimizer.step()
if ((epoch+1)%100) == 0:
print('Epoch: {:03d}, Loss: {:.4f}'.format(epoch+1, np.mean(DGI_all_loss)))
end_time = datetime.datetime.now()
DGI_filename = args.model_path+'DGI_lambdaI_' + str(args.lambda_I) + '_epoch' + str(args.num_epoch) + '.pth.tar'
torch.save(DGI_model.state_dict(), DGI_filename)
随后提取特征并聚类
cluster_type = 'kmeans' # 'louvain' leiden kmeans
print("-----------Clustering-------------")
X_embedding_filename = args.embedding_data_path+'lambdaI' + str(lambda_I) + '_epoch' + str(args.num_epoch) + '_Embed_X.npy'
X_embedding = np.load(X_embedding_filename)
X_embedding = PCA_process(X_embedding, nps=30)
#X_data_PCA = PCA_process(X_data, nps=X_embedding.shape[1])
# concate
#X_embedding = np.concatenate((X_embedding, X_data), axis=1)
print('Shape of data to cluster:', X_embedding.shape)
if cluster_type == 'kmeans':
cluster_labels, score = Kmeans_cluster(X_embedding, n_clusters)
else:
results_file = args.result_path + '/adata.h5ad'
adata = ad.AnnData(X_embedding)
sc.tl.pca(adata, n_comps=50, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=50) # 20
eval_resolution = res_search_fixed_clus(cluster_type, adata, n_clusters)
if cluster_type == 'leiden':
sc.tl.leiden(adata, key_added="CCST_leiden", resolution=eval_resolution)
cluster_labels = np.array(adata.obs['leiden'])
if cluster_type == 'louvain':
sc.tl.louvain(adata, key_added="CCST_louvain", resolution=eval_resolution)
cluster_labels = np.array(adata.obs['louvain'])
#sc.tl.umap(adata)
#sc.pl.umap(adata, color=['leiden'], save='_lambdaI_' + str(lambda_I) + '.png')
adata.write(results_file)
cluster_labels = [ int(x) for x in cluster_labels ]
score = False
all_data = []
for index in range(num_cell):
#all_data.append([index, cell_type_indeces[index], cluster_labels[index]]) # txt: cell_id, gt_labels, cluster type
all_data.append([index, cluster_labels[index]]) #txt: cell_id, cluster type
np.savetxt(args.result_path+'/types.txt', np.array(all_data), fmt='%3d', delimiter='\t')