1、Load Require Package
exec('import warnings; warnings.filterwarnings("ignore")')
import dynamo as dyn
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import spateo as st
st.config.n_threads = 4
st.__version__
1、Load Data & Build Bin50
示例数据(E9.5_E2S2_GEM_bin1.tsv.gz)格式:
adata = st.io.read_bgi("E9.5_E2S2_GEM_bin1.tsv.gz",binsize=50)
adata
|-----> Using binsize=50
|-----> Constructing count matrices.
|-----> <insert> __type to uns in AnnData Object.
|-----> <insert> pp to uns in AnnData Object.
|-----> <insert> spatial to uns in AnnData Object.
AnnData object with n_obs × n_vars = 4332 × 24107
obs: 'area'
uns: '__type', 'pp', 'spatial'
obsm: 'spatial', 'contour', 'bbox'
2、Plot Bin50 Spot Distribution
st.pl.space(adata, color=['area'], pointsize=0.1, show_legend="upper left", cmap="tab20") ### 数据来源于adata.obsm["spatial"]
3、Check Data Quality
st.pp.filter.filter_cells(adata,min_expr_genes=500,inplace=True) ### 过滤表达基因数小于 500的细胞
st.pp.filter.filter_genes(adata, min_cells=3,inplace=True) ### 过滤 无效表达的基因
adata.var['mt'] = adata.var_names.str.startswith('mt-') ### 标记线粒体基因
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) ### 计算线粒体基因表达比例
adata = adata[adata.obs.pct_counts_mt < 5, :] ### 过滤 线粒体基因高表达 的细胞
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) ### 过滤 线粒体基因高表达 的细胞
adata.layers["raw"] = adata.X ### 把原始矩阵拷贝到adata的"raw"层
4、Normalization & Dimensional reduction
### log transform & dimension reduction
adata.X = adata.layers["raw"] ### 从layer层提取 UMI counts matrix
dyn.pp.normalize_cell_expr_by_size_factors(adata, layers="X")
dyn.pp.log1p(adata)
st.tl.pca_spateo(adata, n_pca_components=50)
dyn.tl.neighbors(adata, n_neighbors=30) ### Build KNN Graph
### 使用空间约束聚类(st.tl.ssc)可以识别连续的组织区域, SCC方法同时考虑 物理坐标Grap 和 表达水平Grap 构建联合KNN图用于社区搜索
st.tl.scc(adata, s_neigh=8, resolution=1, cluster_method="louvain", key_added="scc", pca_key="X_pca")
### 绘制聚类点图 & 输出图片为 pdf
st.pl.space(adata, color=['scc'], pointsize=0.2, show_legend="upper left") ### 显示图片
st.pl.space(adata, color=['scc'], pointsize=0.2, show_legend="upper left",save_show_or_return='save',save_kwargs={"path":"bin50_scc_cluster", "ext":"pdf", "dpi":300, "width":1,"heigth":1})
5、Spatial Autocorrelation Features Selection
Moran’s I 指数用以评估一个基因的表达水平在空间上的分布是否呈现出聚集或分散的趋势,也称空间自相关性测量。Moran’s I取值范围为-1到1之间,其中-1表示完全的空间分散,0表示空间随机分布,1表示完全的空间聚集。
### 执行 moran'I test 检测空间自相关基因进行特征选择: 这一步会有点耗时
m = st.tl.spatial_degs.moran_i(adata,n_jobs=8)
### Feature selection
m_filter = m[(m.moran_p_val < 0.05)&(m.moran_q_val<0.05)].sort_values(by=['moran_i'],ascending=False)
m_filter.to_csv("mouse_brain_morani_filter.csv")
print(m_filter)
### 可视化莫兰指数 top5 和 bottom 5 的基因表达
st.pl.space(adata, genes=m_filter.index[0:5].tolist() + m_filter.index[-6:-1].tolist(),pointsize=0.4,ncols=5,show_legend="upper left",figsize=(8,8))
6、Transcriptom DE Analysis
6.1 Case .vs. Control
st.tl.find_cluster_degs(adata,group='scc',test_group='4',control_groups=['0','1','2'],genes=None,method='pairwise')
6.2 One.vs. Ohter
st.tl.find_all_cluster_degs(adata,group='scc',genes=None,n_jobs=8,copy=False)
all_markers = pd.concat(adata.uns["cluster_markers"]["deg_tables"], ignore_index=True, sort=False)
all_markers.to_csv("mouse_brain_all_markers.csv")
print(all_markers)
7、Region DE Analysis(Need First Label Regions)
st.tl.find_spatial_cluster_degs(adata=adata , test_group='4',group='scc',k = 15 , ratio_thresh =0.25)
8、Save object
import joblib
joblib.dump(adata,"adata_bin50_cluster.pkl")