modified: fig1.py
[GalaxyCodeBases.git] / python / salus / cmplatform / fig1.py
bloba2093c669662fbb7857006408dd1083eb747649e
1 #!/usr/bin/env python3
3 import sys
4 import os
5 from typing import NamedTuple
7 PlatformTuple = ('Illumina', 'Salus')
8 SamplesDict = {
9 'mbrain': {
10 'sid' : 'mbrain',
11 'sub' : 'Mouse Brain Sptial',
12 'type': 'visium',
13 'prefix' : '/share/result/spatial/data/BoAo_sp',
14 'suffixOut': dict.fromkeys(PlatformTuple,"outs"),
15 'suffixMtx': 'filtered_feature_bc_matrix',
16 'platforms': {PlatformTuple[0]:'illumina', PlatformTuple[1]: 'salus'},
17 'pattern': ('prefix', 'platformV', 'sid', 'suffixOutV', 'suffixMtx')
19 'mkidney': {
20 'sid' : 'mkidney',
21 'sub' : 'Mouse Kindey Sptial',
22 'type': 'visium',
23 'prefix' : '/share/result/spatial/data/BoAo_sp',
24 'suffixOut': dict.fromkeys(PlatformTuple,"outs"),
25 'suffixMtx': 'filtered_feature_bc_matrix',
26 'platforms': {PlatformTuple[0]:'illumina', PlatformTuple[1]: 'salus'},
27 'pattern': ('prefix', 'platformV', 'sid', 'suffixOutV', 'suffixMtx')
29 'human': {
30 'sid' : 'human',
31 'sub' : 'Human Single Cell',
32 'type': 'mobivision',
33 'prefix' : '/share/result/spatial/data/MoZhuo_sc/FX20230913',
34 'suffixOut': {PlatformTuple[0]: 'out/R22045213-220914-LYY-S11-R03-220914-LYY-S11-R03_combined_outs',
35 PlatformTuple[1]: 'out_subset/20221124-LYY-S09-R03_AGGCAGAA_fastq_outs'},
36 'suffixMtx': 'filtered_cell_gene_matrix',
37 'platforms': {PlatformTuple[0]:'illumina', PlatformTuple[1]: 'sailu'},
38 'pattern': ('prefix', 'platformV', 'suffixOutV', 'suffixMtx')
42 def checkModules() -> None:
43 import importlib.metadata
44 from packaging import version
45 pkgname = "squidpy"
46 min_ver = "1.2.3"
47 got_ver = importlib.metadata.version(pkgname)
48 if version.parse(got_ver) < version.parse(min_ver):
49 raise Exception(f"{pkgname}>={min_ver} is needed, but found {pkgname}=={got_ver}")
51 if __name__ == "__main__":
52 if len(sys.argv) > 1:
53 thisID = sys.argv[1]
54 if thisID not in SamplesDict:
55 print(f"[x]sid can only be {SamplesDict.keys()}", file=sys.stderr)
56 exit(1)
57 else:
58 thisID = 'mbrain'
59 print(sys.argv, file=sys.stderr)
60 print(f"[i]{thisID}")
61 sys.stdout.flush()
62 #checkModules()
64 import matplotlib; matplotlib.use("module://mplcairo.base")
65 from matplotlib import pyplot as plt
66 import mplcairo
68 plt.rcParams['figure.figsize'] = (6.0, 6.0) # set default size of plots
69 plt.rcParams['figure.dpi'] = 300
70 plt.rcParams['savefig.bbox'] = 'tight'
71 plt.rcParams["savefig.transparent"] = True
72 font = {'family' : 'STIX Two Text',
73 #'size' : 22,
74 'weight' : 'normal'}
75 matplotlib.rc('font', **font)
77 import numpy as np
78 import pandas as pd
79 import fast_matrix_market
80 import anndata as ad
81 import scanpy as sc
82 sc._settings.ScanpyConfig.n_jobs = -1
83 #import squidpy as sq
84 import seaborn as sns
85 import scipy
86 import pynndescent
88 import warnings
89 warnings.filterwarnings('ignore')
91 def main() -> None:
93 class scDatItem(NamedTuple):
94 name: str
95 bgRaw: tuple[int,int]
96 bgFlt: tuple[int,int]
97 annDat: ad.AnnData
99 def __repr__(self) -> str:
100 return f'[sc:{self.name}, Raw_BC*Gene={self.bgRaw[0]}x{self.bgRaw[1]}, NonZero_BC*Gene={self.bgFlt[0]}x{self.bgFlt[1]} ({self.annDat.n_obs}x{self.annDat.n_vars})]'
102 scDat = []
103 nfoDict = SamplesDict[thisID]
104 print("[i]Start.", file=sys.stderr)
105 for platform in PlatformTuple:
106 nfoDict['platformK'] = platform
107 nfoDict['platformV'] = nfoDict['platforms'][platform]
108 nfoDict['suffixOutV'] = nfoDict['suffixOut'][platform]
109 mtxPath = os.path.join( *[nfoDict[v] for v in nfoDict['pattern']] )
110 print(f"[i]Reading {mtxPath}", file=sys.stderr)
111 adata=sc.read_10x_mtx(mtxPath, var_names='gene_symbols', make_unique=True, gex_only=True)
112 adata.var_names_make_unique() # this is necessary if using `var_names='gene_symbols'` in `sc.read_10x_mtx`
113 nnRaw = adata.shape
114 adata.var['mt'] = adata.var_names.str.startswith('MT-') | adata.var_names.str.startswith('mt-')
115 sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)
116 adata.raw = adata
117 sc.pp.filter_cells(adata, min_genes=1)
118 sc.pp.filter_genes(adata, min_cells=1)
119 nnFlt = (adata.n_obs,adata.n_vars)
120 sc.pp.pca(adata)
121 #sc.pp.neighbors(adata)
122 #sc.tl.umap(adata,random_state=369)
123 #sc.tl.draw_graph(adata)
124 scDat.append(scDatItem(platform,nnRaw,nnFlt,adata))
125 adata.write_h5ad(f"{nfoDict['sid']}_{platform}.h5ad",compression='lzf')
127 print("\n".join(map(str,scDat)))
129 with pd.option_context("mode.copy_on_write", True):
130 obsmbi = scDat[0].annDat.obs[['n_genes_by_counts', 'total_counts']].copy(deep=False)
131 obsmbs = scDat[1].annDat.obs[['n_genes_by_counts', 'total_counts']].copy(deep=False)
132 p1df = pd.concat([obsmbi.assign(Platform=scDat[0].name), obsmbs.assign(Platform=scDat[1].name)], ignore_index=True).replace([np.inf, -np.inf, 0], np.nan).dropna()
133 p2df = obsmbi.join(obsmbs,lsuffix='_'+scDat[0].name,rsuffix='_'+scDat[1].name,how='inner').replace([np.inf, -np.inf, 0], np.nan).dropna()
134 p3tuple = (frozenset(scDat[0].annDat.var_names), frozenset(scDat[1].annDat.var_names))
136 print("[i]Begin fig A. 1D", file=sys.stderr)
137 custom_params = {"axes.spines.right": False, "axes.spines.top": False}
138 sns.set_theme(style="ticks", rc=custom_params, font="STIX Two Text")
139 figA=sns.JointGrid(data=p1df, x="total_counts", y="n_genes_by_counts", hue='Platform', dropna=True)
140 #figA.plot(sns.scatterplot, sns.histplot, alpha=.7, edgecolor=".2", linewidth=.5)
141 figA.plot_joint(sns.scatterplot, s=12.7, alpha=.6)
142 figA.plot_marginals(sns.histplot, kde=True, alpha=.618)
143 figA.figure.suptitle(f"Gene to UMI plot - {nfoDict['sub']}")
144 figA.set_axis_labels(xlabel='UMIs per Barcode', ylabel='Genes per Barcode')
145 figA.savefig(f"1D_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'Gene to UMI plot', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
147 print("[i]Begin fig B. 1E", file=sys.stderr)
148 figB=sns.JointGrid(data=p2df, x="total_counts_Illumina", y="total_counts_Salus", dropna=True)
149 figB.plot_joint(sns.scatterplot, s=12.7, alpha=.6)
150 figB.plot_marginals(sns.histplot, kde=True, alpha=.618)
151 figB.figure.suptitle(f"UMI per Barcode Counts Comparing - {nfoDict['sub']}")
152 figB.set_axis_labels(xlabel='UMI Counts from Illumina', ylabel='UMI Counts from Salus')
153 figB.savefig(f"1E_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'UMI per Barcode Counts Comparing', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
155 print("[i]Begin fig . 1F", file=sys.stderr)
156 from matplotlib_venn import venn2
157 plt.figure(figsize=(4,4))
158 plt.title(f"Genes Venn diagram - {nfoDict['sub']}")
159 p3intersection = p3tuple[0] & p3tuple[1]
160 p3veen = (p3tuple[0]-p3intersection, p3tuple[1]-p3intersection, p3intersection)
161 GenesA = scDat[0].annDat.var.loc[p3veen[0]-p3veen[2]]
162 GenesB = scDat[1].annDat.var.loc[p3veen[1]-p3veen[2]]
163 GenesC = scDat[0].annDat.var.loc[p3veen[2]]
164 p3vd=venn2(subsets=tuple(map(len,p3veen)), set_labels=(scDat[0].name, scDat[1].name))
165 plt.savefig(f"1F_Genes_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'Veen of Genes', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
166 GenesA.to_csv(f"1F_Genes_{nfoDict['sid']}_{scDat[0].name}_only.csv",encoding='utf-8')
167 GenesB.to_csv(f"1F_Genes_{nfoDict['sid']}_{scDat[1].name}_only.csv",encoding='utf-8')
168 GenesC.to_csv(f"1F_Genes_{nfoDict['sid']}_intersection.csv.zst",encoding='utf-8',compression={'method': 'zstd', 'level': 9, 'write_checksum': True})
170 print("[i]Begin fig C. 2A", file=sys.stderr)
171 # https://www.kaggle.com/code/lizabogdan/top-correlated-genes?scriptVersionId=109838203&cellId=21
172 p4xdf = scDat[0].annDat.to_df()
173 p4ydf = scDat[1].annDat.to_df()
174 p4corraw = p4xdf.corrwith(p4ydf,axis=1)
175 p4corr = p4corraw.dropna()
176 plt.figure(figsize=(6,4))
177 plt.title(f"Pearson correlation - {nfoDict['sub']}")
178 figC=sns.histplot(p4corr,stat='count',binwidth=0.01)
179 plt.savefig(f"2A_Correlation_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'Pearson correlation', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
181 print("[i]Begin fig D. 2B", file=sys.stderr)
182 var_names = scDat[0].annDat.var_names.intersection(scDat[1].annDat.var_names)
183 xadata = scDat[0].annDat[:, var_names]
184 yadata = scDat[1].annDat[:, var_names]
185 xdf=getOBSMdf(xadata)
186 ydf=getOBSMdf(yadata)
187 #p4df = xdf.assign(Platform=scDat[0].name).join(ydf.assign(Platform=scDat[1].name),lsuffix='_'+scDat[0].name,rsuffix='_'+scDat[1].name,how='inner')
188 p4df = pd.concat([xdf.assign(Platform=scDat[0].name), ydf.assign(Platform=scDat[1].name)], ignore_index=True).replace([np.inf, -np.inf, 0], np.nan).dropna()
189 figD=sns.JointGrid(data=p4df, x="P1", y="P2", hue='Platform', dropna=True)
190 figD.plot_joint(sns.scatterplot, s=12.7, alpha=.6)
191 figD.plot_marginals(sns.histplot, kde=True, alpha=.618)
192 figD.figure.suptitle(f"PCA - {nfoDict['sub']}")
193 figD.set_axis_labels(xlabel='PC1', ylabel='PC2')
194 figD.savefig(f"2B_PCA_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'PCA', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
196 print("[i]Begin fig E. 2C", file=sys.stderr)
197 xdf=getOBSMdf(xadata,'X_umap')
198 ydf=getOBSMdf(yadata,'X_umap')
199 p5df = pd.concat([xdf.assign(Platform=scDat[0].name), ydf.assign(Platform=scDat[1].name)], ignore_index=True).replace([np.inf, -np.inf, 0], np.nan).dropna()
200 figE=sns.JointGrid(data=p5df, x="P1", y="P2", hue='Platform', dropna=True)
201 figE.plot_joint(sns.scatterplot, s=12.7, alpha=.6)
202 figE.plot_marginals(sns.histplot, kde=True, alpha=.618)
203 figE.figure.suptitle(f"UMAP - {nfoDict['sub']}")
204 figE.set_axis_labels(xlabel='UMAP1', ylabel='UMAP2')
205 figE.savefig(f"2C_UMAP_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'UMAP', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
206 print("[i]Begin fig E. 2Cn", file=sys.stderr)
207 xdf=getOBSMdf(xadata,'X_draw_graph_fa')
208 ydf=getOBSMdf(yadata,'X_draw_graph_fa')
209 p5df = pd.concat([xdf.assign(Platform=scDat[0].name), ydf.assign(Platform=scDat[1].name)], ignore_index=True).replace([np.inf, -np.inf, 0], np.nan).dropna()
210 figE=sns.JointGrid(data=p5df, x="P1", y="P2", hue='Platform', dropna=True)
211 figE.plot_joint(sns.scatterplot, s=12.7, alpha=.6)
212 figE.plot_marginals(sns.histplot, kde=True, alpha=.618)
213 figE.figure.suptitle(f"ForceAtlas2 - {nfoDict['sub']}")
214 figE.set_axis_labels(xlabel='FA1', ylabel='FA2')
215 figE.savefig(f"2C_ForceAtlas2_{nfoDict['sid']}.pdf", transparent=True, dpi=300, metadata={'Title': 'ForceAtlas2', 'Subject': f"{nfoDict['sub']} Data", 'Author': 'HU Xuesong'})
218 def getOBSMdf(anndata, obsmkey='X_pca') -> pd.DataFrame:
219 if not obsmkey in anndata.obsm:
220 if obsmkey=='X_pca':
221 sc.tl.pca(anndata,zero_center=True)
222 elif obsmkey=='X_umap':
223 if not 'neighbors' in anndata.uns:
224 if not 'X_pca' in anndata.obsm:
225 sc.pp.pca(anndata,zero_center=True)
226 sc.pp.neighbors(anndata)
227 sc.tl.umap(anndata,random_state=369)
228 elif obsmkey=='X_draw_graph_fa':
229 if not 'neighbors' in anndata.uns:
230 if not 'X_pca' in anndata.obsm:
231 sc.pp.pca(anndata,zero_center=True)
232 sc.pp.neighbors(anndata)
233 sc.tl.draw_graph(anndata,random_state=369)
234 data=anndata.obsm[obsmkey][0:,0:2]
235 df=pd.DataFrame(data=data[0:,0:], index=[anndata.obs_names[i] for i in range(data.shape[0])], columns=['P'+str(1+i) for i in range(data.shape[1])])
236 return df
238 if __name__ == "__main__":
239 main() # time (./fig1.py human; ./fig1.py mbrain ; ./fig1.py mkidney ) | tee plot.log
242 x1 = np.random.randn(1000)
243 y1 = np.random.randn(1000)
244 x2 = np.random.randn(1000) * 5
245 y2 = np.random.randn(1000)
246 fig, ax = plt.subplots()
247 # The figure and axes background must be made transparent.
248 fig.patch.set(alpha=0)
249 ax.patch.set(alpha=0)
250 pc1 = ax.scatter(x1, y1, c='b', edgecolors='none')
251 pc2 = ax.scatter(x2, y2, c='r', edgecolors='none')
252 mplcairo.operator_t.ADD.patch_artist(pc2) # Use additive blending.
253 plt.show()
255 1、N和Q<5比率大于4%
256 2、Q平均值小于20
257 3、Q<20和purity<0.6的比率大于18%
259 fastp --thread 4 -z -A --max_len1 28 --max_len2 0 --dont_eval_duplication -q 20 -u 30 -n 4 --average_qual 20 --length_required 28 -y -Y 30 -g -x
261 fastp -w 4 -A -q 20 -u 30 -n 5 -l 28 -y -Y 30 -g -x --max_len1 28 --max_len2 1000 \
262 -i ${prefix}_R1_001.fastq.gz -I ${prefix}_R2_001.fastq.gz \
263 -o ./cleanfq/${basepx}_R1_001.fastq.gz -O ./cleanfq/${basepx}_R2_001.fastq.gz \
264 -j ./cleanfq/${basepx}.json -h ./cleanfq/${basepx}.html 2>./cleanfq/${basepx}.log
266 import patchworklib as pw
267 #from blend_modes import addition
268 matplotlib_venn
270 ToDo:
271 * Try layers of annData.
272 * Res: layers share obs and var, thus useless. Even MuData shares obs.
274 ls -1 *.pdf|while read a;do convert -density 1200 $a -resize 25% $a.png;done