[1]:
from genecover import *
import numpy as np
import scanpy as sc
import pandas as pd
import os
Marker Gene Selection from DLPFC Sample #151673
Load Dataset
[3]:
data_dir = "..\\data\\DLPFC"
file_folder = "151673"
path = os.path.join(data_dir, file_folder)
adata = sc.read_visium(path,count_file = "filtered_feature_bc_matrix.h5",load_images=True)
adata.var_names_make_unique()
df_meta = pd.read_csv(os.path.join(path, "metadata.tsv"), sep='\t')
df_meta_layer = df_meta['layer_guess']
adata.obs['ground_truth'] = df_meta_layer.values
adata = adata[~pd.isnull(adata.obs['ground_truth'])]
sc.pp.filter_genes(adata, min_cells=100)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
orig_gene = adata.var.index.values
C:\Users\An Wang\AppData\Local\Temp\ipykernel_60420\1400016560.py:4: FutureWarning: Use `squidpy.read.visium` instead.
adata = sc.read_visium(path,count_file = "filtered_feature_bc_matrix.h5",load_images=True)
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\scanpy\preprocessing\_simple.py:287: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
adata.var["n_cells"] = number
Computing Correlation Matrix
[4]:
corr_mat = gene_gene_correlation(adata.X.toarray())
GeneCover via Combinatorial Optimization (Gurobi Solver)
[5]:
# Obtain 100 marker genes
genecover_markers = GeneCover(num_marker=100, corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = "Gurobi")
print("GeneCover markers: \n", orig_gene[genecover_markers])
Set parameter Username
Academic license - for non-commercial use only - expires 2026-04-07
Best Gap: 0
Best Epsilon: 0.1311767578125
GeneCover markers:
['MARCKSL1' 'SYNC' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'HPCAL1' 'VSNL1'
'MDH1' 'PPP3R1' 'CTNNA2' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1'
'CHN1' 'MOBP' 'CCK' 'AC106707.1' 'CLDN11' 'FAM131A' 'LDB2' 'UCHL1' 'SPP1'
'BBS7' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3' 'CXCL14' 'GABRB2'
'MOG' 'ACTB' 'NDUFA4' 'RAPGEF5' 'AQP1' 'PHKG1' 'YWHAG' 'SLC26A4-AS1'
'GJB1' 'NAP1L2' 'NEFM' 'NEFL' 'STMN2' 'CALB1' 'ENPP2' 'DIRAS2' 'CERCAM'
'OLFM1' 'PTGDS' 'SAA1' 'FOLH1' 'MYRF' 'SCGB2A2' 'CARNS1' 'NRGN' 'NKX6-2'
'NELL2' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1' 'AACS' 'RTN1' 'HSPA2'
'PPP4R4' 'GLDN' 'HBA2' 'SLC5A11' 'AC009133.1' 'PLLP' 'NDRG4' 'CALB2'
'KRT19' 'CNP' 'GFAP' 'NSF' 'ANKRD40' 'AATK' 'MBP' 'CHGB' 'SNAP25' 'CST3'
'BCAS1' 'NDUFA7' 'PPP1R14A' 'CALM3' 'RSPH14' 'NEFH' 'YWHAH' 'C21orf91'
'OLIG1' 'PCP4' 'MT-CO1' 'MT-CO2']
GeneCover via Combinaotorial Optimization (SCIP Solver)
[ ]:
genecover_markers = GeneCover(num_marker=100, corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = "SCIP")
print("GeneCover markers via SCIP: \n", orig_gene[genecover_markers])
Best Gap: 0
Best Epsilon: 0.12958984375
GeneCover markers via SCIP:
['PADI2' 'SYNC' 'WDR47' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'VSNL1' 'PREPL'
'MDH1' 'PPP3R1' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1' 'CHN1' 'CCK'
'CLDN11' 'LDB2' 'SPP1' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3'
'RHOBTB3' 'CXCL14' 'GABRB2' 'MOG' 'TPBG' 'ACTB' 'NDUFA4' 'RAPGEF5' 'AQP1'
'YWHAG' 'SLC26A4-AS1' 'CD99' 'GJB1' 'NAP1L2' 'LAMP2' 'NEFM' 'NEFL'
'STMN2' 'CALB1' 'ENPP2' 'CERCAM' 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1' 'MYRF'
'FTH1' 'SCGB2A2' 'PLA2G16' 'RTN3' 'CARNS1' 'HSPA8' 'NRGN' 'NKX6-2'
'TUBA1B' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1' 'AACS' 'RTN1' 'HSPA2'
'PPP4R4' 'GLDN' 'NPTN' 'HBA1' 'CRYM' 'SLC5A11' 'AC009133.1' 'PLLP'
'NDRG4' 'CALB2' 'KRT19' 'KRT17' 'GFAP' 'NSF' 'ANKRD40' 'AATK' 'MBP'
'SNAP25' 'CST3' 'MAG' 'PPP1R14A' 'CALM3' 'RSPH14' 'NEFH' 'YWHAH' 'OLIG1'
'PCP4' 'MT-CO1' 'MT-CO2']
GeneCover via Greedy Heuristics
[8]:
#obtain 100 marker genes via greedy heuristics
genecover_markers_greedy = GeneCover(num_marker=100, corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = "Greedy")
print("GeneCover markers: \n", orig_gene[genecover_markers_greedy])
Best Gap: 0
Best Epsilon: 0.136181640625
GeneCover markers:
['SNAP25' 'ERMN' 'ENC1' 'VSNL1' 'MAG' 'SCGB2A2' 'ATP1B1' 'RTN1' 'PTGDS'
'OLFM1' 'MOG' 'MBP' 'FABP4' 'IGKC' 'CST3' 'CARNS1' 'GAD1' 'TMEM144'
'KRT19' 'MT-CO1' 'NME7' 'UCHL1' 'PPP1R14A' 'CHN1' 'CLDN11' 'HBB' 'HSPA2'
'CALB2' 'EDIL3' 'YWHAG' 'GJB1' 'NEFL' 'ENPP2' 'KRT8' 'RNASE1' 'SLC5A11'
'GFAP' 'YWHAH' 'SYNC' 'RPL5' 'ATP1A1' 'HPCAL1' 'FAM84A' 'PPP3R1' 'TMSB10'
'MOBP' 'LDB2' 'LIMCH1' 'SPP1' 'HHIP' 'SV2C' 'SLC12A2' 'CXCL14' 'TUBB2A'
'ACTB' 'RAPGEF5' 'AQP1' 'PHKG1' 'SEMA3E' 'CHRDL1' 'LAMP2' 'NEFM' 'CALB1'
'DIRAS2' 'SLC44A1' 'STXBP1' 'CERCAM' 'ABCA2' 'FOLH1' 'FTH1' 'THY1'
'HSPA8' 'NRGN' 'NKX6-2' 'VAMP1' 'SYT1' 'CABP1' 'AACS' 'HTR2A' 'SLAIN1'
'HS6ST3' 'GLDN' 'IQCK' 'GPRC5B' 'AC009133.1' 'PLLP' 'NDRG4' 'RPL26'
'LRRC75A' 'KRT17' 'NSF' 'ANKRD40' 'SEPT4' 'AATK' 'CHGB' 'LAMP5' 'NEFH'
'OLIG1' 'S100B' 'MT-CO3']
Iterative GeneCover via Combinatorial Optimization (Gurobi Solver)
[9]:
# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations
genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = "Gurobi")
Iteration 1
Best Gap: 0
Best Epsilon: 0.1311767578125
Iteration 2
Best Gap: 0
Best Epsilon: 0.1367919921875
[10]:
print("Iterative GeneCover markers: \n", orig_gene[genecover_markers_iterative])
Iterative GeneCover markers:
[['MARCKSL1' 'SYNC' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'HPCAL1' 'VSNL1'
'MDH1' 'PPP3R1' 'CTNNA2' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1'
'CHN1' 'MOBP' 'CCK' 'AC106707.1' 'CLDN11' 'FAM131A' 'LDB2' 'UCHL1'
'SPP1' 'BBS7' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3' 'CXCL14'
'GABRB2' 'MOG' 'ACTB' 'NDUFA4' 'RAPGEF5' 'AQP1' 'PHKG1' 'YWHAG'
'SLC26A4-AS1' 'GJB1' 'NAP1L2' 'NEFM' 'NEFL' 'STMN2' 'CALB1' 'ENPP2'
'DIRAS2' 'CERCAM' 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1' 'MYRF' 'SCGB2A2'
'CARNS1' 'NRGN' 'NKX6-2' 'NELL2' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1'
'AACS' 'RTN1' 'HSPA2' 'PPP4R4' 'GLDN' 'HBA2' 'SLC5A11' 'AC009133.1'
'PLLP' 'NDRG4' 'CALB2' 'KRT19' 'CNP' 'GFAP' 'NSF' 'ANKRD40' 'AATK'
'MBP' 'CHGB' 'SNAP25' 'CST3' 'BCAS1' 'NDUFA7' 'PPP1R14A' 'CALM3'
'RSPH14' 'NEFH' 'YWHAH' 'C21orf91' 'OLIG1' 'PCP4' 'MT-CO1' 'MT-CO2']
['CLSTN1' 'HPCA' 'LMO4' 'KCNA2' 'MEF2D' 'CADM3' 'RGS4' 'FAM84A'
'SLC30A3' 'R3HDM1' 'ATP5MC3' 'EPHA4' 'ATP2B2' 'SYN2' 'CLDND1' 'TAGLN3'
'TF' 'RAB6B' 'SERPINI1' 'PCDH7' 'HOPX' 'MAPK10' 'PPP3CA' 'UGT8' 'BASP1'
'SELENOP' 'CARTPT' 'MAP1B' 'NREP' 'ANXA6' 'GABRA1' 'SNCB' 'TUBB2A'
'NRN1' 'CAP2' 'TSPYL1' 'DGKB' 'AC018647.1' 'VSTM2A' 'TSPAN7' 'BEX1'
'PLP1' 'FABP4' 'PDP1' 'GABBR2' 'SLC44A1' 'STXBP1' 'AL359091.1' 'ABCA2'
'SLC1A2' 'FTH1' 'SCGB1D2' 'RTN3' 'C11orf87' 'CRYAB' 'THY1' 'HSPA8'
'HEPACAM' 'SNCG' 'LIPA' 'LHPP' 'VAMP1' 'TUBA1B' 'TUBA1A' 'ATP5F1B'
'FAM19A2' 'ATP2B1' 'HTR2A' 'PCDH8' 'HS6ST3' 'RNASE1' 'CALM1' 'ITPKA'
'B2M' 'NPTN' 'CRYM' 'CACNG3' 'MT3' 'FA2H' 'CAMKK1' 'LRRC75A' 'KRT17'
'SEPT4' 'AQP4' 'CNDP1' 'FKBP1A' 'LAMP5' 'VSTM2L' 'SLC12A5' 'CBLN4'
'EEF1A2' 'AC005944.1' 'AC092069.1' 'HAPLN4' 'SCN1B' 'MAG' 'SLC17A7'
'IGLC2' 'SULT4A1' 'MT-CO3']]
Iterative GeneCover via Combinatorial Optimization (SCIP Solver)
[11]:
# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations
genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = "SCIP")
Iteration 1
Best Gap: 0
Best Epsilon: 0.12958984375
Iteration 2
Best Gap: 0
Best Epsilon: 0.137158203125
[12]:
print("Iterative GeneCover markers: \n", orig_gene[genecover_markers_iterative])
Iterative GeneCover markers:
[['PADI2' 'SYNC' 'WDR47' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'VSNL1' 'PREPL'
'MDH1' 'PPP3R1' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1' 'CHN1' 'CCK'
'CLDN11' 'LDB2' 'SPP1' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3'
'RHOBTB3' 'CXCL14' 'GABRB2' 'MOG' 'TPBG' 'ACTB' 'NDUFA4' 'RAPGEF5'
'AQP1' 'YWHAG' 'SLC26A4-AS1' 'CD99' 'GJB1' 'NAP1L2' 'LAMP2' 'NEFM'
'NEFL' 'STMN2' 'CALB1' 'ENPP2' 'CERCAM' 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1'
'MYRF' 'FTH1' 'SCGB2A2' 'PLA2G16' 'RTN3' 'CARNS1' 'HSPA8' 'NRGN'
'NKX6-2' 'TUBA1B' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1' 'AACS' 'RTN1'
'HSPA2' 'PPP4R4' 'GLDN' 'NPTN' 'HBA1' 'CRYM' 'SLC5A11' 'AC009133.1'
'PLLP' 'NDRG4' 'CALB2' 'KRT19' 'KRT17' 'GFAP' 'NSF' 'ANKRD40' 'AATK'
'MBP' 'SNAP25' 'CST3' 'MAG' 'PPP1R14A' 'CALM3' 'RSPH14' 'NEFH' 'YWHAH'
'OLIG1' 'PCP4' 'MT-CO1' 'MT-CO2']
['CLSTN1' 'HPCA' 'LMO4' 'KCNA2' 'CADM3' 'RGS4' 'HPCAL1' 'FAM84A'
'SLC30A3' 'CALM2' 'R3HDM1' 'ATP5MC3' 'EPHA4' 'ATP2B2' 'SYN2' 'MOBP'
'CLDND1' 'TAGLN3' 'TF' 'RAB6B' 'SERPINI1' 'FAM131A' 'PCDH7' 'UCHL1'
'HOPX' 'PPP3CA' 'UGT8' 'BASP1' 'CARTPT' 'MAP1B' 'NREP' 'ANXA6' 'GABRA1'
'SNCB' 'TUBB2A' 'NRN1' 'TSPYL1' 'PRKAR1B' 'DGKB' 'ANLN' 'VSTM2A'
'PHKG1' 'TSPAN7' 'PLP1' 'CLU' 'FABP4' 'PDP1' 'DIRAS2' 'GABBR2'
'SLC44A1' 'SLC31A2' 'STXBP1' 'AL359091.1' 'ABCA2' 'SCGB1D2' 'C11orf87'
'CRYAB' 'THY1' 'HEPACAM' 'SNCG' 'LIPA' 'GOT1' 'SCD' 'LHPP' 'VAMP1'
'FAM19A2' 'ATP2B1' 'DCLK1' 'HTR2A' 'PCDH8' 'HS6ST3' 'RNASE1' 'CALM1'
'CHGA' 'ITPKA' 'B2M' 'CACNG3' 'MT3' 'FA2H' 'CAMKK1' 'LRRC75A' 'CNP'
'TTYH2' 'AQP4' 'CNDP1' 'CHGB' 'SLC12A5' 'CBLN4' 'AC005944.1' 'ABHD8'
'RAB3A' 'TMEM59L' 'HAPLN4' 'SCN1B' 'ATP1A3' 'SLC17A7' 'IGLC2' 'SULT4A1'
'S100B' 'MT-CO3']]
Iterative GeneCover via Greedy Heuristics
[13]:
# Obtain 200 marker genes via Iterative GeneCover (Greedy Heuristics) with an incremental size of 100 and two iterations
genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver= "Greedy")
Iteration 1
Best Gap: 0
Best Epsilon: 0.136181640625
Iteration 2
Best Gap: 0
Best Epsilon: 0.14765625
[14]:
print("Iterative GeneCover markers: \n", orig_gene[genecover_markers_iterative])
Iterative GeneCover markers:
[['SNAP25' 'ERMN' 'ENC1' 'VSNL1' 'MAG' 'SCGB2A2' 'ATP1B1' 'RTN1' 'PTGDS'
'OLFM1' 'MOG' 'MBP' 'FABP4' 'IGKC' 'CST3' 'CARNS1' 'GAD1' 'TMEM144'
'KRT19' 'MT-CO1' 'NME7' 'UCHL1' 'PPP1R14A' 'CHN1' 'CLDN11' 'HBB'
'HSPA2' 'CALB2' 'EDIL3' 'YWHAG' 'GJB1' 'NEFL' 'ENPP2' 'KRT8' 'RNASE1'
'SLC5A11' 'GFAP' 'YWHAH' 'SYNC' 'RPL5' 'ATP1A1' 'HPCAL1' 'FAM84A'
'PPP3R1' 'TMSB10' 'MOBP' 'LDB2' 'LIMCH1' 'SPP1' 'HHIP' 'SV2C' 'SLC12A2'
'CXCL14' 'TUBB2A' 'ACTB' 'RAPGEF5' 'AQP1' 'PHKG1' 'SEMA3E' 'CHRDL1'
'LAMP2' 'NEFM' 'CALB1' 'DIRAS2' 'SLC44A1' 'STXBP1' 'CERCAM' 'ABCA2'
'FOLH1' 'FTH1' 'THY1' 'HSPA8' 'NRGN' 'NKX6-2' 'VAMP1' 'SYT1' 'CABP1'
'AACS' 'HTR2A' 'SLAIN1' 'HS6ST3' 'GLDN' 'IQCK' 'GPRC5B' 'AC009133.1'
'PLLP' 'NDRG4' 'RPL26' 'LRRC75A' 'KRT17' 'NSF' 'ANKRD40' 'SEPT4' 'AATK'
'CHGB' 'LAMP5' 'NEFH' 'OLIG1' 'S100B' 'MT-CO3']
['TF' 'CCK' 'STMN2' 'NDUFA4' 'MDH1' 'PLP1' 'GPM6A' 'TUBA1B' 'HOPX'
'SCGB1D2' 'MAL' 'NPTN' 'AC005944.1' 'MT-ND1' 'SAA1' 'CNP' 'RTN3'
'IGLC2' 'SNCB' 'SLC1A2' 'CALM3' 'C11orf87' 'CRYAB' 'CNDP1' 'PCP4'
'LMO4' 'CLDND1' 'GABRA1' 'TUBA1A' 'PLEKHH1' 'KLK6' 'ELOVL1' 'RGS4'
'BASP1' 'RPL37' 'CARTPT' 'MAP1B' 'GABRB2' 'HLA-B' 'BEX1' 'LINC00844'
'KCNC2' 'ATP2B1' 'MT3' 'HPCA' 'TMEM125' 'SV2A' 'CNTN2' 'SLC30A3'
'CALM2' 'R3HDM1' 'ATP2B2' 'ARPP21' 'AC106707.1' 'SERPINI1' 'JCHAIN'
'PARM1' 'G3BP2' 'PRDM8' 'PPP3CA' 'UGT8' 'CCDC152' 'SELENOP' 'HCN1'
'HSP90AB1' 'PRKAR1B' 'ANLN' 'NACAD' 'GPR37' 'CLEC2L' 'TSPAN7' 'SYP'
'NAP1L2' 'TCEAL6' 'PDP1' 'GABBR2' 'DNM1' 'MYRF' 'HEPACAM' 'FAM107B'
'PIP4K2A' 'FRMPD2' 'HK1' 'OPALIN' 'GRIN2B' 'DCLK1' 'PCDH8' 'IGHA2'
'IGHA1' 'ITPKA' 'ARPP19' 'SV2B' 'FA2H' 'CA10' 'BCAS1' 'EEF1A2' 'HAPLN4'
'FTL' 'SLC17A7' 'SYNJ1']]
Marker Gene Selection Across Samples
Load the datasets (Sample #1515067, #151669, #151673)
[15]:
Adata = {}
data_dir = "..\\data\\DLPFC Full Samples"
file_names = os.listdir(data_dir)
across_samples_file_names = file_names[:3]
annotation_names = file_names[3:]
for j, file in enumerate(across_samples_file_names):
adata = sc.read_10x_h5(os.path.join(data_dir, file))
adata.var_names_make_unique()
layer_annotation = pd.read_csv(os.path.join(data_dir, annotation_names[j]), index_col=0)
assert np.all(adata.obs.index.values == layer_annotation.index.values)
adata.obs = layer_annotation
sc.pp.filter_genes(adata, min_cells=100)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=10000)
adata = adata[:, adata.var.highly_variable]
Adata[file.split("_")[0]] = adata
for j, file in enumerate(across_samples_file_names):
if j == 0:
genes_across_samples = Adata[file.split("_")[0]].var_names.values
else:
genes_across_samples = np.intersect1d(genes_across_samples, Adata[file.split("_")[0]].var_names.values)
for key in across_samples_file_names:
Adata[key.split("_")[0]] = Adata[key.split("_")[0]][:, genes_across_samples]
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
c:\Users\An Wang\.conda\envs\vae-spatial\lib\site-packages\anndata\_core\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Compute Correlation Matrices for Every Sample
[16]:
corr_mat_combine_across_samples = gene_gene_correlation([Adata[key.split("_")[0]].X.toarray() for key in across_samples_file_names])
GeneCover Marker Selection via Combinatorial Optimization Across Samples (Gurobi Solver)
[17]:
# Obtain 100 marker genes across samples
genecover_marker_across_samples = GeneCover(num_marker=100, corr_mat = corr_mat_combine_across_samples, w = np.ones(corr_mat_combine_across_samples.shape[1]), solver = "Gurobi")
print("GeneCover markers: \n", genes_across_samples[genecover_marker_across_samples])
Best Gap: 0
Best Epsilon: 0.10332946777343749
GeneCover markers:
['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'ATP1A1'
'ATP1B1' 'ATP6V1B2' 'B2M' 'BASP1' 'CARTPT' 'CCK' 'CHN1' 'CLSTN2' 'CLSTN3'
'CPLX1' 'CRYM' 'CST3' 'DIRAS2' 'DYNC1I1' 'ENC1' 'EPHA4' 'FABP4' 'G0S2'
'GABRA1' 'GABRG2' 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'HBB' 'HOPX' 'IGHA1'
'IGLC2' 'KLK6' 'KRT17' 'KRT19' 'KRT8' 'LMO4' 'MAGEE1' 'MAP1B' 'MBP'
'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4' 'NEFH' 'NEFL'
'NEFM' 'NELL2' 'NME7' 'NRGN' 'NSF' 'OAT' 'OLFM1' 'PAK1' 'PDP1' 'PHKG1'
'PLIN1' 'PLP1' 'PPP3R1' 'RAB3A' 'REEP2' 'RPLP1' 'RTN1' 'RTN3' 'RTN4'
'SCGB2A2' 'SCN1A' 'SH3BGRL2' 'SLC17A7' 'SLC1A2' 'SLC24A2' 'SLC39A10'
'SNAP25' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'SYNC' 'SYNGR1' 'SYT1' 'SYT4'
'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10' 'TUBA1A' 'TUBA1B' 'TUBB2A' 'UCHL1'
'VPS35' 'VSNL1' 'WDR47' 'WDR7' 'YWHAG']
GeneCover Marker Selection via Combinatorial Optimization Across Samples (SCIP Solver)
[28]:
# Obtain 100 marker genes across samples
genecover_marker_across_samples = GeneCover(num_marker=100, corr_mat = corr_mat_combine_across_samples, w = np.ones(corr_mat_combine_across_samples.shape[1]), lambdaMax=.2,solver = "SCIP")
print("GeneCover markers: \n", genes_across_samples[genecover_marker_across_samples])
Best Gap: 0
Best Epsilon: 0.10302734375
GeneCover markers:
['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'APP'
'ATL1' 'ATP1A1' 'ATP1B1' 'BASP1' 'CARNS1' 'CCK' 'CHN1' 'CLSTN2' 'CLSTN3'
'CLU' 'CRYM' 'DYNC1I1' 'EIF1B' 'ENC1' 'EPHA4' 'FABP4' 'G0S2' 'GABRA1'
'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'GPRC5B' 'HBB' 'HOPX' 'HPCAL1' 'HSP90AB1'
'IFI27' 'IGHA1' 'IGKC' 'KRT17' 'KRT19' 'KRT8' 'LDB2' 'LMO4' 'MAP1B' 'MBP'
'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4' 'NECAB1' 'NEFH'
'NEFL' 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NRN1' 'NSF' 'OAT' 'OLFM1' 'PAK1'
'PDP1' 'PFKP' 'PHKG1' 'PLIN1' 'PLP1' 'RAB3A' 'RPLP1' 'RTN1' 'RTN3'
'SCGB2A2' 'SERPINF1' 'SH3BGRL2' 'SLC17A7' 'SLC1A2' 'SLC24A2' 'SNAP25'
'SNCA' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'STXBP1' 'SYNC' 'SYNGR1' 'SYT1'
'SYT4' 'TBR1' 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10' 'TUBA1B' 'TUBB2A'
'UCHL1' 'VSNL1' 'WDR47' 'YWHAG']
GeneCover Marker Selection via Greedy Heuristics Across Samples
[19]:
# Obtain 100 marker genes across samples via greedy heuristics
genecover_marker_across_samples_greedy = GeneCover(num_marker=100, corr_mat = corr_mat_combine_across_samples, w = np.ones(corr_mat_combine_across_samples.shape[1]), solver= "Greedy")
print("GeneCover markers: \n", genes_across_samples[genecover_marker_across_samples_greedy])
Best Gap: 0
Best Epsilon: 0.11860351562499999
GeneCover markers:
['SNAP25' 'PLP1' 'ENC1' 'SYT1' 'SCGB2A2' 'NRGN' 'VSNL1' 'CST3' 'ATP1B1'
'MOG' 'TUBA1B' 'RTN1' 'SAA1' 'TMSB10' 'NEFL' 'GFAP' 'ERMN' 'IGKC' 'UCHL1'
'KRT19' 'ACTB' 'MBP' 'CLDN11' 'MAP1B' 'OLFM1' 'PTGDS' 'MT-CO2' 'CARNS1'
'CHN1' 'CLSTN2' 'RTN3' 'STMN1' 'TUBB2A' 'HBA1' 'NDUFA4' 'PPP1R14A'
'SLC1A2' 'DIRAS2' 'AGT' 'ALDOA' 'ANXA6' 'ATP6V1C1' 'ATP6V1D' 'BASP1'
'CDC37' 'CLDND1' 'CNDP1' 'CNTN2' 'COL1A2' 'COX6C' 'CREG2' 'CRYAB' 'DCLK1'
'DYNC1I1' 'EFHD2' 'EPDR1' 'ETS2' 'FAM3C' 'FKBP1A' 'FTL' 'GLS' 'GPM6A'
'HPRT1' 'HSPA2' 'HSPA8' 'HSPH1' 'KCNC2' 'KIFAP3' 'LAMP2' 'LDHA'
'LINC00844' 'LMO4' 'MAG' 'MAGED1' 'MAL' 'MAP3K12' 'MDH1' 'MGP' 'MMACHC'
'MT-ND2' 'NAP1L2' 'NAP1L5' 'NSF' 'OPALIN' 'PLIN1' 'PNMA8A' 'PRKAR1B'
'PRKCE' 'RAB3A' 'RAB6A' 'RGS4' 'RPS27' 'SCG5' 'SCGB1D2' 'SERPINI1'
'SLC1A3' 'SNCA' 'SNCG' 'SYN2' 'TFF1']
Iterative GeneCover via Combinatorial Optimization Across Samples (Gurobi Solver)
[22]:
# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations
genecover_marker_across_samples = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat_combine_across_samples, w=np.ones(corr_mat_combine_across_samples.shape[1]), solver = "Gurobi")
Iteration 1
Best Gap: 0
Best Epsilon: 0.10332946777343749
Iteration 2
Best Gap: 0
Best Epsilon: 0.10310058593749999
[23]:
print("Iterative GeneCover markers: \n", genes_across_samples[genecover_marker_across_samples])
Iterative GeneCover markers:
[['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'ATP1A1'
'ATP1B1' 'ATP6V1B2' 'B2M' 'BASP1' 'CARTPT' 'CCK' 'CHN1' 'CLSTN2'
'CLSTN3' 'CPLX1' 'CRYM' 'CST3' 'DIRAS2' 'DYNC1I1' 'ENC1' 'EPHA4'
'FABP4' 'G0S2' 'GABRA1' 'GABRG2' 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'HBB'
'HOPX' 'IGHA1' 'IGLC2' 'KLK6' 'KRT17' 'KRT19' 'KRT8' 'LMO4' 'MAGEE1'
'MAP1B' 'MBP' 'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4'
'NEFH' 'NEFL' 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NSF' 'OAT' 'OLFM1' 'PAK1'
'PDP1' 'PHKG1' 'PLIN1' 'PLP1' 'PPP3R1' 'RAB3A' 'REEP2' 'RPLP1' 'RTN1'
'RTN3' 'RTN4' 'SCGB2A2' 'SCN1A' 'SH3BGRL2' 'SLC17A7' 'SLC1A2' 'SLC24A2'
'SLC39A10' 'SNAP25' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'SYNC' 'SYNGR1'
'SYT1' 'SYT4' 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10' 'TUBA1A' 'TUBA1B'
'TUBB2A' 'UCHL1' 'VPS35' 'VSNL1' 'WDR47' 'WDR7' 'YWHAG']
['ADD2' 'AK5' 'ANXA6' 'APBA2' 'APP' 'ATP6AP2' 'ATRNL1' 'BEX1' 'CABP1'
'CACNB1' 'CALM3' 'CAMK2D' 'CAMK2N1' 'CHGA' 'CHGB' 'CLDN11' 'CLU' 'CMAS'
'CNP' 'CPB1' 'CREG2' 'CRYAB' 'DCLK1' 'DNM1' 'DNM3' 'EEF1A1' 'EFHD2'
'EIF4A2' 'EPDR1' 'ERMN' 'ETS2' 'FBXW7' 'G3BP2' 'GAPDH' 'GLS' 'GNB5'
'HLA-B' 'HSP90AA1' 'HSP90AB1' 'HSPA8' 'HSPH1' 'IGKC' 'KCNK1' 'KIFAP3'
'LRRC75A' 'MAPRE3' 'MFSD4A' 'MGP' 'MT-ND1' 'MUC1' 'NAP1L2' 'NDFIP1'
'NELL1' 'NMNAT2' 'NPTN' 'NREP' 'NUAK1' 'NUDT4' 'OXR1' 'PCP4' 'PFKP'
'PGK1' 'PIK3R1' 'PRKAR1A' 'PRKCE' 'PRPF19' 'PTGDS' 'PTPN5' 'RAB2A'
'RAB3C' 'REEP1' 'RGS4' 'RPS12' 'RTN4RL2' 'SAA1' 'SCG5' 'SCGB1D2'
'SCN2A' 'SERPINI1' 'SH3GL2' 'SLC1A3' 'SLC6A17' 'SNCG' 'SNX10' 'SPOCK2'
'SRPK2' 'STXBP1' 'SV2A' 'SV2B' 'SYP' 'TAGLN3' 'TBR1' 'TF' 'THY1'
'TOLLIP' 'TUBA4A' 'WASF1' 'WASL' 'YWHAB' 'YWHAH']]
Iterative GeneCover via Combinatorial Optimization Across Samples (SCIP Solver)
[30]:
# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations
genecover_marker_across_samples = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat_combine_across_samples, w=np.ones(corr_mat_combine_across_samples.shape[1]), lambdaMax=.2 ,solver = "SCIP")
Iteration 1
Best Gap: 0
Best Epsilon: 0.10302734375
Iteration 2
Best Gap: 0
Best Epsilon: 0.102752685546875
[32]:
print("Iterative GeneCover markers: \n", genes_across_samples[genecover_marker_across_samples])
Iterative GeneCover markers:
[['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'APP'
'ATL1' 'ATP1A1' 'ATP1B1' 'BASP1' 'CARNS1' 'CCK' 'CHN1' 'CLSTN2'
'CLSTN3' 'CLU' 'CRYM' 'DYNC1I1' 'EIF1B' 'ENC1' 'EPHA4' 'FABP4' 'G0S2'
'GABRA1' 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'GPRC5B' 'HBB' 'HOPX' 'HPCAL1'
'HSP90AB1' 'IFI27' 'IGHA1' 'IGKC' 'KRT17' 'KRT19' 'KRT8' 'LDB2' 'LMO4'
'MAP1B' 'MBP' 'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4'
'NECAB1' 'NEFH' 'NEFL' 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NRN1' 'NSF' 'OAT'
'OLFM1' 'PAK1' 'PDP1' 'PFKP' 'PHKG1' 'PLIN1' 'PLP1' 'RAB3A' 'RPLP1'
'RTN1' 'RTN3' 'SCGB2A2' 'SERPINF1' 'SH3BGRL2' 'SLC17A7' 'SLC1A2'
'SLC24A2' 'SNAP25' 'SNCA' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'STXBP1' 'SYNC'
'SYNGR1' 'SYT1' 'SYT4' 'TBR1' 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10'
'TUBA1B' 'TUBB2A' 'UCHL1' 'VSNL1' 'WDR47' 'YWHAG']
['AC011603.2' 'ADD2' 'AK5' 'APBA2' 'ATP1B2' 'ATP5F1A' 'ATP6AP2'
'ATP6V1B2' 'BEX1' 'CABP1' 'CACNB1' 'CALM3' 'CAMK2D' 'CAP2' 'CARTPT'
'CDKN2D' 'CHGA' 'CHGB' 'CLDN11' 'CNP' 'CREG2' 'CRYAB' 'CST3' 'CX3CL1'
'DCLK1' 'DNAJB6' 'DNAJC6' 'DNM1' 'DNM3' 'EFHD2' 'EHD3' 'EPDR1' 'ERMN'
'FBXW7' 'FKBP1A' 'G3BP2' 'GABRG2' 'GAPDH' 'GGCX' 'GLS' 'GOT1' 'HLA-B'
'HMGCS1' 'HSP90AA1' 'HSPA8' 'IGLC2' 'KCNK1' 'KIFAP3' 'LRRC75A' 'MAPRE3'
'MAPT' 'MGP' 'MTRNR2L8' 'MUC1' 'NAP1L2' 'NCALD' 'NCOA7' 'NDFIP1'
'NMNAT2' 'NPTN' 'NUAK1' 'NUDT4' 'OXR1' 'PCP4' 'PGK1' 'PI4KA' 'PIK3R1'
'PNMA8A' 'PPP3R1' 'PREPL' 'PRKCE' 'PRNP' 'PRPF19' 'PTGDS' 'PTPN5'
'RAB15' 'RAB3C' 'REEP1' 'RPS12' 'RTN4' 'SAA1' 'SCGB1D2' 'SCN2A' 'SCN3B'
'SERPINI1' 'SH3GL2' 'SNX10' 'SV2A' 'SV2B' 'SYP' 'TAGLN3' 'TF' 'THY1'
'TOLLIP' 'TRIM37' 'TUBA4A' 'VPS35' 'WASF1' 'YWHAB' 'YWHAH']]
Iterative GeneCover via Greedy Heuristics Across Samples
[ ]:
# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations
genecover_marker_across_samples_greedy = genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat_combine_across_samples, w=np.ones(corr_mat_combine_across_samples.shape[1]),lambdaMin = .08, lambdaMax = .2, solver= "Greedy")
Iteration 1
Best Gap: 0
Best Epsilon: 0.1185546875
Iteration 2
Best Gap: 0
Best Epsilon: 0.11213867187500001
[ ]:
print("Iterative GeneCover markers: \n", genes_across_samples[genecover_marker_across_samples_greedy])
Iterative GeneCover markers:
[['SNAP25' 'PLP1' 'ENC1' 'SYT1' 'SCGB2A2' 'NRGN' 'VSNL1' 'CST3' 'ATP1B1'
'MOG' 'TUBA1B' 'RTN1' 'SAA1' 'TMSB10' 'NEFL' 'GFAP' 'ERMN' 'IGKC'
'UCHL1' 'KRT19' 'ACTB' 'MBP' 'CLDN11' 'MAP1B' 'OLFM1' 'PTGDS' 'MT-CO2'
'CARNS1' 'CHN1' 'CLSTN2' 'RTN3' 'STMN1' 'TUBB2A' 'HBA1' 'NDUFA4'
'PPP1R14A' 'SLC1A2' 'DIRAS2' 'AGT' 'ALDOA' 'ANXA6' 'ATP6V1C1' 'ATP6V1D'
'BASP1' 'CDC37' 'CLDND1' 'CNDP1' 'CNTN2' 'COL1A2' 'COX6C' 'CREG2'
'DCLK1' 'DYNC1I1' 'EFHD2' 'EPDR1' 'ETS2' 'FAM3C' 'FKBP1A' 'FTL' 'GLS'
'GPM6A' 'HPRT1' 'HSPA2' 'HSPA8' 'HSPH1' 'KCNC2' 'KIFAP3' 'KRT18'
'LAMP2' 'LDHA' 'LINC00844' 'LMO4' 'MAG' 'MAGED1' 'MAL' 'MAP3K12' 'MDH1'
'MGP' 'MMACHC' 'MT-ND2' 'NAP1L2' 'NAP1L5' 'NSF' 'OPALIN' 'PLIN1'
'PNMA8A' 'PRKAR1B' 'PRKCE' 'RAB3A' 'RAB6A' 'RGS4' 'RPS27' 'SCG5'
'SCGB1D2' 'SERPINI1' 'SLC1A3' 'SNCA' 'SNCG' 'SYN2' 'TFF1']
['YWHAH' 'TF' 'STMN2' 'YWHAG' 'SLC24A2' 'CCK' 'MT3' 'MUC1' 'NEFM' 'NME7'
'FABP4' 'MOBP' 'HOPX' 'ATP6V1B2' 'IGLC2' 'PCP4' 'SLC17A7' 'PFKP' 'TBR1'
'CAP2' 'APP' 'GABRA1' 'NKX6-2' 'SYP' 'TAGLN3' 'SH3GL2' 'GAP43' 'CRYM'
'AC005944.1' 'CNP' 'THY1' 'MOAP1' 'NELL1' 'SYT4' 'NAPB' 'SLC30A3'
'OXR1' 'SPP1' 'STXBP1' 'GABRG2' 'MICAL2' 'NDRG4' 'CHL1' 'ENPP2' 'FABP3'
'IGHA1' 'ITFG1' 'KRT8' 'MT-CO1' 'PAK1' 'SCN2A' 'SSTR2' 'TUBA4A' 'CHGB'
'CLU' 'FAM49A' 'HSP90AA1' 'LRRC75A' 'NELL2' 'NUAK1' 'PGM2L1'
'AC011603.3' 'ACOT13' 'ACOT7' 'APBA2' 'APOE' 'ATL1' 'ATP1B2' 'ATP6V1A'
'BNIP3' 'CPLX1' 'CRYAB' 'CTNNB1' 'EIF4A2' 'ENO2' 'GHITM' 'GLUL'
'IPCEF1' 'KIF21A' 'KLC1' 'MAP2K1' 'MLLT11' 'MT-ND1' 'MT-ND3' 'MYRF'
'PCLO' 'PCSK2' 'PEG3' 'PIK3R1' 'S100A11' 'SEPT4' 'SLC6A17' 'STX1A'
'SYNGR1' 'SYNGR3' 'TMEM144' 'TMEM14A' 'TOLLIP' 'TUBB' 'VAMP1']]