{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "466cade8", "metadata": {}, "outputs": [], "source": [ "from genecover import *\n", "import numpy as np\n", "import scanpy as sc\n", "import pandas as pd\n", "import os " ] }, { "cell_type": "markdown", "id": "8850d47a", "metadata": {}, "source": [ "# Marker Gene Selection from DLPFC Sample #151673" ] }, { "cell_type": "markdown", "id": "56908a7c", "metadata": {}, "source": [ "### Load Dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "275d3d76", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\An Wang\\AppData\\Local\\Temp\\ipykernel_60420\\1400016560.py:4: FutureWarning: Use `squidpy.read.visium` instead.\n", " adata = sc.read_visium(path,count_file = \"filtered_feature_bc_matrix.h5\",load_images=True)\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\scanpy\\preprocessing\\_simple.py:287: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.\n", " adata.var[\"n_cells\"] = number\n" ] } ], "source": [ "data_dir = \"..\\\\data\\\\DLPFC\"\n", "file_folder = \"151673\"\n", "path = os.path.join(data_dir, file_folder)\n", "adata = sc.read_visium(path,count_file = \"filtered_feature_bc_matrix.h5\",load_images=True)\n", "adata.var_names_make_unique()\n", "df_meta = pd.read_csv(os.path.join(path, \"metadata.tsv\"), sep='\\t')\n", "df_meta_layer = df_meta['layer_guess']\n", "adata.obs['ground_truth'] = df_meta_layer.values\n", "adata = adata[~pd.isnull(adata.obs['ground_truth'])]\n", "sc.pp.filter_genes(adata, min_cells=100)\n", "sc.pp.normalize_total(adata, target_sum=1e4)\n", "sc.pp.log1p(adata)\n", "orig_gene = adata.var.index.values" ] }, { "cell_type": "markdown", "id": "61acfede", "metadata": {}, "source": [ "### Computing Correlation Matrix" ] }, { "cell_type": "code", "execution_count": 4, "id": "f9c57399", "metadata": {}, "outputs": [], "source": [ "corr_mat = gene_gene_correlation(adata.X.toarray())" ] }, { "cell_type": "markdown", "id": "67343b83", "metadata": {}, "source": [ "### GeneCover via Combinatorial Optimization (Gurobi Solver)" ] }, { "cell_type": "code", "execution_count": 5, "id": "6948d2da", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Set parameter Username\n", "Academic license - for non-commercial use only - expires 2026-04-07\n", "Best Gap: 0\n", "Best Epsilon: 0.1311767578125\n", "GeneCover markers: \n", " ['MARCKSL1' 'SYNC' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'HPCAL1' 'VSNL1'\n", " 'MDH1' 'PPP3R1' 'CTNNA2' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1'\n", " 'CHN1' 'MOBP' 'CCK' 'AC106707.1' 'CLDN11' 'FAM131A' 'LDB2' 'UCHL1' 'SPP1'\n", " 'BBS7' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3' 'CXCL14' 'GABRB2'\n", " 'MOG' 'ACTB' 'NDUFA4' 'RAPGEF5' 'AQP1' 'PHKG1' 'YWHAG' 'SLC26A4-AS1'\n", " 'GJB1' 'NAP1L2' 'NEFM' 'NEFL' 'STMN2' 'CALB1' 'ENPP2' 'DIRAS2' 'CERCAM'\n", " 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1' 'MYRF' 'SCGB2A2' 'CARNS1' 'NRGN' 'NKX6-2'\n", " 'NELL2' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1' 'AACS' 'RTN1' 'HSPA2'\n", " 'PPP4R4' 'GLDN' 'HBA2' 'SLC5A11' 'AC009133.1' 'PLLP' 'NDRG4' 'CALB2'\n", " 'KRT19' 'CNP' 'GFAP' 'NSF' 'ANKRD40' 'AATK' 'MBP' 'CHGB' 'SNAP25' 'CST3'\n", " 'BCAS1' 'NDUFA7' 'PPP1R14A' 'CALM3' 'RSPH14' 'NEFH' 'YWHAH' 'C21orf91'\n", " 'OLIG1' 'PCP4' 'MT-CO1' 'MT-CO2']\n" ] } ], "source": [ "# Obtain 100 marker genes \n", "genecover_markers = GeneCover(num_marker=100, corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = \"Gurobi\")\n", "print(\"GeneCover markers: \\n\", orig_gene[genecover_markers])" ] }, { "cell_type": "markdown", "id": "a5e01fae", "metadata": {}, "source": [ "### GeneCover via Combinaotorial Optimization (SCIP Solver)" ] }, { "cell_type": "code", "execution_count": null, "id": "d5f87f84", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Gap: 0\n", "Best Epsilon: 0.12958984375\n", "GeneCover markers via SCIP: \n", " ['PADI2' 'SYNC' 'WDR47' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'VSNL1' 'PREPL'\n", " 'MDH1' 'PPP3R1' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1' 'CHN1' 'CCK'\n", " 'CLDN11' 'LDB2' 'SPP1' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3'\n", " 'RHOBTB3' 'CXCL14' 'GABRB2' 'MOG' 'TPBG' 'ACTB' 'NDUFA4' 'RAPGEF5' 'AQP1'\n", " 'YWHAG' 'SLC26A4-AS1' 'CD99' 'GJB1' 'NAP1L2' 'LAMP2' 'NEFM' 'NEFL'\n", " 'STMN2' 'CALB1' 'ENPP2' 'CERCAM' 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1' 'MYRF'\n", " 'FTH1' 'SCGB2A2' 'PLA2G16' 'RTN3' 'CARNS1' 'HSPA8' 'NRGN' 'NKX6-2'\n", " 'TUBA1B' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1' 'AACS' 'RTN1' 'HSPA2'\n", " 'PPP4R4' 'GLDN' 'NPTN' 'HBA1' 'CRYM' 'SLC5A11' 'AC009133.1' 'PLLP'\n", " 'NDRG4' 'CALB2' 'KRT19' 'KRT17' 'GFAP' 'NSF' 'ANKRD40' 'AATK' 'MBP'\n", " 'SNAP25' 'CST3' 'MAG' 'PPP1R14A' 'CALM3' 'RSPH14' 'NEFH' 'YWHAH' 'OLIG1'\n", " 'PCP4' 'MT-CO1' 'MT-CO2']\n" ] } ], "source": [ "genecover_markers = GeneCover(num_marker=100, corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = \"SCIP\")\n", "print(\"GeneCover markers via SCIP: \\n\", orig_gene[genecover_markers])" ] }, { "cell_type": "markdown", "id": "c5eb6c39", "metadata": {}, "source": [ "### GeneCover via Greedy Heuristics" ] }, { "cell_type": "code", "execution_count": 8, "id": "4ccccc69", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Gap: 0\n", "Best Epsilon: 0.136181640625\n", "GeneCover markers: \n", " ['SNAP25' 'ERMN' 'ENC1' 'VSNL1' 'MAG' 'SCGB2A2' 'ATP1B1' 'RTN1' 'PTGDS'\n", " 'OLFM1' 'MOG' 'MBP' 'FABP4' 'IGKC' 'CST3' 'CARNS1' 'GAD1' 'TMEM144'\n", " 'KRT19' 'MT-CO1' 'NME7' 'UCHL1' 'PPP1R14A' 'CHN1' 'CLDN11' 'HBB' 'HSPA2'\n", " 'CALB2' 'EDIL3' 'YWHAG' 'GJB1' 'NEFL' 'ENPP2' 'KRT8' 'RNASE1' 'SLC5A11'\n", " 'GFAP' 'YWHAH' 'SYNC' 'RPL5' 'ATP1A1' 'HPCAL1' 'FAM84A' 'PPP3R1' 'TMSB10'\n", " 'MOBP' 'LDB2' 'LIMCH1' 'SPP1' 'HHIP' 'SV2C' 'SLC12A2' 'CXCL14' 'TUBB2A'\n", " 'ACTB' 'RAPGEF5' 'AQP1' 'PHKG1' 'SEMA3E' 'CHRDL1' 'LAMP2' 'NEFM' 'CALB1'\n", " 'DIRAS2' 'SLC44A1' 'STXBP1' 'CERCAM' 'ABCA2' 'FOLH1' 'FTH1' 'THY1'\n", " 'HSPA8' 'NRGN' 'NKX6-2' 'VAMP1' 'SYT1' 'CABP1' 'AACS' 'HTR2A' 'SLAIN1'\n", " 'HS6ST3' 'GLDN' 'IQCK' 'GPRC5B' 'AC009133.1' 'PLLP' 'NDRG4' 'RPL26'\n", " 'LRRC75A' 'KRT17' 'NSF' 'ANKRD40' 'SEPT4' 'AATK' 'CHGB' 'LAMP5' 'NEFH'\n", " 'OLIG1' 'S100B' 'MT-CO3']\n" ] } ], "source": [ "#obtain 100 marker genes via greedy heuristics\n", "genecover_markers_greedy = GeneCover(num_marker=100, corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = \"Greedy\")\n", "print(\"GeneCover markers: \\n\", orig_gene[genecover_markers_greedy])" ] }, { "cell_type": "markdown", "id": "8feb09d9", "metadata": {}, "source": [ "### Iterative GeneCover via Combinatorial Optimization (Gurobi Solver)" ] }, { "cell_type": "code", "execution_count": 9, "id": "7a2d338e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iteration 1\n", "Best Gap: 0\n", "Best Epsilon: 0.1311767578125\n", "Iteration 2\n", "Best Gap: 0\n", "Best Epsilon: 0.1367919921875\n" ] } ], "source": [ "# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations\n", "genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = \"Gurobi\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "a6806880", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iterative GeneCover markers: \n", " [['MARCKSL1' 'SYNC' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'HPCAL1' 'VSNL1'\n", " 'MDH1' 'PPP3R1' 'CTNNA2' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1'\n", " 'CHN1' 'MOBP' 'CCK' 'AC106707.1' 'CLDN11' 'FAM131A' 'LDB2' 'UCHL1'\n", " 'SPP1' 'BBS7' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3' 'CXCL14'\n", " 'GABRB2' 'MOG' 'ACTB' 'NDUFA4' 'RAPGEF5' 'AQP1' 'PHKG1' 'YWHAG'\n", " 'SLC26A4-AS1' 'GJB1' 'NAP1L2' 'NEFM' 'NEFL' 'STMN2' 'CALB1' 'ENPP2'\n", " 'DIRAS2' 'CERCAM' 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1' 'MYRF' 'SCGB2A2'\n", " 'CARNS1' 'NRGN' 'NKX6-2' 'NELL2' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1'\n", " 'AACS' 'RTN1' 'HSPA2' 'PPP4R4' 'GLDN' 'HBA2' 'SLC5A11' 'AC009133.1'\n", " 'PLLP' 'NDRG4' 'CALB2' 'KRT19' 'CNP' 'GFAP' 'NSF' 'ANKRD40' 'AATK'\n", " 'MBP' 'CHGB' 'SNAP25' 'CST3' 'BCAS1' 'NDUFA7' 'PPP1R14A' 'CALM3'\n", " 'RSPH14' 'NEFH' 'YWHAH' 'C21orf91' 'OLIG1' 'PCP4' 'MT-CO1' 'MT-CO2']\n", " ['CLSTN1' 'HPCA' 'LMO4' 'KCNA2' 'MEF2D' 'CADM3' 'RGS4' 'FAM84A'\n", " 'SLC30A3' 'R3HDM1' 'ATP5MC3' 'EPHA4' 'ATP2B2' 'SYN2' 'CLDND1' 'TAGLN3'\n", " 'TF' 'RAB6B' 'SERPINI1' 'PCDH7' 'HOPX' 'MAPK10' 'PPP3CA' 'UGT8' 'BASP1'\n", " 'SELENOP' 'CARTPT' 'MAP1B' 'NREP' 'ANXA6' 'GABRA1' 'SNCB' 'TUBB2A'\n", " 'NRN1' 'CAP2' 'TSPYL1' 'DGKB' 'AC018647.1' 'VSTM2A' 'TSPAN7' 'BEX1'\n", " 'PLP1' 'FABP4' 'PDP1' 'GABBR2' 'SLC44A1' 'STXBP1' 'AL359091.1' 'ABCA2'\n", " 'SLC1A2' 'FTH1' 'SCGB1D2' 'RTN3' 'C11orf87' 'CRYAB' 'THY1' 'HSPA8'\n", " 'HEPACAM' 'SNCG' 'LIPA' 'LHPP' 'VAMP1' 'TUBA1B' 'TUBA1A' 'ATP5F1B'\n", " 'FAM19A2' 'ATP2B1' 'HTR2A' 'PCDH8' 'HS6ST3' 'RNASE1' 'CALM1' 'ITPKA'\n", " 'B2M' 'NPTN' 'CRYM' 'CACNG3' 'MT3' 'FA2H' 'CAMKK1' 'LRRC75A' 'KRT17'\n", " 'SEPT4' 'AQP4' 'CNDP1' 'FKBP1A' 'LAMP5' 'VSTM2L' 'SLC12A5' 'CBLN4'\n", " 'EEF1A2' 'AC005944.1' 'AC092069.1' 'HAPLN4' 'SCN1B' 'MAG' 'SLC17A7'\n", " 'IGLC2' 'SULT4A1' 'MT-CO3']]\n" ] } ], "source": [ "print(\"Iterative GeneCover markers: \\n\", orig_gene[genecover_markers_iterative])" ] }, { "cell_type": "markdown", "id": "53da29f7", "metadata": {}, "source": [ "### Iterative GeneCover via Combinatorial Optimization (SCIP Solver)" ] }, { "cell_type": "code", "execution_count": 11, "id": "5a93d5a5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iteration 1\n", "Best Gap: 0\n", "Best Epsilon: 0.12958984375\n", "Iteration 2\n", "Best Gap: 0\n", "Best Epsilon: 0.137158203125\n" ] } ], "source": [ "# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations\n", "genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver = \"SCIP\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "50fc7d17", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iterative GeneCover markers: \n", " [['PADI2' 'SYNC' 'WDR47' 'ATP1A1' 'ATP1B1' 'NME7' 'CNTN2' 'VSNL1' 'PREPL'\n", " 'MDH1' 'PPP3R1' 'TMSB10' 'IGKC' 'MAL' 'ERMN' 'LRP2' 'GAD1' 'CHN1' 'CCK'\n", " 'CLDN11' 'LDB2' 'SPP1' 'HHIP' 'TMEM144' 'GPM6A' 'SLC1A3' 'ENC1' 'EDIL3'\n", " 'RHOBTB3' 'CXCL14' 'GABRB2' 'MOG' 'TPBG' 'ACTB' 'NDUFA4' 'RAPGEF5'\n", " 'AQP1' 'YWHAG' 'SLC26A4-AS1' 'CD99' 'GJB1' 'NAP1L2' 'LAMP2' 'NEFM'\n", " 'NEFL' 'STMN2' 'CALB1' 'ENPP2' 'CERCAM' 'OLFM1' 'PTGDS' 'SAA1' 'FOLH1'\n", " 'MYRF' 'FTH1' 'SCGB2A2' 'PLA2G16' 'RTN3' 'CARNS1' 'HSPA8' 'NRGN'\n", " 'NKX6-2' 'TUBA1B' 'KRT8' 'KCNC2' 'SYT1' 'CUX2' 'CABP1' 'AACS' 'RTN1'\n", " 'HSPA2' 'PPP4R4' 'GLDN' 'NPTN' 'HBA1' 'CRYM' 'SLC5A11' 'AC009133.1'\n", " 'PLLP' 'NDRG4' 'CALB2' 'KRT19' 'KRT17' 'GFAP' 'NSF' 'ANKRD40' 'AATK'\n", " 'MBP' 'SNAP25' 'CST3' 'MAG' 'PPP1R14A' 'CALM3' 'RSPH14' 'NEFH' 'YWHAH'\n", " 'OLIG1' 'PCP4' 'MT-CO1' 'MT-CO2']\n", " ['CLSTN1' 'HPCA' 'LMO4' 'KCNA2' 'CADM3' 'RGS4' 'HPCAL1' 'FAM84A'\n", " 'SLC30A3' 'CALM2' 'R3HDM1' 'ATP5MC3' 'EPHA4' 'ATP2B2' 'SYN2' 'MOBP'\n", " 'CLDND1' 'TAGLN3' 'TF' 'RAB6B' 'SERPINI1' 'FAM131A' 'PCDH7' 'UCHL1'\n", " 'HOPX' 'PPP3CA' 'UGT8' 'BASP1' 'CARTPT' 'MAP1B' 'NREP' 'ANXA6' 'GABRA1'\n", " 'SNCB' 'TUBB2A' 'NRN1' 'TSPYL1' 'PRKAR1B' 'DGKB' 'ANLN' 'VSTM2A'\n", " 'PHKG1' 'TSPAN7' 'PLP1' 'CLU' 'FABP4' 'PDP1' 'DIRAS2' 'GABBR2'\n", " 'SLC44A1' 'SLC31A2' 'STXBP1' 'AL359091.1' 'ABCA2' 'SCGB1D2' 'C11orf87'\n", " 'CRYAB' 'THY1' 'HEPACAM' 'SNCG' 'LIPA' 'GOT1' 'SCD' 'LHPP' 'VAMP1'\n", " 'FAM19A2' 'ATP2B1' 'DCLK1' 'HTR2A' 'PCDH8' 'HS6ST3' 'RNASE1' 'CALM1'\n", " 'CHGA' 'ITPKA' 'B2M' 'CACNG3' 'MT3' 'FA2H' 'CAMKK1' 'LRRC75A' 'CNP'\n", " 'TTYH2' 'AQP4' 'CNDP1' 'CHGB' 'SLC12A5' 'CBLN4' 'AC005944.1' 'ABHD8'\n", " 'RAB3A' 'TMEM59L' 'HAPLN4' 'SCN1B' 'ATP1A3' 'SLC17A7' 'IGLC2' 'SULT4A1'\n", " 'S100B' 'MT-CO3']]\n" ] } ], "source": [ "print(\"Iterative GeneCover markers: \\n\", orig_gene[genecover_markers_iterative])" ] }, { "cell_type": "markdown", "id": "693d7e6a", "metadata": {}, "source": [ "### Iterative GeneCover via Greedy Heuristics" ] }, { "cell_type": "code", "execution_count": 13, "id": "89bc7372", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iteration 1\n", "Best Gap: 0\n", "Best Epsilon: 0.136181640625\n", "Iteration 2\n", "Best Gap: 0\n", "Best Epsilon: 0.14765625\n" ] } ], "source": [ "# Obtain 200 marker genes via Iterative GeneCover (Greedy Heuristics) with an incremental size of 100 and two iterations\n", "genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat, w=np.ones(corr_mat.shape[1]), solver= \"Greedy\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "b30fd59b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iterative GeneCover markers: \n", " [['SNAP25' 'ERMN' 'ENC1' 'VSNL1' 'MAG' 'SCGB2A2' 'ATP1B1' 'RTN1' 'PTGDS'\n", " 'OLFM1' 'MOG' 'MBP' 'FABP4' 'IGKC' 'CST3' 'CARNS1' 'GAD1' 'TMEM144'\n", " 'KRT19' 'MT-CO1' 'NME7' 'UCHL1' 'PPP1R14A' 'CHN1' 'CLDN11' 'HBB'\n", " 'HSPA2' 'CALB2' 'EDIL3' 'YWHAG' 'GJB1' 'NEFL' 'ENPP2' 'KRT8' 'RNASE1'\n", " 'SLC5A11' 'GFAP' 'YWHAH' 'SYNC' 'RPL5' 'ATP1A1' 'HPCAL1' 'FAM84A'\n", " 'PPP3R1' 'TMSB10' 'MOBP' 'LDB2' 'LIMCH1' 'SPP1' 'HHIP' 'SV2C' 'SLC12A2'\n", " 'CXCL14' 'TUBB2A' 'ACTB' 'RAPGEF5' 'AQP1' 'PHKG1' 'SEMA3E' 'CHRDL1'\n", " 'LAMP2' 'NEFM' 'CALB1' 'DIRAS2' 'SLC44A1' 'STXBP1' 'CERCAM' 'ABCA2'\n", " 'FOLH1' 'FTH1' 'THY1' 'HSPA8' 'NRGN' 'NKX6-2' 'VAMP1' 'SYT1' 'CABP1'\n", " 'AACS' 'HTR2A' 'SLAIN1' 'HS6ST3' 'GLDN' 'IQCK' 'GPRC5B' 'AC009133.1'\n", " 'PLLP' 'NDRG4' 'RPL26' 'LRRC75A' 'KRT17' 'NSF' 'ANKRD40' 'SEPT4' 'AATK'\n", " 'CHGB' 'LAMP5' 'NEFH' 'OLIG1' 'S100B' 'MT-CO3']\n", " ['TF' 'CCK' 'STMN2' 'NDUFA4' 'MDH1' 'PLP1' 'GPM6A' 'TUBA1B' 'HOPX'\n", " 'SCGB1D2' 'MAL' 'NPTN' 'AC005944.1' 'MT-ND1' 'SAA1' 'CNP' 'RTN3'\n", " 'IGLC2' 'SNCB' 'SLC1A2' 'CALM3' 'C11orf87' 'CRYAB' 'CNDP1' 'PCP4'\n", " 'LMO4' 'CLDND1' 'GABRA1' 'TUBA1A' 'PLEKHH1' 'KLK6' 'ELOVL1' 'RGS4'\n", " 'BASP1' 'RPL37' 'CARTPT' 'MAP1B' 'GABRB2' 'HLA-B' 'BEX1' 'LINC00844'\n", " 'KCNC2' 'ATP2B1' 'MT3' 'HPCA' 'TMEM125' 'SV2A' 'CNTN2' 'SLC30A3'\n", " 'CALM2' 'R3HDM1' 'ATP2B2' 'ARPP21' 'AC106707.1' 'SERPINI1' 'JCHAIN'\n", " 'PARM1' 'G3BP2' 'PRDM8' 'PPP3CA' 'UGT8' 'CCDC152' 'SELENOP' 'HCN1'\n", " 'HSP90AB1' 'PRKAR1B' 'ANLN' 'NACAD' 'GPR37' 'CLEC2L' 'TSPAN7' 'SYP'\n", " 'NAP1L2' 'TCEAL6' 'PDP1' 'GABBR2' 'DNM1' 'MYRF' 'HEPACAM' 'FAM107B'\n", " 'PIP4K2A' 'FRMPD2' 'HK1' 'OPALIN' 'GRIN2B' 'DCLK1' 'PCDH8' 'IGHA2'\n", " 'IGHA1' 'ITPKA' 'ARPP19' 'SV2B' 'FA2H' 'CA10' 'BCAS1' 'EEF1A2' 'HAPLN4'\n", " 'FTL' 'SLC17A7' 'SYNJ1']]\n" ] } ], "source": [ "print(\"Iterative GeneCover markers: \\n\", orig_gene[genecover_markers_iterative])" ] }, { "cell_type": "markdown", "id": "c1d6020b", "metadata": {}, "source": [ "# Marker Gene Selection Across Samples" ] }, { "cell_type": "markdown", "id": "6962a1a6", "metadata": {}, "source": [ "### Load the datasets (Sample #1515067, #151669, #151673)" ] }, { "cell_type": "code", "execution_count": 15, "id": "53d4db40", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n", "c:\\Users\\An Wang\\.conda\\envs\\vae-spatial\\lib\\site-packages\\anndata\\_core\\anndata.py:1758: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n" ] } ], "source": [ "Adata = {}\n", "data_dir = \"..\\\\data\\\\DLPFC Full Samples\"\n", "file_names = os.listdir(data_dir)\n", "across_samples_file_names = file_names[:3]\n", "annotation_names = file_names[3:]\n", "for j, file in enumerate(across_samples_file_names):\n", " adata = sc.read_10x_h5(os.path.join(data_dir, file))\n", " adata.var_names_make_unique()\n", " layer_annotation = pd.read_csv(os.path.join(data_dir, annotation_names[j]), index_col=0)\n", " assert np.all(adata.obs.index.values == layer_annotation.index.values)\n", " adata.obs = layer_annotation\n", " sc.pp.filter_genes(adata, min_cells=100)\n", " sc.pp.normalize_total(adata, target_sum=1e4)\n", " sc.pp.log1p(adata)\n", " sc.pp.highly_variable_genes(adata, n_top_genes=10000)\n", " adata = adata[:, adata.var.highly_variable]\n", " Adata[file.split(\"_\")[0]] = adata\n", "\n", "for j, file in enumerate(across_samples_file_names):\n", " if j == 0: \n", " genes_across_samples = Adata[file.split(\"_\")[0]].var_names.values\n", " else:\n", " genes_across_samples = np.intersect1d(genes_across_samples, Adata[file.split(\"_\")[0]].var_names.values)\n", "\n", "for key in across_samples_file_names:\n", " Adata[key.split(\"_\")[0]] = Adata[key.split(\"_\")[0]][:, genes_across_samples]" ] }, { "cell_type": "markdown", "id": "55bfaa59", "metadata": {}, "source": [ "### Compute Correlation Matrices for Every Sample" ] }, { "cell_type": "code", "execution_count": 16, "id": "a64eb0ca", "metadata": {}, "outputs": [], "source": [ "corr_mat_combine_across_samples = gene_gene_correlation([Adata[key.split(\"_\")[0]].X.toarray() for key in across_samples_file_names])" ] }, { "cell_type": "markdown", "id": "ede8996c", "metadata": {}, "source": [ "### GeneCover Marker Selection via Combinatorial Optimization Across Samples (Gurobi Solver)" ] }, { "cell_type": "code", "execution_count": 17, "id": "06e6a76b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Gap: 0\n", "Best Epsilon: 0.10332946777343749\n", "GeneCover markers: \n", " ['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'ATP1A1'\n", " 'ATP1B1' 'ATP6V1B2' 'B2M' 'BASP1' 'CARTPT' 'CCK' 'CHN1' 'CLSTN2' 'CLSTN3'\n", " 'CPLX1' 'CRYM' 'CST3' 'DIRAS2' 'DYNC1I1' 'ENC1' 'EPHA4' 'FABP4' 'G0S2'\n", " 'GABRA1' 'GABRG2' 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'HBB' 'HOPX' 'IGHA1'\n", " 'IGLC2' 'KLK6' 'KRT17' 'KRT19' 'KRT8' 'LMO4' 'MAGEE1' 'MAP1B' 'MBP'\n", " 'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4' 'NEFH' 'NEFL'\n", " 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NSF' 'OAT' 'OLFM1' 'PAK1' 'PDP1' 'PHKG1'\n", " 'PLIN1' 'PLP1' 'PPP3R1' 'RAB3A' 'REEP2' 'RPLP1' 'RTN1' 'RTN3' 'RTN4'\n", " 'SCGB2A2' 'SCN1A' 'SH3BGRL2' 'SLC17A7' 'SLC1A2' 'SLC24A2' 'SLC39A10'\n", " 'SNAP25' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'SYNC' 'SYNGR1' 'SYT1' 'SYT4'\n", " 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10' 'TUBA1A' 'TUBA1B' 'TUBB2A' 'UCHL1'\n", " 'VPS35' 'VSNL1' 'WDR47' 'WDR7' 'YWHAG']\n" ] } ], "source": [ "# Obtain 100 marker genes across samples\n", "genecover_marker_across_samples = GeneCover(num_marker=100, corr_mat = corr_mat_combine_across_samples, w = np.ones(corr_mat_combine_across_samples.shape[1]), solver = \"Gurobi\")\n", "print(\"GeneCover markers: \\n\", genes_across_samples[genecover_marker_across_samples])" ] }, { "cell_type": "markdown", "id": "2122dbdc", "metadata": {}, "source": [ "### GeneCover Marker Selection via Combinatorial Optimization Across Samples (SCIP Solver)" ] }, { "cell_type": "code", "execution_count": 28, "id": "1517aa2d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Gap: 0\n", "Best Epsilon: 0.10302734375\n", "GeneCover markers: \n", " ['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'APP'\n", " 'ATL1' 'ATP1A1' 'ATP1B1' 'BASP1' 'CARNS1' 'CCK' 'CHN1' 'CLSTN2' 'CLSTN3'\n", " 'CLU' 'CRYM' 'DYNC1I1' 'EIF1B' 'ENC1' 'EPHA4' 'FABP4' 'G0S2' 'GABRA1'\n", " 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'GPRC5B' 'HBB' 'HOPX' 'HPCAL1' 'HSP90AB1'\n", " 'IFI27' 'IGHA1' 'IGKC' 'KRT17' 'KRT19' 'KRT8' 'LDB2' 'LMO4' 'MAP1B' 'MBP'\n", " 'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4' 'NECAB1' 'NEFH'\n", " 'NEFL' 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NRN1' 'NSF' 'OAT' 'OLFM1' 'PAK1'\n", " 'PDP1' 'PFKP' 'PHKG1' 'PLIN1' 'PLP1' 'RAB3A' 'RPLP1' 'RTN1' 'RTN3'\n", " 'SCGB2A2' 'SERPINF1' 'SH3BGRL2' 'SLC17A7' 'SLC1A2' 'SLC24A2' 'SNAP25'\n", " 'SNCA' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'STXBP1' 'SYNC' 'SYNGR1' 'SYT1'\n", " 'SYT4' 'TBR1' 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10' 'TUBA1B' 'TUBB2A'\n", " 'UCHL1' 'VSNL1' 'WDR47' 'YWHAG']\n" ] } ], "source": [ "# Obtain 100 marker genes across samples\n", "genecover_marker_across_samples = GeneCover(num_marker=100, corr_mat = corr_mat_combine_across_samples, w = np.ones(corr_mat_combine_across_samples.shape[1]), lambdaMax=.2,solver = \"SCIP\")\n", "print(\"GeneCover markers: \\n\", genes_across_samples[genecover_marker_across_samples])" ] }, { "cell_type": "markdown", "id": "7e24f901", "metadata": {}, "source": [ "### GeneCover Marker Selection via Greedy Heuristics Across Samples" ] }, { "cell_type": "code", "execution_count": 19, "id": "cbb3bdb0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best Gap: 0\n", "Best Epsilon: 0.11860351562499999\n", "GeneCover markers: \n", " ['SNAP25' 'PLP1' 'ENC1' 'SYT1' 'SCGB2A2' 'NRGN' 'VSNL1' 'CST3' 'ATP1B1'\n", " 'MOG' 'TUBA1B' 'RTN1' 'SAA1' 'TMSB10' 'NEFL' 'GFAP' 'ERMN' 'IGKC' 'UCHL1'\n", " 'KRT19' 'ACTB' 'MBP' 'CLDN11' 'MAP1B' 'OLFM1' 'PTGDS' 'MT-CO2' 'CARNS1'\n", " 'CHN1' 'CLSTN2' 'RTN3' 'STMN1' 'TUBB2A' 'HBA1' 'NDUFA4' 'PPP1R14A'\n", " 'SLC1A2' 'DIRAS2' 'AGT' 'ALDOA' 'ANXA6' 'ATP6V1C1' 'ATP6V1D' 'BASP1'\n", " 'CDC37' 'CLDND1' 'CNDP1' 'CNTN2' 'COL1A2' 'COX6C' 'CREG2' 'CRYAB' 'DCLK1'\n", " 'DYNC1I1' 'EFHD2' 'EPDR1' 'ETS2' 'FAM3C' 'FKBP1A' 'FTL' 'GLS' 'GPM6A'\n", " 'HPRT1' 'HSPA2' 'HSPA8' 'HSPH1' 'KCNC2' 'KIFAP3' 'LAMP2' 'LDHA'\n", " 'LINC00844' 'LMO4' 'MAG' 'MAGED1' 'MAL' 'MAP3K12' 'MDH1' 'MGP' 'MMACHC'\n", " 'MT-ND2' 'NAP1L2' 'NAP1L5' 'NSF' 'OPALIN' 'PLIN1' 'PNMA8A' 'PRKAR1B'\n", " 'PRKCE' 'RAB3A' 'RAB6A' 'RGS4' 'RPS27' 'SCG5' 'SCGB1D2' 'SERPINI1'\n", " 'SLC1A3' 'SNCA' 'SNCG' 'SYN2' 'TFF1']\n" ] } ], "source": [ "# Obtain 100 marker genes across samples via greedy heuristics\n", "genecover_marker_across_samples_greedy = GeneCover(num_marker=100, corr_mat = corr_mat_combine_across_samples, w = np.ones(corr_mat_combine_across_samples.shape[1]), solver= \"Greedy\")\n", "print(\"GeneCover markers: \\n\", genes_across_samples[genecover_marker_across_samples_greedy])" ] }, { "cell_type": "markdown", "id": "a4ebc77a", "metadata": {}, "source": [ "### Iterative GeneCover via Combinatorial Optimization Across Samples (Gurobi Solver)" ] }, { "cell_type": "code", "execution_count": 22, "id": "492f655c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iteration 1\n", "Best Gap: 0\n", "Best Epsilon: 0.10332946777343749\n", "Iteration 2\n", "Best Gap: 0\n", "Best Epsilon: 0.10310058593749999\n" ] } ], "source": [ "# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations\n", "genecover_marker_across_samples = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat_combine_across_samples, w=np.ones(corr_mat_combine_across_samples.shape[1]), solver = \"Gurobi\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "d1543b53", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iterative GeneCover markers: \n", " [['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'ATP1A1'\n", " 'ATP1B1' 'ATP6V1B2' 'B2M' 'BASP1' 'CARTPT' 'CCK' 'CHN1' 'CLSTN2'\n", " 'CLSTN3' 'CPLX1' 'CRYM' 'CST3' 'DIRAS2' 'DYNC1I1' 'ENC1' 'EPHA4'\n", " 'FABP4' 'G0S2' 'GABRA1' 'GABRG2' 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'HBB'\n", " 'HOPX' 'IGHA1' 'IGLC2' 'KLK6' 'KRT17' 'KRT19' 'KRT8' 'LMO4' 'MAGEE1'\n", " 'MAP1B' 'MBP' 'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4'\n", " 'NEFH' 'NEFL' 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NSF' 'OAT' 'OLFM1' 'PAK1'\n", " 'PDP1' 'PHKG1' 'PLIN1' 'PLP1' 'PPP3R1' 'RAB3A' 'REEP2' 'RPLP1' 'RTN1'\n", " 'RTN3' 'RTN4' 'SCGB2A2' 'SCN1A' 'SH3BGRL2' 'SLC17A7' 'SLC1A2' 'SLC24A2'\n", " 'SLC39A10' 'SNAP25' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'SYNC' 'SYNGR1'\n", " 'SYT1' 'SYT4' 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10' 'TUBA1A' 'TUBA1B'\n", " 'TUBB2A' 'UCHL1' 'VPS35' 'VSNL1' 'WDR47' 'WDR7' 'YWHAG']\n", " ['ADD2' 'AK5' 'ANXA6' 'APBA2' 'APP' 'ATP6AP2' 'ATRNL1' 'BEX1' 'CABP1'\n", " 'CACNB1' 'CALM3' 'CAMK2D' 'CAMK2N1' 'CHGA' 'CHGB' 'CLDN11' 'CLU' 'CMAS'\n", " 'CNP' 'CPB1' 'CREG2' 'CRYAB' 'DCLK1' 'DNM1' 'DNM3' 'EEF1A1' 'EFHD2'\n", " 'EIF4A2' 'EPDR1' 'ERMN' 'ETS2' 'FBXW7' 'G3BP2' 'GAPDH' 'GLS' 'GNB5'\n", " 'HLA-B' 'HSP90AA1' 'HSP90AB1' 'HSPA8' 'HSPH1' 'IGKC' 'KCNK1' 'KIFAP3'\n", " 'LRRC75A' 'MAPRE3' 'MFSD4A' 'MGP' 'MT-ND1' 'MUC1' 'NAP1L2' 'NDFIP1'\n", " 'NELL1' 'NMNAT2' 'NPTN' 'NREP' 'NUAK1' 'NUDT4' 'OXR1' 'PCP4' 'PFKP'\n", " 'PGK1' 'PIK3R1' 'PRKAR1A' 'PRKCE' 'PRPF19' 'PTGDS' 'PTPN5' 'RAB2A'\n", " 'RAB3C' 'REEP1' 'RGS4' 'RPS12' 'RTN4RL2' 'SAA1' 'SCG5' 'SCGB1D2'\n", " 'SCN2A' 'SERPINI1' 'SH3GL2' 'SLC1A3' 'SLC6A17' 'SNCG' 'SNX10' 'SPOCK2'\n", " 'SRPK2' 'STXBP1' 'SV2A' 'SV2B' 'SYP' 'TAGLN3' 'TBR1' 'TF' 'THY1'\n", " 'TOLLIP' 'TUBA4A' 'WASF1' 'WASL' 'YWHAB' 'YWHAH']]\n" ] } ], "source": [ "print(\"Iterative GeneCover markers: \\n\", genes_across_samples[genecover_marker_across_samples])" ] }, { "cell_type": "markdown", "id": "5c04759c", "metadata": {}, "source": [ "### Iterative GeneCover via Combinatorial Optimization Across Samples (SCIP Solver)" ] }, { "cell_type": "code", "execution_count": 30, "id": "b9a981bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iteration 1\n", "Best Gap: 0\n", "Best Epsilon: 0.10302734375\n", "Iteration 2\n", "Best Gap: 0\n", "Best Epsilon: 0.102752685546875\n" ] } ], "source": [ "# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations\n", "genecover_marker_across_samples = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat_combine_across_samples, w=np.ones(corr_mat_combine_across_samples.shape[1]), lambdaMax=.2 ,solver = \"SCIP\")" ] }, { "cell_type": "code", "execution_count": 32, "id": "8cb41210", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iterative GeneCover markers: \n", " [['AC005944.1' 'AC009133.1' 'AC092069.1' 'ACTB' 'ALDOA' 'AP3B2' 'APP'\n", " 'ATL1' 'ATP1A1' 'ATP1B1' 'BASP1' 'CARNS1' 'CCK' 'CHN1' 'CLSTN2'\n", " 'CLSTN3' 'CLU' 'CRYM' 'DYNC1I1' 'EIF1B' 'ENC1' 'EPHA4' 'FABP4' 'G0S2'\n", " 'GABRA1' 'GAD1' 'GAP43' 'GFAP' 'GPM6A' 'GPRC5B' 'HBB' 'HOPX' 'HPCAL1'\n", " 'HSP90AB1' 'IFI27' 'IGHA1' 'IGKC' 'KRT17' 'KRT19' 'KRT8' 'LDB2' 'LMO4'\n", " 'MAP1B' 'MBP' 'MDH1' 'MICAL2' 'MOAP1' 'MOBP' 'MT-ND2' 'MT3' 'NDRG4'\n", " 'NECAB1' 'NEFH' 'NEFL' 'NEFM' 'NELL2' 'NME7' 'NRGN' 'NRN1' 'NSF' 'OAT'\n", " 'OLFM1' 'PAK1' 'PDP1' 'PFKP' 'PHKG1' 'PLIN1' 'PLP1' 'RAB3A' 'RPLP1'\n", " 'RTN1' 'RTN3' 'SCGB2A2' 'SERPINF1' 'SH3BGRL2' 'SLC17A7' 'SLC1A2'\n", " 'SLC24A2' 'SNAP25' 'SNCA' 'SNCB' 'SPP1' 'STMN1' 'STMN2' 'STXBP1' 'SYNC'\n", " 'SYNGR1' 'SYT1' 'SYT4' 'TBR1' 'TFF1' 'TMEM38A' 'TMEM59L' 'TMSB10'\n", " 'TUBA1B' 'TUBB2A' 'UCHL1' 'VSNL1' 'WDR47' 'YWHAG']\n", " ['AC011603.2' 'ADD2' 'AK5' 'APBA2' 'ATP1B2' 'ATP5F1A' 'ATP6AP2'\n", " 'ATP6V1B2' 'BEX1' 'CABP1' 'CACNB1' 'CALM3' 'CAMK2D' 'CAP2' 'CARTPT'\n", " 'CDKN2D' 'CHGA' 'CHGB' 'CLDN11' 'CNP' 'CREG2' 'CRYAB' 'CST3' 'CX3CL1'\n", " 'DCLK1' 'DNAJB6' 'DNAJC6' 'DNM1' 'DNM3' 'EFHD2' 'EHD3' 'EPDR1' 'ERMN'\n", " 'FBXW7' 'FKBP1A' 'G3BP2' 'GABRG2' 'GAPDH' 'GGCX' 'GLS' 'GOT1' 'HLA-B'\n", " 'HMGCS1' 'HSP90AA1' 'HSPA8' 'IGLC2' 'KCNK1' 'KIFAP3' 'LRRC75A' 'MAPRE3'\n", " 'MAPT' 'MGP' 'MTRNR2L8' 'MUC1' 'NAP1L2' 'NCALD' 'NCOA7' 'NDFIP1'\n", " 'NMNAT2' 'NPTN' 'NUAK1' 'NUDT4' 'OXR1' 'PCP4' 'PGK1' 'PI4KA' 'PIK3R1'\n", " 'PNMA8A' 'PPP3R1' 'PREPL' 'PRKCE' 'PRNP' 'PRPF19' 'PTGDS' 'PTPN5'\n", " 'RAB15' 'RAB3C' 'REEP1' 'RPS12' 'RTN4' 'SAA1' 'SCGB1D2' 'SCN2A' 'SCN3B'\n", " 'SERPINI1' 'SH3GL2' 'SNX10' 'SV2A' 'SV2B' 'SYP' 'TAGLN3' 'TF' 'THY1'\n", " 'TOLLIP' 'TRIM37' 'TUBA4A' 'VPS35' 'WASF1' 'YWHAB' 'YWHAH']]\n" ] } ], "source": [ "print(\"Iterative GeneCover markers: \\n\", genes_across_samples[genecover_marker_across_samples])" ] }, { "cell_type": "markdown", "id": "daee13da", "metadata": {}, "source": [ "### Iterative GeneCover via Greedy Heuristics Across Samples" ] }, { "cell_type": "code", "execution_count": null, "id": "79614e93", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iteration 1\n", "Best Gap: 0\n", "Best Epsilon: 0.1185546875\n", "Iteration 2\n", "Best Gap: 0\n", "Best Epsilon: 0.11213867187500001\n" ] } ], "source": [ "# Obtain 200 marker genes via Iterative GeneCover with an incremental size of 100 and two iterations\n", "genecover_marker_across_samples_greedy = genecover_markers_iterative = Iterative_GeneCover(incremental_sizes=[100,100], corr_mat=corr_mat_combine_across_samples, w=np.ones(corr_mat_combine_across_samples.shape[1]),lambdaMin = .08, lambdaMax = .2, solver= \"Greedy\")" ] }, { "cell_type": "code", "execution_count": null, "id": "73ae49b9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iterative GeneCover markers: \n", " [['SNAP25' 'PLP1' 'ENC1' 'SYT1' 'SCGB2A2' 'NRGN' 'VSNL1' 'CST3' 'ATP1B1'\n", " 'MOG' 'TUBA1B' 'RTN1' 'SAA1' 'TMSB10' 'NEFL' 'GFAP' 'ERMN' 'IGKC'\n", " 'UCHL1' 'KRT19' 'ACTB' 'MBP' 'CLDN11' 'MAP1B' 'OLFM1' 'PTGDS' 'MT-CO2'\n", " 'CARNS1' 'CHN1' 'CLSTN2' 'RTN3' 'STMN1' 'TUBB2A' 'HBA1' 'NDUFA4'\n", " 'PPP1R14A' 'SLC1A2' 'DIRAS2' 'AGT' 'ALDOA' 'ANXA6' 'ATP6V1C1' 'ATP6V1D'\n", " 'BASP1' 'CDC37' 'CLDND1' 'CNDP1' 'CNTN2' 'COL1A2' 'COX6C' 'CREG2'\n", " 'DCLK1' 'DYNC1I1' 'EFHD2' 'EPDR1' 'ETS2' 'FAM3C' 'FKBP1A' 'FTL' 'GLS'\n", " 'GPM6A' 'HPRT1' 'HSPA2' 'HSPA8' 'HSPH1' 'KCNC2' 'KIFAP3' 'KRT18'\n", " 'LAMP2' 'LDHA' 'LINC00844' 'LMO4' 'MAG' 'MAGED1' 'MAL' 'MAP3K12' 'MDH1'\n", " 'MGP' 'MMACHC' 'MT-ND2' 'NAP1L2' 'NAP1L5' 'NSF' 'OPALIN' 'PLIN1'\n", " 'PNMA8A' 'PRKAR1B' 'PRKCE' 'RAB3A' 'RAB6A' 'RGS4' 'RPS27' 'SCG5'\n", " 'SCGB1D2' 'SERPINI1' 'SLC1A3' 'SNCA' 'SNCG' 'SYN2' 'TFF1']\n", " ['YWHAH' 'TF' 'STMN2' 'YWHAG' 'SLC24A2' 'CCK' 'MT3' 'MUC1' 'NEFM' 'NME7'\n", " 'FABP4' 'MOBP' 'HOPX' 'ATP6V1B2' 'IGLC2' 'PCP4' 'SLC17A7' 'PFKP' 'TBR1'\n", " 'CAP2' 'APP' 'GABRA1' 'NKX6-2' 'SYP' 'TAGLN3' 'SH3GL2' 'GAP43' 'CRYM'\n", " 'AC005944.1' 'CNP' 'THY1' 'MOAP1' 'NELL1' 'SYT4' 'NAPB' 'SLC30A3'\n", " 'OXR1' 'SPP1' 'STXBP1' 'GABRG2' 'MICAL2' 'NDRG4' 'CHL1' 'ENPP2' 'FABP3'\n", " 'IGHA1' 'ITFG1' 'KRT8' 'MT-CO1' 'PAK1' 'SCN2A' 'SSTR2' 'TUBA4A' 'CHGB'\n", " 'CLU' 'FAM49A' 'HSP90AA1' 'LRRC75A' 'NELL2' 'NUAK1' 'PGM2L1'\n", " 'AC011603.3' 'ACOT13' 'ACOT7' 'APBA2' 'APOE' 'ATL1' 'ATP1B2' 'ATP6V1A'\n", " 'BNIP3' 'CPLX1' 'CRYAB' 'CTNNB1' 'EIF4A2' 'ENO2' 'GHITM' 'GLUL'\n", " 'IPCEF1' 'KIF21A' 'KLC1' 'MAP2K1' 'MLLT11' 'MT-ND1' 'MT-ND3' 'MYRF'\n", " 'PCLO' 'PCSK2' 'PEG3' 'PIK3R1' 'S100A11' 'SEPT4' 'SLC6A17' 'STX1A'\n", " 'SYNGR1' 'SYNGR3' 'TMEM144' 'TMEM14A' 'TOLLIP' 'TUBB' 'VAMP1']]\n" ] } ], "source": [ "print(\"Iterative GeneCover markers: \\n\", genes_across_samples[genecover_marker_across_samples_greedy])" ] } ], "metadata": { "kernelspec": { "display_name": "vae-spatial", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }