Code
import os, gzip, pathlib, urllib.request, pickle
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
DATA_DIR = pathlib.Path('gtex_data')
DATA_DIR.mkdir(exist_ok=True)This notebook documents every step needed to go from a fresh clone to the gene_expr.pkl and sample_meta.csv files used by the main figure. It is not run during CI — the repo ships a pre-built cache (gtex_data/figure1_cache.pkl.gz) that lets figure1.ipynb run without this data. Run this notebook only if you want full reproducibility from raw GTEx files.
Runtime: ~2–4 h (dominated by parsing the 1.5 GB gzipped TPM matrix). Disk: ~20 GB during parsing; ~2 GB retained.
The Genotype-Tissue Expression (GTEx) project collects RNA-seq from 54 post-mortem tissues across hundreds of donors. We use version 8 (838 donors, 17 382 samples).
Files we need: | File | Contents | Size (compressed) | |——|———-|——————-| | GTEx_Analysis_..._gene_tpm.gct.gz | Per-sample TPM for all genes | ~1.5 GB | | GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt | Sample ↔︎ tissue mapping | ~12 MB | | GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt | Donor sex, age | ~30 kB |
GTEX_BASE = 'https://storage.googleapis.com/adult-gtex'
FILES = {
'GTEx_gene_tpm.gct.gz':
f'{GTEX_BASE}/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz',
'GTEx_SampleAttributes.txt':
f'{GTEX_BASE}/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',
'GTEx_SubjectPhenotypes.txt':
f'{GTEX_BASE}/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt',
}
for fname, url in FILES.items():
dst = DATA_DIR / fname
if dst.exists():
print(f' already present: {fname}')
else:
print(f' downloading {fname} …')
urllib.request.urlretrieve(url, dst)
print(f' done ({dst.stat().st_size / 1e9:.2f} GB)')gene_expr.pklWe stream through the gzipped GCT file line-by-line to avoid loading the full ~15 GB uncompressed matrix into memory. We convert TPM → log₂(TPM + 1) on the fly and store only the brain-region samples we need.
BRAIN_REGIONS = {
'Hypothalamus': 'Brain - Hypothalamus',
'Amygdala': 'Brain - Amygdala',
'Hippocampus': 'Brain - Hippocampus',
'Ant. Cing. Ctx': 'Brain - Anterior cingulate cortex (BA24)',
'Frontal Cortex': 'Brain - Frontal Cortex (BA9)',
'Cortex': 'Brain - Cortex',
'Caudate': 'Brain - Caudate (basal ganglia)',
'Putamen': 'Brain - Putamen (basal ganglia)',
'Nucleus Accumbens': 'Brain - Nucleus accumbens (basal ganglia)',
'Cerebellum': 'Brain - Cerebellum',
'Cerebellar Hemi.': 'Brain - Cerebellar Hemisphere',
'Substantia Nigra': 'Brain - Substantia nigra',
'Spinal Cord': 'Brain - Spinal cord (cervical c-1)',
}
brain_tissue_names = set(BRAIN_REGIONS.values())
# Genes we display in the figure
DISPLAY_GENES = [
'TPH2', 'CHRNA7', 'ESR1',
'TH', 'SLC6A3', 'DDC', 'AGRP',
'SST', 'PENK', 'GAD1', 'GAD2', 'CRH',
'DRD5', 'CHRM1', 'BDNF', 'CYP19A1',
'AIF1', 'MAOA', 'FKBP5', 'GFAP',
]
DISPLAY_GENES_SET = set(DISPLAY_GENES)# Build sample → tissue lookup (fast: small file)
attr = pd.read_csv(DATA_DIR / 'GTEx_SampleAttributes.txt', sep='\t',
usecols=['SAMPID', 'SMTSD'])
brain_samples = set(attr.loc[attr['SMTSD'].isin(brain_tissue_names), 'SAMPID'])
print(f'{len(brain_samples)} brain-region samples across {len(brain_tissue_names)} tissues')# Stream TPM file — takes ~30–60 min
TPM_GZ = DATA_DIR / 'GTEx_gene_tpm.gct.gz'
OUT_PKL = DATA_DIR / 'gene_expr.pkl'
gene_expr = {} # gene_name → {sample_id: log2(tpm+1)}
n_lines = 0
with gzip.open(TPM_GZ, 'rt') as fh:
fh.readline(); fh.readline() # skip GCT header lines
header = fh.readline().rstrip('\n').split('\t')
col_ids = header[2:] # sample IDs start at column 3
# Indices of brain samples in the column order
brain_idx = [(i, sid) for i, sid in enumerate(col_ids) if sid in brain_samples]
print(f'Columns to extract: {len(brain_idx)} / {len(col_ids)}')
for line in fh:
parts = line.rstrip('\n').split('\t')
gene = parts[1] # gene symbol in column 2
if gene not in DISPLAY_GENES_SET:
continue
tpm_row = parts[2:]
gene_expr[gene] = {
sid: float(np.log2(float(tpm_row[i]) + 1.0))
for i, sid in brain_idx
}
n_lines += 1
if len(gene_expr) == len(DISPLAY_GENES_SET):
break # found all genes — stop early
print(f'Parsed {len(gene_expr)} genes. Saving …')
with open(OUT_PKL, 'wb') as f:
pickle.dump(gene_expr, f, protocol=5)
print(f'Saved {OUT_PKL} ({OUT_PKL.stat().st_size / 1e6:.1f} MB)')sample_meta.csvpheno = pd.read_csv(DATA_DIR / 'GTEx_SubjectPhenotypes.txt', sep='\t')
sex_map = pheno.set_index('SUBJID')['SEX'].map({1: 'Male', 2: 'Female'})
attr2 = pd.read_csv(DATA_DIR / 'GTEx_SampleAttributes.txt', sep='\t',
usecols=['SAMPID', 'SMTSD'])
attr2['SUBJID'] = attr2['SAMPID'].str.extract(r'(GTEX-[^-]+)')
attr2['SEX'] = attr2['SUBJID'].map(sex_map)
region_rev = {v: k for k, v in BRAIN_REGIONS.items()}
brain_attr = attr2[attr2['SMTSD'].isin(brain_tissue_names) & attr2['SEX'].notna()].copy()
brain_attr['region'] = brain_attr['SMTSD'].map(region_rev)
sample_meta = brain_attr.set_index('SAMPID')[['region', 'SEX']].rename(columns={'SEX': 'sex'})
sample_meta.to_csv(DATA_DIR / 'sample_meta.csv')
print(f'sample_meta saved: {len(sample_meta)} samples')
print(sample_meta.groupby(['region', 'sex']).size().unstack())Verify that key genes are present and sample counts are reasonable before handing off to make_cache.ipynb.
Next step: open helpers/make_cache.ipynb to build the compact gtex_data/figure1_cache.pkl.gz that figure1.ipynb uses.