Category Archives: rna-seq

Running NBP-Iso framework on BEERS simulated data

This tutorial will explain how to run the full NBP-iso framework.
The major steps are:
1) Simulate reads sampled from novel splice forms using BEERS simulator.
2) Simulate multiple individuals on using the output of the BEERS simulator.
3) Run the NBP-iso model.
4) Visualization and analysis of the results.


######################## SIMULATE DATA ########################
# constants CHANGE THESE
STORAGE_DIR=/tigress/daguiar/npb-iso
SCRIPT_DIR=/home/daguiar/workspace/npb-iso/src/beers
SOURCE_DIR=/home/daguiar/workspace/npb-iso/src/pythoncode

# get reference data
cd $STORAGE_DIR
wget http://itmat.rum.s3.amazonaws.com/simulator_config_human.tar.gz
tar xzvf simulator_config_human.tar.gz
rm simulator_config_human.tar.gz

# get the BEERS simulator
cd $SCRIPT_DIR
wget http://www.cbil.upenn.edu/BEERS/beers.tar
tar xvf beers.tar
rm beers.tar

# make config files OR use the supplied simulator configuration files (chosen here)
# this process takes a LONG TIME so it's recommended to use the existing reference data
# mkdir $STORAGE_DIR/testgene_dir
# usage: make_config_files_for_subset_of_gene_ids.pl
# perl make_config_files_for_subset_of_gene_ids.pl testgene testgene $STORAGE_DIR $STORAGE_DIR/testgene_dir

mkdir $STORAGE_DIR/sim_out
# usage: perl reads_simulator.pl [options]
# simulate some data
perl reads_simulator.pl 100 sim1 -readlength 100 -numgenes 1 -nointron -nalt 6 -palt 0.4 -sn -mastercfgdir $STORAGE_DIR -outdir $STORAGE_DIR/sim1_out
# or simulate with idealistic parameters (this may fail due to random sampling
# of an non-expressed gene, just run it again)
mkdir $STORAGE_DIR/sim_ideal_out
perl reads_simulator.pl 100 sim_ideal -readlength 100 -numgenes 1 -error 0 -subfreq 0 -indelfreq 0 -nointron -tpercent 0 -tqual 1 -nalt 2 -palt 0.5 -sn -mastercfgdir $STORAGE_DIR -outdir $STORAGE_DIR/sim_ideal_out

######################## RUN NPB-ISO ########################
cd $SOURCE_DIR
# run the gene_11 simulated data
python VB/VarBayes.py --data-file ../../data/gene_11/small/data/GENE_11_LDA_sorted.npy --term2exon-file ../../data/gene_11/small/data/GENE_11_Exon_sorted.txt --output-dir ../../output/GENE_11/small/vb_output/ --N-iter 100

# run the ideal simulated data, we have to prepare
# the data for the format accepted by VarBayes.py
python simulate_reads.py --numsamps 1 --numreads 100 --resultDir $STORAGE_DIR/sim_ideal_out/ --stem sim_ideal --suffix temp
# extract individual, isoform number and position
awk '{print $1,$3,$6}' $STORAGE_DIR/sim_ideal_out/simulated_reads*.cig > $STORAGE_DIR/sim_ideal_out/ind_iso_pos.txt
# format the data to obtain a read dictionary and read term counts for each individual
# tricky bit here: nb-iso includes the actual gene, so in our case, 2 isoforms + 1 gene = 3
python formatSimdata.py --resultsDir $STORAGE_DIR/sim_ideal_out/ --nb-ind 1 --nb-iso 3 --stem sim_ideal --gene-name sim_ideal

python VB/VarBayes.py --data-file $STORAGE_DIR/sim_ideal_out/sim_ideal_LDA_sorted.npy --term2exon-file $STORAGE_DIR/sim_ideal_out/sim_ideal_Exon_sorted.txt --output-dir $STORAGE_DIR/sim_ideal_out/sim_ideal/output/ --N-iter 100

######################## VISUALIZE RESULTS ########################
# GENE_11 results
python VB/analysis.py --data-dir ../../data/gene_11/small/data/ --results-dir ../../output/GENE_11/small/vb_output/ --results-prefix GENE_11_ --sorted-reads simulated_reads_allind.cig