[user0001@boris ~]$ mkdir bin [user0001@boris ~]$ cd bin #################################################################################### ## Bowtie-while still in use, this is no longer the best resource-see Bowtie 2 below #################################################################################### [cavanr@boris bin]$ wget http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.8/bowtie-0.12.8-linux-x86_64.zip [cavanr@boris bin]$ unzip bowtie-0.12.8-linux-x86_64.zip [cavanr@boris bin]$ cd bowtie-0.12.8 [cavanr@boris bowtie-0.12.8]$ ./bowtie e_coli reads/e_coli_1000.fq [cavanr@boris bowtie-0.12.8]$ ./bowtie e_coli reads/e_coli_1000.fq e_coli.map [cavanr@boris bowtie-0.12.8]$ wget ftp://ftp.cbcb.umd.edu/pub/data/bowtie_indexes/s_cerevisiae.ebwt.zip [cavanr@boris indexes]$ unzip s_cerevisiae.ebwt.zip [cavanr@boris bowtie-0.12.8]$ ./bowtie -c s_cerevisiae ATTGTAGTTCGAGTAAGTAATGTGGGTTTG [cavanr@boris bowtie-0.12.8]$ wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_O157_H7_Sakai_uid57781/NC_002127.fna [cavanr@boris bowtie-0.12.8]$ ./bowtie-build NC_002127.fna e_coli_O157_H7 [cavanr@boris bowtie-0.12.8]$ ls e_coli_O157_H7.* [cavanr@boris bowtie-0.12.8]$ mv e_coli_O157_H7.* indexes [cavanr@boris bowtie-0.12.8]$ ./bowtie -c e_coli_O157_H7 GCGTGAGCTATGAGAAAGCGCCACGCTTCC [cavanr@boris ~]$ export PATH=$HOME/bin:$PATH [cavanr@boris bowtie-0.12.8]$ cp bowtie $HOME/bin/./bowtie [cavanr@boris bowtie-0.12.8]$ cp bowtie-build $HOME/bin/./bowtie-build [cavanr@boris bowtie-0.12.8]$ cp bowtie-inspect $HOME/bin/./bowtie-inspect [cavanr@boris ~]$ bowtie $HOME/bin/bowtie-0.12.8/indexes/e_coli $HOME/bin/bowtie-0.12.8/reads/e_coli_1000.fq [cavanr@boris bowtie-0.12.8]$ bowtie -S e_coli reads/e_coli_10000snp.fq e_snp.sam #################################################################### ## Bowtie 2 #################################################################### [user0001@boris bin]$ wget http://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.3.1/bowtie2-2.3.3.1-linux-x86_64.zip [user0001@boris bin]$ unzip bowtie2-2.3.3.1-linux-x86_64.zip [user0001@boris bin]$ cd bowtie2-2.3.3.1-linux-x86_64 [user0001@boris bowtie2-2.3.3.1]$ echo $PATH [user0001@boris bowtie2-2.3.3.1]$ ./bowtie2-build example/reference/lambda_virus.fa lambda_virus [user0001@boris bowtie2-2.3.3.1]$ ./bowtie2 -x lambda_virus -U example/reads/reads_1.fq -S eg1.sam [user0001@boris bowtie2-2.3.3.1]$ head eg1.sam [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2 $HOME/bin/bowtie2 [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-align-l $HOME/bin/bowtie2-align-l [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-align-s $HOME/bin/bowtie2-align-s [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-build $HOME/bin/bowtie2-build [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-build-l $HOME/bin/bowtie2-build-l [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-build-s $HOME/bin/bowtie2-build-s [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-inspect $HOME/bin/bowtie2-inspect [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-inspect-l $HOME/bin/bowtie2-inspect-l [user0001@boris bowtie2-2.3.3.1]$ cp bowtie2-inspect-s $HOME/bin/bowtie2-inspect-s [user0001@boris bowtie2-2.3.3.1]$ export BT2_HOME=/export/home/courses/ph7445/user0001/bin/bowtie2-2.3.3.1-linux-x86_64 [user0001@boris bowtie2-2.3.3.1]$ echo $BT2_HOME [user0001@boris bowtie2-2.3.3.1]$ cd [user0001@boris ~]$ bowtie2-build $BT2_HOME/example/reference/lambda_virus.fa lambda_virus [user0001@boris ~]$ bowtie2 -x lambda_virus -U $BT2_HOME/example/reads/reads_1.fq -S eg1.sam ################################################################################# ## SAMtools-no longer necessary for tophat but needed for Tuxedo 2.0 workflow ################################################################################# [user0001@boris ~]$ cd bin [user0001@boris bin]$ wget http://sourceforge.net/projects/samtools/files/samtools/1.6/samtools-1.6.tar.bz2 [user0001@boris bin]$ tar -jxvf samtools-1.6.tar.bz2 [user0001@boris bin]$ cd samtools-1.6 [user0001@boris samtools-1.6]$ make [user0001@boris samtools-1.6]$ cp samtools $HOME/bin/samtools [user0001@boris samtools-1.6]$ cd examples # index fa file [user0001@boris examples]$ samtools faidx ex1.fa # convert sam to bam [user0001@boris examples]$ samtools view -S -b -t ex1.fa.fai -o ex1.bam ex1.sam.gz # index the bam file [user0001@boris examples]$ samtools index ex1.bam # View a portion of the BAM file [user0001@boris examples]$ samtools view ex1.bam seq2:450-550 # Visually inspect the alignments at the same location [user0001@boris examples]$ samtools tview -p seq2:450 ex1.bam ex1.fa # hit q to leave # View the data in pileup format: [user0001@boris examples]$ samtools mpileup -f ex1.fa ex1.bam # Generate an uncompressed VCF file of variants [user0001@boris examples]$ samtools mpileup -vu -f ex1.fa ex1.bam > ex1.vcf # Generate a compressed VCF file of variants [user0001@boris examples]$ samtools mpileup -g -f ex1.fa ex1.bam > ex1.bcf ##################################################################### ## TopHat-first command searches site for address of file to download ##################################################################### ## deprecated [user0001@boris samtools-1.3.1]$ cd .. [user0001@boris bin]$ wget -o ~/output.txt -r -l 10 --spider http://ccb.jhu.edu/software [user0001@boris bin]$ wget http://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.1.Linux_x86_64.tar.gz [user0001@boris bin]$ tar zxvf tophat-2.1.1.Linux_x86_64.tar.gz [user0001@boris bin]$ cd tophat-2.1.1.Linux_x86_64 [user0001@boris tophat-2.1.1.Linux_x86_64]$ cp tophat2 $HOME/bin/tophat2 [user0001@boris tophat-2.1.1.Linux_x86_64]$ cp * $HOME/bin/ [user0001@boris tophat-2.1.1.Linux_x86_64]$ wget http://ccb.jhu.edu/software/tophat/downloads/test_data.tar.gz [user0001@boris tophat-2.1.1.Linux_x86_64]$ tar zxvf test_data.tar.gz [user0001@boris tophat-2.1.1.Linux_x86_64]$ cd test_data [user0001@boris test_data]$ tophat2 -r 20 test_ref reads_1.fq reads_2.fq #################################################################### ## Cufflinks #################################################################### # deprecated [user0001@boris test_data]$ cd ..; cd .. [user0001@boris bin]$ wget http://cole-trapnell-lab.github.io/cufflinks/assets/downloads/cufflinks-2.2.1.Linux_x86_64.tar.gz [user0001@boris bin]$ tar zxvf cufflinks-2.2.1.Linux_x86_64.tar.gz [user0001@boris bin]$ cd cufflinks-2.2.1.Linux_x86_64 [user0001@boris cufflinks-2.2.1.Linux_x86_64]$ cp * $HOME/bin/ # need to download the file called test_data.sam from the course website #[user0001@boris cufflinks-2.2.1.Linux_x86_64]$ wget http://cufflinks.cbcb.umd.edu/downloads/test_data.sam [user0001@boris cufflinks-2.2.1.Linux_x86_64]$ wget http://www.biostat.umn.edu/~cavanr/test_data.sam [user0001@boris cufflinks-2.2.1.Linux_x86_64]$ cufflinks test_data.sam #################################################################### ## TopHat and Cufflinks using the fly data #################################################################### # deprecated [user0001@boris ~]$ mkdir RNAseq [user0001@boris ~]$ cd RNAseq [user0001@boris RNAseq]$ tophat2 -p 2 -G \ /export/home/courses/ph7445/data/Drosophila_melanogaster.BDGP5.68.gtf \ -o C1_R1_thout \ /export/home/courses/ph7445/data/Drosophila_melanogaster/Ensembl/BDGP5/Sequence/Bowtie2Index/genome \ /export/home/courses/ph7445/data/GSM794483_C1_R1_1.fq \ /export/home/courses/ph7445/data/GSM794483_C1_R1_2.fq [user0001@boris RNAseq]$ cufflinks -p 2 -o C1_R1_clout C1_R1_thout/accepted_hits.bam ## repeat these commands for all of the samples (each has 2 fastq files)-look at rnaseqDrosophilia.sh [user0001@boris RNAseq]$ sh rnaseqDrosophilia.sh ## set up assemblies.txt to hold locations of output files from cufflinks ## then copy the fasta file to this location [cavanr@boris RNAseq]$ cp /export/home/courses/ph7445/data/Drosophila_melanogaster/Ensembl/BDGP5/Sequence/Bowtie2Index/genome.fa . [cavanr@boris RNAseq]$ cuffmerge -p 2 \ -g /export/home/courses/ph7445/data/Drosophila_melanogaster.BDGP5.68.gtf \ -s genome.fa \ assemblies.txt ## old protocol [cavanr@boris RNAseq]$ cuffdiff -o diff_out \ #-b d_melanogaster_fb5_22.fa -p 2 -L \ -b genome.fa -p 2 -L \ C1,C2 merged_asm/merged.gtf C1_R1_thout/accepted_hits.bam,\ C1_R2_thout/accepted_hits.bam,C1_R3_thout/accepted_hits.bam \ C2_R1_thout/accepted_hits.bam,C2_R2_thout/accepted_hits.bam,\ C2_R3_thout/accepted_hits.bam ## new protocol [user0001@boris RNAseq]$ cuffquant -o C1_R1_quant_out -b genome.fa -p 2 merged_asm/merged.gtf C1_R1_thout/accepted_hits.bam ## use same technique for other 5 samples [cavanr@boris RNAseq]$ cuffdiff -o diff_out \ #-b d_melanogaster_fb5_22.fa -p 2 -L \ -b genome.fa -p 2 -L \ C1,C2 merged_asm/merged.gtf C1_R1_quant_out/abundances.cxb,\ C1_R2_quant_out/abundances.cxb,C1_R3_quant_out/abundances.cxb \ C2_R1_quant_out/abundances.cxb,C2_R2_quant_out/abundances.cxb,\ C2_R3_quant_out/abundances.cxb #################################################################### ## cummeRbund #################################################################### ## deprecated source("http://www.bioconductor.org/biocLite.R") biocLite("cummeRbund") # then if that fails issue this command biocLite("BiocUpgrade") library(cummeRbund) cuff_data <- readCufflinks('diff_out') gene_diff <- diffData(genes(cuff_data)) sig_gene_diff <- subset(gene_diff, significant=='yes') nrow(sig_gene_diff) csDensity(genes(cuff_data)) csScatter(genes(cuff_data), 'C1', 'C2') csVolcano(genes(cuff_data), 'C1', 'C2') mygene <- getGene(cuff_data, 'regucalcin') expressionBarplot(mygene) expressionBarplot(isoforms(mygene)) isoform_diff <- diffData(isoforms(cuff_data), 'C1', 'C2') sig_isoform_diff <- subset(isoform_diff,significant=='yes') nrow(sig_isoform_diff) tss_diff <- diffData(TSS(cuff_data), 'C1', 'C2') sig_tss_diff <- subset(tss_diff, significant=='yes') nrow(sig_tss_diff) cds_diff <- diffData(CDS(cuff_data), 'C1', 'C2') sig_cds_diff <- subset(cds_diff, significant=='yes') nrow(sig_cds_diff) table(tss_diff[,12]) promoter_diff <- distValues(promoters(cuff_data)) sig_promoter_diff <- subset(promoter_diff,significant=='yes') nrow(sig_promoter_diff) splicing_diff <- distValues(splicing(cuff_data)) sig_splicing_diff <- subset(splicing_diff,significant=='yes') nrow(sig_splicing_diff) relCDS_diff <- distValues(relCDS(cuff_data)) sig_relCDS_diff <- subset(relCDS_diff,significant=='yes') nrow(sig_relCDS_diff) #################################################################### ## edgeR #################################################################### biocLite("edgeR") # then that fails-problem with limma library(BiocInstaller) biocValid() # and we see limma is out of date biocLite("limma") biocLite("edgeR") library("edgeR") bovCnts <- read.table("/export/home/courses/ph7445/data/bovineCounts.txt") grp <- factor(c(rep(1,5),rep(2,6))) f1 <- apply(bovCnts,1,min) sum(f1>4) bovCntsF <- bovCnts[f1 > 4 ,] delist <- DGEList(counts=bovCntsF, group=grp) delist <- estimateCommonDisp(delist) delist <- estimateTagwiseDisp(delist) et <- exactTest(delist) padj <- p.adjust(et$table[,3], method="BH") sum(padj<0.05) #################################################################### ## DESeq #################################################################### biocLite("DESeq") cds <- newCountDataSet(bovCntsF, grp) cds <- estimateSizeFactors(cds) cds <- estimateDispersions(cds) res <- nbinomTest(cds, "1", "2") plotDispEst <- function(cds){ plot(rowMeans(counts(cds, normalized=TRUE)), fitInfo(cds)$perGeneDispEsts, pch='.', log="xy") xg <- 10^seq(-.5, 5, length.out=300) lines(xg, fitInfo(cds)$dispFun(xg), col="red") } plotDispEst(cds) edgeRFC <- et$table[,1] edgeRp.value <- et$table[,3] DESeqFC <- res[,6] install.packages("ggplot2") qplot(edgeRFC, DESeqFC, colour= -log(edgeRp.value,10)) #################################################################### ## BCFtools #################################################################### [cavanr@boris samtools-1.2]$ samtools mpileup -uf examples/ex1.fa ex1.bam | bcftools/bcftools view -bvcg - > var.raw.bcf [cavanr@boris samtools-1.2]$ cd bcftools [cavanr@boris bcftools]$ ./bcftools view var.raw.bcf | ./vcfutils.pl varFilter -D100 > var.flt.vcf [cavanr@boris bcftools]$ more var.flt.vcf [cavanr@boris samtools-1.2]$ samtools mpileup -uf examples/ex1.fa ex1.bam | bcftools/bcftools view -cg - | bcftools/vcfutils.pl vcf2fq > cns.fq #################################################################### ## metagenomics with edgeR #################################################################### bactCnt <- read.table("/export/home/courses/ph7445/data/bactCounts.txt") dis <- factor(substring(colnames(bactCnt),1,3)) bactDEL <- DGEList(counts=bactCnt, group=dis) design <- model.matrix(~dis) bactDEL <- estimateGLMTrendedDisp(bactDEL, design) bactDEL <- estimateGLMTagwiseDisp(bactDEL, design) fit <- glmFit(bactDEL, design) lrt <- glmLRT(fit, coef=2) topTags(lrt) bactCntF <- bactCnt[apply(bactCnt,1,sum) >= 10,] bactDEL1 <- DGEList(counts=bactCntF, group=dis) bactDEL1 <- estimateGLMTrendedDisp(bactDEL1, design) bactDEL1 <- estimateGLMTagwiseDisp(bactDEL1, design) fit1 <- glmFit(bactDEL1, design) lrt1.2 <- glmLRT(fit1, coef=2) lrt1.3 <- glmLRT(fit1, coef=3) topTags(lrt1.2) topTags(lrt1.3) padj1=p.adjust(lrt1.2$table$PValue, "BY") length(padj1[padj1<.1]) padj2=p.adjust(lrt1.3$table$PValue, "BY") length(padj2[padj2<.1]) intersect(rownames(bactCntF)[padj1<.1], rownames(bactCntF)[padj2<.1]) ######################################################################### ## Hisat2, stringtie and ballgown ######################################################################### [user0001@boris ~]$ mkdir rnaseq [user0001@boris ~]$ cd rnaseq # this takes a while [user0001@boris rnaseq]$ wget ftp://ftp.ccb.jhu.edu/pub/RNAseq_protocol/chrX_data.tar.gz [user0001@boris rnaseq]$ tar zxvf chrX_data.tar.gz # download hisat2, stringtie, or get from my site [user0001@boris rnaseq]$ cd [user0001@boris ~]$ cd bin [user0001@boris bin]$ wget http://www.biostat.umn.edu/~cavanr/hisat2-2.1.0-Linux_x86_64.zip [user0001@boris bin]$ wget http://www.biostat.umn.edu/~cavanr/stringtie-1.3.3b.Linux_x86_64.tar.gz [user0001@boris bin]$ unzip hisat2-2.1.0-Linux_x86_64.zip [user0001@boris bin]$ tar zxvf stringtie-1.3.3b.Linux_x86_64.tar.gz [user0001@boris bin]$ cd hisat2-2.1.0 [user0001@boris hisat2-2.1.0]$ export HISAT2_HOME=$HOME/bin/hisat2-2.1.0 [user0001@boris hisat2-2.1.0]$ cp hisat2 $HOME/bin/. [user0001@boris hisat2-2.1.0]$ cp hisat2* $HOME/bin/. [user0001@boris hisat2-2.1.0]$ cd .. [user0001@boris bin]$ cd stringtie-1.3.3b.Linux_x86_64 [user0001@boris stringtie-1.3.3b.Linux_x86_64]$ cp stringtie $HOME/bin/. [user0001@boris stringtie-1.3.3b.Linux_x86_64]$ cd [user0001@boris ~]$ cd rnaseq [user0001@boris rnaseq]$ hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188044_chrX_1.fastq.gz -2 chrX_data/samples/ERR188044_chrX_2.fastq.gz -S ERR188044_chrX.sam # run rest: takes 8 minutes hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188104_chrX_1.fastq.gz -2 chrX_data/samples/ERR188104_chrX_2.fastq.gz -S ERR188104_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188234_chrX_1.fastq.gz -2 chrX_data/samples/ERR188234_chrX_2.fastq.gz -S ERR188234_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188245_chrX_1.fastq.gz -2 chrX_data/samples/ERR188245_chrX_2.fastq.gz -S ERR188245_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188257_chrX_1.fastq.gz -2 chrX_data/samples/ERR188257_chrX_2.fastq.gz -S ERR188257_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188273_chrX_1.fastq.gz -2 chrX_data/samples/ERR188273_chrX_2.fastq.gz -S ERR188273_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188337_chrX_1.fastq.gz -2 chrX_data/samples/ERR188337_chrX_2.fastq.gz -S ERR188337_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188383_chrX_1.fastq.gz -2 chrX_data/samples/ERR188383_chrX_2.fastq.gz -S ERR188383_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188401_chrX_1.fastq.gz -2 chrX_data/samples/ERR188401_chrX_2.fastq.gz -S ERR188401_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188428_chrX_1.fastq.gz -2 chrX_data/samples/ERR188428_chrX_2.fastq.gz -S ERR188428_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR188454_chrX_1.fastq.gz -2 chrX_data/samples/ERR188454_chrX_2.fastq.gz -S ERR188454_chrX.sam hisat2 -p 8 --dta -x chrX_data/indexes/chrX_tran -1 chrX_data/samples/ERR204916_chrX_1.fastq.gz -2 chrX_data/samples/ERR204916_chrX_2.fastq.gz -S ERR204916_chrX.sam [user0001@boris rnaseq]$ samtools sort -@ 8 -o ERR188044_chrX.bam ERR188044_chrX.sam # run rest: takes 3 minutes samtools sort -@ 8 -o ERR188104_chrX.bam ERR188104_chrX.sam samtools sort -@ 8 -o ERR188234_chrX.bam ERR188234_chrX.sam samtools sort -@ 8 -o ERR188245_chrX.bam ERR188245_chrX.sam samtools sort -@ 8 -o ERR188257_chrX.bam ERR188257_chrX.sam samtools sort -@ 8 -o ERR188273_chrX.bam ERR188273_chrX.sam samtools sort -@ 8 -o ERR188337_chrX.bam ERR188337_chrX.sam samtools sort -@ 8 -o ERR188383_chrX.bam ERR188383_chrX.sam samtools sort -@ 8 -o ERR188401_chrX.bam ERR188401_chrX.sam samtools sort -@ 8 -o ERR188428_chrX.bam ERR188428_chrX.sam samtools sort -@ 8 -o ERR188454_chrX.bam ERR188454_chrX.sam samtools sort -@ 8 -o ERR204916_chrX.bam ERR204916_chrX.sam [user0001@boris rnaseq]$ stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188044_chrX.gtf ERR188044_chrX.bam -l ERR188044 # run rest, takes 2 minutes stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188104_chrX.gtf ERR188104_chrX.bam -l ERR188104 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188234_chrX.gtf ERR188234_chrX.bam -l ERR188234 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188245_chrX.gtf ERR188245_chrX.bam -l ERR188245 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188257_chrX.gtf ERR188257_chrX.bam -l ERR188257 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188273_chrX.gtf ERR188273_chrX.bam -l ERR188273 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188337_chrX.gtf ERR188337_chrX.bam -l ERR188337 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188383_chrX.gtf ERR188383_chrX.bam -l ERR188383 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188401_chrX.gtf ERR188401_chrX.bam -l ERR188401 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188428_chrX.gtf ERR188428_chrX.bam -l ERR188428 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR188454_chrX.gtf ERR188454_chrX.bam -l ERR188454 stringtie -p 8 -G chrX_data/genes/chrX.gtf -o ERR204916_chrX.gtf ERR204916_chrX.bam -l ERR204916 # merge [user0001@boris rnaseq]$ stringtie --merge -p 8 -G chrX_data/genes/chrX.gtf -o stringtie_merged.gtf chrX_data/mergelist.txt # optional comparison step [user0001@boris rnaseq]$ gffcompare -r chrX_data/genes/chrX.gtf -G -o merged stringtie_merged.gtf # run stringtie again to get expression estimates [user0001@boris rnaseq]$ stringtie ERR188044_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188044/ERR188044_chrX.gtf -e -B # run rest, takes 3 minutes stringtie ERR188104_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188104/ERR188104_chrX.gtf -e -B stringtie ERR188234_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188234/ERR188234_chrX.gtf -e -B stringtie ERR188245_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188245/ERR188245_chrX.gtf -e -B stringtie ERR188257_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188257/ERR188257_chrX.gtf -e -B stringtie ERR188273_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188273/ERR188273_chrX.gtf -e -B stringtie ERR188337_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188337/ERR188337_chrX.gtf -e -B stringtie ERR188383_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188383/ERR188383_chrX.gtf -e -B stringtie ERR188401_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188401/ERR188401_chrX.gtf -e -B stringtie ERR188428_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188428/ERR188428_chrX.gtf -e -B stringtie ERR188454_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR188454/ERR188454_chrX.gtf -e -B stringtie ERR204916_chrX.bam -p 8 -G stringtie_merged.gtf -o ballgown/ERR204916/ERR204916_chrX.gtf -e -B ############################################################ ## read into R and merge by hand ############################################################ [user0001@boris rnaseq]$ R getTx <- function(smpNm){ flnm=paste0("ballgown/",smpNm,"/",smpNm,"_chrX.gtf") fl=read.delim(flnm,skip=2,header=F) tx=fl[fl[,3]=="transcript",] txid=substr(tx[,9],regexpr("transcript_id ",tx[,9]),regexpr("; cov",tx[,9])-1) fpkm=as.numeric(substr(tx[,9],regexpr("FPKM ",tx[,9])+5,regexpr("; TPM",tx[,9])-1)) gprst=regexpr("gene_name",txid)>0 txid=ifelse(gprst,substr(txid,1,regexpr(";",txid)),txid) gname=ifelse(gprst,substr(txid,regexpr("gene_name",txid)+10,10000), substr(txid,15,10000)) data.frame(gname=gname,fpkm=fpkm,txid=txid) } fls=list.files() smpNms=substr(fls[grep("bam",fls)],1,9) txData=vector("list",length(smpNms)) for(i in 1:length(smpNms)) txData[[i]]=getTx(smpNms[i]) # then create a nice dataset-use sorted order of genes datmat=matrix(NA,dim(txData[[1]])[1],length(smpNms)) txid=sort(as.character(txData[[1]][,3])) for(i in 1:length(smpNms)){ datmat[,i]=txData[[i]][order(as.character(txData[[i]][,3])),2] } pheno_data=read.csv("chrX_data/geuvadis_phenodata.csv") table(smpNms==pheno_data$ids) sexp=rep(NA,dim(datmat)[1]) popp=rep(NA,dim(datmat)[1]) for(i in 1:dim(datmat)[1]){ m1=summary(lm(datmat[i,]~sex+population,data=pheno_data)) sexp[i]=m1$coef[2,4] popp[i]=m1$coef[3,4] } sum(p.adjust(sexp,method="BH")<.05,na.rm=T) sum(p.adjust(popp,method="BH")<.05,na.rm=T) ############################################################ ## ballgown ############################################################ install.packages("devtools",repos="http://cran.us.r-project.org") source("http://www.bioconductor.org/biocLite.R") biocLite(c("alyssafrazee/RSkittleBrewer","ballgown", "genefilter","dplyr","devtools")) pheno_data = read.csv("geuvadis_phenodata.csv") bg_chrX = ballgown(dataDir = "ballgown", samplePattern = "ERR", pData=pheno_data) bg_chrX_filt = subset(bg_chrX,"rowVars(texpr(bg_chrX)) >1",genomesubset=TRUE) results_transcripts = stattest(bg_chrX_filt, feature="transcript",covariate="sex",adjustvars = c("population"), getFC=TRUE, meas="FPKM") results_genes = stattest(bg_chrX_filt, feature="gene", covariate="sex", adjustvars = c("population"), getFC=TRUE, meas="FPKM") subset(results_transcripts,results_transcripts$qval<0.05) plotTranscripts(ballgown::geneIDs(bg_chrX)[1729], bg_chrX, main=c('Gene XIST in sample ERR188234'), sample=c('ERR188234'))