######################################################################
# R commands      Statistics for Human Genetics and Molecular Biology
# Lab 3  2015                                 University of Minnesota
######################################################################
# This file contains the R commands for the lab.
#
# Lines beginning with the symbol '#' are comments in R.  All other
# lines contain code.
#
# In R for Windows, you may wish to open this file from the menu bar
# (File:Display file); you can then copy commands into the command
# window.  (Use the mouse to highlight one or more lines; then
# right-click and select "Paste to console".)
######################################################################


##########################################################################
# Import data 
#############################################################################
## Need to creat a dataset with missing values in excel and save it as test.csv (comma delimited) for this exercise
getwd()
data<-read.csv("test.csv", header=T)
str(data)
rm<-is.na(data$Values)
rm<-which(is.na(data$Values))
newdata<-data[-rm,]
######Alternative using !is.na
keep<-which(!is.na(data$Values))
newdata<-data[keep,]
##############################
## Export data 
##############################
write.table(newdata, file="test2.txt", row.names=F, quote=F, sep="\t")

############# FAMuss example 
getwd() ## change it using setwd()
fmsURL<-"http://people.umass.edu/foulkes/asg/data/FMS_data.txt"
fms<-read.delim(file=fmsURL, header=TRUE, sep="\t")
colnames(fms)
dim(fms)  ## check the dimension of the data
str(fms[,1:10]) ## check the structure of the data 
fms$id[1:10] 
fms[1,1:10]
fms$pre.BMI
fms$actn3_rs540874
index<-match(c("actn3_rs540874","pre.BMI"), colnames(fms)) 
dat<-fms[!is.na(fms$pre.BMI) & !is.na(fms$actn3_rs540874) , index] ## observations without NA
attach(fms)
mean(pre.BMI, na.rm=T)
dURL<-"http://itl.nist.gov/div898/strd/univ/data/PiDigits.dat"
pidigits<-read.table(file=dURL, skip=60)
table(pidigits)
str(pidigits)
prop<-table(pidigits)/5000 ##proportions
prop
barplot(prop, xlab="digit", ylab="proportion")
abline(h=0.1, lty=2)
##scan
x<-scan()
str(x)

##############################
## &, |
##############################
x<-c(T, T, F, F)
y<-c(T, F, T, F)
mat<-cbind(x,y)
mat
and<- x & y
or<-x | y
and 
or


##############################
# Export data 
##############################
newdata<-fms[,1:3]
write.table(newdata, file="newdata.txt", sep="\t")
save(newdata, file="newdata.RData")


##############################
# Exercise 1
##############################
# 1. create a file using excel 
# 2. save it as csv
# 3. read it into R using read.csv
##############################
# Exercise 2
##############################
# 1. create a file using excel 
# 2. save it as txt (tab-delimited)
# 3. read it into R using read.table
##############################


##############################
# Generate Random Number  
###############################
##################################################
# generate 100 points from Normal(mu=0, sigma=2)
##################################################
x<-rnorm(100, mean=0, sd=2)
mean(x)
hist(x)
plot(density(x))
mean(x)
quantile(x)
##################################################
# generate 100 points from uniform distribution(0,1)
##################################################
y<-runif(100)
sum(y > 0.5)

##################################################
# Toss a fair coin five times 
##################################################

h<-rbinom(1, 5, p=0.5)

##################################################
# sample from a vector
##################################################
x<-1:10
samp<-sample(x)
samp2<-sample(x, 5)
samp
samp2

##############################
# Graphics 
##############################
library(MASS)
head(mammals)
str(mammals)
br<-seq(0, max(mammals$body)+100, by=100)
hist(mammals$body, freq=T, breaks=br, xlab="Body Size (lbs)", main="Distribution body size of mammals")
m<-median(mammals$body)
size<-ifelse(mammals$body > m, "large", "small")
mammals$size<-size
head(mammals)
plot(mammals$body, mammals$brain, pch=16, xlab="Body Size (lbs)", ylab="Brain Size (g)")
rm<-which(mammals$body > 1000)
mamm<-mammals[-rm,]
dim(mamm)
pdf("mammals.pdf")
plot(mamm$body, mamm$brain, pch=16, xlab="Body Size (lbs)", ylab="Brain Size (g)")
dev.off()

small<-mammals[mammals$size=="small",]
plot(small$body, small$brain, pch=16)

############################
x<-rnorm(100)
y<-3 + 2*x + rnorm(100)
plot(x, y, type="p", pch=16)
## plot normal(0,1) distribution
plot(x<-seq(-4,4, by=0.1), dnorm(x, mean=0, sd=1), type="l", xlab="X", ylab="Density", main="Normal distribution")
hist(x)
pdf("myplot.pdf")
plot(x<-seq(-4,4, by=0.1), dnorm(x, mean=0, sd=1), type="l", xlab="X", ylab="Density", main="Normal distribution")
dev.off()

####################
# Installing packages
#  
######################
source("http://www.bioconductor.org/biocLite.R")
biocLite("UsingR")
library("UsingR")


### generate 1000 points from chi square distribution
x<-rchisq(1000, df=1)
quantile(x)
maint<-expression(paste(chi^2, " distribution", sep=""))
plot(density(x), main=maint, xlab="x")
simple.hist.and.boxplot(x)


##############################
# for loops & ifelse
##############################

for (i in 1:10){
	if (i > 5){
		print(i)
	}
}
## if .. else 
x<-rnorm(10, mean=5, sd=1)

for (i in x){
	if (i < 5){
		msg<-paste(i, " is equal or less than 5", sep="")
		print(msg)
	} else {
		msg<-paste(i, " is larger than 5", sep="")
		print(msg)
	}
	
}


str(fms[,1:10])
attach(fms)
table(actn3_rs540874)
geno<-ifelse(actn3_rs540874=="GG", 1, 0)
table(geno)
geno2<-as.numeric(actn3_rs540874)
table(geno2, useNA="always")
x<-rnorm(100)
xx<-ifelse(x > 0, 1, 0)
table(xx)


##############################
# Functions
##############################
x<-rnorm(100)
mysd<-function(x){
	ans<-sqrt(var(x))
	return(ans)
}
mysd(x)
sd(x)

myfun<-function(x){
	ans<-x+3
	return(ans)
}
myfun(5)

##############################
# Likelihood
##############################

likelihood<-function(p, n=40, k=25){
	ans<-dbinom(x=k, size=n, prob=p)
	return(ans)
}

getwd()
pdf("MLE.pdf")
plot(x<-seq(0, 1, by=0.01), likelihood(x, n=40, k=25), type="l", xlab="p", ylab="Likelihood", main="Toss a coin 40 times and get 25 heads")
abline(v=0.625,col="grey", lty=2)
text(0.3, 0.12, "Maximum Likelihood p=0.625", bty="n")
text(0.3, 0.115, "(the most likely value of p)", bty="n")
dev.off()


##############################
# apply, sapply
##############################
m<-matrix(1:20, nrow=10, ncol=2) ## create a matrix of 10 rows x 2 columns
apply(m, 1, mean) ## mean of the rows: rowMeans
apply(m,2, mean)  ## mean of the columns: colMeans 
x<-1:10
sapply(x, myfun)
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
sapply(x, quantile)


###########################
# Exercise: strsplit, sapply
###########################

myid<-paste("ID-", round(runif(100),2)*100, sep="")
## Use strsplit and sapply to create a numerical vector that contains only the numerical part of myid object (for exampe: ID-89 -> 89)

mystrsplit<-function(x){
	ans<-strsplit(x, split="-")[[1]][2]
	return(ans)
}
mynum<-as.numeric(sapply(myid, mystrsplit))
cbind(myid, mynum)

###### alternative 

mynum2<-as.numeric(sapply(myid, function(x)strsplit(x, split="-")[[1]][2]))
mynum2


###########################
# Exercise: Tossing a coin a lot of times 
###########################


##################
# End of lab2.R
##################