R Notes

The following are some notes on the syntax, commands, etc. for R programming and plotting.

---Directories, packages----
getwd()               # print the working directory
setwd("test")         # sets working directory
dir.create("test")    # Like unix 'mkdir test'
list.files()          # Like unix ls
system("ls")          # Run a command from the OS
sessionInfo()         # packages loaded, version and so on
install.packages("plotrix") # searches internet and installs package 'plotrix'
                           # sometimes these add-ons are exactly what you need
library(plotrix)      # have to do this each session for added packages
citation()            # to cite the R package

help(seq)             # prints help documentation for command 'seq'
?seq                  # another way to invoke help 
??seq                 # look for 'seq' throughout help pages

----Run R commands from a file---
source("command_file") # it is ok to split a long command onto two lines. '#' is a comment

----Assigning variables-----
x=2                   #for a single variable
x <- c(23.,-12.,43.5) # The c() function assigns numbers to a vector
x=c(23.,-12,43.5)     # or do it this way
y=c(x,1,2*x)          # does what you think it should, y has 7 elements if x has 3
z=y-x                 # also has 7 elements. the 3 elements x get repeated to fill up the vector
z=1:10                # assigns values 1 through 10 to vector z
z=seq(1,10,by=2)      # odd numbers 1 through 10
z=seq(40,1,-5)        # start at 40, stop when <=1, increments of -5
z=seq(0.05,3,len=500) # uniform sampling of length 500 between 0.05 and 3
z[4]=5                # 4th value of vector z
z[z<5]                # all values of z that are < 5
z=sample(1:10)        # randomly sample from 1:10 w/o replacement
z=1e-3*sample(0:999,3,replace=TRUE) # 3 random numbers in [0,1) to 3 decimal places
z=rnorm(30,mean=10,sd=2) # 30 random samples from normal distribution with mu=10 sigma=2
z=c("aa","a_string")  # creating a vector of strings
nam=as.character(p)   # convert number to a string for plotting (useful when looping movies)
text(0.02,1.8,nam,col="black") #....and now plot that string. You could update the number p in a loop
z=c(TRUE,TRUE,FALSE)  # a logical vector
rm(x)                 # deletes the variable
rm(list=ls())         # deletes all variables
result=array(0,c(20,3)) # make an array of 20 rows, 3 columns, values all =0
z=matrix(c(4,6,12,9),2,2) # populate z, a 2x2 matrix, with values
zm3=zm2%*%zm1         # matrix multiplication of zm2 and zm1
matrix(4,3,3)         # populate a 3x3 matrix with 4's
diag(3)               # 3x3 matrix with 1's on diagonals, 0 elsewhere
t(z)                  # transpose of matrix z
cov(z)                # covariance of data matrix z; variables in columns, data points in rows,
                      # uses n-1 in denominator to get sigma^2(xy)
cor(z)                # correlation matrix of z

----Store multiple plots as foo*.png files for later insertion as frames into movies
 ...commands to generate plots...
ffmpeg -f image2 -i 'foo%04d.png' try.mpeg  #unix command to make movie from frames; view in Quicktime

----Variables Format----
str(z)                # returns structure of object z, e.g. character, numeric, logical
class(z)              # similar to str
is.matrix(z)          # returns TRUE or FALSE depending on if z is a matrix

----Tabular data and subsamples------
rbind(x,y)            # combine objects into a table by rows
cbind(x,y)            # combine objects by columns
c(xl1,xl2)            # combine two variables together into a single longer one
cx0=as.character(x0)  # interpret cx0 as a character
cx0=as.numeric(x0)    # interpret cx0 as a number
xy=as.data.frame(cbind(x,y)) # creates data frame of named variables. 
names(xy)=c("xval","yval")   # change column names
z=read.table("filename",T)   # read ascii table into a data frame, 'T' reads labels on first line
attach(z)             # puts variables within z into the search path of the program
z1=subset(z,M_A==2 & Dcool<75 & Dcool>50) #choose subset of data in frame z
                      # where  M_A and Dcool are variables. Then attach(z1) 

----Mathematical expressions-----
z=3*sqrt(x)           # expressions work
z=x**3.2              # powers, ^ also works
z=cos((pi/4))         # trig uses radians; acos, tan, etc
z=log10(x)            # log is natural ln 
z=exp(x)              # gives e^x
trunc(x)              # cuts off the fraction part, leaving just the integer
length(z)             # number of elements in z
max(z)                # maximum value; min(z) also works
abs(z)                # absolute value
sort(z)               # order vector from low to high
round(x,4)            # round off variable x to 4 digits

----Plots, Labels----
plot(x,y)             # plot x vs. y (resize window to change aspect ratio)
points(x,y)           # overplot points on existing graph (use lines to overplot lines)
plot(x,pch=20)        # pch=one of the 26 point types; cex is point size
plot(x,xlim=c(0,3),ylim=c(-0.5,4.5),xaxs="i",yaxs="i") # force user-specified limits on x and y
par()                 # show plotting parameters
par(mfrow=c(1,2))     # set up 2-panel figure 
par(mar=c(0,0,0,0),oma=c(4,4,0.5,0.5) #for multiple panels with no space between them
                      # no tics, but plot the box. Useful for multiple panels.
par(mar=c(4,4,1,1))   # size of margin around plot (bottom, left, top, right). Smaller numbers make
                      # the graph fill more of the available space. You can change these along with
                      # mgp to shift the graph relative to the labels.
par(mgp=c(1.8,0.3,0)) # offsets for text labels, numbers, and ticks. Usually ticks=0. The number offset
                      # can be negative to shift values closer to axis, 0.3 works well for tck=0.03 (see below)
title(xlab=expression(paste("something complicated"),line=2) # good for offsetting complicated axis labels
                      # set xlab = " " in the plot command
frac({1},{2})   # write the fraction 1/2. This can be made complicated with Greek letters, subscripts etc. For example:
                  frac({"[N I]"[lambda*lambda*5198+5200]},{"[N II]"[lambda*lambda*6548+6583]})
plot(x,y,tck=0.03,las=1,log='x',pch=M_A,cex=0.9,col="red",xlab=expression(paste("log"["10"]," d"["COOL"]," (AU)")))
                # tck=0.03 tic size, positive number moves it inside the graphic box
                # las=1 axis labels are all vertical
                # log='x' log scale on x (or 'y', 'xy')
                # pch is point type, here chosen from column labeled M_A
                # cex is size of points; use cex.lab=1.3, cex.axis=1.2 to increase label size by 30%, tic labels by 20%
                # col is color 
                # xlab is for x labels:
                # expression/paste syntax for subscripts [] and superscripts ^
                #  [] are followed by a comma. ^ are similar but use only one of them: e.g. paste("cos(45"^"o",")")
plot(x,y,xlab=expression(paste(lambda,"F",lambda))) # for Greek letters in labels 
text(0.5,0.9,"This is a label",0) # Put label at x=0.5, y=0.9.  The last parameter is an offset
                  # Use mtext to place xlabels and ylabels in margins
                  # las=1 makes y-axis (side=2) label vertical, 'at' puts it at y=1.5, and line=2 gives an offset
mtext("peak = 23968",side=3,line=-2.0,adj=0.8,cex=0.8) # adj moves the string along direction you read it
axis(1,at=seq(0,1,0.01),labels=FALSE,tcl=0.2) # add some minor tics to the x-axis, x=0 to 1, intervals 0.01
grid(12,13,lty=3,lwd=2) # draw a dotted grid, 12 lines on x-axis and 13 on the y.  Works best when xaxs='i', yaxs='i'
arrows(m1$r,m1$l-20,m1$r,m1$l+20,length=0.05,angle=90,code=3,lwd=2.0) # draw some errorbars +-20 in y
legend("topright", c("M=2","M=5","M=10"), pch=c(2,5,10))       
                # legend location 'topright' (can be a coordinate)
                # The 'M=2' 'M=5' M=10' are text items for the legend
                # pch are the point types for the text items, or use lty and lwd for line type or width
                # Histogram plot. The breaks define the left and right edges of each bin. Here, M_A is
                # a variable already defined. Label the bins afterward, for example with 
                # axis(1,at=c(0.33,0.67,1.0,1.33,1.67,2.0),labels=c("2","5","10","20","50","100"),tcl=0.0,lwd=0)
                # Then use mtext to label the axes. Fiddle with mar and mgp before drawing the axis, 
                # for example, par(mar=c(0,3,1,1),mgp=c(2,0.8,0))
boxplot(z,range=0.3)  # boxplots, has various arguments, range is for the whiskers
polar.plot(dist,angle,rp.type='sr') # polar coordinate plot in plotrix package, s is symbol, r is radial line

              # GGPLOT EXAMPLE, it has layers that you join via '+'
ggplot(bigg,aes(x=len0,y=curve)) +
    geom_hex(na.rm=TRUE,binwidth=c(2*(delta+ysig),0.007)) +
        limits=c(-1/rminplot,1/rminplot)) +
    scale_fill_viridis_c(option='plasma') +
    theme(panel.background=element_rect(fill='slategrey')) +    
    labs(x="Arc Length",y="Radius of Curvature",

mean(x)               # mean of vector x
sd(x)                 # standard deviation
var(x)                # variance
range(x)              # range of x
summary(x)            # info on mean, median, max, min, quartiles
ecdf(x)               # cumulative distribution function
t.test(x,y)           # t-test for means being the same. A low p-value rejects hypothesis
cframeor.test(x,y,method='s') # rank correlation. p-value is prob of getting correlation
                              # that good from random data. s=spearman, k=kendall, p=pearson
fisher.test(x,conf.level=0.90,alternative="less") # one-sided contingency table test
                              # can also use exact2x2 as alternative
pairs(~x+y+t+s,data=z) # Correlation plots of variables x, y, t, and s against each other 
SmoothScatter(xy,lwd=4,pch=20,cex=0.2) # smooth the data and make a density map
                                       # points are plotted in low density regions. 

---Fitting Models, drawing lines---
fit=lm(y~x+I(x^2))      # y=c0+c1*x+c2*x^2; lm is 'linear model'
xx=seq(40,80,0.1)       # define continuous range of points xx=40 to 80 for drawing curve
yy=fit$coefficients[1]+fit$coefficients[2]*xx+fit$coefficients[3]*xx^2  # now you have (xx,yy) pairs
lines(xx,yy)            # draws the curve

fit=lm(y ~ log10(x))    # fit y to log10 of x
fit                     # print out the coeficients
summary(fit)            # prints fit, residuals, quartiles, errors and what-not
abline(fit)             # overplot line fit
abline(h=0)             # draw horizontal line at y=0
abline(v=0)             # draw vertical line at x=0
lines(sort(x),yfit$fit) # also plots the fitted line
z=smooth.spline(x,y)    # spline fit 
lines(z,lty=2,lw=3)     # draw fit on plot with heavy(lw=3) dotted (lty=2) line.

sample_mean=function(data,index){   #First define the statistic to bootstrap
             xbar=mean(data[index]) #in this case the mean
results=boot(x,sample_mean,R=10000) # 10000 bootstraps on data x
        #results for each returned statistic are in t1, t2, t3, etc.
hist(results$t[,1],main="Mean of 10,000 Bootstrap Samples",xlab="X") #plot them\
boot.ci(boot.out=results,conf=0.95,type="bca") #Confidence intervals

subr=function(data,len,n){  # 3 inputs, can be anything, data frames, numbers, strings
	return(list(datout,ans)) #two outputs, subr[1]=datout and sub[2]=ans, both lists of length 1
df=as.data.frame(subr[1])  interprets datout as a dataframe
numb=as.numeric(subr[2])  interprets ans as a number

---Smoothing, Image display---
bin=hexbin(z1$xcen,z1$ycen,xbins=50)  #make hexagonal density bin plot; larger xbins -> more resolution
plot(bin,main="Density map of Circle Centers",xlab="X (pixels)",ylab="Y (pixels)",
 colorcut=c(seq(0,1,0.1)^1.5),legend=1,lcex=1,colramp=function(n) rainbow(12))  # change stretch via power (here=1.5)
       # this is the hexbin plot command, differs from plot. Can also try style="nested.centroids" for fun
ker=kde2d(z1$xcen,z1$ycen,lims=c(-20,20,-10,30),n=100) ; image(ker)  #make and plot kernel density map

---Looping, plot models---
x=c(50,60,70) # example plots a quadratic 20 times with normal dist. spread of errors in the 3 coefs
plot(x,z0,xlim=c(40,75),ylim=c(-5,80),cex=5,col='red') #plot the points
a=rnorm(20,0,2)                      # a,b,c have 20 values, mean=0 and sigma=2
xx=seq(40,75,1)  # x-values to plot for line
for(i in 1:20){                      #do the loop
 fit=lm(z ~ x + I(x^2))
 lines(xx,zz)                         #plot the fits

---- if/then/else ----
 x[i] = 4.5
} else if (i<-2){
} else {

----Printing to Screen, Output Files----
dim(x)                # print dimensions of variable x (row,column,slice,etc.)
print(x)              # print values of a vector x
x                     # also prints values of x
x[3]                  # prints values of third column of x
head(z,n=10)          # print first 10 lines of array z to screen
tail(z,n=10)          # print last 10 lines of array z to screen
ls()                  # list the defined vectors
write(file='output',z)   # store vector z to an output file
write.table(rbind(x,y),file='out.dat')  # store a table in a file
dev.copy2eps(file='out.eps')            # store figure as an eps file; or just go to menu and save as a pdf!

----A nice reference for syntax and the like----