victor soria-carrasco [email protected]
DESCRIPTION
Running an R application - GeoSSE (Geographic State Speciation and Extinction) analyses run using R and parallel functions. Download this file http://goo.gl/rQL8Hd. Victor Soria-Carrasco [email protected]. GeoSSE. - PowerPoint PPT PresentationTRANSCRIPT
Running an R application - GeoSSE (Geographic State Speciation and
Extinction) analyses run using R and parallel functions
Victor [email protected]
Download this filehttp://goo.gl/rQL8Hd
GeoSSE• Belongs to a family of likelihood-based methods for
studying diversification and character evolution using phylogenetic trees (BiSSE - Binary State Speciation and Extinction)
• GeoSSE models the reciprocal effects between geographic range evolution and diversification (speciation and extinction)
• It estimates region-dependent rates of speciation, extinction and dispersal.
• R package diversitree
A shell script is used to call the R script
#!/bin/bash#$ -l h_rt=08:00:00#$ -l mem=2G#$ -m bea#$ -M [email protected]
R CMD BATCH --vanilla myscript.R
# Load diversitree packagelibrary (diversitree)
# Load phylogenetic treestrees<-read.tree(“mytrees.new”)
# Load statesstates<-read.table(“mytable.txt”, header=T)...
R_job.sh myscript.R
$ qsub R_job.sh -j y -o rjob.log
Example 1
Running R usingmultiple cores of 1 node
(shared memory)
# Launch an interactive session
$ qrsh
# Copy the input files (trees and table with geographic states) to your home folder
$ cp /usr/local/extras/Genomics/HPC_course/R_GeoSSE/example_trees_10.new $HOME/
$ cp /usr/local/extras/Genomics/HPC_course/R_GeoSSE/example_states.txt $HOME/
# Copy the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/geosse_openmp.R $HOME/
# Copy the shell script to call the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/R_job_openmp.sh $HOME/
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# If you experience problems, you can download all the files from the web:
$ wget http://goo.gl/9GVykX$ tar -xvf R_GeoSSE.tar.gz$ rm R_GeoSSE.tar.gz
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# Check you have the following 4 files in your home directory:
example_states.txtexample_trees_10.newgeosse_openmp.RR_job_openmp.sh
# Edit the shell script to change number of processors, e-mail address, and path to R script
$ nano $HOME/R_job_openmp.sh
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
#!/bin/bash#$ -l h_rt=08:00:00#$ -l mem=2G#$ -pe openmp 2#$ -m bea#$ -M [email protected]
# Number of cores to be usedexport OMP_NUM_THREADS=2
# Add path to software repository librariesexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/extras/Genomics/lib/lib
R='/usr/local/extras/Genomics/apps/R/current/bin/R'
# Change R script pathRSCRIPT="$HOME/geosse_openmp.R"
# Some options## --slave# Don't print input commands## --vanilla (highly recommended)# Don't load anything from previous sessions, # and don't save data at the end of the current session
$R CMD BATCH --vanilla $RSCRIPT
R_job_openmp.sh
# Edit the R script to change input and output files
$ nano $HOME/geosse_openmp.R
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# Load parallel packagelibrary(parallel)
# Number of coresncores<-2
# Load diversitree packagelibrary(diversitree)
# Load phylogenetic treestrees<-read.tree(“/home/user/example_trees_10.new”)
# Load statesstates<-read.table(”/home/user/example_states.txt”, header=T)tip.states<-states$statenames(tip.states)<-states$taxon
# Fit birth-death model to get an extinction/speciation ratio (eps)# and use it as starting value for GeoSSEbd.rates<-mclapply(trees, birthdeath)
geosse_openmp.R (1)
# Fit GeoSSE model by maximum likelihood to the treesML.est<-mclapply(1:length(trees), function(x){ model<-make.geosse( tree=trees[[x]], states=tip.states) start.values<-starting.point.geosse(trees[[x]], eps=bd.rates[[x]]$par[1]) res<-find.mle(model, x.init=start.values, method="subplex") cat("Job # ", x, " finished\n", sep="") return(res) }, mc.cores=ncores) # Format results into a table ML.est.pars.table<-as.data.frame(do.call("rbind", lapply(ML.est, function(x) c(x$par, npar=length(x$par), lnl=x$lnLik)))) # Write the table to a file output<-paste(Sys.getenv("HOME"), "/example_openmp_output.txt", sep='') write.table(ML.est.pars.table, file=output, row.names=F, quote=F, sep="\t") # Save image of this session image<-paste(Sys.getenv("HOME"), "/example_openmp_image.RData", sep='') save.image(file=image, compress="bzip2")
geosse_openmp.R (2)
# Submit the job to the cluster
$ qsub -j y -o geosse_openmp.log R_job_openmp.sh
# Check if the job is running
$ Qstat
# When the job finishes, look at the logs and the output
$ less -S geosse_openmp.log$ less -S geosse_openmp.Rout$ less -S example_openmp_output.txt
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
Example 2
Running R usingmultiple nodes
(distributed memory)
# Copy the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/geosse_mpi.R $HOME/
# Copy the shell script to call the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/R_job_mpi.sh $HOME/
# Check you have the following 4 files in your home directory:example_states.txtexample_trees_10.newgeosse_mpi.RR_job_mpi.sh
# Edit the shell script to change number of nodes, e-mail address, and path to R script
$ nano $HOME/R_job_mpi.sh
Example 2. Run GeoSSE using multiple nodes (distributed memory)
#!/bin/bash#$ -l h_rt=08:00:00#$ -l mem=2G#$ -pe ompigige 4#$ -m bea#$ -M [email protected]
# Required for running Rmpimodule add mpi/gcc/openmpi/1.4.4export MPI_ROOT='/usr/local/mpi/gcc/openmpi/1.4.4/'export OMPI_MCA_mtl=^psm
# Add path to software repository librariesexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/extras/Genomics/lib/lib
R='/usr/local/extras/Genomics/apps/R/current/bin/R'
# Change R script pathRSCRIPT=”$HOME/geosse_mpi.R"
# Some options## --slave# Don't print input commands## --vanilla (highly recommended)# Don't load anything from previous sessions, # and don't save data at the end of the current session
$R CMD BATCH --vanilla $RSCRIPT
R_job_mpi.sh
# Edit the R script to change input and output files
$ nano $HOME/geosse_mpi.R
Example 2. Run GeoSSE using multiple nodes (distributed memory)
# This R script uses multiple nodes to fit a GeoSSE model# to multiple phylogenetic trees
# Ignore Infiniband interfaceSys.setenv(OMPI_MCA_mtl='^psm')
# Echo commandsoptions(echo=T)
# Load Rmpi packagelibrary (Rmpi)# Load diversitree packagelibrary (diversitree)
# Load phylogenetic treestrees<-read.tree(“/home/user/example_trees_10.new”)
# Load statesstates<-read.table(”/home/user/example_states.txt”, header=T)tip.states<-states$statenames(tip.states)<-states$taxon
geosse_mpi.R (1)
# Number of nodesnnodes<-4
# spawn mpi slavesmpi.spawn.Rslaves(nslaves=nnodes, needlog=T)
# load diversitree on slavesmpi.bcast.cmd(library(diversitree))
# Broadcast trees to slavesmpi.bcast.Robj2slave(trees)
# Broadcast tip.states to slavesmpi.bcast.Robj2slave(tip.states)
# Fit birth-death model to get an extinction/speciation ratio (eps)# and use it as starting value for GeoSSEbd.rates<-mpi.parLapply(trees, birthdeath)
# Broadcast bd.rates to slavesmpi.bcast.Robj2slave(bd.rates)
geosse_mpi.R (2)
# Fit GeoSSE model by maximum likelihoodML.est<-mpi.parLapply(1:length(trees), function(x){ model<-make.geosse( tree=trees[[x]], states=tip.states) start.values<-starting.point.geosse(trees[[x]], eps=bd.rates[[x]]$par[1]) res<-find.mle(model, x.init=start.values, method="subplex") cat("Job # ", x, " finished\n", sep="") return(res) }, job.num=nnodes)
# Format results into a tableML.est.pars.table<-as.data.frame(do.call("rbind", lapply(ML.est, function(x) c(x$par, npar=length(x$par), lnl=x$lnLik))))
# Write the table to a fileoutput<-paste(Sys.getenv("HOME"), "/example_mpi_output.txt", sep='')write.table(ML.est.pars.table, file=output, row.names=F, quote=F, sep="\t")
# Save an image of this sessionimage<-paste(Sys.getenv("HOME"), "/example_mpi_image.RData", sep='')save.image(file=image, compress="bzip2")
geosse_mpi.R (3)
# Submit the job to the cluster
$ qsub -j y -o geosse_mpi.log R_job_mpi.sh
# Check if the job is running
$ Qstat
# When the job finishes, look at the logs$ less -S geosse_mpi.log$ less -S geosse_mpi.Rout
# look at the log for each node$ find . -name "*node*.log" -exec sh -c "echo {}; cat {} | sed 's/^/\t/g'" \;
# Output (results)$ less -S example_mpi_output.txt
Example 2. Run GeoSSE using multiple nodes (distributed memory)
https://myapps.shef.ac.uk
bo4cm14 / gen0mics