1.

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

Step 2 - Random data set example *

# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame':    1700 obs. of  2 variables:
##  $ ID   : int  1 2 3 7 8 9 12 13 14 15 ...
##  $ myVar: num  0.636 0.579 1.699 0.396 0.934 ...
summary(z$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001441 0.386184 0.762588 0.881076 1.273592 3.413798

Plot histogram of data

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# add empirical density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

maximum likelihood paramaeters for “normal”

normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   0.88107575   0.63415551 
##  (0.01538053) (0.01087568)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 0.881 0.634
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0154 0.0109
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000237 0 0 0.000118
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1700
##  $ loglik  : num -1638
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute ex. mean
##      mean 
## 0.8810757

Plot normal probability denisty

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot exponential probability density

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot uniform prob density

 stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability denisty

gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot beta probability density

 pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

————— Own data set —————– Data set from dryad. The paper is “Repeated genetic and adaptive phenotypic divergence across tidal elevation in a foundation plant species” I selected the maternal plant height at the time of seed collection. They used the offspring to evaluate height and other characteristics to better understand genetic divergence of this marsh species.

2.

z <- read.table("Maternal_plants_field.csv",header=TRUE,sep=",")
str(z)
summary(z)
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

Step 2 - Random data set example *

# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame':    1734 obs. of  2 variables:
##  $ ID   : int  2 6 10 11 13 14 15 19 20 21 ...
##  $ myVar: num  0.19 0.122 1.796 0.827 1.033 ...
summary(z$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000065 0.369898 0.758295 0.876006 1.277773 3.542854

Plot histogram of data

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

add empirical density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

maximum likelihood paramaeters for “normal”

normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   0.87600630   0.64075491 
##  (0.01538748) (0.01088059)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 0.876 0.641
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0154 0.0109
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000237 0 0 0.000118
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1734
##  $ loglik  : num -1689
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute ex. mean
##      mean 
## 0.8760063
# giving mean of maternal plant height 86.6 cm

Plot normal probability denisty

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability density

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot uniform prob density

 tat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability denisty

gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot beta probability density

 pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

3.

# the gamma probability density fits the data set the best 

gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4.

# quick and dirty, a truncated normal distribution to work on the solution set

y <- rnorm(n=1732,mean=0.8661687,0.62406564)
y <- data.frame(1:1732,y)
names(y) <- list("ID","myVar")
y <- y[y$myVar>0,]
str(y)
## 'data.frame':    1593 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 7 9 10 11 12 ...
##  $ myVar: num  1.615 1.06 2.216 1.548 0.356 ...
summary(y$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001608 0.559115 0.954318 0.977300 1.357213 2.842155

probability denisty curve

gammaPars1 <- fitdistr(y$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML1 <- gammaPars1$estimate["shape"]
rateML1 <- gammaPars1$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(y$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The two graphs look very similar, but the original data set has a bit more variation and the simulated dated looks a lot more uniform than the other. Which makes sense because it is a randomly generated integer sequence.