x1 <- rexp(n=100)
m.x1 <- mean(x1)
m.x1
## [1] 1.047794
sd.x1 <- sd(x1)
sd.x1
## [1] 1.072888
Mean: 1.0477942
Standard Deviation: 1.0728881
x0.1 <- rexp(n=100, rate=0.1)
x0.5 <- rexp(n=100, rate=0.5)
x5 <- rexp(n=100, rate=5)
x10 <- rexp(n=100, rate=10)
x0.1
: Mean = 10.6665701 SD = 11.0362542
x0.5
: Mean = 2.3187474 SD = 1.9451221
x5
: Mean = 0.1754572 SD = 0.1372515
x10
: Mean = 0.0972927 SD = 0.0841676
plot(c(0.1,0.5,1,5,10), c(mean(x0.1), mean(x0.5), mean(x1), mean(x5), mean(x10)), pch=2, xlab = 'Rate', ylab='Mean', main='Mean vs Rate')
plot(c(0.1,0.5,1,5,10), c(sd(x0.1), sd(x0.5), sd(x1), sd(x5), sd(x10)), pch=2, xlab = 'Rate', ylab='SD', main='SD vs Rate')
plot(c(mean(x0.1), mean(x0.5), mean(x1), mean(x5), mean(x10)), c(sd(x0.1), sd(x0.5), sd(x1), sd(x5), sd(x10)), pch=2, xlab = 'SD', ylab='Mean', main='Mean vs SD')
\(E[X] = \frac{1}{\lambda}\) and \(Var[X] = \frac{1}{\lambda^2}\) hence \(E[X] vs \sqrt{Var[X]}\) follows a linear trend
y <- rexp(n=1100000)
Mean of \(y\): 0.9988759 SD of \(y\): 0.9982769
hist(y)
\(y\) is the PDF of exponential distribution and hence matches the following distribution \(e^{-x}\)
h <- hist(y, plot = FALSE)
ylim <- range(0, h$density, dexp(0))
hist(y, freq = FALSE, ylim = ylim)
curve(dexp, y, add=TRUE)
plot(seq(0,15,0.1), exp(-seq(0,15,0.1)), main='$e^{-x}$ vs $x$', xlab = '$x$', ylab='$y$')
y.mat <- matrix(y, nrow=1100,ncol=1000)
mean(y.mat[,371])
## [1] 1.013
col.m <- colMeans(y.mat)
h <- hist(col.m, plot = FALSE)
ylim <- range(0, h$density, dnorm(0))
hist(col.m, freq = FALSE, ylim = ylim)
curve(dnorm, col.m, add=TRUE)
The shape of the column means soes not match that of Problem 2 because of central limit theorem, which states that the distribution of mean of large number of iterants(columns in this case) is approximately normal
mean(y.mat[y.mat>1])
## [1] 1.999857
temp.data <- read.csv("Temperature.csv", header=TRUE)
temp.data$DateNr <- as.Date(temp.data$DateNr, format='%m/%d/%Y')
month <- format(temp.data$DateNr, '%m')
year <- format(temp.data$DateNr, '%Y')
temp.df <- data.frame(month)
temp.df$month <- as.factor(month)
temp.df$year <- as.factor(year)
temp.df$Station <- temp.data$Station
temp.df$temperature <- temp.data$Temperature
head(temp.df)
## month year Station temperature
## 1 10 1990 DANT 4.0
## 2 06 1990 DANT 6.0
## 3 08 1990 DANT 7.3
## 4 04 1990 DANT 8.2
## 5 09 1990 DANT 17.4
## 6 06 1990 DANT 18.1
agg <- aggregate(temperature~year+month, data=temp.df, mean, na.rm=TRUE)
head(agg)
## year month temperature
## 1 1990 01 6.788889
## 2 1991 01 6.600000
## 3 1992 01 6.013953
## 4 1993 01 8.511111
## 5 1994 01 6.147059
## 6 1995 01 7.902857
tempeture.list <- agg$temperature
print(agg[, c('year', 'month', 'temperature')])
## year month temperature
## 1 1990 01 6.788889
## 2 1991 01 6.600000
## 3 1992 01 6.013953
## 4 1993 01 8.511111
## 5 1994 01 6.147059
## 6 1995 01 7.902857
## 7 1996 01 4.038462
## 8 1997 01 5.416000
## 9 1998 01 9.254324
## 10 1999 01 9.961724
## 11 2000 01 8.637727
## 12 2001 01 7.035714
## 13 2002 01 10.958636
## 14 2003 01 7.190741
## 15 2004 01 9.739167
## 16 2005 01 8.729143
## 17 1990 02 8.475000
## 18 1991 02 10.207143
## 19 1992 02 6.212500
## 20 1993 02 5.750000
## 21 1994 02 7.489189
## 22 1995 02 10.011429
## 23 1996 02 4.086364
## 24 1997 02 10.378182
## 25 1998 02 10.388333
## 26 1999 02 7.201600
## 27 2000 02 7.294865
## 28 2001 02 9.698947
## 29 2002 02 12.540385
## 30 2003 02 9.150000
## 31 2004 02 7.656818
## 32 2005 02 9.139355
## 33 1990 03 8.330769
## 34 1991 03 8.220000
## 35 1992 03 8.747826
## 36 1993 03 7.324242
## 37 1994 03 10.267347
## 38 1995 03 8.495385
## 39 1996 03 5.297222
## 40 1997 03 6.688200
## 41 1998 03 10.344444
## 42 1999 03 8.962500
## 43 2000 03 8.725490
## 44 2001 03 7.328378
## 45 2002 03 10.792128
## 46 2003 03 8.006786
## 47 2004 03 8.062955
## 48 2005 03 7.916500
## 49 1990 04 8.777419
## 50 1991 04 9.218182
## 51 1992 04 10.968333
## 52 1993 04 9.685714
## 53 1994 04 9.011905
## 54 1995 04 10.160000
## 55 1996 04 6.688235
## 56 1997 04 8.038293
## 57 1998 04 11.072727
## 58 1999 04 11.264138
## 59 2000 04 11.291333
## 60 2001 04 8.908235
## 61 2002 04 10.095111
## 62 2003 04 9.900816
## 63 2004 04 10.198491
## 64 2005 04 10.725918
## 65 1990 05 12.775758
## 66 1991 05 8.166667
## 67 1992 05 11.140313
## 68 1993 05 13.000000
## 69 1994 05 13.661538
## 70 1995 05 11.135135
## 71 1996 05 9.795312
## 72 1997 05 12.238462
## 73 1998 05 13.465116
## 74 1999 05 14.098378
## 75 2000 05 14.380909
## 76 2001 05 12.855172
## 77 2002 05 13.092093
## 78 2003 05 14.544118
## 79 2004 05 12.832250
## 80 2005 05 13.721176
## 81 1990 06 13.361290
## 82 1991 06 11.088889
## 83 1992 06 15.712069
## 84 1993 06 15.340741
## 85 1994 06 13.222222
## 86 1995 06 12.572917
## 87 1996 06 14.558621
## 88 1997 06 15.856154
## 89 1998 06 15.580000
## 90 1999 06 15.377321
## 91 2000 06 14.906923
## 92 2001 06 14.370750
## 93 2002 06 14.962667
## 94 2003 06 17.653333
## 95 2004 06 15.159000
## 96 2005 06 15.702692
## 97 1990 07 15.877143
## 98 1991 07 15.838889
## 99 1992 07 14.817544
## 100 1993 07 15.163415
## 101 1994 07 15.886441
## 102 1995 07 15.657143
## 103 1996 07 17.524242
## 104 1997 07 18.232982
## 105 1998 07 15.273778
## 106 1999 07 18.252000
## 107 2000 07 16.334894
## 108 2001 07 17.797200
## 109 2002 07 17.302041
## 110 2003 07 18.684694
## 111 2004 07 16.724909
## 112 2005 07 17.469459
## 113 1990 08 16.892308
## 114 1991 08 16.489286
## 115 1992 08 13.987500
## 116 1993 08 12.525536
## 117 1994 08 16.296154
## 118 1995 08 17.843860
## 119 1996 08 17.036508
## 120 1997 08 18.162222
## 121 1998 08 15.752500
## 122 1999 08 16.624792
## 123 2000 08 18.524043
## 124 2001 08 18.885500
## 125 2002 08 17.885455
## 126 2003 08 17.482857
## 127 2004 08 15.713750
## 128 2005 08 16.060000
## 129 1990 09 14.446154
## 130 1991 09 12.973333
## 131 1992 09 12.848039
## 132 1993 09 13.252037
## 133 1994 09 13.223382
## 134 1995 09 15.120000
## 135 1996 09 13.481034
## 136 1997 09 15.949583
## 137 1998 09 14.495000
## 138 1999 09 18.681364
## 139 2000 09 15.409459
## 140 2001 09 13.563158
## 141 2002 09 16.697838
## 142 2003 09 15.632609
## 143 2004 09 14.387222
## 144 2005 09 16.324286
## 145 1990 10 12.666667
## 146 1991 10 11.809091
## 147 1992 10 11.894615
## 148 1993 10 13.316667
## 149 1994 10 12.417647
## 150 1995 10 13.741463
## 151 1996 10 14.212000
## 152 1997 10 13.689000
## 153 1998 10 8.994375
## 154 1999 10 13.111842
## 155 2000 10 12.390465
## 156 2001 10 13.095246
## 157 2002 10 11.058649
## 158 2003 10 11.120244
## 159 2004 10 13.198525
## 160 2005 10 13.311389
## 161 1990 11 11.070968
## 162 1991 11 8.824444
## 163 1992 11 11.472927
## 164 1993 11 9.250000
## 165 1994 11 12.021951
## 166 1995 11 11.784615
## 167 1996 11 10.106727
## 168 1997 11 12.714565
## 169 1998 11 8.992941
## 170 1999 11 7.147619
## 171 2000 11 10.396981
## 172 2001 11 11.143158
## 173 2002 11 9.605217
## 174 2003 11 9.591622
## 175 2004 11 12.233158
## 176 2005 11 11.864054
## 177 1990 12 7.913636
## 178 1991 12 9.121622
## 179 1992 12 8.122188
## 180 1993 12 8.975610
## 181 1994 12 11.083636
## 182 1995 12 11.168889
## 183 1996 12 8.547500
## 184 1997 12 9.422000
## 185 1998 12 9.570000
## 186 1999 12 9.077955
## 187 2000 12 8.494400
## 188 2001 12 9.220488
## 189 2002 12 8.426596
## 190 2003 12 9.460000
## 191 2004 12 10.121579
## 192 2005 12 10.462500
count <- as.data.frame(table(temp.df$Station))
print(count)
## Var1 Freq
## 1 DANT 300
## 2 DREI 293
## 3 G6 278
## 4 GROO 296
## 5 HAMM 295
## 6 HANS 309
## 7 HUIB 296
## 8 LODS 294
## 9 MARS 296
## 10 N02 402
## 11 N10 665
## 12 N20 266
## 13 N70 268
## 14 R03 161
## 15 R50 106
## 16 R70 106
## 17 SOEL 295
## 18 T004 339
## 19 T010 261
## 20 T100 258
## 21 T135 259
## 22 T175 258
## 23 T235 258
## 24 VLIS 421
## 25 W02 272
## 26 W20 191
## 27 W70 190
## 28 WISS 296
## 29 ZIJP 296
## 30 ZUID 303
sorted <- count[order(-count$Freq),]
top10 <- sorted[1:10,]
Top 10 stations(with most number of readings):
print(top10)
## Var1 Freq
## 11 N10 665
## 24 VLIS 421
## 10 N02 402
## 18 T004 339
## 6 HANS 309
## 30 ZUID 303
## 1 DANT 300
## 4 GROO 296
## 7 HUIB 296
## 9 MARS 296
top10.stations <- sorted$Var1
agg<-aggregate(temperature~Station+year+month, data=temp.df, mean)
head(agg)
## Station year month temperature
## 1 HAMM 1990 01 5.800000
## 2 HANS 1990 01 5.900000
## 3 LODS 1990 01 5.400000
## 4 N10 1990 01 8.766667
## 5 VLIS 1990 01 6.200000
## 6 WISS 1990 01 5.900000
In the following part to draw the time series, I simply aggregate by years (since including month leads to a lot of data points on the X axis)
library(ggplot2)
agg<-aggregate(temperature~Station+year, data=temp.df, mean)
agg$year <- as.numeric(agg$year)
ggplot(agg, aes(x=year, y=temperature)) + geom_line() + aes(color=factor(Station))