-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy path8-7_flujos_EN.r
273 lines (167 loc) · 9.19 KB
/
8-7_flujos_EN.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
################################################################################
### R BASICS WORKSHOP ###
### EXERCISE 8-7: Flow control ###
### ###
### Unida de Servicios Bioinformáticos ###
### Instituto Nacional de Medicina Genómica ###
### Website: github.com/hachepunto/R_Basics_workshop ###
################################################################################
## OBJECTIVE:
## Practice the use of 'for' loops "
# One common way to test whether two samples of data come from the same
# statistical population is non-parametric 'bootstrapping'. This technique uses
# samples obtained with replacement from empirical data to re-calculate sample
# statistics, and then create confidence intervals. In this example, we will use
# a very simple bootstrapping procedure to test whether or not two variables
# ('Y1 and Y2') have different relationship with a third variable ('X').
### PART 1. SIMULATING DATA FOR ANALYSES ###
X <- rnorm(100, mean=25, sd=5)
# Creates an object named 'X' with 100 values taken from a normal distribution
# with mean 25 and standard deviation 5
Y1 <- 2 + 4*X + rnorm(length(X), mean=0, sd=5)
# Creates a variable 'Y1' that is a function of 'X'. This is based on a linear
# function where the intercept is 2 and slope is 4.
Y2 <- 10 + 3*X + rnorm(length(X), mean=0, sd=5)
# Creates a variable 'Y2' that is a function of 'X'. This is based on a linear
# function where the intercept is 10 and slope is 3.
Y1.X.lm <- lm(Y1~X)
Y1.X.lm
Y2.X.lm <- lm(Y2~X)
Y2.X.lm
# This produces linear models relating the values of 'Y1' and 'Y2' to the values of 'X'.
plot(Y1~X, pch=21, cex=1.5, col="white", bg="olivedrab3", las=1,
ylim=range(c(Y1, Y2)))
points(Y2~X, pch=21, cex=1.5, col="white", bg="darkorange", las=1)
# Plots the values of 'Y' agains 'X'. As you can see, Y1~X has a steeper slope
# (i.e., rises more quickly)
### PART 2. SETTING UP ###
# Often, before starting a loop or a procedure that depends on a loop, it is
# necessary to define how the loop will work (e.g. the number of iterations),
# and where the results will be stored
bootstrap.n <- 9999
# This defines the number of bootstrap samples that will be used
## TASK 1 ##
# Create an empty matrix that will store the coefficients of each repetition of
# the bootstrapping procedure for the relationship between Y1 and X. It must (1)
# be named Y1.coefficients, (2) have as many rows as repetitions of the
# bootstrapping, (3) have 2 columns to hold results for the intercept and the
# slope, and (4) have row names and column names ("Intercept" and "Slope")
Y2.coefficients <- Y1.coefficients
## This creates a similar matrix that will hold results for Y2~X
### PART 3. ONE BOOTSTRAPPING REPETITION ###
# When some code needs to be run multiple times using a loop, it is sometimes
# easier to start by writing the code as if it was going to be run only once.
# This section has the code to re-sample the data with replacement, and repeat
# the linear models with this 'bootstrpped' data.
to.resample.Y1 <- sample(1:length(Y1), replace=TRUE)
# This samples with replacement values from 1 to the length of Y1. In this
# way, it defines which values will be selected with replacement to repeat
# the linear model.
## TASK 2 ##
# Write the corresponding line of code for Y2
# Creates new Y and X variables that contain the bootstrapped values. Note that
# selected values for X must match those selected for Y:
y1.i <- Y1[to.resample.Y1]
x1.i <- X[to.resample.Y1]
## TASK 3 ##
# Write the corresponding lines of code for Y2
# Repeats the linear models with the 'bootstrapped' data
Y1.X.lm.i <- lm(y1.i~x1.i)
Y2.X.lm.i <- lm(y2.i~x2.i)
# Saves the 'bootstrapped' coefficients in the first row of the empty matrix
Y1.coefficients[1, ] <- Y1.X.lm.i$coefficients
## TASK 4 ##
# Write the corresponding line of code for Y2
### PART 4. CREATING A BOOTSTRAP PROCEDURE USING A FOR LOOP ###
## TASK 5 ##
## Now that you have and understand the steps that need to take place at each
## iteration of this randomization procedure, modify the code and place it
## inside the following 'for loops'. The code needs only minor modifications in
## this case.
for(i in 1:bootstrap.n)
{
print(i)
}
### PART 5. COMPARING THE SLOPES OF Y1 AND Y2 WITH X ###
# Now that you have estimates of the slopes and intercepts from each repetition
# of the bootstrap procedure, you can compare the values obtained for Y1 and for
# Y2 to see if the test returns the right conclusion. The right conclusion is,
# of course, 'yes, they are different'. This is so because we created the data,
# by simulation, and know that the coefficients were different.
## TASK 6 ##
## Use the function quantile to calculate the 2.5% and 97.5% percentiles
## distributions of bootstrapped values for Y1 and Y2. Put those values into two
## objects named Y1.CI and Y2.CI. These can be used as rought confidence
## intervals.
Y1.CI <- quantile(...
Y2.CI <-
par(mfrow=c(1,2))
# Creates a window with panels in 1 row and 2 columns
## TASK 7 ##
## Translate to English (or Spanish) the following lines of code
hist(Y1.coefficients[,"Slope"], main="", xlab="Slope", border="olivedrab3", col=NULL,
xlim=range(c(Y1.coefficients[,"Slope"], Y2.coefficients[,"Slope"])))
hist(Y2.coefficients[,"Slope"], border="darkorange", col=NULL, add=TRUE)
bp.locations <- barplot(c(Y1.X.lm$coefficients[2], Y2.X.lm$coefficients[2]),
ylim=c(0, max(c(Y1.coefficients[,"Slope"], Y2.coefficients[,"Slope"]))),
col=c("olivedrab3", "darkorange"), border=NA, names=c("Y1", "Y2"),
ylab="Slopes (with CIs)")
arrows(x0=bp.locations, y0=c(Y1.CI[1], Y2.CI[1]), y1=c(Y1.CI[2], Y2.CI[2]),
x1=bp.locations, angle=90, length=0.1, lwd=2.5, code=3, col=c("grey50"))
################################################################################
### TASK SOLUTIONS #############################################################
################################################################################
## TASK 1 ##
Y1.coefficients <- matrix(NA, nrow=bootstrap.n, ncol=2)
colnames(Y1.coefficients) <- c("Intercept", "Slope")
rownames(Y1.coefficients) <- paste("rand", 1:bootstrap.n, sep="_")
## TASK 2 ##
to.resample.Y2 <- sample(1:length(Y2), replace=TRUE)
## TASK 3 ##
y2.i <- Y2[to.resample.Y2]
x2.i <- X[to.resample.Y2]
## TASK 4 ##
Y2.coefficients[1, ] <- Y2.X.lm.i$coefficients
### PART 4. CREATING A BOOTSTRAP PROCEDURE USING A FOR LOOP ###
## TASK 5 ##
for(i in 1:bootstrap.n)
{
print(i)
# Samples with replacement
to.resample.Y1 <- sample(1:length(Y1), replace=TRUE)
to.resample.Y2 <- sample(1:length(Y2), replace=TRUE)
# Creates bootstraped Y and X vectors
y1.i <- Y1[to.resample.Y1]
x1.i <- X[to.resample.Y1]
y2.i <- Y2[to.resample.Y2]
x2.i <- X[to.resample.Y2]
# Repeats regressions
Y1.X.lm.i <- lm(y1.i~x1.i)
Y2.X.lm.i <- lm(y2.i~x2.i)
# Saves coefficients
Y1.coefficients[i, ] <- Y1.X.lm.i$coefficients # Note the use of 'i' here
Y2.coefficients[i, ] <- Y2.X.lm.i$coefficients
}
## TASK 6 ##
Y1.CI <- quantile(Y1.coefficients[,"Slope"], probs=c(0.025, 0.975) )
Y2.CI <- quantile(Y2.coefficients[,"Slope"], probs=c(0.025, 0.975) )
## TASK 7 ##
hist(Y1.coefficients[,"Slope"], main="", xlab="Slope", border="olivedrab3", col=NULL,
xlim=range(c(Y1.coefficients[,"Slope"], Y2.coefficients[,"Slope"])))
# Create a histogram of the values in the column named "Slope" of the object
# 'Y1.coefficients'. Use for the limits in the x axis the range of values of
# the columns named "slope" in objects 'Y1.coefficients' and 'Y2.coefficients'
hist(Y2.coefficients[,"Slope"], border="darkorange", col=NULL, add=TRUE)
# Add a histogram of the values in the column named "Slope" of the object
# 'Y2.coefficients'.
bp.locations <- barplot(c(Y1.X.lm$coefficients[2], Y2.X.lm$coefficients[2]),
ylim=c(0, max(c(Y1.coefficients[,"Slope"], Y2.coefficients[,"Slope"]))),
col=c("olivedrab3", "darkorange"), border=NA, names=c("Y1", "Y2"),
ylab="Slopes (with CIs)")
# Create a barplot using the second element in the vectors Y1.X.lm$coefficients
# and Y2.X.lm$coefficients. These correspond to the empirical slopes of the
# regressions. Save the location of the bars in an object named "bp.locations"
arrows(x0=bp.locations, y0=c(Y1.CI[1], Y2.CI[1]), y1=c(Y1.CI[2], Y2.CI[2]),
x1=bp.locations, angle=90, length=0.1, lwd=2.5, code=3, col=c("grey50"))
# Create arrows centered on "bp.locations" and that use Y1.CI and Y2.CI to
# define the endpoints of the arrows. This depicts the confidence limits.