-
Notifications
You must be signed in to change notification settings - Fork 42
/
stringsStuff.Rmd
321 lines (266 loc) · 10.4 KB
/
stringsStuff.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
---
title: "Strings"
author: "João Neto"
date: "May 2013"
output:
html_document:
toc: true
toc_depth: 3
fig_width: 12
fig_height: 6
cache: TRUE
---
_This markdown present several ways to manipulate strings._
Standard functions
------------------
+ paste: pastes vectors together
+ substr: extract/replace substrings in a character vector
+ substring: expand cyclically several results
+ strsplit: split the elements into substrings according to the matches (uses regular expressions)
```{r}
paste("Today is ", date())
xs <- 1:7
paste0("A", xs)
paste("A", xs, sep=",")
paste(letters[1:10],xs,sep="|")
paste(letters[1:10],xs,sep="|",collapse=",")
cs <- "o mapa nao e o territorio"
paste0("'", cs, "' tem ", nchar(cs), " caracteres")
substr(cs,3,6)
substr(cs,3,6) <- "MAPA"
cs
substring(cs, 2, 4:6)
xs <- c("ontem", "hoje", "amanha", "depois de amanha")
substring(xs, 2) <- c("XX", "YY", "Z")
xs
cs <- "o mapa nao e o territorio"
strsplit(cs,"[oa]")
cs <- paste(letters[1:10],1:7,sep="|",collapse=",")
cs
cs1 <- strsplit(cs,"[,|]")[[1]]
cs1
cs1 <- paste0(cs1,collapse="")
cs1
```
Regular Expressions
-------------------
```{r}
strsplit(cs1,"[1-9]") # use every digit as a separator
strsplit("a.b.c",".") # . is the operator that accepts all as separator
strsplit("a.b.c","\\.") # separates by the point
cs <- c("aaa","abb","ccc","dda","eaa")
sub("a","X",cs) # sub replaces the first match for the entries of a vector
gsub("a","X",cs) # the same but replaces all matches
text.test <- "Evidence for a model (or belief) must be considered against alternative models. Let me describe a neutral (and very simple) example: Assume I say I have Extra Sensorial Perception (ESP) and tell you that the next dice throw will be 1. You throw the dice and I was right. That is evidence for my claim of ESP. However there's an alternative model ('just a lucky guess') that also explains it and it's much more likely to be the right model (because ESP needs much more assumptions, many of those in conflict with accepted facts and theories). This is a subject of statistical inference. It's crucial to consider the alternatives when we want to put our beliefs to the test."
gsub("belief|model","XXX",text.test) # erase every word equals to belief *or* model
gsub("t([a-z]*)?t","XXX",text.test) # erase every 0+ letters between 2 t's
gsub("([a-z])\\1","YY",text.test) # erase every letter repeated twice (eg: 'ee', 'll')
gsub("(model)","*\\1*",text.test) # bold every 'model' word
gsub("(t)(h)","\\2\\1", text.test) # swap every 'th' to 'ht'
gsub("([^a-zA-Z])([aA][a-z]+)","\\1*\\2*",text.test) # bold every word that begins with 'a'
gsub("([^a-zA-Z])([a-z]){1,3}([^a-zA-Z])","\\1ZZZ\\3",text.test) # erase every word with 1 to 3 letters
# {3} means exactly 3 and {3,} means 3 or more
separators <- "[,.: ()']"
tokens <- strsplit(text.test, separators)[[1]] # tokenize text into words
tokens <- tokens[tokens != ""] # remove empty tokens
tokens
grep("dice", tokens, fixed=TRUE) # where are 'dice' tokens (returns indexes)
string <- "abcedabcfaa"
cs <- strsplit(gsub("([a-z])","\\1,",string),",")[[1]] # convert to vector of chars
grepl("a",cs) # TRUE if there's a match, FALSE otherwise
# regexpr() gives you the first match in each element of the vector (-1 if not found)
# the second vector is the length of the first match
cs <- c("aaa", "axx", "xaa", "axx", "xxx", "xxx")
regexpr("a", cs)
regexpr("a*", cs)
# regexpr() gives the indexes of each sub-expression
cs <- c("123ab67","ab321","10000","0","abc")
regexec("[a-z]*([0-9]+)",cs)
# A more complex eg:
set.seed(101)
pop.data <- paste("the population is", floor(runif(20,1e3,5e4)),"birds")
head(pop.data)
reg.info <- regexec("the population is ([0-9]*) birds", pop.data)
reg.info[1:3]
reg.data <- regmatches(pop.data, reg.info)
reg.data[1:3]
bird.population <- sapply(reg.data, function(x)x[2])
bird.population
```
One more example (based on [this one](http://www.r-bloggers.com/finding-patterns-in-time-series-using-regular-expressions/)). This one shows how to draw grey rectangle over depressions in a time-series:
```{r}
set.seed(1303)
# make time-series
steps <- sample(-2:2, size=200, prob=c(.1,.2,.2,.4,.1) ,replace=TRUE)
ts <- cumsum(steps)
plot(ts, type="l")
# assume we didn't know how ts was made
difs <- sign(diff(ts)>=0) # 0 if decreased, 1 otherwise
bits <- paste0(difs,collapse="") # colapse into a string of bits
bits
# let's signal a consecutive decrease of 2+ time stamps (aka, a depression)
matches <- gregexpr("00+", bits, perl = T)[[1]]
matches
attributes(matches)$match.length # this allows to access the length of each matches[i]
# let's plot the time series with the depressions marked as grey rectangles
plot(ts, type="n")
min.y <- rep(min(ts),length(matches))
max.y <- rep(max(ts),length(matches))
rect(matches, min.y, matches+attributes(matches)$match.length, max.y, col="lightgrey", border=FALSE)
points(ts, type="l")
```
Package `stringr`
----------------
[R Help] _stringr is a set of simple wrappers that make R's string functions more consistent, simpler and easier to use. It does this by ensuring that: function and argument names (and positions) are consistent, all functions deal with NA's and zero length character appropriately, and the output data structures from each function matches the input data structures of other functions_
```{r}
library(stringr)
str1 <- c("o mapa")
str2 <- c("nao e o territorio")
str3 <- str_c(str1,str2, sep=" ") # join 2+ strings
str3
str_c(letters, collapse = ", ")
str_length(str3)
str_dup("ab",5) # duplicates strings
str_dup(c("ab","c"),3)
str_dup("ab",1:3)
str_count(str3, "r") # the number of matches
str_detect(str3, "r") # verifies if match exists
str_extract(str3, "[it][eo]+") # extract first match
str_extract_all(str3, "[it][eo]+") # extract all matches
str_locate(str3, "[it][eo]+") # locate where's first match
str_locate_all(str3, "[it][eo]+") # locate where're all matches
str_replace(str3,"r","R") # replace first match
str_replace_all(str3,"r","R") # replace all matches
str_split(str3,"e")
str_split(str3,"e",n=2)
str_sub(str3,1,3) # extract substrings
str_sub(str3,seq(1,24,2),seq(2,25,2))
str4 <- "BBCDEF"
str_sub(str4, 1, 1) <- "A"
str4
str_sub(str4, -1, -1) <- "K"
str4
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
"387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
"239 923 8115", "842 566 4692", "Work: 579-499-7527", "$1000",
"Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
str_extract(strings, phone)
str_match(strings, phone)
rbind(
str_pad("hadley", 10, "left"),
str_pad("hadley", 10, "right"),
str_pad("hadley", 10, "both")
)
thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1, 3, fixed("\n\n"))
cat(str_wrap(thanks), "\n")
cat(str_wrap(thanks, width = 40), "\n")
cat(str_wrap(thanks, width = 60, indent = 2), "\n")
cat(str_wrap(thanks, width = 60, exdent = 2), "\n")
sentences <- c("Jane saw a cat", "Jane sat down")
word(sentences, 1) # Extract words from a sentence.
word(sentences, 2)
word(sentences, -1)
word(sentences, 2, -1)
word(sentences[1], 1:3, -1) # Also vectorised over start and end
word(sentences[1], 1, 1:4)
str <- 'abc.def..123.4568.999'
word(str, 1, sep = fixed('..')) # Can define words by other separators
word(str, 2, sep = fixed('..'))
```
Package `rex`
----------------
Package `rex` helps constructing complex regex's in an higher level of abstraction.
```{r}
library(rex)
strings <- c("test", "a test", "abc")
strings %>% re_matches( rex("t", zero_or_more(".")) )
# If there are captures in the regular expression, returns a data.frame with
# a column for each capture group.
strings %>% re_matches( rex(capture("t"), zero_or_more(".")) )
```
`reg()` returns regex objects that can be used afterwards:
```{r}
re <- rex(
capture(
group("a" %or% "b"),
one_or_more(non_spaces)
)
)
strings %>% re_matches(re)
```
Here's a regex for URL matching from the package's [vignette](https://cran.rstudio.com/web/packages/rex/vignettes/url_parsing.html):
```{r}
valid_chars <- rex(except_some_of(".", "/", " ", "-"))
re <- rex(
start,
# protocol identifier (optional) + //
group(list('http', maybe('s')) %or% 'ftp', '://'),
# user:pass authentication (optional)
maybe(non_spaces,
maybe(':', zero_or_more(non_space)),
'@'),
#host name
group(zero_or_more(valid_chars, zero_or_more('-')), one_or_more(valid_chars)),
#domain name
zero_or_more('.', zero_or_more(valid_chars, zero_or_more('-')), one_or_more(valid_chars)),
#TLD identifier
group('.', valid_chars %>% at_least(2)),
# server port number (optional)
maybe(':', digit %>% between(2, 5)),
# resource path (optional)
maybe('/', non_space %>% zero_or_more()),
end
)
```
```{r}
good <- c(
"http://foo.com/blah_blah/",
"http://foo.com/blah_blah_(wikipedia)",
"http://www.example.com/wpstyle/?p=364",
"http://1337.net",
"http://a.b-c.de",
"http://223.255.255.254")
bad <- c(
"http://",
"http://.",
"http://..",
"http://../",
"http://?")
all(grepl(re, good) == TRUE)
all(grepl(re, bad) == FALSE)
```
And this server log parsing from another [vignette](https://cran.rstudio.com/web/packages/rex/vignettes/log_parsing.html):
```{r}
library(dplyr)
logs <- c(
'199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245',
'unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985',
'199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085',
'burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0',
'199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179'
)
logs %>%
re_matches(
rex(
# Get the time of the request
"[",
capture(name = "time",
except_any_of("]")
),
"]",
space, double_quote, "GET", space,
# Get the filetype of the request if requesting a file
maybe(
non_spaces, ".",
capture(name = 'filetype',
except_some_of(space, ".", "?", double_quote)
)
)
)
) %>%
mutate(filetype = tolower(filetype),
time = as.POSIXct(time, format="%d/%b/%Y:%H:%M:%S %z"))
```