Regular Expressions in R
Extracting text before or after symbol or delimiter
# timestamp as example
stamp <- "2017-10-16T12:45:04PM"
# time: extract time after the T
gsub(".*\\T", "", stamp)
## [1] "12:45:04PM"
# date: extract everything before the T
gsub("\\T.*", "", stamp)
## [1] "2017-10-16"
Extracting everything up to the Nth occurrence
string <- "abc-def-ghi-jkl-mno"
# up to 1st (not including it)
sub("^(([^-]*-){0}[^-]*).*", "\\1", string)
## [1] "abc"
# up to the 2nd (not including it)
sub("^(([^-]*-){1}[^-]*).*", "\\1", string)
## [1] "abc-def"
# up to 3rd (not including it)
sub("^(([^-]*-){2}[^-]*).*", "\\1", string)
## [1] "abc-def-ghi"
Extracting before or after the first occurrence of symbol or delimeter
string <- "THIS_IS_A_STRING_WITH_A-DASH-SECOND-DASH"
# extract everything after first "-"
sub(".*?-", "", string)
## [1] "DASH-SECOND-DASH"
# extract everything up to the first "-"
stringr::str_extract(string, "[^-]+")
## [1] "THIS_IS_A_STRING_WITH_A"
# or using sub
sub("-.*", "", string)
## [1] "THIS_IS_A_STRING_WITH_A"
Extract the last occurrence in string with multiple occurrences
sub(".*[_]", "", "abc_def_ghi_jkl")
## [1] "jkl"
Extract everything before last occurrence in string
sub("_[^_]+$", "", "abc_def_ghi_jkl")
## [1] "abc_def_ghi"
Extract everything in between two strings
string <- c("The quick brown fox jumped over the lazy dog")
gsub(".*quick (.+) jumped.*", "\\1", string)
## [1] "brown fox"
Extract hashtags and @handles
string <- c("I can't believe @user said #hashtag and #hashtag2")
str_extract_all(string, "@\\w+")
## [[1]]
## [1] "@user"
str_extract_all(string, "#\\w+")
## [[1]]
## [1] "#hashtag" "#hashtag2"
Splitting and pulling data after symbols.
There are also pure regex ways of achieving the same ends, but stringr
is so easy.
library(stringr)
# assume a url with the following path, and we want to pull out the info after the 2nd slash
urlPath <- "/t5/Announcements-and-Info/Bixby-Button-Short-Press-The-choice-is-yours/m-p/168685/highlight/true#M210"
sapply(str_split(urlPath, "/"), "[[", 3)
## [1] "Announcements-and-Info"
Remove commas and convert string to numeric
to_num <- function(x) { as.numeric(stringi::stri_replace_all_fixed(x, ",", "")) }
stringnum <- "100,954"
to_num(stringnum)
## [1] 100954
Add commas and convert numeric to string
scales::comma_format()(10000000)
## [1] "10,000,000"
Extract last N and first N characters in string
The str_sub
function from the stringr
package is useful for this. There are three arguments (string, from = start, to = end) - if you don’t include “from” and the “to” argument is positive, then it starts from character 1. If “from” is left out and the “to” argument is negative, it starts from the last character in the string.
st <- "Thisstringhasnospaces"
# Last word
str_sub(st, -6)
## [1] "spaces"
# First word
str_sub(st, 1, 4)
## [1] "This"
# In between
str_sub(st, 5, 10)
## [1] "string"
Programmatically inserting a line break every N spaces
Found this useful when working with lots of data and using a function to parse the data and plot using ggplot.
strfun <- function(str, n) {gsub(paste0("([^ ]+( +[^ ]+){",n-1,"}) +"),
"\\1\n", str)}
string <- "As he crossed toward the pharmacy at the corner he involuntarily turned his head because of a burst of light that had ricocheted from his temple..."
strfun(string, 8)
## [1] "As he crossed toward the pharmacy at the\ncorner he involuntarily turned his head because of\na burst of light that had ricocheted from\nhis temple..."
If you want to use characters instead…
Adding line break after N characters
string <- "As he crossed toward the pharmacy at the corner he involuntarily turned his head because of a burst of light that had ricocheted from his temple..."
paste(strwrap(string, width = 80), collapse = "\n")
## [1] "As he crossed toward the pharmacy at the corner he involuntarily turned his\nhead because of a burst of light that had ricocheted from his temple..."
And if using this on a column in a dataframe
df |> dplyr::mutate(col = sapply(col, function(x) paste(strwrap(x, 20), collapse = "\n")))
Counting words in a string
Working with strings and trying to count either words or occurrences.
library(stringr)
# string with 19
string <- c("MTD Sales 415 2,667 1 2,014 46 24 52 472 3 2,200 2,256 2,963 25 511 207
274 14,130")
# this will look for all word characters
str_count(string, '\\w+')
## [1] 25
# the above is counting commas as breaks, so remove the comma count
str_count(string, '\\w+') - str_count(string, ",")
## [1] 19
Removing trailing and leading punctuation
In this case, we’ll strip out periods before and after.
test <- c('.name.A.','name.B','.name.C.')
gsub('^\\.|\\.$', '', test)
## [1] "name.A" "name.B" "name.C"
Efficiently converting all versions of a word
Working with text and there are different versions and misspellings of a term
string <- "i was buffereing but then buffered and buffering on the bluff"
# assuming a consistent word stem
str_replace_all(string, "buffer[a-z]+", "buffer")
## [1] "i was buffer but then buffer and buffer on the bluff"
Iteratively replacing text in a string using a key
sample_texts <- tibble(text = c("blah-blah-blah-value1_value1-value2_value2",
"blah _value1-value2"))
key <- tibble(
old = c("-value1", "-value2"),
new = c("_value1", "_value2")
)
sample_texts |>
mutate(text = stringi::stri_replace_all_fixed(text,
pattern = key$old,
replacement = key$new,
vectorize_all=FALSE))
## # A tibble: 2 × 1
## text
## <chr>
## 1 blah-blah-blah_value1_value1_value2_value2
## 2 blah _value1_value2
Splitting words on space or other character
string <- c("apple: banana: orange: kiwi")
str_split(string, boundary('word'))
## [[1]]
## [1] "apple" "banana" "orange" "kiwi"
Splitting a string and keeping the delimiter
Using the separate()
function seems to work well here
tmp <- tibble(string = "abc-def-ghi-jkl-mno")
tmp |> separate(string, c("part1", "part2"), "(?=jkl)")
## # A tibble: 1 × 2
## part1 part2
## <chr> <chr>
## 1 abc-def-ghi- jkl-mno
Extracting specific version of a word
Assume we have to find the location of a specific word. In the line below if we search for “string” we’re going to get two answers.
string <- tibble(col = c("strings", "stringer", "word", "part", "string"))
string$col[which(str_detect(string$col, "string"))]
## [1] "strings" "stringer" "string"
# setting the boundary and find only the exact match
string$col[which(str_detect(string$col, "string\\b"))]
## [1] "string"
Extracting all @mentions
s <- tibble(text = "this is a string, @mention, with a mention")
s |>
mutate(mentions = sapply(str_extract_all(text, "@\\w+"), function(x) paste(x, collapse = ", ")),)
## # A tibble: 1 × 2
## text mentions
## <chr> <chr>
## 1 this is a string, @mention, with a mention @mention
Extracting urls into new column in tibble
url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
tibble(text = c("content with https://www.example.com url", "text in http://www.foo.com", "this string has no url")) |> mutate(url = str_extract(text, url_pattern))
## # A tibble: 3 × 2
## text url
## <chr> <chr>
## 1 content with https://www.example.com url https://www.example.com
## 2 text in http://www.foo.com http://www.foo.com
## 3 this string has no url <NA>