Regular Expressions in R

Extracting text before or after symbol or delimiter

# timestamp as example 
stamp <- "2017-10-16T12:45:04PM"
# time: extract time after the T
gsub(".*\\T", "", stamp) 
## [1] "12:45:04PM"
# date: extract everything before the T
gsub("\\T.*", "", stamp) 
## [1] "2017-10-16"

Extracting everything up to the Nth occurrence

string <- "abc-def-ghi-jkl-mno"

# up to 1st (not including it) 
sub("^(([^-]*-){0}[^-]*).*", "\\1", string)
## [1] "abc"
# up to the 2nd (not including it)
sub("^(([^-]*-){1}[^-]*).*", "\\1", string)
## [1] "abc-def"
# up to 3rd (not including it)
sub("^(([^-]*-){2}[^-]*).*", "\\1", string)
## [1] "abc-def-ghi"

Extracting before or after the first occurrence of symbol or delimeter

string <- "THIS_IS_A_STRING_WITH_A-DASH-SECOND-DASH"
# extract everything after first "-"
sub(".*?-", "", string)
## [1] "DASH-SECOND-DASH"
# extract everything up to the first "-"
stringr::str_extract(string, "[^-]+")
## [1] "THIS_IS_A_STRING_WITH_A"
# or using sub
sub("-.*", "", string)
## [1] "THIS_IS_A_STRING_WITH_A"

Extract the last occurrence in string with multiple occurrences

sub(".*[_]", "", "abc_def_ghi_jkl")
## [1] "jkl"

Extract everything before last occurrence in string

sub("_[^_]+$", "", "abc_def_ghi_jkl")
## [1] "abc_def_ghi"

Extract everything in between two strings

string <- c("The quick brown fox jumped over the lazy dog")
gsub(".*quick (.+) jumped.*", "\\1", string)
## [1] "brown fox"

Extract hashtags and @handles

string <- c("I can't believe @user said #hashtag and #hashtag2")
str_extract_all(string, "@\\w+")
## [[1]]
## [1] "@user"
str_extract_all(string, "#\\w+")
## [[1]]
## [1] "#hashtag"  "#hashtag2"

Splitting and pulling data after symbols.

There are also pure regex ways of achieving the same ends, but stringr is so easy.

library(stringr)
# assume a url with the following path, and we want to pull out the info after the 2nd slash
urlPath <- "/t5/Announcements-and-Info/Bixby-Button-Short-Press-The-choice-is-yours/m-p/168685/highlight/true#M210"

sapply(str_split(urlPath, "/"), "[[", 3)
## [1] "Announcements-and-Info"

Remove commas and convert string to numeric

to_num <- function(x) { as.numeric(stringi::stri_replace_all_fixed(x, ",", "")) }
stringnum <- "100,954"
to_num(stringnum)
## [1] 100954

Add commas and convert numeric to string

scales::comma_format()(10000000)
## [1] "10,000,000"

Extract last N and first N characters in string

The str_sub function from the stringr package is useful for this. There are three arguments (string, from = start, to = end) - if you don’t include “from” and the “to” argument is positive, then it starts from character 1. If “from” is left out and the “to” argument is negative, it starts from the last character in the string.

st <- "Thisstringhasnospaces"
# Last word
str_sub(st, -6)
## [1] "spaces"
# First word  
str_sub(st, 1, 4)
## [1] "This"
# In between
str_sub(st, 5, 10)
## [1] "string"

Programmatically inserting a line break every N spaces

Found this useful when working with lots of data and using a function to parse the data and plot using ggplot.

strfun <- function(str, n) {gsub(paste0("([^ ]+( +[^ ]+){",n-1,"}) +"),
                              "\\1\n", str)}

string <- "As he crossed toward the pharmacy at the corner he involuntarily turned his head because of a burst of light that had ricocheted from his temple..."

strfun(string, 8)
## [1] "As he crossed toward the pharmacy at the\ncorner he involuntarily turned his head because of\na burst of light that had ricocheted from\nhis temple..."

If you want to use characters instead…

Adding line break after N characters

string <- "As he crossed toward the pharmacy at the corner he involuntarily turned his head because of a burst of light that had ricocheted from his temple..."

paste(strwrap(string, width = 80), collapse = "\n")
## [1] "As he crossed toward the pharmacy at the corner he involuntarily turned his\nhead because of a burst of light that had ricocheted from his temple..."

And if using this on a column in a dataframe

df |> dplyr::mutate(col = sapply(col, function(x) paste(strwrap(x, 20), collapse = "\n")))

Counting words in a string

Working with strings and trying to count either words or occurrences.

library(stringr)
# string with 19 
string <- c("MTD Sales 415  2,667 1  2,014  46 24 52 472  3  2,200  2,256 2,963 25 511  207
            274  14,130")

# this will look for all word characters
str_count(string, '\\w+')
## [1] 25
# the above is counting commas as breaks, so remove the comma count
str_count(string, '\\w+') - str_count(string, ",") 
## [1] 19

Removing trailing and leading punctuation

In this case, we’ll strip out periods before and after.

test <- c('.name.A.','name.B','.name.C.')
gsub('^\\.|\\.$', '', test)
## [1] "name.A" "name.B" "name.C"

Efficiently converting all versions of a word

Working with text and there are different versions and misspellings of a term

string <- "i was buffereing but then buffered and buffering on the bluff"
# assuming a consistent word stem
str_replace_all(string, "buffer[a-z]+", "buffer")
## [1] "i was buffer but then buffer and buffer on the bluff"

Iteratively replacing text in a string using a key

sample_texts <- tibble(text = c("blah-blah-blah-value1_value1-value2_value2",
                                "blah _value1-value2"))

key <- tibble(
  old = c("-value1", "-value2"),
  new = c("_value1", "_value2")
)

sample_texts |> 
  mutate(text = stringi::stri_replace_all_fixed(text, 
                                                pattern = key$old, 
                                                replacement = key$new, 
                                                vectorize_all=FALSE))
## # A tibble: 2 × 1
##   text                                      
##   <chr>                                     
## 1 blah-blah-blah_value1_value1_value2_value2
## 2 blah _value1_value2

Splitting words on space or other character

string <- c("apple: banana: orange: kiwi")
str_split(string, boundary('word'))
## [[1]]
## [1] "apple"  "banana" "orange" "kiwi"

Splitting a string and keeping the delimiter

Using the separate() function seems to work well here

tmp <- tibble(string = "abc-def-ghi-jkl-mno")
tmp |> separate(string, c("part1", "part2"), "(?=jkl)")
## # A tibble: 1 × 2
##   part1        part2  
##   <chr>        <chr>  
## 1 abc-def-ghi- jkl-mno

Extracting specific version of a word

Assume we have to find the location of a specific word. In the line below if we search for “string” we’re going to get two answers.

string <- tibble(col = c("strings", "stringer", "word", "part", "string"))
string$col[which(str_detect(string$col, "string"))]
## [1] "strings"  "stringer" "string"
# setting the boundary and find only the exact match
string$col[which(str_detect(string$col, "string\\b"))]
## [1] "string"

Extracting all @mentions

s <- tibble(text = "this is a string, @mention, with a mention")
s |> 
  mutate(mentions = sapply(str_extract_all(text, "@\\w+"), function(x) paste(x, collapse = ", ")),)
## # A tibble: 1 × 2
##   text                                       mentions
##   <chr>                                      <chr>   
## 1 this is a string, @mention, with a mention @mention

Extracting urls into new column in tibble

url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
tibble(text = c("content with https://www.example.com url", "text in http://www.foo.com", "this string has no url")) |> mutate(url = str_extract(text, url_pattern))
## # A tibble: 3 × 2
##   text                                     url                    
##   <chr>                                    <chr>                  
## 1 content with https://www.example.com url https://www.example.com
## 2 text in http://www.foo.com               http://www.foo.com     
## 3 this string has no url                   <NA>
Taylor Grant
Taylor Grant
Group Director, Strategy & Analytics
Next
Previous