Web Scraping

Last updated on Jan 26, 2025 rvest, scraping

Table of Contents

Scraping Instagram

Profile Data

This is a function to scrape the user data and the last 12 posts from that user. Set this up as a recurring cron job and it could scrape multiple users and store the data. R code is adapted from this post, which was written in python. We’re also using the rjsoncons package, which allows for JMESpath subsetting of JSON data. Since we’re adding headers to the GET request, we’ll use the httr package, rather than rvest.

library(httr)
library(jsonlite)

# function to parse data and put into list 
parse_user <- function(data) {
  # get data on the account
  account_data <- rjsoncons::jmespath(data, "data.user.{
        name: full_name,
        username: username,
        id: id,
        category: category_name,
        business_category: business_category_name,
        phone: business_phone_number,
        email: business_email,
        bio: biography,
        bio_links: bio_links[].url,
        homepage: external_url,        
        followers: edge_followed_by.count,
        follows: edge_follow.count,
        facebook_id: fbid,
        is_private: is_private,
        is_verified: is_verified,
        profile_image: profile_pic_url_hd,
        video_count: edge_felix_video_timeline.count,
        image_count: edge_owner_to_timeline_media.count,
        saved_count: edge_saved_media.count,
        related_profiles: edge_related_profiles.edges[].node.username
                                   
}"
  ) |> 
    fromJSON()
  
  # get post data
  post_data <- rjsoncons::jmespath(data, "data.user.edge_owner_to_timeline_media.edges[].node.{
         post_date: taken_at_timestamp, 
         typename: __typename,
         type: is_video,
         id: id,
         shortcode: shortcode,
         thumbnail: thumbnail_src,
         video_url: video_url,
         caption: edge_media_to_caption.edges[].node.text,
         video_viewcount: video_view_count,
         comment_count: edge_media_to_comment.count,
         like_count: edge_liked_by.count,
         location: location.name,
         music_artist: clips_music_attribution_info.artist_name,
         song_name: clips_music_attribution_info.song_name
}"
  ) |> 
    fromJSON() |> 
    mutate(post_date = as.POSIXct(post_date, origin = "1970-01-01"))
  
  list(account_data = account_data, post_data = post_data)
}

scrape_user <- function(username) {
  INSTAGRAM_APP_ID <- "936619743392459" # this is the public app id for instagram.com
  url <- paste0("https://i.instagram.com/api/v1/users/web_profile_info/?username=", username)
  response <- GET(
    url = url,
    add_headers(`x-ig-app-id` = INSTAGRAM_APP_ID)
  )
  data <- content(response, "text")
  parse_user(data)
}

out <- scrape_user('bmw')

Two lists are saved. The first is the account data.

out$account_data[c(1,2,3,5,8,11:12,17,18)] |> 
  cbind() |> 
  data.frame() |> 
  set_names(nm = "") |> 
  knitr::kable()


name	BMW
username	bmw
id	43109246
business_category	Auto Dealers
bio	The official #BMW account, home of Sheer Driving Pleasure.
Use #BMWrepost for the chance to get featured.
followers	37361042
follows	76
video_count	49
image_count	10594

The second is the post data that we’ve pulled.

out$post_data |> 
  head(3) |> 
  knitr::kable()

post_date	typename	type	id	shortcode	caption	video_viewcount	comment_count	like_count	location	music_artist	song_name
2023-09-05 09:12:49	GraphVideo	TRUE	3185240436572707198	Cw0P00KIFl-	Taking over our hometown in our home colours 💙🤍 #NeueKlasse #IAA23 #THEVisionNeueKlasse #TheNeueNew	678386	813	164169	Max-Joseph-Platz	ottocarclub	Original audio
2023-09-07 01:00:45	GraphVideo	TRUE	3186442829185025149	Cw4hN7iqgR9	Every decision is an act of creation, shaping the path of our journey. THE NEUE NEW. The BMW Vision Neue Klasse. #NeueKlasse #IAA23 #THEVisionNeueKlasse #TheNeueNew	325513	256	40401	NA	bmw	Original audio
2023-09-08 10:34:30	GraphVideo	TRUE	3187455988330466509	Cw8HlVhoQzN	BMW faces its toughest critic yet @liamcarps1 🫣 Shop Liam’s look via the link in bio. #NeueKlasse #IAA23 #THEVisionNeueKlasse #TheNeueNew #BMW	1540983	959	121633	NA	bmw	Original audio

Scraping Post Data

We can also scrape post level data. The wrapper function is written in R, but a lot of the work is done via Python scripts. All that the user has to provide is the user_name and number of pages to return. Each page contains 12 posts.

scrape_instagram <- function(user_name, max_pages) {
  
  pacman::p_load(tidyverse, janitor, here, glue, reticulate)
  
  if (max_pages > 10) {
    sleep_time <- 5L
  } else {
    sleep_time <- 2L
  }
  
  # LOAD FUNCTIONS ----------------------------------------------------------
  # R FUNCTIONS
  source("~/R/scrapper/instagram/R/ig_scrape_user.R") # scrapes the user
  source("~/R/scrapper/instagram/R/ig_parse_user.R") # parsed the user data into a list 
  source("~/R/scrapper/instagram/R/ig_clean_posts.R")
  # PYTHON FUNCTIONS
  source_python("~/R/scrapper/instagram/python/ig_parse_post.py")
  source_python("~/R/scrapper/instagram/python/ig_scrape_single_post.py")
  source_python("~/R/scrapper/instagram/python/ig_scrape_user_posts_detail.py")
  
  # 1. GET USER DATA --------------------------------------------------------
  # this is necessary to get the user ID 
  user <- ig_scrape_user(user_name)
  parsed_user <- ig_parse_user(user)
  
  # 2. GET POST DATA --------------------------------------------------------
  httpx <- import("httpx") # Create a Python session using httpx
  session <- httpx$Client(timeout = httpx$Timeout(20.0)) # Create an HTTPX session object in Python
  post_df <- ig_scrape_user_posts_detail(
    user_id = parsed_user$account_data$id,
    session = session,
    page_size = 12L,
    max_pages = max_pages,            # Optional: Limit the number of pages to scrape
    sleep_time = sleep_time
  )
  session$close()
  # 3. CLEAN THE DATA AND RETURN  -------------------------------------------
  clean_posts <- ig_clean_posts(post_df)
  return(clean_posts)
}

out <- scrape_instagram("bmw", max_pages = 2)
out[c(1:2, 15:16)]

##              post_date                                     url
## 1  2025-01-16 08:01:09 https://www.instagram.com/p/DE5HUT1oKn-
## 2  2025-01-16 05:01:14 https://www.instagram.com/p/DE4yuAJK_T2
## 15 2025-01-23 08:02:51 https://www.instagram.com/p/DFLItEVtT_Y
## 16 2025-01-23 07:20:07 https://www.instagram.com/p/DFLENULOfx1
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 caption
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      We shine in the dark ✨\n\nThe BMW i7. 100% electric. \n#THEi7 #BMW #BMWElectric
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        What we mean when we say self-care 🧽\n\nThe BMW i7. 100% electric. \n#THEi7 #BMW #BMWElectric
## 15 Signed, sealed, delivered. \n\nThe BMW M5 with the @redbulldriftbrothers sideways at @bmwwelt.\n\nWatch the full reel on the channel now. \n\n#BMW #BMWM #BMWWELT #THEM5 #BMWIndividual\n\nThe BMW M5 Sedan: Mandatory information according to German law ’Pkw-EnVKV’ based on WLTP: energy consumption weighted combined: 26.8 kWh/100 km and 1.9 l/100 km; CO₂ emissions weighted combined: 43 g/km; CO2 classes: with discharged battery G; weighted combined B; Fuel consumption with depleted battery combined: 10.7 l/100km; electric range: 66 km\n\nPaint finish shown: BMW Individual special paint Speed Yellow solid. Check the link in bio for the BMW Individual Visualizer to see more BMW Individual colour options.
## 16                                                                                                                                                             Being bold will never go old. \n\nShop the BMW M oversized brown t-shirts and Erlkönig gilet from @bmwlifestyle via the link in bio and in select dealerships.\n\n#THEM5 #BMWLifestyle #BMWM #BMW \n\nThe new BMW M5 Sedan: Mandatory information according to German law ’Pkw-EnVKV’ based on WLTP: energy consumption weighted combined: 26.8 kWh/100 km and 1.9 l/100 km; CO₂ emissions weighted combined: 43 g/km; CO2 classes: with discharged battery G; weighted combined B; Fuel consumption with depleted battery combined: 10.7 l/100km; electric range: 63 km
##     views   plays  likes comments video_duration                   collaborators
## 1  473077 2131529 194182     1787          8.811                                
## 2  580173 2712296  60728      477         16.939                                
## 15 204202 1289461  75870      555         21.199 bmw, bmwm, redbulldriftbrothers
## 16    NaN     NaN 153243      668            NaN                                
##                                         hashtags
## 1                     #THEi7, #BMW, #BMWElectric
## 2                     #THEi7, #BMW, #BMWElectric
## 15 #BMW, #BMWM, #BMWWELT, #THEM5, #BMWIndividual
## 16            #THEM5, #BMWLifestyle, #BMWM, #BMW
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             media_link
## 1    https://scontent-sjc3-1.cdninstagram.com/o1/v/t16/f2/m86/AQOg1Ky9xbfVECX3bvhFUtdBHKv7o9mX-1dgqCp4K31XZS7G5Xz4KJsC0qNnaLu3H1fTz7W2mpzOvtSV88_SZej34JjJ5E3IW7RstqM.mp4?stp=dst-mp4&efg=eyJxZV9ncm91cHMiOiJbXCJpZ193ZWJfZGVsaXZlcnlfdnRzX290ZlwiXSIsInZlbmNvZGVfdGFnIjoidnRzX3ZvZF91cmxnZW4uY2xpcHMuYzIuNzIwLmJhc2VsaW5lIn0&_nc_cat=110&vs=568456509361022_967454477&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9FQjQwNkNFMDczMTREOEE5QkVBNDQxMjgyQ0E1Rjk4MV92aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dHemVNeHhBcUxpMDVOa0VBSF8yTUNjanMxUWRicV9FQUFBRhUCAsgBACgAGAAbABUAACaKipbwtoSyPxUCKAJDMywXQCGZmZmZmZoYEmRhc2hfYmFzZWxpbmVfMV92MREAdf4HAA%3D%3D&ccb=9-4&oh=00_AYA-VCvkAD7yK3W9Oc5RImyqafn6Vu_kinT2rnsaBLVZyQ&oe=6798A66E&_nc_sid=d885a2
## 2   https://scontent-sjc3-1.cdninstagram.com/o1/v/t16/f2/m86/AQOmrQsFDZZcfiDCWNGWgZaxSumLzvmxoGqXk-14EXiOGUHo4THo8dic5zt_6uizcKkf-P6tLqEHT7Vis_SCnGhz8wrXy2jX_OZuh-Q.mp4?stp=dst-mp4&efg=eyJxZV9ncm91cHMiOiJbXCJpZ193ZWJfZGVsaXZlcnlfdnRzX290ZlwiXSIsInZlbmNvZGVfdGFnIjoidnRzX3ZvZF91cmxnZW4uY2xpcHMuYzIuNzIwLmJhc2VsaW5lIn0&_nc_cat=111&vs=925850083087482_1767228440&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9COTQ3NTkyNEMzNjk1QzQ2MTEzMDBBREM5NzA1QkQ5MV92aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dEcUZNaHgwdWVkMUdaa0JBT01vOGxWa3VtZ2xicV9FQUFBRhUCAsgBACgAGAAbABUAACaGh5nIpfuwQBUCKAJDMywXQDDu2RaHKwIYEmRhc2hfYmFzZWxpbmVfMV92MREAdf4HAA%3D%3D&ccb=9-4&oh=00_AYBd4DrNHTbUKoKDMh5ousP_wmiCa1Fv3V0XYhDWMZ3F-w&oe=67988DF3&_nc_sid=d885a2
## 15 https://scontent-sjc3-1.cdninstagram.com/o1/v/t16/f2/m86/AQNcBzoBGCcWdW4Wc8gXRLaSaFXTUgo_nUls-3F2a6H7e_KM6HVU5jFoFDIGJFMIqbOj4y--VeYlpVs3e_228p7ylmJtAHU4Z_X-KbQ.mp4?stp=dst-mp4&efg=eyJxZV9ncm91cHMiOiJbXCJpZ193ZWJfZGVsaXZlcnlfdnRzX290ZlwiXSIsInZlbmNvZGVfdGFnIjoidnRzX3ZvZF91cmxnZW4uY2xpcHMuYzIuNzIwLmJhc2VsaW5lIn0&_nc_cat=108&vs=2151982308533041_4058955520&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC8wRTQ1QjBEOUE2RDFBNjI4MUUxMzhCQkY2Q0MzMDVCNl92aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dMVVhUeHp5V2YybHVTa0dBRnVEek42Z21sczVicV9FQUFBRhUCAsgBACgAGAAbABUAACa8s9DKqa6OQBUCKAJDMywXQDUwIMSbpeMYEmRhc2hfYmFzZWxpbmVfMV92MREAdf4HAA%3D%3D&ccb=9-4&oh=00_AYCcg77xLKB8JoLR3OdoZsTr5w3eBYiPGe34kctSvAn5Hw&oe=67989FE7&_nc_sid=d885a2
## 16                                                                                                                                                                                                                                                                                                                                                                                                               https://scontent-sjc3-1.cdninstagram.com/v/t51.2885-15/474754627_1126112715679825_3972589647703166490_n.jpg?stp=dst-jpg_e35_tt6&_nc_ht=scontent-sjc3-1.cdninstagram.com&_nc_cat=101&_nc_ohc=Xc5NHDoXH_EQ7kNvgEIXpvB&_nc_gid=58de59e0a6c44786a7c07aa205b8be98&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AYApN-DbGKz8LdRtp0MVAZzjJ0s9J91ScnAKNsqdPjDFJA&oe=679C90EF&_nc_sid=d885a2
##            link_expiry
## 1  2025-01-28 01:42:06
## 2  2025-01-27 23:57:39
## 15 2025-01-28 01:14:15
## 16 2025-01-31 00:59:27

Scraping hidden data

In this example I show how when there is an identifiable script, that we can scrape the javascript from a dynamically loaded site. But what to do when this hidden data is loaded via a generic script? When this occurs, we have to do some parsing. It can be a pain, but being able to do this correctly opens a whole new world of data availability.

As an example, we’re going to use the Home Depot state by state store location page. In this case, for the state of Alabama. This data is actually easily scrapable using rvest and css selection, but pulling the JSON is far more convenient, and often yields additional information.

After loading the page, we go to devtools and search for part of the first address. As can be seen, the script isn’t identified, but we see the window.__APOLLO_STATE__:. We’re going to use this little bit to find the JSON data.

The first thing we’re going to do is use the httr package to request the page and then parse it.

# Define the URL
url <- "https://www.homedepot.com/l/AL"

# Send a GET request to the URL
response <- GET(url)

# Check if the request was successful
(http_status(response)$category == "Success")
  
# Parse the HTML content from the response
html_content <- content(response, as = "text", encoding = "UTF-8")

Now that we have our content in text format, we have to find that script identifier. The start_pos variable is finding the location after the window.__APOLLO_STATE__= text is found. The end_pos_relative is looking for the location of the end of the JSON. In this case, becasue the data is nested, we’re looking for 3 end brackets. This will change depending on the structure of the data.

# Find the position of the substring "window.__APOLLO_STATE__="
start_pos <- str_locate(html_content, "window.__APOLLO_STATE__=")[2] + 1
    
# If the start position is found, proceed to find the end position and extract the JSON string
end_substring <- str_sub(html_content, start = start_pos)
end_pos_relative <- str_locate(end_substring, "\\}\\}\\}")

Now we adjust our end position relative to where our JSON data started, extract the data, and parse it with the jsonlite::fromJSON() function.

# Assuming we found the end_pos, adjust end_pos relative to start_pos 
end_pos <- end_pos_relative[2] + start_pos - 1
        
# Extract the JSON string using substring
json_str <- substr(html_content, start_pos, end_pos)
        
# Parse the JSON string to a list
json_data <- fromJSON(json_str, flatten = TRUE)

Because our data is nested, it’s going to parse as a list. Calling the data, we can see that our data can be found in the following location.

stores_df <- json_data$ROOT_QUERY$`storeDirectoryByState({"state":"AL"})`$storesInfo |> 
  select(storeName, address.street, address.city, address.state, 
         address.postalCode, address.county, url, phone)

head(stores_df)

##      storeName        address.street address.city address.state address.postalCode address.county
## 1     W Mobile  755 Schillinger Rd S       Mobile            AL              36695         Mobile
## 2        Foley    2899 S Mckenzie St        Foley            AL              36535        Baldwin
## 3 W Huntsville  4045 Lawson Ridge Dr      Madison            AL              35757        Madison
## 4 N Huntsville 1035 Memorial Pkwy Nw   Huntsville            AL              35801        Madison
## 5       Pelham      3191 Pelham Pkwy       Pelham            AL              35124         Shelby
## 6   Prattville  2710 Legends Parkway   Prattville            AL              36066         Elmore
##                                                                url         phone
## 1         https://www.homedepot.com/l/W-Mobile/AL/Mobile/36695/801 (251)634-0351
## 2             https://www.homedepot.com/l/Foley/AL/Foley/36535/802 (251)955-2401
## 3    https://www.homedepot.com/l/W-Huntsville/AL/Madison/35757/803 (256)837-6658
## 4 https://www.homedepot.com/l/N-Huntsville/AL/Huntsville/35801/804 (256)536-2216
## 5           https://www.homedepot.com/l/Pelham/AL/Pelham/35124/805 (205)685-1837
## 6   https://www.homedepot.com/l/Prattville/AL/Prattville/36066/806 (334)285-1693

Scraping YouTube

YouTube has an API, but it can be rate limited pretty quickly. The following will get performance metrics and all comments from each video, without running into any limit.

Set up

Within a conda environment created via the reticulate package, install yt-dlp (more detail).

reticulate::conda_install("[ENV Name]", "yt-dlp", pip = TRUE)

Next, write the python function that will call the yt-dlp library and pull the video metrics and comments for any url that’s passed to it.

def youtube_dl_info(URL):
  import json
  import yt_dlp

  # See help(yt_dlp.YoutubeDL) for a list of available options and public functions
  ydl_opts = {'getcomments':True}
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
      info = ydl.extract_info(URL, download=False)
      return(info)

After that, write quick function to clean the urls that we’re going to read in.

yt_clean <- function(file_loc) {
  tmp <- readr::read_csv(file_loc) |> 
    dplyr::filter(stringr::str_detect(links, "^https://www.youtube.com/watch?")) |> 
    dplyr::distinct(links) |> 
    dplyr::pull()
}

Because I prefer to work in R, I need to get the data out of the python environment and into R. From there, I’m going to clean the data. The data will be split into two different dataframes - the summary metrics and the comments.

# to get data out of the returned lists # 
youtube_extract <- function(datalist) {
  # get summary data about the video
  summary <- purrr::map_dfr(datalist, 
                            magrittr::extract, 
                            c("uploader", "upload_date", "title", "description", "categories",
                              "duration_string", "view_count", 
                              "like_count", "comment_count", "original_url")) |> 
    dplyr::rename(duration = duration_string) |> 
    dplyr::mutate(upload_date = lubridate::ymd(upload_date))
  # get the comments
  x <- purrr::map(datalist, "comments")
  # drop empty lists (no comments)
  x <- Filter(length, x)
  
  # loop over comments and put into dataframe
  comments <- NULL
  for (i in seq_len(nrow(summary))) {
    print(i)
    tmp <- x[[i]] %>% 
      data.table::rbindlist(fill = TRUE) |> 
      dplyr::as_tibble() |> 
      dplyr::mutate(timestamp = lubridate::as_datetime(as.numeric(timestamp, tz = "America/Los_Angeles"))) |>
      dplyr::select(c(text, timestamp, like_count, author)) |> 
      dplyr::mutate(video = summary$title[i],
                    timestamp = as.Date(timestamp)) |> 
      dplyr::rename(date = timestamp)
    comments <- rbind(comments, tmp)
    tmp <- NULL
  }
  out <- list(summary = summary, comments = comments)
}

Finally, we write a wrapper function that will run the above functions and save the output into .rds and .xlsx files.

Everything in this function is referenced by the here() function. The directory structure looks like the following:

├── R
│   └── helpers
├── py_functions
├── youtube_report
│   └── youtube_data
├── yt_caption_data
└── yt_link_data

# YouTube metrics and comments # 

# 1. start with LinkGopher to extract all urls
# 2. add links to a csv with the column name "links"
# 3. save csv as "[handle_name]_yt_links.csv" in the "yt_link_data" folder
# 4. run this function 

yt_scrape_fn <- function(handle) {
  # load packages
  pacman::p_load(tidyverse, janitor, here, glue)
  # load functions
  reticulate::source_python(here::here('youtube', 'py_functions', 'youtube_dl_info.py'))
  source(here('youtube','R','helpers', 'helpers.R'))
  source(here('youtube','R', "youtube_extract.R"))
  
  loc <- here('youtube','yt_link_data', glue("{handle}_yt_links.csv"))
  urls <- yt_clean(loc)
  
  # map over function and save to environment for safe processing
  tmpdata <<- urls |>
    purrr::map(youtube_dl_info)
  
  # clean and add to reactive values
  out <- youtube_extract(tmpdata)
  
  outlist <- list(summary = out$summary, comments = out$comments)
  saveRDS(outlist, here("youtube","youtube_report", "youtube_data", glue::glue("yt_{handle}_{Sys.Date()}.rds")))
  
  openxlsx::write.xlsx(outlist, here("youtube","youtube_report", "youtube_data", glue::glue("yt_{handle}_{Sys.Date()}.xlsx")))
}

Getting the link data

We have to pass links into the function and to get them, we’re going to use the Link Gopher extension to pull all urls on the YouTube page of the account of interest. Then copy and paste the data and save as a csv.

Run it

As written, the script will save the data.

yt_scrape_fn("tesla")

And when the data is loaded, it looks like this

## # A tibble: 3 × 10
##   uploader upload_date title  description categories duration view_count like_count comment_count original_url
##   <chr>    <date>      <chr>  <chr>       <chr>      <chr>         <int>      <int>         <int> <chr>       
## 1 Tesla    2023-01-21  Tesla… "Tesla is … Autos & V… 2:17         619217      21844          1586 https://www…
## 2 Tesla    2023-03-31  Lars … "Engineere… Autos & V… 2:45         319640      15934          1089 https://www…
## 3 Tesla    2023-09-25  Tesla… "Optimus c… Autos & V… 1:18          28591       5882           729 https://www…

## # A tibble: 6 × 5
## # Groups:   video [3]
##   text                                                                      date       like_count author video
##   <chr>                                                                     <date>          <int> <chr>  <chr>
## 1 From Lars to Lars. Nice Explanation 👍                                    2023-09-24         NA @lars… Lars…
## 2 When you drill into the crash test data, it becomes apparent that the Mo… 2023-09-11         NA @cras… Lars…
## 3 And my car still can't drive itself halfway through the city in a light … 2023-09-25         NA @ne0n… Tesl…
## 4 The fact that it moves it whole body, hips included when moving the bloc… 2023-09-25         NA @Wify… Tesl…
## 5 I love to see them working very hard to replace every human on earth, ro… 2023-09-04         NA @Ahme… Tesl…
## 6 Please build artificial, noiseless, space- and energy-saving muscles int… 2023-08-25         NA @gk... Tesl…

Scraping TikTok

Code below can be used to scrape three different aspects of TikTok data - Profile data, Post data, Comments and comment data.

Profile data

The first thing we’ll scrape is the Profile data, which is found in a JavaScript script tag. The data is pretty comprehensive and is returned in a JSON format. A function to pull the data is below.

get_tt_user <- function(handle) {
  pacman::p_load(tidyverse)
  handle <- sub("^@", "", handle) # drop @ if it's there
  link <- glue::glue("https://www.tiktok.com/@{handle}?lang=en")

  # setting headers
  headers = c(
    `Accept` = '*/*',
    `Accept-Language` = 'en-US,en;q=0.9',
    `Connection` = 'keep-alive',
    `Content-type` = 'application/x-www-form-urlencoded',
    `DNT` = '1',
    `Referer` = "https://www.google.com/",
    `Sec-Fetch-Dest` = 'empty',
    `Sec-Fetch-Mode` = 'cors',
    `Sec-Fetch-Site` = 'same-origin',
    `User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    `cache-control` = 'no-cache',
    `pragma` = 'no-cache',
    `sec-ch-ua` = '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    `sec-ch-ua-mobile` = '?0',
    `sec-ch-ua-platform` = '"macOS"'
  )

  # GET request
  res <- httr::GET(link, httr::add_headers(.headers = headers))
  if (res$status_code != 200) {
    stop(paste("Error: Received status code", res$status_code))
  }

  tmpout <- httr::content(res) |>
    rvest::html_elements("script#__UNIVERSAL_DATA_FOR_REHYDRATION__") |>
    rvest::html_text() |>
    jsonlite::fromJSON()

  out <- tmpout$`__DEFAULT_SCOPE__`$`webapp.user-detail`$userInfo

  # build out a table of user data
  tibble::tibble(name = out$user$nickname,
         handle = out$user$uniqueId,
         signature = out$user$signature,
         verified = out$user$verified,
         biolink = out$user$bioLink$link,
         region = out$user$region,
         followers = out$stats$followerCount,
         following = out$stats$followingCount,
         hearts = out$stats$heartCount,
         videos = out$stats$videoCount,
         diggs = out$stats$diggCount,
         friends = out$stats$friendCount,
         date_pulled = Sys.Date())
}

Let’s run it.

adre <- get_tt_user("addisonre")
adre |> 
  knitr::kable()

name	handle	signature	verified	biolink	region	followers	following	hearts	videos	diggs	friends	date_pulled
Addison Rae	addisonre	The world is my oyster 🧜🏼‍♀️	TRUE	https://addisonrae.lnk.to/Aquamarine	US	88600000	22	926500000	320	0	11	2025-01-26

Post data

Similar to profile data, post-level data can be found in a hidden script. The function to pull post level data is below.

get_tt_posts <- function(link){
  # setting headers
  headers = c(
    `Accept` = '*/*',
    `Accept-Language` = 'en-US,en;q=0.9',
    `Connection` = 'keep-alive',
    `Content-type` = 'application/x-www-form-urlencoded',
    `DNT` = '1',
    `Referer` = "https://www.google.com/",
    `Sec-Fetch-Dest` = 'empty',
    `Sec-Fetch-Mode` = 'cors',
    `Sec-Fetch-Site` = 'same-origin',
    `User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    `cache-control` = 'no-cache',
    `pragma` = 'no-cache',
    `sec-ch-ua` = '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    `sec-ch-ua-mobile` = '?0',
    `sec-ch-ua-platform` = '"macOS"'
  )
  
  # GET request
  res <- httr::GET(link, httr::add_headers(.headers = headers))
  if (res$status_code != 200) {
    stop(paste("Error: Received status code", res$status_code))
  }
  # content and pull script
  tmpout <- httr::content(res) |>
    rvest::html_elements("script#__UNIVERSAL_DATA_FOR_REHYDRATION__") |>
    rvest::html_text() |>
    jsonlite::fromJSON()
  
  out <- tmpout$`__DEFAULT_SCOPE__`$`webapp.video-detail`$itemInfo$itemStruct
  # build out a table of user data
  tibble::tibble(id = out$id, 
                 post_text = out$desc,
                 created_time = out$createTime,
                 duration = out$video$duration,
                 author_id = out$author$id,
                 author_uniqueID = out$author$uniqueId,
                 author_nickname = out$author$nickname,
                 author_signature = out$author$signature,
                 author_verified = out$author$verified,
                 like_count = out$stats$diggCount,
                 share_count = out$stats$shareCount,
                 comment_count = out$stats$commentCount,
                 play_count = out$stats$playCount,
                 collect_count = out$stats$collectCount,
                 diversification_labels = paste(out$diversificationLabels, collapse = ", "),
                 suggested_words = paste(out$suggestedWords, collapse = ", ")) |>  
    dplyr::mutate(hashtags = stringr::str_extract_all(post_text, "#\\S+"),
                  at_mentions = stringr::str_extract_all(post_text, "@\\S+"),
                  created_time = lubridate::as_datetime(as.integer(created_time), tz = "America/Los_Angeles"))
}

To run it, just provide a link

single_post <- get_tt_posts(link = "https://www.tiktok.com/@addisonre/video/7363016297233354027")
single_post |> 
  knitr::kable()

id	post_text	created_time	duration	author_id	author_uniqueID	author_nickname	author_signature	author_verified	like_count	share_count	comment_count	play_count	collect_count	diversification_labels	suggested_words	hashtags	at_mentions
7363016297233354027	Got married at stagecoach. thank u @airbnb 🙏🏼	2024-04-28 13:24:14	30	6703550784929793030	addisonre	Addison Rae	The world is my oyster 🧜🏼‍♀️	TRUE	485400	12300	1442	30100000	14988	Romance, Relationship, Family & Relationship	Addison Rae, addison rae boyfriend, addison rae mom, addison rae scream, stagecoach, addison, addison rae and omer, Abby And Addison Rae, Stagecoach Outfits, josh richards addison rae		@airbnb

Comment data (This is broken for the time being….)

Finally, we’ll pull the comments from a post. Comments aren’t found in a hidden script, but are loaded dynamically through hidden APIs as a user scrolls. We’re going to call the hidden API directly, and iterate through all of the comments to pull them. We can get 50 comments per API call, and one of the API parameters is cursor, which identifies where in the number of comments we want to start to pull our next 50.

There are several functions that will all be used in our convenience wrapper function. We’re going to use the same post as above.

parse_comments <- purrr::possibly(function(l){
  # putting comment data into a cleaned tibble
  tibble::tibble(
    text = l$comments$text,
    comment_language = l$comments$comment_language,
    digg_count = l$comments$digg_count,
    reply_comment_count = l$comments$reply_comment_total,
    author_pin = l$comments$author_pin,
    create_time = l$comments$create_time,
    cid = l$comments$cid,
    nickname = l$comments$user$nickname,
    unique_id = l$comments$user$unique_id,
    post_id = l$comments$aweme_id
  ) |> 
    dplyr::mutate(create_time = lubridate::as_datetime(create_time, tz = "America/Los_Angeles"))
}, otherwise = NA_character_)

scrape_comments <- function(post_id, cursor) {
  # setting headers
  headers = c(
    `Accept` = '*/*',
    `Accept-Language` = 'en-US,en;q=0.9',
    `Connection` = 'keep-alive',
    `Content-type` = 'application/x-www-form-urlencoded',
    `DNT` = '1',
    `Referer` = "https://www.google.com/",
    `Sec-Fetch-Dest` = 'empty',
    `Sec-Fetch-Mode` = 'cors',
    `Sec-Fetch-Site` = 'same-origin',
    `User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    `cache-control` = 'no-cache',
    `pragma` = 'no-cache',
    `sec-ch-ua` = '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    `sec-ch-ua-mobile` = '?0',
    `sec-ch-ua-platform` = '"macOS"'
  )
  # build url and first pull 
  base_url <- "https://www.tiktok.com/api/comment/list/?"
  params <- list(
    aweme_id = post_id,
    count = 50,
    cursor = cursor
  )
  # GET request
  res <- httr::GET(base_url, query = params, httr::add_headers(.headers = headers))
  # parse data depending on cursor location
  if (cursor == 0) {
    tmp <- httr::content(res, "text") |> 
      jsonlite::fromJSON()
    
    out <- list(comment_data = parse_comments(tmp), total_comments = tmp$total)
  } else {
    tmp <- httr::content(res, "text") |> 
      jsonlite::fromJSON()
    
    out <- parse_comments(tmp)
  }
}

fetch_tt_comments <- function(post_id) {
  tmp <- scrape_comments(post_id, cursor = 0)
  # get cursor starting locations
  cursors <- (1:floor(tmp$total_comments/50)*50)+1
  # map across cursor starts
  out <- purrr::map2(post_id, cursors, scrape_comments)
  out <- out[!is.na(out)]
  out <- dplyr::bind_rows(out)
  # bind all pulls together
  rbind(tmp$comment_data, out)
}

We’ll run it and provide the first 5 comments.

fetch_tt_comments("7363016297233354027") |> 
  head(5) |> 
  knitr::kable()

rvest scraping