Data pipelines for R/Pharma

data-pipelines

Structure

Rebuilding

This data (and this page) rebuild automatically based on the CRON schedule specified in the github action.

Run the pipeline with targets::tar_make()

Check the pipeline with targets::tar_manifest(fields = all_of("command"))

Visualize the pipeline with targets::tar_visnetwork()

graph LR
  style Legend fill:#FFFFFF00,stroke:#000000;
  style Graph fill:#FFFFFF00,stroke:#000000;
  subgraph Legend
    direction LR
    xf1522833a4d242c5([""Up to date""]):::uptodate --- xd03d7c7dd2ddda2b([""Stem""]):::none
    xd03d7c7dd2ddda2b([""Stem""]):::none --- xeb2d7cac8a1ce544>""Function""]:::none
  end
  subgraph Graph
    direction LR
    xe6fae3abc4970767(["data_processed_talks"]):::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    x986bd675a516b5b3(["data_processed_team"]):::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    x4a9bd41924e0fdb4>"write_data"]:::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    xdd04dc2490030259>"get_data"]:::uptodate --> xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate
    xe1268667a4ed1fd8>"build_team"]:::uptodate --> x986bd675a516b5b3(["data_processed_team"]):::uptodate
    xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate --> x986bd675a516b5b3(["data_processed_team"]):::uptodate
    x24d9c683daf7bfc6>"build_talks"]:::uptodate --> xe6fae3abc4970767(["data_processed_talks"]):::uptodate
    xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate --> xe6fae3abc4970767(["data_processed_talks"]):::uptodate
    xa0ae033e33afa834>"build_workshops"]:::uptodate --> xa0ae033e33afa834>"build_workshops"]:::uptodate
    x97fb05a8a7947628>"build_proceedings"]:::uptodate --> x97fb05a8a7947628>"build_proceedings"]:::uptodate
  end
  classDef uptodate stroke:#000000,color:#ffffff,fill:#354823;
  classDef none stroke:#000000,color:#000000,fill:#94a4ac;
  linkStyle 0 stroke-width:0px;
  linkStyle 1 stroke-width:0px;
  linkStyle 10 stroke-width:0px;
  linkStyle 11 stroke-width:0px;

Output

The current files are available in both parquet and csv.

arrow::read_parquet(
  "https://rinpharma.github.io/data-pipelines/output/processed_talks.parquet"
  ) |>
  dplyr::glimpse()
Rows: 602
Columns: 16
$ ID                <chr> "rinpharma_1", "rinpharma_2", "rinpharma_3", "rinpha…
$ Event             <chr> "2018 Conference", "2018 Conference", "2018 Conferen…
$ Abstract          <chr> NA, NA, NA, NA, NA, "Lilliam will be presenting a pe…
$ Type              <chr> "Workshop", "Workshop", "Workshop", "Workshop", "Rem…
$ APAC              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Year              <chr> "2018", "2018", "2018", "2018", "2018", "2018", "201…
$ Date              <dttm> 2018-08-15, 2018-08-15, 2018-08-15, 2018-08-15, 201…
$ Start             <chr> "08:00:00", "08:00:00", "08:00:00", "08:00:00", "09:…
$ End               <chr> "09:15:00", "09:15:00", "09:15:00", "09:15:00", "09:…
$ Speaker           <chr> "Marianna Foos", "Adrian Waddell", "Daniel Lee", "De…
$ Affiliation       <chr> "Biogen", "Roche", "Generable", "Metrum Research Gro…
$ Title             <chr> "Keeping things Peachy when Shiny gets Hairy", "Anal…
$ Slides            <chr> NA, NA, NA, NA, NA, NA, NA, NA, "https://github.com/…
$ Video             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Abstract_Sanitzed <chr> NA, NA, NA, NA, NA, "Lilliam will be presenting a pe…
$ Missing_Content   <chr> "Unfortunately we do not currently have an abstract,…
arrow::read_parquet(
  "https://rinpharma.github.io/data-pipelines/output/processed_team.parquet"
  ) |>
  dplyr::glimpse()
Rows: 44
Columns: 8
$ Name              <chr> "James Black", "Paulo Bargo", "Phil Bowsher", "Ellis…
$ Company           <chr> "Novartis", "Novartis", "Posit", "GlaxoSmithKline", …
$ EUROPE_US_OC      <lgl> TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRU…
$ `EX-EUROPE_US_OC` <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALS…
$ APAC_OC           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ `EX-APAC_OC`      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ github            <chr> "epijim", NA, "philbowsher", "thebioengineer", "harv…
$ linkedin          <chr> "epijim", "paulo-bargo-phd-10590830", "philip-bowshe…

Setup

Code to create gsheets token.

# Generate credentials for gsheet access

pw_name <- gargle:::secret_pw_name("googlesheets4")
pw <- gargle:::secret_pw_gen()

added pwname_pw to usethis::edit_r_environ()

encrypt the service account token (I made via goodle dev console)
gargle:::secret_write(
  package = "googlesheets4",
  name = "rinpharma-4ac2ad6eba3b.json",
  input = "~/Downloads/rinpharma-4ac2ad6eba3b.json"
)

encrypted file is now in /inst/

file_name <- "rinpharma-4ac2ad6eba3b.json"
secret_name <- "googlesheets4"
path <- paste0("inst/secret/", file_name)
raw <- readBin(path, "raw", file.size(path))
json <- sodium::data_decrypt(
  bin = raw, key = gargle:::secret_pw_get(secret_name), 
  nonce = gargle:::secret_nonce()
  )
pass <- rawToChar(json)

gs4_auth(path = pass)