Data pipelines for R/Pharma

data-pipelines

Structure

Code: rinpharma/data-pipelines
Rendered run: rinpharma.github.io/data-pipelines/
Output: Direct links below. Stored in gh-pages branch

TO-DO

Put thought into columns

Rebuilding

This data (and this page) rebuild automatically based on the CRON schedule specified in the github action.

Run the pipeline with targets::tar_make()

Check the pipeline with targets::tar_manifest(fields = all_of("command"))

Visualise the pipeline with targets::tar_visnetwork()

graph LR
  style Legend fill:#FFFFFF00,stroke:#000000;
  style Graph fill:#FFFFFF00,stroke:#000000;
  subgraph Legend
    direction LR
    xf1522833a4d242c5([""Up to date""]):::uptodate --- xd03d7c7dd2ddda2b([""Stem""]):::none
    xd03d7c7dd2ddda2b([""Stem""]):::none --- xeb2d7cac8a1ce544>""Function""]:::none
  end
  subgraph Graph
    direction LR
    xbd4a273a79a766d1(["data_processed_proceedings"]):::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    x986bd675a516b5b3(["data_processed_team"]):::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    xca33cab3ec65c968(["data_processed_workshops"]):::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    x4a9bd41924e0fdb4>"write_data"]:::uptodate --> x846d45454e9823b7(["write_data_to_files"]):::uptodate
    x97fb05a8a7947628>"build_proceedings"]:::uptodate --> xbd4a273a79a766d1(["data_processed_proceedings"]):::uptodate
    xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate --> xbd4a273a79a766d1(["data_processed_proceedings"]):::uptodate
    xdd04dc2490030259>"get_data"]:::uptodate --> xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate
    xe1268667a4ed1fd8>"build_team"]:::uptodate --> x986bd675a516b5b3(["data_processed_team"]):::uptodate
    xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate --> x986bd675a516b5b3(["data_processed_team"]):::uptodate
    xa0ae033e33afa834>"build_workshops"]:::uptodate --> xca33cab3ec65c968(["data_processed_workshops"]):::uptodate
    xeb364ad4c7efc3d3(["get_gsheet_data"]):::uptodate --> xca33cab3ec65c968(["data_processed_workshops"]):::uptodate
  end
  classDef uptodate stroke:#000000,color:#ffffff,fill:#354823;
  classDef none stroke:#000000,color:#000000,fill:#94a4ac;
  linkStyle 0 stroke-width:0px;
  linkStyle 1 stroke-width:0px;

Output

The current files are available

arrow::read_parquet(
  "https://rinpharma.github.io/data-pipelines/output/processed_proceedings.parquet"
  ) |>
  dplyr::glimpse()

Rows: 377
Columns: 13
$ ID                <chr> "rinpharma_1", "rinpharma_2", "rinpharma_3", "rinpha…
$ Event             <chr> "2018 Conference", "2018 Conference", "2018 Conferen…
$ Abstract          <chr> NA, NA, NA, NA, "Lilliam will be presenting a perspe…
$ Type              <chr> "Workshop", "Workshop", "Workshop", "Workshop", "Key…
$ Year              <chr> "2018", "2018", "2018", "2018", "2018", "2018", "201…
$ Date              <dttm> 2018-08-15, 2018-08-15, 2018-08-15, 2018-08-15, 201…
$ Speaker           <chr> "Marianna Foos", "Adrian Waddell", "Daniel Lee", "De…
$ Affiliation       <chr> "Biogen", "Roche / Genentech", "Generable", "Metrum …
$ Title             <chr> "Keeping things Peachy when Shiny gets Hairy", "Anal…
$ Slides            <chr> NA, NA, NA, NA, NA, NA, NA, "https://github.com/rinp…
$ Video             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Abstract_Sanitzed <chr> NA, NA, NA, NA, "Lilliam will be presenting a perspe…
$ Missing_Content   <chr> "Unfortunately we do not currently have an abstract,…

arrow::read_parquet(
  "https://rinpharma.github.io/data-pipelines/output/processed_team.parquet"
  ) |>
  dplyr::glimpse()

Rows: 18
Columns: 10
$ name            <chr> "James Black", "Paulo Bargo", "Phil Bowsher", "Ellis H…
$ role            <chr> "Roche / Genentech", "Novartis", "posit", "GlaxoSmithK…
$ site_superuser  <lgl> TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, F…
$ custom_link     <chr> "https://epijim.uk", NA, NA, "https://twitter.com/elli…
$ github          <chr> "epijim", NA, "philbowsher", "thebioengineer", "harvey…
$ linkedin        <chr> "epijim", "paulo-bargo-phd-10590830", "philip-bowsher-…
$ organising_comm <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, …
$ exec_comm       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ program_comm    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
$ Ex_Committee    <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,…

arrow::read_parquet(
  "https://rinpharma.github.io/data-pipelines/output/processed_workshops.parquet"
  ) |>
  dplyr::glimpse()

Rows: 51
Columns: 8
$ event         <chr> "2021 Conference", "2021 Conference", "2021 Conference",…
$ title         <chr> "Clinical Trials Data Analysis at Roche", "Intro Shiny",…
$ date          <date> 2021-10-25, 2021-10-25, 2021-10-26, 2021-10-26, 2021-10…
$ time          <chr> "09:00-12:00 ET", "14:00-16:00 ET", "09:00-12:00 ET", "1…
$ presenter     <chr> "Adrian Waddell (Roche)", "Ted Laderas (DNANexus)", "Dan…
$ max_attendees <dbl> 100, 40, 120, 200, 500, 75, 500, 160, 50, 250, 200, 120,…
$ ticket_url    <chr> "https://www.eventbrite.com/e/187203810637", "https://ww…
$ status        <chr> "soldout", "soldout", "closed", "closed", "closed", "sol…

Setup

Code to create gsheets token.

# Generate credentials for gsheet access

pw_name <- gargle:::secret_pw_name("googlesheets4")
pw <- gargle:::secret_pw_gen()

added pwname_pw to usethis::edit_r_environ()

encrypt the service account token (I made via goodle dev console)
gargle:::secret_write(
  package = "googlesheets4",
  name = "rinpharma-4ac2ad6eba3b.json",
  input = "~/Downloads/rinpharma-4ac2ad6eba3b.json"
)

encrypted file is now in /inst/

file_name <- "rinpharma-4ac2ad6eba3b.json"
secret_name <- "googlesheets4"
path <- paste0("inst/secret/", file_name)
raw <- readBin(path, "raw", file.size(path))
json <- sodium::data_decrypt(
  bin = raw, key = gargle:::secret_pw_get(secret_name), 
  nonce = gargle:::secret_nonce()
  )
pass <- rawToChar(json)

gs4_auth(path = pass)