library(tidyverse)
library(RSQLite)
library(DBI)CMS & Open Payments
Overview
Using publicly available databases, generate a dataset of oncology specialists who billed for a drug prescription in 2021 that received pharmaceutical research funding for the same drug in 2017. Includes data linkage across disparate databases.
 
 
Technical Assets
R Packages
docs <- sessionInfo()
as.data.frame(as.matrix(map(pluck(docs, "otherPkgs"), "Version"))) |> 
  rownames_to_column("package") |> 
  unnest(V1) |> 
  rename(version = V1) |> 
  DT::datatable()Databases
All databases used in production
| Variable Name(s) | Webpage Link | Retrieved Date | File Download Link (last updated: 6/30/2024) | |
|---|---|---|---|---|
| phys_import | CMS | June 2024 | Download | |
| rsch | openPayment | June 2024 | Download | |
| crosswalk | CMS ASP Pricing | June 2024 | Download | |
| billing_import | CMS | June 2024 | Download | 
 
 
Identifying cohort
Gets a baseline cohort for oncologists billing to CMS (part B & fee-for-service) in 2021.
phys_import <- data.table::fread("Medicare_Physician_Other_Practitioners_by_Provider_2021.csv")
oncs <- phys_import |> 
  filter(grepl("onco", Rndrng_Prvdr_Type, ignore.case = T))con <- dbConnect(RSQLite::SQLite(), "cms.db")
QUERY_temp <- dbSendQuery(con, "CREATE TABLE physicians_onco AS SELECT * 
                          FROM physicians2021 WHERE LOWER(Rndrng_Prvdr_Type) LIKE '%onco%'")
oncs_sql <- dbReadTable(con, "physicians_onco")
dbClearResult(QUERY_temp)
dbDisconnect(con)Validates that the R & SQL code deliver the same output
dim(oncs_sql) == #18,090 rows
  dim(oncs) # 18,090 rows
identical(oncs, data.table::as.data.table(oncs_sql))
 
 
Open payments
#loads in the open payments dataset into R
rsch <- data.table::fread("OP_DTL_RSRCH_PGYR2017_P06282024_06122024.csv")
#QC of dates to make sure it's in time period
rsch |> 
  mutate(yr = gsub(".*(?=[0-9]{4}$)", "", Date_of_Payment, perl = T)) |> 
  select(Date_of_Payment, yr) |> 
  count(yr)       yr      n
   <char>  <int>
1:   2017 820118The data set is structured to show a record of payment. The record of payment can be made directly to the physician or to the institution, where the physician is the PI. These are stored across features within a single record of payment.
This section separates out the 2 types of payments, then merges together to create a listing of each physician and the record of payment associated with it.
Listing Physicians
# gets if the individual is covered
resrch_npis_oth <- rsch |> 
  filter(Covered_Recipient_NPI %in% oncs$Rndrng_NPI) |> 
  select(Record_ID, Covered_Recipient_NPI) |> 
  rename(npi = Covered_Recipient_NPI) |> 
  mutate(type = "covered")
# gets the individual if record of payment is made to institution 
resrch_npis_pis <- rsch |> 
  select(Record_ID, ends_with("NPI")) |> 
  select(-Covered_Recipient_NPI) |> 
  pivot_longer(cols = ends_with("NPI"), 
               names_to = "type", 
               values_to = "npi") |> 
  filter(!is.na(npi)) |> 
  mutate(type = "PI")
# Combines into final table of oncologists with a record of payment 
oncs_rsch_op <- rbind(resrch_npis_oth, resrch_npis_pis) |> #786,104 rows
  distinct(Record_ID, npi, .keep_all = T) |> #786,104 rows
  filter(npi %in% oncs$Rndrng_NPI) |> # only oncs, 150,322 rows
  left_join(rsch[, c("Record_ID", "Total_Amount_of_Payment_USDollars",
                     "Form_of_Payment_or_Transfer_of_Value", 
                     "Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name")], 
            by = "Record_ID") # adds back in other featurescon <- dbConnect(RSQLite::SQLite(), "cms.db")
QUERY_merged_op_rsch_npis <- dbSendQuery(con, "WITH resrch_npis_oth AS (
    SELECT Record_ID, 
      Covered_Recipient_NPI AS npi, 
      'covered' AS type FROM op_rsch2017 
      WHERE Covered_Recipient_NPI IN (SELECT Rndrng_NPI FROM physicians_onco)
),
resrch_npis_pis AS (
    SELECT 
        Record_ID, 
        Principal_Investigator_1_NPI AS npi FROM op_rsch2017  WHERE Principal_Investigator_1_NPI IS NOT NULL
    UNION ALL
    SELECT 
        Record_ID, 
        Principal_Investigator_2_NPI AS npi FROM op_rsch2017  WHERE Principal_Investigator_2_NPI IS NOT NULL
    UNION ALL
    SELECT 
        Record_ID, 
        Principal_Investigator_3_NPI AS npi FROM op_rsch2017  WHERE Principal_Investigator_3_NPI IS NOT NULL
    UNION ALL
    SELECT 
        Record_ID, 
        Principal_Investigator_4_NPI AS npi FROM op_rsch2017  WHERE Principal_Investigator_4_NPI IS NOT NULL
    UNION ALL
    SELECT 
        Record_ID, 
        Principal_Investigator_5_NPI AS npi FROM op_rsch2017  WHERE Principal_Investigator_5_NPI IS NOT NULL
),
combined_resrch_npis AS (
    SELECT * FROM resrch_npis_oth
    UNION ALL
    SELECT Record_ID, npi, 'PI' AS type FROM resrch_npis_pis
),
oncs_rsch_op AS (
    SELECT DISTINCT 
        c.Record_ID, 
        c.npi,
        c.type
    FROM 
        combined_resrch_npis c
    WHERE 
        c.npi IN (SELECT Rndrng_NPI FROM physicians_onco)
) SELECT 
    o.*, 
    r.Total_Amount_of_Payment_USDollars,
    r.Form_of_Payment_or_Transfer_of_Value,
    r.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name
FROM 
    oncs_rsch_op o
LEFT JOIN 
    op_rsch2017 r ON o.Record_ID = r.Record_ID;
")
oncs_rsch_op_SQL <- dbFetch(QUERY_merged_op_rsch_npis )
dbClearResult(QUERY_merged_op_rsch_npis)
dbDisconnect(con)Validates that the R & SQL code deliver the same output
identical(
arrange(oncs_rsch_op, Record_ID),
data.table::as.data.table(oncs_rsch_op_SQL) |> #150,322 rows
  arrange(Record_ID)
)Listing Drugs
Similarly, there can be multiple drugs associated with the record of payment. This provides a crosswalk between drug code (ndc from the FDA) and the record of payment
CMS only provides public billing data for Part B medications. Public Part D is too aggregated to merge effectively with other CMS billing data.
#dataset of drugs and ndcs  per record
drugs_ops <- rsch |> 
  filter(Record_ID %in% oncs_rsch_op$Record_ID) |> #grabs only oncologists w/ a record of payment on file
  select(Record_ID, starts_with(c('Associated_Drug_or_Biological_NDC', "Product_Category_or_Therapeutic_Area"))) |> 
  pivot_longer(cols = starts_with(c('Associated_Drug_or_Biological_NDC', "Product_Category_or_Therapeutic_Area")), 
               names_to = "feature", 
               values_to = "value") |> 
  filter(value != "") |> 
  mutate(number = gsub(".*_", "", feature),
    feature = ifelse(grepl("^Associated_Drug_or_", feature), "ndc", "type_drug")) |>
  pivot_wider(id_cols = c(Record_ID, number), 
              names_from = feature, 
              values_from = value) |> 
  filter(!is.na(ndc))  |> 
  select(-number)To connect the drug codes (NDC) to the crosswalks/dictionary of the billing data, it has to be in the same format (5DIGITS-4DIGITS-2DIGITS). Unfortunately, the open payments database uses a truncated version of the NDC codes. This 1) reformats the NDC codes into the more compatible version, and 2) merges with the oncologist IDs.
splits <-  drugs_ops |> 
   mutate(splits = strsplit(ndc, "-")) |> # NDC Package Code
   unnest_wider(splits, names_sep = "_") |>
   mutate(splits_1 = str_pad(splits_1, 5, pad = "0"),
          splits_2 = str_pad(splits_2, 4, pad = "0"),
          splits_3 = str_pad(splits_3, 2, pad = "0")) |>
    mutate(ndc_11_digits = paste(splits_1, splits_2, splits_3, sep = "-")) |>
  left_join(oncs_rsch_op, by = "Record_ID") # combines with each oncologist with the record payment (more than 1 oncologist can be listed per payment & there can be more than 1 drug per record of payment)
 
 
Crosswalk + Billing Data
This acts as a dictionary/crosswalk, providing all drug codes (NDC) for each billing code (HCPCS).
crosswalk <- data.table::fread("crosswalk.csv") |> 
  janitor::clean_names()
dictionat_ds <- distinct(select(crosswalk, x2021_code, ndc2))  |> 
  mutate(ncd_group = paste0("ncd_", row_number()), 
         .by = x2021_code) |> 
  summarize(ndcs_possible = list(ndc2), 
            .by = x2021_code)This imports the billing data, and filters for only oncologists and services pertaining to Part B drug prescriptions
billing_import <- data.table::fread("Medicare_Physician_Other_Practitioners_by_Provider_and_Service_2021.csv")
oncs_billing <- billing_import |> 
  filter(HCPCS_Drug_Ind == "Y" & Rndrng_NPI %in% oncs$Rndrng_NPI) |> 
  select(Rndrng_NPI, HCPCS_Cd, HCPCS_Desc) con <- dbConnect(RSQLite::SQLite(), "cms.db")
QUERY_oncs_billing <- dbSendQuery(con, "SELECT 
    b.Rndrng_NPI, 
    b.HCPCS_Cd, 
    b.HCPCS_Desc
FROM 
    billings2021 b
JOIN 
    physicians_onco o ON b.Rndrng_NPI = o.Rndrng_NPI
WHERE 
    b.HCPCS_Drug_Ind = 'Y';")
      
oncs_billing_SQL <- dbFetch(QUERY_oncs_billing )
dbClearResult(QUERY_oncs_billing )
      
      
dbDisconnect(con)Validates that the R & SQL code deliver the same output
identical(data.table::as.data.table(oncs_billing_SQL), oncs_billing)Final merge to create a table listing of all oncologists who accepted research payments in 2017 for a drug in which they later prescribed to patients in 2021.
final <- oncs_billing |> 
  left_join(dictionat_ds, by = c("HCPCS_Cd" = "x2021_code")) |> 
  unnest(ndcs_possible) |> 
  left_join(splits, by = c("ndcs_possible" ="ndc_11_digits", "Rndrng_NPI"="npi")) |> 
  filter(!is.na(Record_ID)) |> 
  distinct(Rndrng_NPI, Record_ID, HCPCS_Cd, .keep_all = T) # makes sure billings aren't counted twice
 
 
Archive
Debugging the SQL code above, for the SQL and R outputs did not match
dim(oncs_rsch_op) ==
dim(oncs_rsch_op_SQL)
names(oncs_rsch_op_SQL) == names(oncs_rsch_op)
oncs_rsch_op_SQL$Record_ID == oncs_rsch_op$Record_ID
setdiff(oncs_rsch_op$Record_ID, oncs_rsch_op_SQL$Record_ID)
sapply(oncs_rsch_op_SQL, typeof) == sapply(oncs_rsch_op, typeof)This creates quick summary statistics per each oncologist
rsch_op_Summary <- oncs_rsch_op |> 
  summarize(n = n(), 
            total = sum(Total_Amount_of_Payment_USDollars),
            median = median(Total_Amount_of_Payment_USDollars),
            percentile = list(quantile(Total_Amount_of_Payment_USDollars)),
            idk = list(unique(Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name)),
            # details = paste0(Form_of_Payment_or_Transfer_of_Value, collapse = "; "),
            .by = npi) |> 
  unnest_wider(col = percentile) |> 
  janitor::clean_names() |> 
  mutate(percentile = cut_number(total, n = 4, labels = c("25th", 
                                                          "50th",
                                                          "75th", 
                                                          "100th"))) |> 
  rename_with( ~ paste0("op_", .x))