library(tidyverse)
library(RSQLite)
library(DBI)
CMS & Open Payments
Overview
Using publicly available databases, generate a dataset of oncology specialists who billed for a drug prescription in 2021 that received pharmaceutical research funding for the same drug in 2017. Includes data linkage across disparate databases.
Technical Assets
R Packages
<- sessionInfo()
docs
as.data.frame(as.matrix(map(pluck(docs, "otherPkgs"), "Version"))) |>
rownames_to_column("package") |>
unnest(V1) |>
rename(version = V1) |>
::datatable() DT
Databases
All databases used in production
Variable Name(s) | Webpage Link | Retrieved Date | File Download Link (last updated: 6/30/2024) | |
---|---|---|---|---|
phys_import |
CMS | June 2024 | Download | |
rsch |
openPayment | June 2024 | Download | |
crosswalk |
CMS ASP Pricing | June 2024 | Download | |
billing_import |
CMS | June 2024 | Download |
Identifying cohort
Gets a baseline cohort for oncologists billing to CMS (part B & fee-for-service) in 2021.
<- data.table::fread("Medicare_Physician_Other_Practitioners_by_Provider_2021.csv")
phys_import
<- phys_import |>
oncs filter(grepl("onco", Rndrng_Prvdr_Type, ignore.case = T))
<- dbConnect(RSQLite::SQLite(), "cms.db")
con
<- dbSendQuery(con, "CREATE TABLE physicians_onco AS SELECT *
QUERY_temp FROM physicians2021 WHERE LOWER(Rndrng_Prvdr_Type) LIKE '%onco%'")
<- dbReadTable(con, "physicians_onco")
oncs_sql
dbClearResult(QUERY_temp)
dbDisconnect(con)
Validates that the R & SQL code deliver the same output
dim(oncs_sql) == #18,090 rows
dim(oncs) # 18,090 rows
identical(oncs, data.table::as.data.table(oncs_sql))
Open payments
#loads in the open payments dataset into R
<- data.table::fread("OP_DTL_RSRCH_PGYR2017_P06282024_06122024.csv")
rsch
#QC of dates to make sure it's in time period
|>
rsch mutate(yr = gsub(".*(?=[0-9]{4}$)", "", Date_of_Payment, perl = T)) |>
select(Date_of_Payment, yr) |>
count(yr)
yr n
<char> <int>
1: 2017 820118
The data set is structured to show a record of payment. The record of payment can be made directly to the physician or to the institution, where the physician is the PI. These are stored across features within a single record of payment.
This section separates out the 2 types of payments, then merges together to create a listing of each physician and the record of payment associated with it.
Listing Physicians
# gets if the individual is covered
<- rsch |>
resrch_npis_oth filter(Covered_Recipient_NPI %in% oncs$Rndrng_NPI) |>
select(Record_ID, Covered_Recipient_NPI) |>
rename(npi = Covered_Recipient_NPI) |>
mutate(type = "covered")
# gets the individual if record of payment is made to institution
<- rsch |>
resrch_npis_pis select(Record_ID, ends_with("NPI")) |>
select(-Covered_Recipient_NPI) |>
pivot_longer(cols = ends_with("NPI"),
names_to = "type",
values_to = "npi") |>
filter(!is.na(npi)) |>
mutate(type = "PI")
# Combines into final table of oncologists with a record of payment
<- rbind(resrch_npis_oth, resrch_npis_pis) |> #786,104 rows
oncs_rsch_op distinct(Record_ID, npi, .keep_all = T) |> #786,104 rows
filter(npi %in% oncs$Rndrng_NPI) |> # only oncs, 150,322 rows
left_join(rsch[, c("Record_ID", "Total_Amount_of_Payment_USDollars",
"Form_of_Payment_or_Transfer_of_Value",
"Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name")],
by = "Record_ID") # adds back in other features
<- dbConnect(RSQLite::SQLite(), "cms.db")
con
<- dbSendQuery(con, "WITH resrch_npis_oth AS (
QUERY_merged_op_rsch_npis SELECT Record_ID,
Covered_Recipient_NPI AS npi,
'covered' AS type FROM op_rsch2017
WHERE Covered_Recipient_NPI IN (SELECT Rndrng_NPI FROM physicians_onco)
),
resrch_npis_pis AS (
SELECT
Record_ID,
Principal_Investigator_1_NPI AS npi FROM op_rsch2017 WHERE Principal_Investigator_1_NPI IS NOT NULL
UNION ALL
SELECT
Record_ID,
Principal_Investigator_2_NPI AS npi FROM op_rsch2017 WHERE Principal_Investigator_2_NPI IS NOT NULL
UNION ALL
SELECT
Record_ID,
Principal_Investigator_3_NPI AS npi FROM op_rsch2017 WHERE Principal_Investigator_3_NPI IS NOT NULL
UNION ALL
SELECT
Record_ID,
Principal_Investigator_4_NPI AS npi FROM op_rsch2017 WHERE Principal_Investigator_4_NPI IS NOT NULL
UNION ALL
SELECT
Record_ID,
Principal_Investigator_5_NPI AS npi FROM op_rsch2017 WHERE Principal_Investigator_5_NPI IS NOT NULL
),
combined_resrch_npis AS (
SELECT * FROM resrch_npis_oth
UNION ALL
SELECT Record_ID, npi, 'PI' AS type FROM resrch_npis_pis
),
oncs_rsch_op AS (
SELECT DISTINCT
c.Record_ID,
c.npi,
c.type
FROM
combined_resrch_npis c
WHERE
c.npi IN (SELECT Rndrng_NPI FROM physicians_onco)
) SELECT
o.*,
r.Total_Amount_of_Payment_USDollars,
r.Form_of_Payment_or_Transfer_of_Value,
r.Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name
FROM
oncs_rsch_op o
LEFT JOIN
op_rsch2017 r ON o.Record_ID = r.Record_ID;
")
<- dbFetch(QUERY_merged_op_rsch_npis )
oncs_rsch_op_SQL dbClearResult(QUERY_merged_op_rsch_npis)
dbDisconnect(con)
Validates that the R & SQL code deliver the same output
identical(
arrange(oncs_rsch_op, Record_ID),
::as.data.table(oncs_rsch_op_SQL) |> #150,322 rows
data.tablearrange(Record_ID)
)
Listing Drugs
Similarly, there can be multiple drugs associated with the record of payment. This provides a crosswalk between drug code (ndc from the FDA) and the record of payment
CMS only provides public billing data for Part B medications. Public Part D is too aggregated to merge effectively with other CMS billing data.
#dataset of drugs and ndcs per record
<- rsch |>
drugs_ops filter(Record_ID %in% oncs_rsch_op$Record_ID) |> #grabs only oncologists w/ a record of payment on file
select(Record_ID, starts_with(c('Associated_Drug_or_Biological_NDC', "Product_Category_or_Therapeutic_Area"))) |>
pivot_longer(cols = starts_with(c('Associated_Drug_or_Biological_NDC', "Product_Category_or_Therapeutic_Area")),
names_to = "feature",
values_to = "value") |>
filter(value != "") |>
mutate(number = gsub(".*_", "", feature),
feature = ifelse(grepl("^Associated_Drug_or_", feature), "ndc", "type_drug")) |>
pivot_wider(id_cols = c(Record_ID, number),
names_from = feature,
values_from = value) |>
filter(!is.na(ndc)) |>
select(-number)
To connect the drug codes (NDC) to the crosswalks/dictionary of the billing data, it has to be in the same format (5DIGITS-4DIGITS-2DIGITS). Unfortunately, the open payments database uses a truncated version of the NDC codes. This 1) reformats the NDC codes into the more compatible version, and 2) merges with the oncologist IDs.
<- drugs_ops |>
splits mutate(splits = strsplit(ndc, "-")) |> # NDC Package Code
unnest_wider(splits, names_sep = "_") |>
mutate(splits_1 = str_pad(splits_1, 5, pad = "0"),
splits_2 = str_pad(splits_2, 4, pad = "0"),
splits_3 = str_pad(splits_3, 2, pad = "0")) |>
mutate(ndc_11_digits = paste(splits_1, splits_2, splits_3, sep = "-")) |>
left_join(oncs_rsch_op, by = "Record_ID") # combines with each oncologist with the record payment (more than 1 oncologist can be listed per payment & there can be more than 1 drug per record of payment)
Crosswalk + Billing Data
This acts as a dictionary/crosswalk, providing all drug codes (NDC) for each billing code (HCPCS).
<- data.table::fread("crosswalk.csv") |>
crosswalk ::clean_names()
janitor
<- distinct(select(crosswalk, x2021_code, ndc2)) |>
dictionat_ds mutate(ncd_group = paste0("ncd_", row_number()),
.by = x2021_code) |>
summarize(ndcs_possible = list(ndc2),
.by = x2021_code)
This imports the billing data, and filters for only oncologists and services pertaining to Part B drug prescriptions
<- data.table::fread("Medicare_Physician_Other_Practitioners_by_Provider_and_Service_2021.csv")
billing_import
<- billing_import |>
oncs_billing filter(HCPCS_Drug_Ind == "Y" & Rndrng_NPI %in% oncs$Rndrng_NPI) |>
select(Rndrng_NPI, HCPCS_Cd, HCPCS_Desc)
<- dbConnect(RSQLite::SQLite(), "cms.db")
con
<- dbSendQuery(con, "SELECT
QUERY_oncs_billing b.Rndrng_NPI,
b.HCPCS_Cd,
b.HCPCS_Desc
FROM
billings2021 b
JOIN
physicians_onco o ON b.Rndrng_NPI = o.Rndrng_NPI
WHERE
b.HCPCS_Drug_Ind = 'Y';")
<- dbFetch(QUERY_oncs_billing )
oncs_billing_SQL dbClearResult(QUERY_oncs_billing )
dbDisconnect(con)
Validates that the R & SQL code deliver the same output
identical(data.table::as.data.table(oncs_billing_SQL), oncs_billing)
Final merge to create a table listing of all oncologists who accepted research payments in 2017 for a drug in which they later prescribed to patients in 2021.
<- oncs_billing |>
final left_join(dictionat_ds, by = c("HCPCS_Cd" = "x2021_code")) |>
unnest(ndcs_possible) |>
left_join(splits, by = c("ndcs_possible" ="ndc_11_digits", "Rndrng_NPI"="npi")) |>
filter(!is.na(Record_ID)) |>
distinct(Rndrng_NPI, Record_ID, HCPCS_Cd, .keep_all = T) # makes sure billings aren't counted twice
Archive
Debugging the SQL code above, for the SQL and R outputs did not match
dim(oncs_rsch_op) ==
dim(oncs_rsch_op_SQL)
names(oncs_rsch_op_SQL) == names(oncs_rsch_op)
$Record_ID == oncs_rsch_op$Record_ID
oncs_rsch_op_SQL
setdiff(oncs_rsch_op$Record_ID, oncs_rsch_op_SQL$Record_ID)
sapply(oncs_rsch_op_SQL, typeof) == sapply(oncs_rsch_op, typeof)
This creates quick summary statistics per each oncologist
<- oncs_rsch_op |>
rsch_op_Summary summarize(n = n(),
total = sum(Total_Amount_of_Payment_USDollars),
median = median(Total_Amount_of_Payment_USDollars),
percentile = list(quantile(Total_Amount_of_Payment_USDollars)),
idk = list(unique(Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name)),
# details = paste0(Form_of_Payment_or_Transfer_of_Value, collapse = "; "),
.by = npi) |>
unnest_wider(col = percentile) |>
::clean_names() |>
janitormutate(percentile = cut_number(total, n = 4, labels = c("25th",
"50th",
"75th",
"100th"))) |>
rename_with( ~ paste0("op_", .x))