How to correctly approach NHANES data to make estimates that are representative of the population
Define Metabolic Syndrome based on ATP III
Look at the MetS prevalence over time
I need to use these packages:
tidyverse
: data management (tidyr, dplyr) and plots (ggplot2)foreign
: reading xport filessurvey
: for using survey weights # survey::svydesign, svymean, svyglmlibrary(tidyverse)
library(foreign)
library(survey)
options(survey.lonely.psu='adjust')
library(kableExtra)
#cat("R package versions:\n")
for (p in c("base", "survey","tidyverse")) {
cat(p, ": ", as.character(packageVersion(p)), "\n")
}
base : 4.0.4
survey : 4.0
tidyverse : 1.3.1
Code for data import was adapted from a NHANES tutorials example.
<- function(fileprefix){
downloadNHANES print (fileprefix)
<- data.frame(NULL)
outdf for (j in 1:length(letters)){
<- paste('https://wwwn.cdc.gov/nchs/nhanes/',yrs[j],'/',fileprefix,letters[j],'.XPT', sep='')
urlstring download.file(urlstring, tf <- tempfile(), mode="wb")
<- foreign::read.xport(tf)
tmpframe <- bind_rows(outdf, tmpframe)
outdf
}return(outdf)
}
# Specify the survey cycles required, with corresponding file suffixes
<- c('2013-2014')
yrs <- c('_h')
letters
# Download data for each component
# (1) Demographic (DEMO)
<- downloadNHANES('DEMO') # 10175
DEMO # (2) Blood Pressure & Cholesterol
<- downloadNHANES('BPQ') # 6464
BPQ # (3) Diabetes
<- downloadNHANES('DIQ') # 9770
DIQ # (4) Blood Pressure
<- downloadNHANES('BPX') # 9813
BPX # (5) Body Measures
<- downloadNHANES('BMX') # 9813
BMX # (6) Cholesterol - High - Density Lipoprotein
<- downloadNHANES('HDL') # 8291
HDL # (7) Cholesterol - Low-Density Lipoproteins (LDL) & Triglycerides (TRIGLY_J)
<- downloadNHANES('TRIGLY') # 3329
TRIGLY # (8) Plasma Fasting Glucose (GLU_J)
<- downloadNHANES('GLU') # 3329 GLU
# Merging using "piping"
<- left_join(DEMO, BPQ, by="SEQN") %>%
nhanesdata.merged left_join(DIQ, by="SEQN") %>%
left_join(BPX, by='SEQN') %>%
left_join(BMX, by='SEQN') %>%
left_join(HDL, by='SEQN') %>%
left_join(TRIGLY, by='SEQN') %>%
left_join(GLU, by='SEQN') # 10175 obs
one
<- nhanesdata.merged %>%
nhanes.vbls select(SEQN, # Respondent sequence number
# Age in years at screening
RIDAGEYR, # Gender
RIAGENDR,
# Race/Hispanic origin w/ NH Asian
RIDRETH3, # Marital status
DMDMARTL, # Education level - Adults 20+
DMDEDUC2,
# Survey design variables
# Masked variance pseudo-PSU
SDMVPSU, # Masked variance pseudo-stratum
SDMVSTRA, # Full sample 2 year MEC exam weight
WTMEC2YR, # Full sample 2 year interview weight
WTINT2YR,
# Systolic: Blood pres (1st rdg) mm Hg
BPXSY1, # Diastolic: Blood pres (1st rdg) mm Hg
BPXDI1, # Systolic: Blood pres (2nd rdg) mm Hg
BPXSY2, # Diastolic: Blood pres (2nd rdg) mm Hg
BPXDI2, # Systolic: Blood pres (3rd rdg) mm Hg
BPXSY3, # Diastolic: Blood pres (3rd rdg) mm Hg
BPXDI3, # Systolic: Blood pres (4th rdg) mm Hg
BPXSY4, # Diastolic: Blood pres (4th rdg) mm Hg
BPXDI4, # Fasting Glucose (mg/dL)
LBXGLU, # Direct HDL-Cholesterol (mg/dL)
LBDHDD, # Triglyceride (mg/dL)
LBXTR, # Waist Circumference (cm)
BMXWAIST,
# Told to take prescription for cholesterol
BPQ090D, # Taking prescription for hypertension
BPQ040A, # Take diabetic pills to lower blood sugar
DIQ070 %>%
) mutate(# create indicator for overall summary
one = 1,
id = SEQN,
psu = SDMVPSU, strata = SDMVSTRA,
persWeight = WTINT2YR, persWeightMEC= WTMEC2YR,
age = RIDAGEYR,
# Create age categories for adults aged 18 and over: ages 18-39, 40-59, 60 and over
ageCat = cut(RIDAGEYR,
breaks = c(19, 29, 39, 49, 59, 69, Inf),
labels = c('20-29','30-39', '40-49', '50-59', '60-69', '70+')),
female = factor(if_else(RIAGENDR == 1, 0, 1)), # 1:male ==> 0
ethnicity = case_when(
%in% c(1,2) ~ 0, # Hispanic
RIDRETH3 == 3 ~ 1, # Non-Hispanic White
RIDRETH3 == 4 ~ 2, # Non-Hispanic Black
RIDRETH3 == 6 ~ 3, # Non-Hispanic Asian
RIDRETH3 == 7 ~ 4, # Multi-Racial
RIDRETH3 is.na(RIDRETH3) ~ NA_real_
),chol.prsn = case_when(
== 2 ~ 0, # No
BPQ090D == 1 ~ 1, # Yes
BPQ090D %in% c(7,9) ~ NA_real_
BPQ090D
),bp.prsn = case_when(
== 2 ~ 0, # No
BPQ040A == 1 ~ 1, # Yes
BPQ040A %in% c(7,9) ~ NA_real_
BPQ040A
),glu.prsn = case_when(
== 2 ~ 0, # No
DIQ070 == 1 ~ 1, # Yes
DIQ070 %in% c(7,9) ~ NA_real_
DIQ070
),dia = rowMeans(.[, c("BPXDI1","BPXDI2","BPXDI3","BPXDI4")], na.rm = TRUE),
sys = rowMeans(.[, c("BPXSY1","BPXSY2","BPXSY3","BPXSY4")], na.rm = TRUE),
glu = LBXGLU,
hdl = LBDHDD,
tri = LBXTR,
waist = BMXWAIST
%>%
) select(one, id, psu, strata, persWeight, persWeightMEC, age, ageCat, female, ethnicity, sys, dia, glu, hdl, tri, waist, bp.prsn, chol.prsn, glu.prsn) # 10175
According to Alberti et al. (2009), the presence of any 3 of 5 risk factors constitutes a diagnosis of metabolic syndrome:
Besides, I intentionally created the indicators for missing data: - return NA if all indicators are NA for mets.sum
: when I summed up all 5 MetS indicators, if all are NA, so sum should be NA (not 0)
- ind.non_missing
: return TRUE if none of indicators are none-NA, otherwise FALSE
Finally, mets.ind
(MetS indicator) would be binary of 1 and 0 if sum of 5 indicators above greater or equal to 3.
<- nhanes.vbls %>%
nhanes.MetS mutate(waist.ind = case_when(waist >= 88 & female == 1 ~ 1, #condition 1
>= 102 & female == 0 ~ 1, #condition 2
waist is.na(waist) | is.na(female) ~ NA_real_, #condition 3
TRUE ~ 0), #all other cases
tri.ind = case_when(tri >= 150 | chol.prsn == 1 ~ 1,
is.na(tri) ~ NA_real_,
TRUE ~0),
hdl.ind = case_when((hdl < 40 & female == 0) | chol.prsn == 1 ~ 1,
< 50 & female == 1) | chol.prsn == 1 ~ 1,
(hdl is.na(hdl) | is.na(female) ~ NA_real_,
TRUE ~ 0),
bp.ind = case_when(sys >= 130 | dia >= 85 | bp.prsn == 1 ~ 1,
is.na(sys) & is.na(dia) ~ NA_real_,
TRUE ~ 0
),glu.ind = case_when(glu >= 100 | glu.prsn == 1 ~ 1,
is.na(glu) ~ NA_real_,
TRUE ~ 0)
# 10175 obs
)
$mets.sum = rowSums(nhanes.MetS[20:24], na.rm = T) # from waist.ind to glu.ind # 20:24
nhanes.MetS$mets.sum[rowSums(!is.na(nhanes.MetS[20:24])) == 0] <- NA # return NA if all indicators are NA
nhanes.MetS$ind.non_missing <- !rowSums(!is.na(nhanes.MetS[20:24])==0) # create an indicator in which none of criteria is none NA
nhanes.MetS$mets.ind <- ifelse(nhanes.MetS$mets.sum >= 3, 1, 0) # 10175 obs nhanes.MetS
Here we can get the sense of data for all indicators created:
age
of 20+ Reasons for the consistency in comparison:
Ford, Giles, and Dietz (2002) used NHANES III, 1988 to 1994 based on 2001 Adult Treatment Panel III (ATP III) criteria (same as Moore et al.) with age groups.
Smiley, King, and Bidulescu (2019) did not specify the age range but they used the same criteria for adults (e.g. “Metabolic syndrome was defined according to NCEP ATPIII (National Cholesterol Education Program Adult Treatment Panel III) criteria.”).
inAnalysis1
: age of 20+ and none of missing of MetS indicatorinAnalysis2
: age of 20+ and none of indicators are none-NA<- nhanes.MetS %>%
nhanes.MetS mutate(
# Define sub-population of interest
inAnalysis1 = (age >=20 & !is.na(mets.ind)),
inAnalysis2 = (age >=20 & ind.non_missing)
# 10175 obs )
saveRDS(nhanes.MetS, file = "data/nhanesMetS13_14snd.rds")
Our sample size would be if I subset data on:
age
groups%>%
nhanes.MetS group_by(mets.ind) %>%
summarise(n=n()) %>%
mutate(prop=round(n/sum(n)*100,1)) %>%
kbl(col.names = c('Metabolic syndrome', 'Freq', 'Percent')) %>%
kable_material(c("striped", "hover"))
Metabolic syndrome | Freq | Percent |
---|---|---|
0 | 6940 | 68.2 |
1 | 2184 | 21.5 |
NA | 1051 | 10.3 |
age
groups & none missing of MetS indicator%>%
nhanes.MetS filter(!is.na(mets.ind)) %>%
group_by(mets.ind) %>%
summarise(n=n()) %>%
mutate(prop=round(n/sum(n)*100,1)) %>%
kbl(col.names = c('Metabolic syndrome', 'Freq', 'Percent')) %>%
kable_material(c("striped", "hover"))
Metabolic syndrome | Freq | Percent |
---|---|---|
0 | 6940 | 76.1 |
1 | 2184 | 23.9 |
age
20+ & none missing of MetS indicator%>%
nhanes.MetS filter(age>=20 & !is.na(mets.ind)) %>%
group_by(mets.ind) %>%
summarise(n=n()) %>%
mutate(prop=round(n/sum(n)*100,1)) %>%
kbl(col.names = c('Metabolic syndrome', 'Freq', 'Percent')) %>%
kable_material(c("striped", "hover"))
Metabolic syndrome | Freq | Percent |
---|---|---|
0 | 3511 | 62.2 |
1 | 2138 | 37.8 |
age
20+ and ind.non_missing
is TRUE:Note:
ind.non_missing
was created an indicator in which none of criteria is none NA
%>%
nhanes.MetS filter(age>=20 & ind.non_missing) %>%
group_by(mets.ind) %>%
summarise(n=n()) %>%
mutate(prop=round(n/sum(n)*100,1)) %>%
kbl(col.names = c('Metabolic syndrome', 'Freq', 'Percent')) %>%
kable_material(c("striped", "hover"))
Metabolic syndrome | Freq | Percent |
---|---|---|
0 | 1273 | 49.2 |
1 | 1314 | 50.8 |
Note:
psu = SDMVPSU,
strata = SDMVSTRA,
persWeight = WTINT2YR,
persWeightMEC= WTMEC2YR
# Define survey design for overall dataset
<- svydesign(data=nhanes.MetS, id=~psu, strata=~strata, weights=~persWeightMEC, nest=TRUE)
NHANES_all # Create a survey design object for the subset of interest
# Subsetting the original survey design object ensures we keep the design information about the number of clusters and strata
<- subset(NHANES_all, inAnalysis1==1) NHANES
#' Proportion and standard error, for adults aged 20 and over
svyby(~mets.ind, ~one, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
one mets.ind se
1 1 36 0.8
#' Proportion and standard error, for adults age groups
svyby(~mets.ind, ~ageCat, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
ageCat mets.ind se
20-29 20-29 7.0 1.0
30-39 30-39 21.6 1.0
40-49 40-49 30.0 1.3
50-59 50-59 45.7 2.1
60-69 60-69 59.8 2.8
70+ 70+ 67.2 1.4
#' Proportion and standard error, for adults aged 20+ by race and Hispanic origin
svyby(~mets.ind, ~ethnicity, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
ethnicity mets.ind se
0 0 30.1 1.1
1 1 39.0 1.1
2 2 34.1 1.6
3 3 23.4 2.8
4 4 28.3 3.9
I processed the same procedures as the cycle 2013/2014. Then saved as RDS file. Here I just showed the prevalence after reading the RDS data into R using weights.
<- readRDS(file = "data/nhanesMetS15_16.rds")
nhanes.MetS <- svydesign(data=nhanes.MetS, id=~psu, strata=~strata, weights=~persWeightMEC, nest=TRUE)
NHANES_all <- subset(NHANES_all, inAnalysis1==1) NHANES
#' Proportion and standard error, for adults aged 20 and over
svyby(~mets.ind, ~one, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
one mets.ind se
1 1 37.7 1.8
#' Proportion and standard error, for adults age groups
svyby(~mets.ind, ~ageCat, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
ageCat mets.ind se
20-29 20-29 9.2 1.5
30-39 30-39 19.3 1.4
40-49 40-49 33.2 3.2
50-59 50-59 48.1 2.5
60-69 60-69 61.7 2.4
70+ 70+ 66.7 2.5
#' Proportion and standard error, for adults aged 20+ by race and Hispanic origin
svyby(~mets.ind, ~ethnicity, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
ethnicity mets.ind se
0 0 34.0 1.8
1 1 40.1 2.4
2 2 34.0 1.2
3 3 23.9 2.8
4 4 43.9 4.1
<- readRDS(file = "data/nhanesMetS17_18.rds")
nhanes.MetS <- svydesign(data=nhanes.MetS, id=~psu, strata=~strata, weights=~persWeightMEC, nest=TRUE)
NHANES_all <- subset(NHANES_all, inAnalysis1==1) NHANES
#' Proportion and standard error, for adults aged 20 and over
svyby(~mets.ind, ~one, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
one mets.ind se
1 1 38.2 1.3
#' Proportion and standard error, for adults age groups
svyby(~mets.ind, ~ageCat, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
ageCat mets.ind se
20-29 20-29 9.0 1.5
30-39 30-39 18.4 1.6
40-49 40-49 31.1 1.8
50-59 50-59 49.2 3.4
60-69 60-69 60.5 2.8
70+ 70+ 71.3 1.4
#' Proportion and standard error, for adults aged 20+ by race and Hispanic origin
svyby(~mets.ind, ~ethnicity, NHANES, svymean) %>% mutate(mets.ind = round(mets.ind*100, digits=1), se=round(se*100, digits=1))
ethnicity mets.ind se
0 0 34.5 1.7
1 1 40.3 1.7
2 2 33.5 1.7
3 3 32.1 2.3
4 4 42.5 4.4
<- c("2013-2014","2015-2016","2017-2018")
Cycle <- c(36, 37.7, 38.2)
Prevalence <- as.tibble(data.frame(Cycle, Prevalence))
year.prl
<- c('20-29','30-39','40-49','50-59','60-69','70+')
Age.Categories .13_14 <- c(7,21.6,30,45.7,59.8,67.2)
Prevalence.Cycle.15_16 <- c(9.2,19.3,33.2,48.1,61.7,66.7)
Prevalence.Cycle.17_18 <- c(9,18.4,31.1,49.2,60.5,71.3)
Prevalence.Cycle<- as.tibble(data.frame(Age.Categories,Prevalence.Cycle.13_14,Prevalence.Cycle.15_16,Prevalence.Cycle.17_18))
age.g.p <- gather(age.g.p, Prevalence, Percent, Prevalence.Cycle.13_14:Prevalence.Cycle.17_18)
age.g.p.l
<- c('Hispanic','White','Black','Asian','Multi-Racial')
Race.Ethnic.Categories .13_14 <- c(34.0,40.1,34.0,23.9,43.9)
Prevalence.Cycle.15_16 <- c(34.0,40.1,34.0,23.9,43.9)
Prevalence.Cycle.17_18 <- c(34.5,40.3,33.5,32.1,42.5)
Prevalence.Cycle<- as.tibble(data.frame(Race.Ethnic.Categories,Prevalence.Cycle.13_14,Prevalence.Cycle.15_16,Prevalence.Cycle.17_18))
r.e.g.p <- gather(r.e.g.p, Prevalence, Percent, Prevalence.Cycle.13_14:Prevalence.Cycle.17_18) r.e.g.p.l
%>% ggplot(aes(x = Cycle, y = Prevalence)) +
year.prl geom_point() +
geom_line(aes(group=1)) +
geom_text(label=paste0(Prevalence,"%"), hjust=1.5, vjust=0) +
ylim(30, 45) +
theme_bw()
%>% ggplot(aes(fill=Prevalence, y=Percent, x=Age.Categories)) +
age.g.p.l geom_bar(position="dodge", stat="identity") +
theme_bw()
%>% ggplot(aes(fill=Prevalence, y=Percent, x=Race.Ethnic.Categories)) +
r.e.g.p.l geom_bar(position="dodge", stat="identity") +
theme_bw()
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. Source code is available at https://github.com/hai-mn/hai-mn.github.io, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Nguyen (2021, June 8). HaiBiostat: Metabolic Syndrome Prevalence across time using NHANES Data. Retrieved from https://hai-mn.github.io/posts/2021-06-09-mets-using-nhanesdata/
BibTeX citation
@misc{nguyen2021metabolic, author = {Nguyen, Hai}, title = {HaiBiostat: Metabolic Syndrome Prevalence across time using NHANES Data}, url = {https://hai-mn.github.io/posts/2021-06-09-mets-using-nhanesdata/}, year = {2021} }