Case-Study Exemplar
A Case-Study of Parkinson’s Disease using Simulated Big Data
- Overview: This case-study examines the associations between clinical, demographic, imaging and genetics variables for Parkinson’s disease. This is an example of Big Data for investigating important neurodegenerative disorders.
-
Driving Challenges:
- Are there relations between imaging, genetic and clinical covariates?
- Can we predict subject diagnosis using model-based approaches?
- Do exploratory data analytics provide clues to Parkinson’s disease?
- If clinical PD Dx is solely based on UPDRS scores, is there value added by including other covariates?
-
Meta-data:
- ID: Case subject identifier
- Imaging Biomarkers: ComputeArea = surface area of 3D brain region of interest, Volume= the 3D volume/size of the region of interest, region name is encoded in morphometry measure name (e.g., L_putamen_ComputeArea represents the left putamen surface area):
L_caudate_ComputeArea, L_caudate_Volume, R_caudate_ComputeArea, R_caudate_Volume, L_putamen_ComputeArea, L_putamen_Volume, R_putamen_ComputeArea, R_putamen_Volume, L_hippocampus_ComputeArea, L_hippocampus_Volume, R_hippocampus_ComputeArea, R_hippocampus_Volume, cerebellum_ComputeArea, cerebellum_Volume, L_lingual_gyrus_ComputeArea, L_lingual_gyrus_Volume, R_lingual_gyrus_ComputeArea, R_lingual_gyrus_Volume, L_fusiform_gyrus_ComputeArea, L_fusiform_gyrus_Volume, R_fusiform_gyrus_ComputeArea, R_fusiform_gyrus_Volume
- Demographics variables: Sex, Weight, Age
- Diagnosis: Dx, PD=Parkinson’s, HC=Healthy Control, SWEDD = (tremor associated clinical parkinsonism features) scans without evidence of dopaminergic deficit
- Genetics: chr12_rs34637584_GT, chr17_rs11868035_GT
- Clinical: UPDRS_part_I, UPDRS_part_II, UPDRS_part_III (Movement Disorder Society-Sponsored Revision of the Unified. Parkinson's Disease Rating Scale (MDS-UPDRS)), http://www.movementdisorders.org/MDS/Education/Rating-Scales.htm Links to an external site. (Normal=0)
- Time: VisitTime: four time-points (baseline (0), 6, 12, and 18 month follow-ups).
- Data: Complete data is in this table (HS853_PD_SimData.csv), from the Canvas-files partition.
- Provenance: This case study only uses simulated data. The complete R-script generating the data is included below. The entire case study is CC-BY licensed and can be used, updated, refactored and expanded by the entire community.
# Define number of subjects
NumSubj <- 282
NumTime <- 4
# Define data elements
# Cases
Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369, 370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692)
# Imaging Biomarkers
L_caudate_ComputeArea <- rpois(NumSubj, 600)
L_caudate_Volume <- rpois(NumSubj, 800)
R_caudate_ComputeArea <- rpois(NumSubj, 893)
R_caudate_Volume <- rpois(NumSubj, 1000)
L_putamen_ComputeArea <- rpois(NumSubj, 900)
L_putamen_Volume <- rpois(NumSubj, 1400)
R_putamen_ComputeArea <- rpois(NumSubj, 1300)
R_putamen_Volume <- rpois(NumSubj, 3000)
L_hippocampus_ComputeArea <- rpois(NumSubj, 1300)
L_hippocampus_Volume <- rpois(NumSubj, 3200)
R_hippocampus_ComputeArea <- rpois(NumSubj, 1500)
R_hippocampus_Volume <- rpois(NumSubj, 3800)
cerebellum_ComputeArea <- rpois(NumSubj, 16700)
cerebellum_Volume <- rpois(NumSubj, 14000)
L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
L_lingual_gyrus_Volume <- rpois(NumSubj, 11000)
R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_lingual_gyrus_Volume <- rpois(NumSubj, 12000)
L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600)
L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000)
R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)
# Demographics variables
Sex <- ifelse(runif(NumSubj)<.5,0,1)
Weight <- as.integer(rnorm(NumSubj, 80,10))
Age <- as.integer(rnorm(NumSubj, 62,10))
# Diagnosis:
Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))
# Genetics
chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials
chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials
# Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6)
UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1),
ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1),
ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) )
UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T),
sample.int(18, 82, replace=T) )
UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T),
sample.int(25, 82, replace=T) )
# Time: VisitTime – done automatically below in aggregator
# Data (putting all components together)
sim_PD_Data <- cbind(
rep(Cases, each= NumTime), # Cases
rep(L_caudate_ComputeArea, each= NumTime), # Imaging
rep(L_caudate_Volume, each= NumTime),
rep(R_caudate_ComputeArea, each= NumTime),
rep(R_caudate_Volume, each= NumTime),
rep(L_putamen_ComputeArea, each= NumTime),
rep(L_putamen_Volume, each= NumTime),
rep(R_putamen_ComputeArea, each= NumTime),
rep(R_putamen_Volume, each= NumTime),
rep(L_hippocampus_ComputeArea, each= NumTime),
rep(L_hippocampus_Volume, each= NumTime),
rep(R_hippocampus_ComputeArea, each= NumTime),
rep(R_hippocampus_Volume, each= NumTime),
rep(cerebellum_ComputeArea, each= NumTime),
rep(cerebellum_Volume, each= NumTime),
rep(L_lingual_gyrus_ComputeArea, each= NumTime),
rep(L_lingual_gyrus_Volume, each= NumTime),
rep(R_lingual_gyrus_ComputeArea, each= NumTime),
rep(R_lingual_gyrus_Volume, each= NumTime),
rep(L_fusiform_gyrus_ComputeArea, each= NumTime),
rep(L_fusiform_gyrus_Volume, each= NumTime),
rep(R_fusiform_gyrus_ComputeArea, each= NumTime),
rep(R_fusiform_gyrus_Volume, each= NumTime),
rep(Sex, each= NumTime), # Demographics
rep(Weight, each= NumTime),
rep(Age, each= NumTime),
rep(Dx, each= NumTime), # Dx
rep(chr12_rs34637584_GT, each= NumTime), # Genetics
rep(chr17_rs11868035_GT, each= NumTime),
rep(UPDRS_part_I, each= NumTime), # Clinical
rep(UPDRS_part_II, each= NumTime),
rep(UPDRS_part_III, each= NumTime),
rep(c(0,6,12,18), NumSubj) # Time
)
# Assign the column names
colnames(sim_PD_Data) <- c(
"Cases",
"L_caudate_ComputeArea",
"L_caudate_Volume",
"R_caudate_ComputeArea",
"R_caudate_Volume",
"L_putamen_ComputeArea",
"L_putamen_Volume",
"R_putamen_ComputeArea",
"R_putamen_Volume",
"L_hippocampus_ComputeArea",
"L_hippocampus_Volume",
"R_hippocampus_ComputeArea",
"R_hippocampus_Volume",
"cerebellum_ComputeArea",
"cerebellum_Volume",
"L_lingual_gyrus_ComputeArea",
"L_lingual_gyrus_Volume",
"R_lingual_gyrus_ComputeArea",
"R_lingual_gyrus_Volume",
"L_fusiform_gyrus_ComputeArea",
"L_fusiform_gyrus_Volume",
"R_fusiform_gyrus_ComputeArea",
"R_fusiform_gyrus_Volume",
"Sex", "Weight", "Age",
"Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT",
"UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III",
"Time")
# some QC
summary(sim_PD_Data)
dim(sim_PD_Data)
head(sim_PD_Data)
# Write out (save) the result to a file that can be shared
write.table(sim_PD_Data, "output_data.csv", sep=",", row.names=FALSE, col.names=TRUE)
The transposed matrix looks like this:
Cases |
2 |
2 |
2 |
2 |
3 |
3 |
3 |
3 |
6 |
6 |
L_caudate_ComputeArea |
597 |
597 |
597 |
597 |
604 |
604 |
604 |
604 |
580 |
.. |
L_caudate_Volume |
767 |
767 |
767 |
767 |
873 |
873 |
873 |
873 |
797 |
.. |
R_caudate_ComputeArea |
855 |
855 |
855 |
855 |
935 |
935 |
935 |
935 |
919 |
.. |
R_caudate_Volume |
968 |
968 |
968 |
968 |
1043 |
1043 |
1043 |
1043 |
1023 |
.. |
L_putamen_ComputeArea |
842 |
842 |
842 |
842 |
892 |
892 |
892 |
892 |
908 |
.. |
L_putamen_Volume |
1357 |
1357 |
1357 |
1357 |
1366 |
1366 |
1366 |
1366 |
1415 |
.. |
R_putamen_ComputeArea |
1285 |
1285 |
1285 |
1285 |
1305 |
1305 |
1305 |
1305 |
1264 |
.. |
R_putamen_Volume |
3052 |
3052 |
3052 |
3052 |
2920 |
2920 |
2920 |
2920 |
2995 |
.. |
L_hippocampus_ComputeArea |
1306 |
1306 |
1306 |
1306 |
1292 |
1292 |
1292 |
1292 |
1313 |
.. |
L_hippocampus_Volume |
3238 |
3238 |
3238 |
3238 |
3079 |
3079 |
3079 |
3079 |
3227 |
.. |
R_hippocampus_ComputeArea |
1513 |
1513 |
1513 |
1513 |
1516 |
1516 |
1516 |
1516 |
1541 |
.. |
R_hippocampus_Volume |
3759 |
3759 |
3759 |
3759 |
3827 |
3827 |
3827 |
3827 |
3791 |
.. |
cerebellum_ComputeArea |
16845 |
16845 |
16845 |
16845 |
16698 |
16698 |
16698 |
16698 |
16480 |
.. |
cerebellum_Volume |
13949 |
13949 |
13949 |
13949 |
14076 |
14076 |
14076 |
14076 |
13992 |
.. |
L_lingual_gyrus_ComputeArea |
3268 |
3268 |
3268 |
3268 |
3243 |
3243 |
3243 |
3243 |
3331 |
.. |
L_lingual_gyrus_Volume |
11130 |
11130 |
11130 |
11130 |
11033 |
11033 |
11033 |
11033 |
11093 |
.. |
R_lingual_gyrus_ComputeArea |
3294 |
3294 |
3294 |
3294 |
3190 |
3190 |
3190 |
3190 |
3407 |
.. |
R_lingual_gyrus_Volume |
12221 |
12221 |
12221 |
12221 |
12187 |
12187 |
12187 |
12187 |
12062 |
.. |
L_fusiform_gyrus_ComputeArea |
3625 |
3625 |
3625 |
3625 |
3631 |
3631 |
3631 |
3631 |
3520 |
.. |
L_fusiform_gyrus_Volume |
11087 |
11087 |
11087 |
11087 |
11116 |
11116 |
11116 |
11116 |
10890 |
.. |
R_fusiform_gyrus_ComputeArea |
3232 |
3232 |
3232 |
3232 |
3302 |
3302 |
3302 |
3302 |
3328 |
.. |
R_fusiform_gyrus_Volume |
10122 |
10122 |
10122 |
10122 |
10162 |
10162 |
10162 |
10162 |
9884 |
.. |
Sex |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
.. |
Weight |
84 |
84 |
84 |
84 |
97 |
97 |
97 |
97 |
96 |
.. |
Age |
67 |
67 |
67 |
67 |
39 |
39 |
39 |
39 |
54 |
.. |
Dx |
PD |
PD |
PD |
PD |
PD |
PD |
PD |
PD |
PD |
.. |
chr12_rs34637584_GT |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
.. |
chr17_rs11868035_GT |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
.. |
UPDRS_part_I |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
.. |
UPDRS_part_II |
12 |
12 |
12 |
12 |
19 |
19 |
19 |
19 |
15 |
.. |
UPDRS_part_III |
1 |
1 |
1 |
1 |
22 |
22 |
22 |
22 |
19 |
.. |
Time |
0 |
6 |
12 |
18 |
0 |
6 |
12 |
18 |
0 |
.. |
See more on this case-study here (including the complete dataset) Links to an external site..