Case-Study Exemplar

A Case-Study of Parkinson’s Disease using Simulated Big Data

 

  • Overview: This case-study examines the associations between clinical, demographic, imaging and genetics variables for Parkinson’s disease. This is an example of Big Data for investigating important neurodegenerative disorders.

 

  • Driving Challenges:
    • Are there relations between imaging, genetic and clinical covariates?
    • Can we predict subject diagnosis using model-based approaches?
    • Do exploratory data analytics provide clues to Parkinson’s disease?
    • If clinical PD Dx is solely based on UPDRS scores, is there value added by including other covariates?

 

  • Meta-data:
    • ID: Case subject identifier
    • Imaging Biomarkers: ComputeArea = surface area of 3D brain region of interest, Volume= the 3D volume/size of the region of interest, region name is encoded in morphometry measure name (e.g., L_putamen_ComputeArea represents the left putamen surface area):

L_caudate_ComputeArea, L_caudate_Volume, R_caudate_ComputeArea, R_caudate_Volume, L_putamen_ComputeArea, L_putamen_Volume, R_putamen_ComputeArea, R_putamen_Volume, L_hippocampus_ComputeArea, L_hippocampus_Volume, R_hippocampus_ComputeArea, R_hippocampus_Volume, cerebellum_ComputeArea, cerebellum_Volume, L_lingual_gyrus_ComputeArea, L_lingual_gyrus_Volume, R_lingual_gyrus_ComputeArea, R_lingual_gyrus_Volume, L_fusiform_gyrus_ComputeArea, L_fusiform_gyrus_Volume, R_fusiform_gyrus_ComputeArea, R_fusiform_gyrus_Volume

  • Demographics variables: Sex, Weight, Age
  • Diagnosis: Dx, PD=Parkinson’s, HC=Healthy Control, SWEDD = (tremor associated clinical parkinsonism features) scans without evidence of dopaminergic deficit
  • Genetics: chr12_rs34637584_GT, chr17_rs11868035_GT
  • Clinical: UPDRS_part_I, UPDRS_part_II, UPDRS_part_III (Movement Disorder Society-Sponsored Revision of the Unified. Parkinson's Disease Rating Scale (MDS-UPDRS)), http://www.movementdisorders.org/MDS/Education/Rating-Scales.htm Links to an external site. (Normal=0)
  • Time: VisitTime: four time-points (baseline (0), 6, 12, and 18 month follow-ups).

 

 

  • Provenance: This case study only uses simulated data. The complete R-script generating the data is included below. The entire case study is CC-BY licensed and can be used, updated, refactored and expanded by the entire community.

 

# Define number of subjects

NumSubj <- 282

NumTime <- 4

 

# Define data elements

# Cases

Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369, 370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692)

 

# Imaging Biomarkers

L_caudate_ComputeArea <- rpois(NumSubj, 600)

L_caudate_Volume <- rpois(NumSubj, 800)

R_caudate_ComputeArea <- rpois(NumSubj, 893)

R_caudate_Volume <- rpois(NumSubj, 1000)
L_putamen_ComputeArea <- rpois(NumSubj, 900)
L_putamen_Volume <- rpois(NumSubj, 1400)
R_putamen_ComputeArea <- rpois(NumSubj, 1300)
R_putamen_Volume <- rpois(NumSubj, 3000)
L_hippocampus_ComputeArea <- rpois(NumSubj, 1300)
L_hippocampus_Volume <- rpois(NumSubj, 3200)
R_hippocampus_ComputeArea <- rpois(NumSubj, 1500)
R_hippocampus_Volume <- rpois(NumSubj, 3800)
cerebellum_ComputeArea <- rpois(NumSubj, 16700)
cerebellum_Volume <- rpois(NumSubj, 14000)
L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
L_lingual_gyrus_Volume <- rpois(NumSubj, 11000)
R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_lingual_gyrus_Volume <- rpois(NumSubj, 12000)
L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600)
L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000)
R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)

 

# Demographics variables

Sex <- ifelse(runif(NumSubj)<.5,0,1)

Weight <- as.integer(rnorm(NumSubj, 80,10))

Age <- as.integer(rnorm(NumSubj, 62,10))

 

# Diagnosis:

Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))

 

# Genetics

chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1))                              # NumSubj Bernoulli trials

chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1))                              # NumSubj Bernoulli trials

 

# Clinical          # rpois(NumSubj, 15) + rpois(NumSubj, 6)

UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1),

ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1),

ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) )

UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T),

sample.int(18, 82, replace=T) )

UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T),

           sample.int(25, 82, replace=T) )

 

# Time: VisitTime – done automatically below in aggregator

 

# Data (putting all components together)

 

sim_PD_Data <- cbind(

rep(Cases, each= NumTime),                                      # Cases

 

                                         rep(L_caudate_ComputeArea, each= NumTime), # Imaging

                                         rep(L_caudate_Volume, each= NumTime),

                                         rep(R_caudate_ComputeArea, each= NumTime),
                                         rep(R_caudate_Volume, each= NumTime),
                                         rep(L_putamen_ComputeArea, each= NumTime),
                                         rep(L_putamen_Volume, each= NumTime),
                                         rep(R_putamen_ComputeArea, each= NumTime),
                                         rep(R_putamen_Volume, each= NumTime),
                                         rep(L_hippocampus_ComputeArea, each= NumTime),
                                         rep(L_hippocampus_Volume, each= NumTime),
                                         rep(R_hippocampus_ComputeArea, each= NumTime),
                                         rep(R_hippocampus_Volume, each= NumTime),
                                         rep(cerebellum_ComputeArea, each= NumTime),
                                         rep(cerebellum_Volume, each= NumTime),
                                         rep(L_lingual_gyrus_ComputeArea, each= NumTime),
                                         rep(L_lingual_gyrus_Volume, each= NumTime),
                                         rep(R_lingual_gyrus_ComputeArea, each= NumTime),
                                         rep(R_lingual_gyrus_Volume, each= NumTime),
                                         rep(L_fusiform_gyrus_ComputeArea, each= NumTime),
                                         rep(L_fusiform_gyrus_Volume, each= NumTime),
                                         rep(R_fusiform_gyrus_ComputeArea, each= NumTime),
                                         rep(R_fusiform_gyrus_Volume, each= NumTime),                                        

                                                                                     

                                         rep(Sex, each= NumTime),                                          # Demographics

rep(Weight, each= NumTime),

rep(Age, each= NumTime),

 

                                         rep(Dx, each= NumTime),                                                           # Dx

 

                                         rep(chr12_rs34637584_GT, each= NumTime),                      # Genetics

                                         rep(chr17_rs11868035_GT, each= NumTime),

                                                       

                                         rep(UPDRS_part_I, each= NumTime),                       # Clinical

rep(UPDRS_part_II, each= NumTime),

rep(UPDRS_part_III, each= NumTime),

 

rep(c(0,6,12,18), NumSubj)                                         # Time

)

 

# Assign the column names

colnames(sim_PD_Data) <- c(

"Cases",

"L_caudate_ComputeArea",

"L_caudate_Volume",

"R_caudate_ComputeArea",

"R_caudate_Volume",

"L_putamen_ComputeArea",

"L_putamen_Volume",

"R_putamen_ComputeArea",

"R_putamen_Volume",

"L_hippocampus_ComputeArea",

"L_hippocampus_Volume",

"R_hippocampus_ComputeArea",

"R_hippocampus_Volume",

"cerebellum_ComputeArea",

"cerebellum_Volume",

"L_lingual_gyrus_ComputeArea",

"L_lingual_gyrus_Volume",

"R_lingual_gyrus_ComputeArea",

"R_lingual_gyrus_Volume",

"L_fusiform_gyrus_ComputeArea",

"L_fusiform_gyrus_Volume",

"R_fusiform_gyrus_ComputeArea",

"R_fusiform_gyrus_Volume",                                        

"Sex", "Weight", "Age",

"Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT",

"UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III",

"Time")

 

# some QC

summary(sim_PD_Data)

dim(sim_PD_Data)

head(sim_PD_Data)

 

# Write out (save) the result to a file that can be shared

write.table(sim_PD_Data, "output_data.csv", sep=",", row.names=FALSE, col.names=TRUE)

 

The transposed matrix looks like this:

 

Cases

2

2

2

2

3

3

3

3

6

6

L_caudate_ComputeArea

597

597

597

597

604

604

604

604

580

..

L_caudate_Volume

767

767

767

767

873

873

873

873

797

..

R_caudate_ComputeArea

855

855

855

855

935

935

935

935

919

..

R_caudate_Volume

968

968

968

968

1043

1043

1043

1043

1023

..

L_putamen_ComputeArea

842

842

842

842

892

892

892

892

908

..

L_putamen_Volume

1357

1357

1357

1357

1366

1366

1366

1366

1415

..

R_putamen_ComputeArea

1285

1285

1285

1285

1305

1305

1305

1305

1264

..

R_putamen_Volume

3052

3052

3052

3052

2920

2920

2920

2920

2995

..

L_hippocampus_ComputeArea

1306

1306

1306

1306

1292

1292

1292

1292

1313

..

L_hippocampus_Volume

3238

3238

3238

3238

3079

3079

3079

3079

3227

..

R_hippocampus_ComputeArea

1513

1513

1513

1513

1516

1516

1516

1516

1541

..

R_hippocampus_Volume

3759

3759

3759

3759

3827

3827

3827

3827

3791

..

cerebellum_ComputeArea

16845

16845

16845

16845

16698

16698

16698

16698

16480

..

cerebellum_Volume

13949

13949

13949

13949

14076

14076

14076

14076

13992

..

L_lingual_gyrus_ComputeArea

3268

3268

3268

3268

3243

3243

3243

3243

3331

..

L_lingual_gyrus_Volume

11130

11130

11130

11130

11033

11033

11033

11033

11093

..

R_lingual_gyrus_ComputeArea

3294

3294

3294

3294

3190

3190

3190

3190

3407

..

R_lingual_gyrus_Volume

12221

12221

12221

12221

12187

12187

12187

12187

12062

..

L_fusiform_gyrus_ComputeArea

3625

3625

3625

3625

3631

3631

3631

3631

3520

..

L_fusiform_gyrus_Volume

11087

11087

11087

11087

11116

11116

11116

11116

10890

..

R_fusiform_gyrus_ComputeArea

3232

3232

3232

3232

3302

3302

3302

3302

3328

..

R_fusiform_gyrus_Volume

10122

10122

10122

10122

10162

10162

10162

10162

9884

..

Sex

1

1

1

1

0

0

0

0

0

..

Weight

84

84

84

84

97

97

97

97

96

..

Age

67

67

67

67

39

39

39

39

54

..

Dx

PD

PD

PD

PD

PD

PD

PD

PD

PD

..

chr12_rs34637584_GT

1

1

1

1

1

1

1

1

0

..

chr17_rs11868035_GT

0

0

0

0

1

1

1

1

0

..

UPDRS_part_I

1

1

1

1

0

0

0

0

1

..

UPDRS_part_II

12

12

12

12

19

19

19

19

15

..

UPDRS_part_III

1

1

1

1

22

22

22

22

19

..

Time

0

6

12

18

0

6

12

18

0

..

 

See more on this case-study here (including the complete dataset) Links to an external site..