Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm   norm_2   norm_3 bern neg pois       exp      unif       beta
#> 1 52.81412 53.51319 53.67912    0   4    1  2.930778 0.6989983 0.10578491
#> 2 68.23793 59.38779 35.13537    1   0    2  0.927297 0.2805017 0.07902199
#> 3 68.65313 58.44543 40.40566    0   0    1 34.820686 0.2691659 0.39371600
#> 4 45.62340 73.49855 61.86956    0   2    5 11.391594 0.6804970 0.46589920
#> 5 43.92391 37.85506 65.47060    0   0    1 21.108217 0.3427744 0.75404320
#> 6 51.66342 46.86237 81.70720    1   1    1  8.234561 0.3648130 0.14351144
#>       gamma    chi_sq     t_dist    f_dist
#> 1 3.3267537  7.212244 -0.8529863 0.5167607
#> 2 3.6669007  9.661967 -1.0643916 0.7294443
#> 3 4.1921663  5.173578  0.5679994 0.9108190
#> 4 0.8220897  8.708574  1.4667282 1.4952712
#> 5 2.0554174 19.556774 -0.6172805 0.1470397
#> 6 2.7243905 11.227294 -3.5974447 1.1166657

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif      beta
#> 1 46.02367 71.25660 39.750130    0   0    2 21.772330 0.9305956 0.4539395
#> 2 49.32905 58.94355 -3.378858    0   2    1 24.235364 0.7742999 0.2503492
#> 3 58.65559 69.64089  6.842902    1   2    2 27.303474 0.7920125 0.2215807
#> 4 29.89703 56.91901 70.248972    0   3    2 53.093483 0.6174459 0.2416861
#> 5 76.22624 59.05343 30.431970    1   3    4  1.065747 0.9747066 0.4487912
#> 6 40.68722 64.01339 12.307218    0   2    1  7.177614 0.7474789 0.4006164
#>       gamma    chi_sq     t_dist    f_dist
#> 1 4.9530141 11.196222 -0.8598328 3.2969429
#> 2 1.4745625  8.721485 -0.2513822 1.1522467
#> 3 0.4714032  7.562053  1.3909277 1.1831806
#> 4 2.3675387 16.463112  0.4024442 1.1443270
#> 5 4.0108882  5.337741  0.8392667 1.1098908
#> 6 4.8259632 11.386632 -0.2674480 0.6722363

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2   norm_3 bern neg pois       exp       unif       beta
#> 1 68.17047 43.77534 32.77409    0   0    2  7.686341 0.07423565 0.22908364
#> 2 54.50925 57.31223 19.59262    0   0    3  5.295738 0.43575854 0.28826722
#> 3 34.23543 73.95832 35.53038    0   1    1 18.730670 0.07914885 0.02256758
#> 4 71.51849 66.28091 40.40233    1   0    8 18.031923 0.90119115 0.36785600
#> 5 54.46610 68.21688 36.06081    0   1    1  8.926137 0.90411665 0.57451053
#> 6 48.68553 60.46151 53.43388    1   1    2  2.153736 0.25749653 0.43182738
#>      gamma    chi_sq     t_dist    f_dist
#> 1 3.527325 14.116512  0.1701468 1.1050098
#> 2 1.410158  4.353722 -0.8447841 2.2619767
#> 3 2.300857 10.397276  2.1414233 1.6591481
#> 4 4.877539 12.756853  0.5519434 1.0611762
#> 5 2.961266 10.642384  0.6644180 0.8904096
#> 6 3.291894 12.879280 -0.3902477 3.0987640

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

Reproducibility

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853

Variation

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2     norm_3 bern neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694    0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086    0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563    0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269    0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258    1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528    1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.7914120  4.464348 -1.0150596 2.2557295
#> 2 3.0132520  8.062120  0.3262369 1.4955877
#> 3 4.7360954 10.969593  1.5141157 1.0766901
#> 4 5.1235878  6.249247  0.6432708 1.1251542
#> 5 6.6851637  4.358815  0.2025742 0.4754946
#> 6 0.3903841 20.019575  1.6257109 0.6653886