Take randomly sample based on groups
Here is one approach in base R.
First, the prerequisite sample data to work with:
set.seed(1)mydf <- data.frame(ID = rep(1:3, each = 5), matrix(rnorm(45), ncol = 3))mydf# ID X1 X2 X3# 1 1 -0.6264538 -0.04493361 1.35867955# 2 1 0.1836433 -0.01619026 -0.10278773# 3 1 -0.8356286 0.94383621 0.38767161# 4 1 1.5952808 0.82122120 -0.05380504# 5 1 0.3295078 0.59390132 -1.37705956# 6 2 -0.8204684 0.91897737 -0.41499456# 7 2 0.4874291 0.78213630 -0.39428995# 8 2 0.7383247 0.07456498 -0.05931340# 9 2 0.5757814 -1.98935170 1.10002537# 10 2 -0.3053884 0.61982575 0.76317575# 11 3 1.5117812 -0.05612874 -0.16452360# 12 3 0.3898432 -0.15579551 -0.25336168# 13 3 -0.6212406 -1.47075238 0.69696338# 14 3 -2.2146999 -0.47815006 0.55666320# 15 3 1.1249309 0.41794156 -0.68875569
Second, the sampling:
do.call(rbind, lapply(split(mydf, mydf$ID), function(x) x[sample(nrow(x), 3), ]))# ID X1 X2 X3# 1.2 1 0.1836433 -0.01619026 -0.1027877# 1.1 1 -0.6264538 -0.04493361 1.3586796# 1.5 1 0.3295078 0.59390132 -1.3770596# 2.10 2 -0.3053884 0.61982575 0.7631757# 2.9 2 0.5757814 -1.98935170 1.1000254# 2.8 2 0.7383247 0.07456498 -0.0593134# 3.13 3 -0.6212406 -1.47075238 0.6969634# 3.12 3 0.3898432 -0.15579551 -0.2533617# 3.15 3 1.1249309 0.41794156 -0.6887557
There is also strata
from the sampling
package, which is convenient when you want to sample different sizes from each group:
# install.packages("sampling")library(sampling)set.seed(1)x <- strata(mydf, "ID", size = c(2, 3, 2), method = "srswor")getdata(mydf, x)# X1 X2 X3 ID ID_unit Prob Stratum# 2 0.1836433 -0.01619026 -0.1027877 1 2 0.4 1# 5 0.3295078 0.59390132 -1.3770596 1 5 0.4 1# 6 -0.8204684 0.91897737 -0.4149946 2 6 0.6 2# 8 0.7383247 0.07456498 -0.0593134 2 8 0.6 2# 9 0.5757814 -1.98935170 1.1000254 2 9 0.6 2# 14 -2.2146999 -0.47815006 0.5566632 3 14 0.4 3# 15 1.1249309 0.41794156 -0.6887557 3 15 0.4 3