# use NA instead of zero for missing values in `xtabs`
> d=data.frame(month=month.name[c(1,1,2)],year=c(2022,2023,2023),val=c(5,2,9))
> d
     month year val
1  January 2022   5
2  January 2023   2
3 February 2023   9
> xtabs(val~month+year,d) # this returns zero for missing combinations of month and year
          year
month      2022 2023
  February    0    9
  January     5    2
> tapply(d$val,d[,1:2],c) # this returns NA for missing combinations of month and year
          year
month      2022 2023
  February   NA    9
  January     5    2

# convert 3d array to long form
> a=array(sample(99,8),c(2,2,2),list(2022:2023,month.abb[1:2],c("male","female")))
> cbind(expand.grid(dimnames(a)),x=c(a))
  Var1 Var2   Var3  x
1 2022  Jan   male 52
2 2023  Jan   male 86
3 2022  Feb   male 63
4 2023  Feb   male 24
5 2022  Jan female 94
6 2023  Jan female 93
7 2022  Feb female 18
8 2023  Feb female 92
> as.data.frame.table(a) # this converts the first three columns to factors
  Var1 Var2   Var3  x
1 2022  Jan   male 52
2 2023  Jan   male 86
3 2022  Feb   male 63
4 2023  Feb   male 24
5 2022  Jan female 94
6 2023  Jan female 93
7 2022  Feb female 18
8 2023  Feb female 92

# prevent custom function definitions in `.Rprofile` from being overridden by a variable assignment
> writeLines("j=paste0","~/.functions.R")
> source("~/.functions.R",attach(NULL,name="myfuns")) # add this to `.Rprofile`
> j=9
> j(1,2) # not overridden by variable assignment
[1] "12"
> j
9
> detach("myfuns");j(1,2) # disable custom functions
Error in j(1, 2) : could not find function "j"

# get the fraction of a normal distribution which is 3 standard deviations or more above the mean
> 1-pnorm(3)
[1] 0.001349898
> integrate(\(x)exp(-x^2/2)/sqrt(2*pi),3,Inf)$value
[1] 0.001349899

# get z-score for point of normal distribution that is higher than 97.5% of values
> qnorm(.975)
[1] 1.95996

# divide ages into age groups
> ages=c(53,2,22,105,0,91)
> starts=c(0,18,50,80)
> cut(ages,c(starts,Inf),include.lowest=T,right=F)
[1] [50,80)  [0,18)   [18,50)  [80,Inf] [0,18)   [80,Inf]
Levels: [0,18) [18,50) [50,80) [80,Inf]
> agecut=\(x,starts)cut(pmax(x,0),c(starts,Inf),paste0(starts,c(paste0("-",starts[-1]-1),"+")),T,F)
> agecut(ages,starts)
[1] 50-79 0-17  18-49 80+   0-17  80+
Levels: 0-17 18-49 50-79 80+

# convert a long character vector to dates faster
> v=as.character(Sys.Date()-sample(1e3,1e5,T))
> b=microbenchmark(times=10,as.Date(v),
+ {u=unique(v);as.Date(u)[match(v,u)]},
+ {f=as.factor(v);as.Date(levels(v))[v]},
+ fasttime::fastDate(v), # this only supports the `%Y-%m-%d` format and this doesn't support dates before 1970
+ as.Date(lubridate::parse_date_time(v,orders="ymd")),
+ anytime::anydate(v,calcUnique=T),
+ anytime::anydate(v))
> o=sort(tapply(b$time,gsub("  +"," ",b$expr),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 fasttime::fastDate(v)
3.27 anytime::anydate(v, calcUnique = T)
5.48 { u = unique(v) as.Date(u)[match(v, u)] }
9.42 { f = as.factor(v) as.Date(levels(v))[v] }
9.82 as.Date(lubridate::parse_date_time(v, orders = "ymd"))
101.72 anytime::anydate(v)
242.50 as.Date(v)

# save the intermediate result of pipeline to a variable
> sv=\(x,y=p){do.call("=",list(substitute(y),x),,.GlobalEnv);x}
> seq(5)|>sv()|>mean()|>c(p) # `p` (pipe) is the default variable name used by `sv`
[1] 3 1 2 3 4 5
> seq(5)%>%{.->>p}%>%mean%>%c(p)
[1] 3 1 2 3 4 5

# count dates and return 0 for missing dates in between
> d=as.Date(c("2024-1-2","2024-1-2","2024-1-5"))
> table(factor(d,as.character(Reduce(\(...)seq(...,by=1),range(d)))))
# the anonymous function is used because `Reduce` doesn't support passing additional arguments to a function
2024-01-02 2024-01-03 2024-01-04 2024-01-05
         2          0          0          1
> num=as.numeric(d);ran=Reduce(seq,range(num,na.rm=T));setNames(table(factor(num,ran)),as.Date(ran,"1970-1-1"))
# this was about 7 times faster than the first option in my benchmark for a long vector of dates
2024-01-02 2024-01-03 2024-01-04 2024-01-05
         2          0          0          1
> setNames(tabulate(as.numeric(d-min(d))+1),seq(min(d),max(d),1))
# this was about 25 times faster than the first option in my benchmark
2024-01-02 2024-01-03 2024-01-04 2024-01-05
         2          0          0          1

# make a matrix for shifted versions of a vector
> v=1:4;embed(c(NA,v,NA),3)
     [,1] [,2] [,3]
[1,]    2    1   NA
[2,]    3    2    1
[3,]    4    3    2
[4,]   NA    4    3

# calculate a centered moving average which uses a narrower window at vector ends instead of returning NA values
> v=c(3,5,7,3,-1)
> rowMeans(embed(c(NA,v,NA),3),na.rm=T) # this was the fastest method in my benchmark
[1] 4 5 5 3 1
> o=outer(1:length(v),-1:1,"+");rowMeans(matrix(v[ifelse(o<1,NA,o)],length(v)),na.rm=T)
# this was 2 times slower than the first method
[1] 4 5 5 3 1
> l=length(v);sapply(1:l,\(i)mean(v[max(1,i-1):min(l,i+1)],na.rm=T))
# this was about 100 times slower than the first method
[1] 4 5 5 3 1
> zoo::rollapplyr(v,3,mean,partial=T,align="center") # this was about 170 times slower than the first method
[1] 4 5 5 3 1
> slider::slide_dbl(v,mean,.before=1,.after=1) # this was about 70 times slower than the first method
[1] 4 5 5 3 1

# convert a vector of distances to a `dist` object
> v=1:6;class(v)="dist";attr(v,"Size")=ceiling(sqrt(length(v)*2));v # size is the number of rows in a full distance matrix
  1 2 3
2 1
3 2 4
4 3 5 6
> v=1:6;structure(v,class="dist",Size=Re(polyroot(c(-2*length(v),-1,1))[1]))
# `polyroot` returns `4+0i` which has an imaginary part but `Re` extracts only the real part
# `polyroot solves the equation `-2*6-1*x+1*x^2=0` (so the coefficients are in increasing order and not decreasing order)
# the solution to the equation is `-2*6-1*4+1*4^2=0`
# `2*6` is the size of the lower and upper triangles, 4 is the size of the diagonal, and `4^2` the size of the full matrix
  1 2 3
  1 2 3
2 1
3 2 4
4 3 5 6

# ignore NA in cumulative sum
> v=c(1,2,3,NA,5);cumsum(v)
[1]  1  3  6 NA NA
> cumsum(ifelse(is.na(v),0,v))
[1]  1  3  6  6 11

# get length of each subitem of list
> lengths(list(1:3,4:9))
[1] 3 6

# split a vector into a list for each run of consecutive integers
> v=c(1,2,3,8,9);split(v,cumsum(c(T,diff(v)!=1)))
$`1`
[1] 1 2 3
$`2`
[1] 8 9

# disable alphabetic sorting for `tapply`
> v=1:4
> group=c("b","b","a","a")
> tapply(v,group,sum)
a b
7 3
> tapply(v,factor(group,unique(group)),sum)
b a
3 7

# convert datetime to float
> as.double(difftime(as.POSIXct("2023-01-02 01:23:45"),as.Date("1970-1-1")))
[1] 19358.97

# month name to integer
> match(c("January","July"),month.name)
[1] 1 7

# convert HSV to RGB
> 255*attr(colorspace::hex2RGB(colorspace::hex(HSV(seq(0,300,60),.5,1))),"coords")
       R   G   B
[1,] 255 128 128
[2,] 255 255 128
[3,] 128 255 128
[4,] 128 255 255
[5,] 128 128 255
[6,] 255 128 255

# sum together each segment of 7 items in a vector
> v=1:21;colSums(matrix(v,7))
[1]  28  77 126

# turn a cumulative series to non-cumulative
> v=c(0,1,1,5,6)
> diff(c(0,v))
[1] 0 1 0 4 1

# print a dataframe without row names or column names
> d=data.frame(col1=1:2,col2=c("a","b"))
> d
  col1 col2
1    1    a
2    2    b
> print.data.frame(d,row.names=F) # there is no option to disable printing column names
 col1 col2
    1    a
    2    b
> apply(d,2,\(x)format(x,width=max(nchar(x))))|>apply(1,paste,collapse=" ")|>writeLines()
1 a
2 b

# shift each item of vector left or right by 2 positions
> v=1:5
> lshift\(x,y=1)c(tail(x,-y),rep(NA,y))
> rshift=\(x,y=1)c(rep(NA,y),head(x,-y))
> lshift(v,2)
[1]  3  4  5 NA NA
> rshift(v,2)
[1] NA NA  1  2  3
> dplyr::lag(v,2) # this only supports shifting items right
[1] NA NA  1  2  3
> Hmisc::Lag(v,-2) # `Hmisc::Lag` supports shifting items left when given a negative offset
[1]  3  4  5 NA NA

# prepend item to list
> l=list(10,20);append(5,l)
[[1]]
[1] 5
[[2]]
[1] 10
[[3]]
[1] 20

# predict values from linear or polynomial regression
> df=data.frame(year=2016:2019,deaths=c(53896,53627,54386,53997))
> predict(lm(deaths~year,df),year=c(2023,2024))
      1       2
54560.6 54666.8
> predict(lm(deaths~poly(year,2),df),year=c(2023,2024)) # second-degree polynomial
      1       2
53690.6 53436.8

# sum together list of named vectors joining by name
> l=list(setNames(c(1,2),c("a","b")),setNames(c(3,4),c("b","c")))
> tapply(unlist(l),names(unlist(l)),sum)
a b c
1 5 4

# add thousands separator
> formatC(1234567,digits=0,format="f",big.mark=",")
[1] "1,234,567"

# include zero counts in `table` output
> v=c(1,1,3,4,4,4)
> table(v) # doesn't include 0 for 2
v
1 3 4
> table(factor(v,min(v):max(v)))
1 2 3 4
2 0 1 3
> tabulate(v) # vector of counts starting from 1
[1] 2 0 1 3

# difference between maximum and minimum value in vector
> diff(range(c(2,-3,5,9)))
12

# get RGB codes for shades of gray
> sapply(seq(0,255,,6)/255,\(i)rgb(i,i,i))
[1] "#FFFFFF" "#CCCCCC" "#999999" "#666666" "#333333" "#000000"

# get frequency table for combinations of dataframe columns
> d=data.frame(a=c(10,10,20,20,20,10,20,10),b=c(4,5,4,4,4,6,6,6))
> table(d)
    a
b    4 5 6
  10 1 1 2
  20 3 0 1
> aggregate(rep(1,nrow(d)),d,sum)
   a b x
1 10 4 1
2 20 4 3
3 10 5 1
4 10 6 2
5 20 6 1
> data.table(d)[,.N,by=.(a,b)]
    a b N
1: 10 4 1
2: 10 5 1
3: 20 4 3
4: 10 6 2
5: 20 6 1
> d=data.frame(a=sample(1:1e5,1e4,T),b=sample(1:1e3,1e4,T))
> b=microbenchmark(times=10,table(d),
+ aggregate(rep(1,nrow(d)),d,sum),
+ data.table(d)[,.N,by=.(a,b)])
> o=sort(tapply(b$time,b$expr,median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 data.table(d)[, .N, by = .(a, b)]
20.87 table(d)
49.53 aggregate(rep(1, nrow(d)), d, sum)

# `seq` from minimum item of vector to maximum
> Reduce(seq,range(c(-2,3,4,1)))
[1] -2 -1  0  1  2  3  4

# pass additional arguments to `Reduce`
> Reduce(\(...)seq(...,by=2),range(c(-2,3,4,1)))
[1] -2  0  2  4

# get a boolean vector that shows which positions of a square matrix are on the diagonal
> m=matrix(1:9,3)
> c(row(m)==col(m))
[1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
> !(lower.tri(m)|upper.tri(m))
[1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
> c(diag(nrow(m))==1) # `diag(3)` returns a 3-by-3 identity matrix
[1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
> 1:length(m)==seq(1,length(m),nrow(m)+1)
[1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE

# get the number of days in a month
> lubridate::days_in_month("2020-2-25")
Feb
 29
> isleap=\(x)x%%4==0&!(x%%100==0&x%%400!=0)
> d=as.POSIXlt("2020-2-25");c(31,ifelse(isleap(d$year+1900),29,28),31,30,31,30,31,31,30,31,30,31)[d$mon+1]
[1] 29

# calculate rowsums for a 3D array
> a=array(1:8,c(2,2,2))
> colSums(aperm(a,c(2,1,3))) # `aperm` transposes the first and second dimensions
     [,1] [,2]
[1,]    4   12
[2,]    6   14

# wide to long conversion in base R
> m=matrix(50:55,3,dimnames=list(letters[1:3],month.name[1:2]))
> m
  January February
a      50       53
b      51       54
c      52       55
> cbind(expand.grid(dimnames(m)),x=c(m))
  Var1     Var2  x
1    a  January 50
2    b  January 51
3    c  January 52
4    a February 53
5    b February 54
6    c February 55
> data.frame(x=rownames(m),y=c(m),z=colnames(m)[col(m)])
  x  y        z
1 a 50  January
2 b 51  January
3 c 52  January
4 a 53 February
5 b 54 February
6 c 55 February
> as.data.frame.table(m)
# the first two columns get converted to factors
# you can also use `as.data.frame(as.table(m))`
  Var1     Var2 Freq
1    a  January   50
2    b  January   51
3    c  January   52
4    a February   53
5    b February   54
6    c February   55
> stack(m) # the first two columns get converted to factors and the second column gets converted to an `Rle` factor
DataFrame with 6 rows and 3 columns
       row      col     value
  <factor>    <Rle> <integer>
1        a  January        50
2        b  January        51
3        c  January        52
4        a February        53
5        b February        54
6        c February        55
> d=as.data.frame(m);cbind(row=rownames(d),stack(d))
# when the input is a dataframe and not a matrix, the rownames don't get included as a column in `stack` output
# the colnames don't get converted to an `Rle` factor when the input for `stack` is a dataframe
  row values ind
1   1      1  V1
2   2      2  V1
3   3      3  V1
4   1      4  V2
5   2      5  V2
6   3      6  V2
7   1      7  V3
8   2      8  V3
9   3      9  V3

# don't omit missing factor levels from `aggregate` output
> aggregate(1:3,list(factor(c(100,102,100),100:102)),sum)
  Group.1 x
1     100 4
2     102 2
> aggregate(1:3,list(factor(c(100,102,100),100:102)),sum,drop=F)
  Group.1  x
1     100  4
2     101 NA
3     102  2

# convert ISO 8601 week number to date
> year=2021;week=1;weekday=7
> as.Date(paste(year,week,weekday),"%Y %V %u") # doesn't work
# this returns the month and day of the current date because `as.Date` doesn't support `%V` (ISO week number)
[1] "2021-12-24"
> ISOweek::ISOweek2date(sprintf("%d-W%02d-%d",year,week,weekday))
# single-digit week numbers are required to have a leading zero
[1] "2021-01-10"
> d=as.Date(paste0(year,"-1-4"));d-(as.integer(format(d,"%w"))+6)%%7-1+7*(week-1)+weekday
# this is a simplified version of `ISOweek2date`
[1] "2021-01-10"

# remove other attributes of a vector but keep names
> v=na.omit(setNames(c(1,NA,2),month.abb[1:3]));v # adds unnecessary attributes
Jan Mar
  1   2
attr(,"na.action")
Feb
  2
attr(,"class")
[1] "omit"
> c(v) # attributes removed but names preserved
Jan Mar
  1   2

# get year and month for a date
> d=as.Date("2023-12-31")
> substr(d,1,7) # this is slower
[1] "2023-12"
> format(d,"%Y-%m") # this was about 3 times faster than `substr` for a long vector of dates
[1] "2023-12"

# remove dataframe rows with one or more NA column
> d=data.frame(x=c(1,NA,3),y=letters[1:3])
> na.omit(d)
  x y
1 1 a
3 3 c

# convert character matrix to numeric
> m=matrix(c("1","2","3","4"),2)
> class(m)="numeric"
> m
     [,1] [,2]
[1,]    1    3
[2,]    2    4

# get the point that is halfway between two points on a log scale
> exp((log(100)+log(1000))/2)
[1] 316.2278

# `seq` on a log scale
> exp(seq(log(100),log(1e5),,5))
100.0000 316.2278 1000.0000 3162.2777 10000.0000

# geometric mean
> v=1:5
> prod(v)^(1/length(v))
[1] 2.605171
> exp(mean(log(v)))
[1] 2.605171

# list definitions of base functions
> Filter(is.function,mget(ls("package:base"),inherits=T))[6:8]
$`:::`
function (pkg, name)  .Primitive(":::")
$`!`
function (x)  .Primitive("!")
$`!.hexmode`
function (a)
as.hexmode(bitwNot(as.hexmode(a)))
<bytecode: 0x7ff0bf2b0c78>
<environment: namespace:base>

# find matrix rows where all columns have the same value
> m=matrix(c(8,8,8,9,7,7,5,5,5),3,byrow=T)
> m
     [,1] [,2] [,3]
[1,]    8    8    8
[2,]    9    7    7
[3,]    5    5    5
> which(rowSums(m!=m[,1])==0)
[1] 1 3

# mode value
> v=c(5,3,8,5,8,8)
> collapse::fmode(v) # this was the fastest in a benchmark with a longer vector
[1] 8
> data.table(v)[,.N,v][which.max(N),v] # this was about 4 times slower
[1] 8
> as.numeric(names(which.max(table(v)))) # this was about 12 times slower
[1] 8

# mean-center matrix columns
> m=matrix(c(1,5,-16,-14),2)
> scale(m,scale=F)
     [,1] [,2]
[1,]   -2   -1
[2,]    2    1
attr(,"scaled:center")
[1]   3 -15
> apply(m,2,\(x)x-mean(x))
     [,1] [,2]
[1,]   -2   -1
[2,]    2    1
> t(t(m)-colMeans(m))
     [,1] [,2]
[1,]   -2   -1
[2,]    2    1

# round numeric columns to integers
> iris|>head(2)|>dplyr::mutate_if(is.double,round)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1            5           4            1           0  setosa
2            5           3            1           0  setosa
> x=head(iris,2);for(i in which(sapply(x,is.numeric)))x[i]=round(x[i]);x
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1            5           4            1           0  setosa
2            5           3            1           0  setosa

# remove rows with the same value in two or more different columns
> m=matrix(c(1,1,2,3,4,5,6,7,8,9,9,9),,3,byrow=T)
> m
     [,1] [,2] [,3]
[1,]    1    1    2
[2,]    3    4    5
[3,]    6    7    8
[4,]    9    9    9
> m[colSums(apply(m,1,duplicated))==0,]
     [,1] [,2] [,3]
[1,]    3    4    5
[2,]    6    7    8

# convert the rows of a matrix to a list of vectors
> m=matrix(1:4,2)
> asplit(m,1) # 1 uses the first dimension as the margin
[[1]]
[1] 1 3
[[2]]
[1] 2 4

# order by column name or column number
> ob=\(x,y){a=match.call()$y;if(is.name(a))a=as.character(a);x[order(x[,a]),]}
> mtcars|>ob(wt)|>head(2) # this also works with matrices unlike `dplyr::arrange`
              mpg cyl disp  hp drat    wt  qsec vs am gear carb
Lotus Europa 30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2
Honda Civic  30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2

# vectorized `seq`
> bit::vecseq(c(-3,8),c(-2,10))
[1] -3 -2  8  9 10
> pseq=Vectorize(seq.default,vectorize.args=c("from","to"))
> pseq(c(-3,8),c(-2,10)) # this is slow
[1] -3 -2  8  9 10
> v1=1:1e5;v2=v1+9
> system.time(bit::vecseq(v1,v2))
   user  system elapsed
  0.001   0.001   0.002
> system.time(pseq(v1,v2))
   user  system elapsed
  0.381   0.042   0.423
> system.time(unlist(mapply(seq,v1,v2)))
   user  system elapsed
  0.506   0.002   0.509

# modify a variable outside a function in place
> plusinplace=\(x,y)eval.parent(substitute(x<-x+y))
> x=1;plusinplace(x,2);x
[1] 3

# swap variables
> swap=\(x,y){x2=x;env=parent.frame();do.call("=",list(substitute(x),y),,env);do.call("=",list(substitute(y),x2),,env)}
> x=1;y=2;swap(x,y);c(x,y)
[1] 2 1

# reorder a hierarchical clustering tree so that similar branches are grouped together
> hc=as.hclust(reorder(as.dendrogram(hclust(eurodist)),cmdscale(eurodist,1)))
# `cmdscale` does classical multidimensional scaling which is similar to PCA but it takes a distance matrix as the input
# the first dimension of MDS is used as weight so that the furthest branches are arranged to opposite ends of the tree
> hc$label[hc$order]
 [1] "Lisbon"          "Madrid"          "Gibraltar"       "Barcelona"       "Marseilles"      "Lyons"
 [7] "Geneva"          "Milan"           "Cherbourg"       "Paris"           "Calais"          "Brussels"
[13] "Hook of Holland" "Cologne"         "Munich"          "Vienna"          "Stockholm"       "Hamburg"
[19] "Copenhagen"      "Rome"            "Athens"

# get angle between two vectors
> vlen=\(x)sum(x^2)^.5
> vang=\(x,y)acos(sum(x*y)/(vlen(x)*vlen(y)))
> vang(c(2,1),c(1,-2))*180/pi
[1] 90

# number of 2-combinations of 5 items
> n=5;k=2;choose(n,k)
[1] 10
> factorial(n)/(factorial(k)*factorial(n-k))
[1] 10

# get indexes of 2-combinations of 4 items
> combn(4,2)
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    2    2    3
[2,]    2    3    4    3    4    4
> n=4;rbind(rep(1:(n-1),(n-1):1),unlist(lapply(2:n,\(x)x:n))) # this is faster than `combn` for high values of n
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    2    2    3
[2,]    2    3    4    3    4    4
> m=matrix(,4,4);rbind(col(m)[lower.tri(m)],row(m)[lower.tri(m)])
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    1    1    2    2    3
[2,]    2    3    4    3    4    4
> m=matrix(,4,4);t(which(lower.tri(m),arr.ind=T))
# `arr.ind=T` returns matrix row and column indexes instead of vector indexes
    [,1] [,2] [,3] [,4] [,5] [,6]
row    2    3    4    3    4    4
col    1    1    1    2    2    3

# get indexes of the fifth 2-combination of 4 items
> combn(4,2)[,5]
[1] 2 4
> n=4;k=5;x=(2*n-1-sqrt(4*n^2-4*n+1-8*(k-1)))%/%2;y=k-x*n+(x^2+x)%/%2+x;c(x+1,y+1)
[1] 2 4

# get distance between Tokyo and London in km
> geosphere::distm(c(137.53,35.49),c(0.38,51.53),fun=geosphere::distHaversine)/1000 # longitude is before latitude
         [,1]
[1,] 9487.566

# do linear interpolation of NA values
> v=c(1,NA,NA,7,NA,5)
> approx(v,n=length(v))$y
[1] 1 3 5 7 6 5
> zoo::na.approx(v)
[1] 1 3 5 7 6 5
> w=c(NA,1,NA,NA,7,NA,5,NA)
> approx(w,n=length(w))$y # this removes NA values from the start and end
[1] 1.000000 2.428571 3.857143 5.285714 6.714286 6.428571 5.714286 5.000000
> zoo::na.approx(v,na.rm=F) # this keeps the NA value at the start but removes the NA value from the end
[1] NA  1  3  5  7  6  5
> zoo::na.approx(v,rule=2) # this replaces NA values at the start with the first non-NA value
[1] 1 1 3 5 7 6 5

# do spline extrapolation and interpolation of NA values in the ends and middle of a vector
> v=c(NA,3,8,NA,30)
> spline(v,n=length(v))$y # this skips the NA value at the start so the output starts from 3
[1]  3.000  6.375 12.000 19.875 30.000
> zoo::na.spline(v) # this also extrapolates the spline to fill in the NA at the start
[1]  2  3  8 17 30

# remove null items from a list
> l=list("a",NULL,c(1,2))
> Filter(Negate(is.null),l)
[[1]]
[1] "a"
[[2]]
[1] 1 2
> l[lengths(l)>0]
[[1]]
[1] "a"
[[2]]
[1] 1 2
> l|>purrr::discard(is.null)
[[1]]
[1] "a"
[[2]]
[1] 1 2

# get Euclidean distances between vectors on corresponding rows of two equal-sized matrices
> m1=matrix(1:12,,3);m2=matrix(8:-3,,3)
> sqrt(rowSums((m2-m1)^2))
[1] 11.44552 12.44990 14.24781 16.58312

# calculate a cross Euclidean distance of each row vector in matrix 1 to each row vector in matrix 2
> m1=matrix(10:15,,3);m2=matrix(1:9,,3)
> sqrt(pmax(0,outer(rowSums(m1^2),rowSums(m2^2),"+")-2*m1%*%t(m2))) # `m1%*%t(m2)` is the same as `tcrossprod(m1,m2)`
# `pmax(0)` is used so the argument for `sqrt` won't be negative when two identical rows are compared
# the result of the substraction is sometimes slightly below zero for identical rows
         [,1]     [,2]     [,3]
[1,] 13.92839 12.20656 10.48809
[2,] 15.65248 13.92839 12.20656
> Rfast::dista(m1,m2) # this was about as fast as the first option
         [,1]     [,2]     [,3]
[1,] 13.92839 12.20656 10.48809
[2,] 15.65248 13.92839 12.20656
> outer(asplit(m1,1),asplit(m2,1),Vectorize(\(x,y)sqrt(sum((x-y)^2))))
# this was about 100 times slower than the first option in a benchmark where I used bigger matrices
# `asplit(m,1)` splits a matrix into a list of row vectors, like `split(m,row(m))`
         [,1]     [,2]     [,3]
[1,] 13.92839 12.20656 10.48809
[2,] 15.65248 13.92839 12.20656

# find the closest neighbors of a human population in Global25 PCA
> t=read.csv("https://drive.google.com/uc?export=download&id=1wZr-UOve0KUKo_Qbgeo27m-CQncZWb8y",row.names=1)
> head(sort(as.matrix(dist(t))[,"Udmurt"]))
     Udmurt   Besermyan     Chuvash       Saami Tatar_Kazan        Komi
 0.00000000  0.02190332  0.04896754  0.05229796  0.06070359  0.06728795

# mean of corresponding cells on a list of matrices of equal size
> l=list(matrix(1:20,4),matrix(21:40,4),matrix(41:60,4))
> Reduce("+",l)/length(l)
     [,1] [,2] [,3] [,4] [,5]
[1,]   21   25   29   33   37
[2,]   22   26   30   34   38
[3,]   23   27   31   35   39
[4,]   24   28   32   36   40

# get row and column index of maximum value in a matrix
> m=matrix(1:6,2)
> which(m==max(m),arr.ind=T)
     row col
[1,]   2   3
> arrayInd(which.max(m),dim(m))
     [,1] [,2]
[1,]    2    3

# make a matrix with 3 rows where each row contains vector `v`
> v=1:4;rep(1,3)%o%v # `%o%` is outer product
     [,1] [,2] [,3] [,4]
[1,]    1    2    3    4
[2,]    1    2    3    4
[3,]    1    2    3    4

# get indexes of up to 2 first occurrences of each unique value in a vector
> v=c(4,4,4,2,3,3,3,3,4,4,3)
> unname(unlist(tapply(1:length(v),v,head,2)))
[1] 4 5 6 1 2
> unname(unlist(lapply(split(1:length(v),v),head,2)))
[1] 4 5 6 1 2
> which(data.table::rowid(v)<3) # this was the fastest option in my benchmark
[1] 1 2 4 5 6

# `sapply` with indexes
> sapply(month.name[1:3],\(x)paste(x,parent.frame()$i))
     January     February        March
 "January 1" "February 2"    "March 3"
> v=month.name[1:3];mapply(\(x,i)paste(x,i),v,1:length(v))
     January     February        March
 "January 1" "February 2"    "March 3"

# find the 5th biggest value in a vector
> set.seed(0);v=rnorm(1e6)
> kit::topn(v,5,index=F)[5] # this is very fast (`index=F` returns values instead of indexes)
[1] 4.342213
> sort(tail(sort(v,partial=length(v)-5+1),5))[1]
# `sort(partial=5)` puts the 5 smallest items to start of the vector but not necessarily in the correct order
# sorting in descending order is not supported with `partial`
[1] 4.342213

# find the indexes of the 3 smallest items in a vector
> set.seed(0);v=rnorm(1e6)
> kit::topn(v,3,decreasing=F,hasna=F) # this was the fastest option in my benchmark
108893 251672 253645
> order(v)[1:3] # this was fairly fast compared to the other options up to around length 1e5
108893 251672 253645
> w=which(v<=Rfast::nth(v,3+1))[1:3];w[order(v[w])] # `[1:3]` can be removed if there are no duplicates
108893 251672 253645
> v2=v;o=c();sapply(1:3,\(i){ind=which.min(v2);v2[ind]<<-NA;ind})
# this is fairly fast for small values of `k` like 3
108893 251672 253645
> w=which(v<=max(sort(v,partial=3)));w[order(v[w])][1:3]
# `sort(v,partial=3)` puts the three smallest items to the start of the vector but not necessarily in the correct order
108893 251672 253645

# round numbers to 3 significant digits
> x=123456
> signif(x,3)
[1] 123000
> round(x,3-ceiling(log10(abs(x)))) # this is identical to `signif` according to the help page of `signif`
[1] 123000

# add one month to a date
> seq(as.Date("2023-12-25"),,"month",2)[-1]
[1] "2024-01-25"
> as.Date("2023-12-25")%m+%months(1) # lubridate
[1] "2024-01-25"
> d=as.POSIXlt("2023-12-25");d$mon=d$mon+1;as.Date(d)
[1] "2024-01-25"

# get US state name from abbreviation
> state.name[match("CT",state.abb)] # these variables are from the base `datasets` package
[1] "Connecticut"

# calculate z-scores
> scale(mtcars[1:3,1:3])|>as.data.frame()
# `scale` calculates z-scores for each column by subtracting the mean and dividing by the standard deviation
# `as.data.frame` removes attributes added by `scale` without removing dimname attributes
                     mpg        cyl       disp
Mazda RX4     -0.5773503  0.5773503  0.5773503
Mazda RX4 Wag -0.5773503  0.5773503  0.5773503
Datsun 710     1.1547005 -1.1547005 -1.1547005
> d=mtcars[1:3,1:3];t((t(d)-colMeans(d))/apply(d,2,sd))
                     mpg        cyl       disp
Mazda RX4     -0.5773503  0.5773503  0.5773503
Mazda RX4 Wag -0.5773503  0.5773503  0.5773503
Datsun 710     1.1547005 -1.1547005 -1.1547005

# divide items into clusters of similar items by doing hierarchical clustering of z-scores
> hclust(dist(scale(head(mtcars,8))))|>cutree(3) # `cutree(3)` cuts the tree at the height where it has 3 subtrees
        Mazda RX4     Mazda RX4 Wag        Datsun 710    Hornet 4 Drive
                1                 1                 1                 2
Hornet Sportabout           Valiant        Duster 360         Merc 240D
                3                 2                 3                 1

# convert quarterly data to monthly data
> quarterly=c(67190,66350,65510,65020)
> round(zoo::na.spline(c(rbind(NA,quarterly,NA))))
# this always uses the quarterly total as the value for the middle month of the quarter
[1] 67410 67190 66932 66647 66350 66053 65768 65510 65290 65123 65020 64995
> library(tempdisagg);as.numeric(predict(td(ts(quarterly,frequency=4)~1,"mean",to="monthly"))) # preserve quarterly means
 [1] 67268.94 67248.53 67052.53 66677.09 66339.56 66033.35 65752.47 65501.62 65275.91 65070.94
[11] 64981.93 65007.13

# convert quarterly data to daily
> d=data.frame(as.Date(c("2023-1-1","2023-4-1")),c(12345,23456)) # the dates have to be the starting days of quarters
> library(tempdisagg);predict(td(d~1,"mean","daily","fast"))|>head(2) # this preserves quarterly means
        time    value
1 2023-01-01 9584.940
2 2023-01-02 9586.904

# split matrix into groups of columns
> m=matrix(1:10,2);group=c(1,1,2,2,2)
> t(m)|>split.data.frame(group)|>lapply(t) # `split.data.frame` splits a dataframe or matrix by rows
$`1`
     [,1] [,2]
[1,]    1    3
[2,]    2    4
$`2`
     [,1] [,2] [,3]
[1,]    5    7    9
[2,]    6    8   10
> split(1:ncol(m),group)|>lapply(\(x)m[,x])
$`1`
     [,1] [,2]
[1,]    1    3
[2,]    2    4
$`2`
     [,1] [,2] [,3]
[1,]    5    7    9
[2,]    6    8   10

# calculate rowsums by column group vector
> m=matrix(1:10,2);group=c(1,1,2,2,2)
> tapply(m,list(row(m),group[col(m)]),sum)
  1  2
1 4 21
2 6 24
> t(rowsum(t(m),group)) # there is no `colsum` function
     1  2
[1,] 4 21
[2,] 6 24

# calculate excess deaths using a baseline from a linear regression
> d=data.frame(year=2015:2022,deaths=c(31712,31336,33581,33140,34170,32739,34913,38554))
> d$trend=predict(lm(deaths~year,subset(d,year<2020)),d)
> d$excess=d$deaths-d$trend
> round(d)
  year deaths trend excess
1 2015  31712 31444    268
2 2016  31336 32116   -780
3 2017  33581 32788    793
4 2018  33140 33460   -320
5 2019  34170 34132     38
6 2020  32739 34804  -2065
7 2021  34913 35476   -563
8 2022  38554 36148   2406

# calculate excess age-standardized mortality rate using the 2013 European Standard Population
> nzpop=tail(read.csv("https://mongol-fi.github.io/f/nz_infoshare_population.csv",check.names=F),8)
> nzdeath=tail(read.csv("https://mongol-fi.github.io/f/nz_infoshare_deaths.csv",check.names=F),8)
> esp=c(10,40,55,55,55,60,60,65,70,70,70,70,65,60,55,50,40,25,15,8,2)*100
> espage=c(0,1,seq(5,95,5))
> espcut=\(x)cut(x,c(espage,Inf),include.lowest=T,right=F)
> pop=t(rowsum(t(nzpop[-1]),espcut(as.numeric(colnames(nzpop)[-1]))))
> death=t(rowsum(t(nzdeath[-1]),espcut(as.numeric(colnames(nzdeath)[-1]))))
> d=data.frame(year=nzpop[1],pop=rowSums(nzpop[-1]),death=rowSums(nzdeath[-1]))
> d$asmr=rowSums(t(t(death)*esp)/pop)
> d$trend=lm(asmr~year,subset(d,year<2020))|>predict(d)
> d$excesspct=(d$asmr/d$trend-1)*100
> print.data.frame(round(d),row.names=F)
 year     pop death asmr trend excesspct
 2015 4612910 31608  936   924         1
 2016 4716520 31176  893   918        -3
 2017 4815070 33333  928   912         2
 2018 4902090 33234  901   906         0
 2019 4985560 34257  901   900         0
 2020 5085970 32616  828   894        -7
 2021 5111110 34911  859   888        -3
 2022 5125220 38547  929   882         5

# convert date to floating point year
> d=as.Date("2020-12-31")
> y=as.numeric(format(d,"%Y"));y+(as.numeric(format(d,"%j"))-1)/"if"(y%%4==0&!(y%%100==0&y%%400!=0),366,365)
[1] 2020.997
> lubridate::decimal_date(d)
[1] 2020.997

# select the first row for each unique value of a column
> mtcars[!duplicated(mtcars$cyl),]
                   mpg cyl disp  hp drat   wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.62 16.46  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
Hornet Sportabout 18.7   8  360 175 3.15 3.44 17.02  0  0    3    2
> dplyr::slice_head(mtcars,by=cyl) # this was about 60 times slower than the previous option in my benchmark
                   mpg cyl disp  hp drat   wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.62 16.46  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
Hornet Sportabout 18.7   8  360 175 3.15 3.44 17.02  0  0    3    2
> ub=\(x,y){a=match.call()$y;if(is.name(a))a=as.character(a);x[!duplicated(x[,a]),]}
> ub(mtcars,cyl) # unique by (this also works with matrices unlike dplyr)
                   mpg cyl disp  hp drat   wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.62 16.46  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
Hornet Sportabout 18.7   8  360 175 3.15 3.44 17.02  0  0    3    2
> collapse::ffirst(mtcars,mtcars$cyl) # this is fast but it replaces rownames with group names
   mpg cyl disp  hp drat   wt  qsec vs am gear carb
4 22.8   4  108  93 3.85 2.32 18.61  1  1    4    1
6 21.0   6  160 110 3.90 2.62 16.46  0  1    4    4
8 18.7   8  360 175 3.15 3.44 17.02  0  0    3    2

# make an empty matrix by specifying dimension names
> arr=\(...){l=list(...);array(,lengths(l),l)} # `list(...)` converts the ellipsis argument to a list
> arr(state.name[1:2],month.abb[1:3])
        Jan Feb Mar
Alabama  NA  NA  NA
Alaska   NA  NA  NA

# get year of date
> d=as.Date("2023-12-31")
> as.POSIXlt(d)$year+1900 # this is the definition of `lubridate::year` and `data.table::year`
[1] 2023
> as.numeric(format(d,"%Y")) # this was about 10 times slower in my benchmark with a long vector
[1] 2023

# apply multiple functions
> mul=\(x,...)sapply(c(...),mapply,x)
> trees|>mul(min,max,mean)
       [,1] [,2]     [,3]
Girth   8.3 20.6 13.24839
Height 63.0 87.0 76.00000
Volume 10.2 77.0 30.17097

# run a benchmark and display the median time of a hundred runs relative to the fastest option
> bench=\(times,...){arg=match.call(expand.dots=F)$...;l=length(arg);out=double(times*l);rand=sample(rep(1:l,times))
+ n=1;for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n+1}
+ setNames(out,sapply(arg[rand],\(i)gsub("  +"," ",paste(deparse(i),collapse=" "))))}
> b=bench(100,1:1e6,seq(1e6),sequence(1e6),seq_len(1e6),seq.int(1e6))
> o=sort(tapply(b,names(b),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 1:1e+06
1.00 seq_len(1e+06)
1.00 seq.int(1e+06)
2.24 seq(1e+06)
345.06 sequence(1e+06)

# select columns
> k=\(x,...)x[,sapply(match.call(expand.dots=F)$...,\(i)"if"(is.name(i),match(as.character(i),colnames(x)),i))]
> mtcars[1:2,]|>k(cyl,5) # this supports mixing column names and column numbers like `dplyr::select`
              cyl drat
Mazda RX4       6  3.9
Mazda RX4 Wag   6  3.9

# merge dataframes by rownames
> merge(mtcars[1:3,1:2],mtcars[2:4,6:7],by=0) # `by=0` merges by rownames
      Row.names  mpg cyl    wt  qsec
1    Datsun 710 22.8   4 2.320 18.61
2 Mazda RX4 Wag 21.0   6 2.875 17.02

# weighted mean by group
> t=data.frame(age=c(80,80,70,70),pop=c(2850,1820,3053,5190),deaths=c(312,287,201,220))
> with(t,tapply(deaths*pop,age,sum)/tapply(pop,age,sum))
      70       80
212.9629 302.2570
> sapply(split(t,t$age),\(i)weighted.mean(i$deaths,i$pop))
      70       80
212.9629 302.2570
> data.table(t)[,weighted.mean(deaths,pop),age]
   age       V1
1:  80 302.2570
2:  70 212.9629

# calculate frequency table using specified factors for each dimension
> fat=\(x,y){o=table(mapply(factor,x,y,SIMPLIFY=F));names(dimnames(o))=NULL;o} # factor table
> dim=list(letters[1:3],month.abb[1:3],state.abb[1:2])
> fat(list(c("c","c","b"),c("Jan","Jan","Mar"),c("AL","AL","AK")),dim)
, , AL
  Jan Feb Mar
a   0   0   0
b   0   0   0
c   2   0   0
, , AK
  Jan Feb Mar
a   0   0   0
b   0   0   1
c   0   0   0

# replace NA values with the previous non-NA value
> v=c(NA,5,NA,NA,3,NA,4)
> data.table::nafill(v,type="locf") # LOCF means last observation carried forward
[1] NA  5  5  5  3  3  4
> zoo::na.locf(v,na.rm=F) # `na.rm=F` disables removing the NA value from the start
[1] NA  5  5  5  3  3  4

# convert a long vector of dates with many repeated dates to character faster
> v=as.Date(sample(18000:19000,1e6,T),"1970-1-1")
> system.time(as.character(v))
   user  system elapsed
  1.639   0.030   1.688
> system.time({u=unique(v);as.character(u)[match(v,u)]})
   user  system elapsed
  0.036   0.002   0.038
> system.time(as.character(seq(min(v),max(v),1))[as.numeric(v-min(v)+1)])
   user  system elapsed
  0.033   0.008   0.041
> system.time(data.table(x=v)[,y:=as.character(x),x]$y)
   user  system elapsed
  0.103   0.003   0.105

# convert date to year and month
> v=Sys.Date()-sample(1e3,1e5,T)
> ua=\(x,fun,...){u=unique(x);fun(u,...)[match(x,u)]} # unique apply (faster for long vectors with many repeated values)
> b=microbenchmark(times=10,
+ with(as.POSIXlt(v),sprintf("%d-%02d",year+1900,mon+1)),
+ format(v,"%Y-%m"),
+ substr(v,1,7),
+ ua(v,format,"%Y-%m"))
> o=sort(tapply(b$time,gsub("  +"," ",b$expr),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 ua(v, format, "%Y-%m")
12.75 with(as.POSIXlt(v), sprintf("%d-%02d", year + 1900, mon + 1))
14.40 format(v, "%Y-%m")
34.61 substr(v, 1, 7)

# count the number of leap days between the epoch and a date in the Julian calendar inclusive of the date
> d=as.numeric(as.Date(c("2000-2-28","2000-2-29","1968-2-29","1968-3-1")))
> (d-789-ifelse(d<0,1,0))%/%1461+1
# 1461 is `365*4+1` and 789 is the first leap day after the epoch
# this incorrectly treats 1900 and 2100 as leap years in the Gregorian calendar
[1]  7  8 -1  0

# get the difference between dates in floored years
> d1=as.Date("2021-01-23");d2=as.Date("2022-01-23")
> p1=as.POSIXlt(d1);p2=as.POSIXlt(d2);p2$year-p1$year-pmin(p2$mon<p1$mon,p2$mon==p1$mon&p2$mday<p1$mday)
# this was the fastest option which treated leap years correctly in my benchmark
# if `d1` can be after `d2` then replace `-pmin` with `+ifelse(p1>p2,-1,1)*pmin`
[1] 1
> as.numeric(format(d2,"%Y"))-as.numeric(format(d1,"%Y"))-as.numeric(format(d2,"%m-%d")<format(d1,"%m-%d"))
# this was about 5 times slower than the first option in my benchmark with long vectors of dates
[1] 1
> as.numeric(substr(d2,1,4))-as.numeric(substr(d1,1,4))-as.numeric(substr(d2,5,10)<substr(d1,5,10))
# this was about 12 times slower than the first option
[1] 1
> lubridate::interval(d1,d2)%/%lubridate::years() # this was about 4 times slower than the first option
[1] 1
> d1%--%d2%/%years() # this is another lubridate option that was also about 4 times slower
[1] 1
> floor(lubridate::time_length(difftime(d2,d1),"years")) # this is fast but it uses 365.25 days as the length of each year
# this produces an incorrect result for most pairs of dates that have the same month and day of month
[1] 0
> floor(as.numeric(lubridate::interval(d1,d2),"years")) # this is fast but similar to the previous option
[1] 0
> floor(as.numeric(d2-d1)/365.25) # this is similar to the previous two options but it was slightly faster
[1] 0
> age=\(x,y){class(x)=class(y)=NULL;(d2-d1-(d2-789)%/%1461+(d1-789)%/%1461)%/%365}
> age(d1,d2)
# this is a very fast way to get the floored difference in years between dates that are after 1900 and before 2100
# this incorrectly treats 1900 and 2100 as leap years in the Gregorian calendar
# 1461 is `365*4+1` 789 is the first leap day since the epoch
[1] 1

# calculate a distance matrix using a mean instead of sum of squared differences to reduce bias caused by NA values
# mean of squared differences is used because rows with many NA values get a low sum of squared differences to other rows
> m=matrix(1:9,3);m[2,3]=NA
> i=combn(nrow(m),2);d=rowMeans((m[i[1,],]-m[i[2,]])^2,na.rm=T);structure(d,class="dist","Size"=nrow(m))
# a `dist` object is required to have a `Size` attribute which is set to the number of rows in the full square matrix
     1    2
2 10.0
3  7.0  2.5

# read CSV faster
> write.csv(matrix(sample(1e6),,10),"test.csv",row.names=F)
> system.time(read.csv("test.csv"))
   user  system elapsed
  0.526   0.014   0.540
> system.time(as.data.frame(data.table::fread("test.csv",showProgress=F)))
   user  system elapsed
  0.016   0.002   0.018
> system.time(as.data.frame(readr::read_csv("test.csv",show_col_types=F)))
   user  system elapsed
  0.161   0.012   0.062

# replace NA values with an incrementing sequence starting from the previous non-NA value
> v=c(NA,NA,5,NA,NA,2,8,NA)
> Reduce(\(i,j)if(is.na(j))i+1 else j,v,accumulate=T)
[1] NA NA  5  6  7  2  8  9
> r=rle(is.na(v));w=which(r$values[-1]);s=cumsum(r$lengths)
> v2=v;v2[bit::vecseq(s[w]+1,s[w+1])]=bit::vecseq(v[s[w]]+1,v[s[w]]+r$lengths[w+1]);v2
# this was about 10 times faster than the first option in a benchmark with a long vector
[1] NA NA  5  6  7  2  8  9
> nn=!is.na(v);w=which(nn);l=length(v);d=diff(c(w,l+1));v2=v;v2[w[1]:l]=sequence(d,v[nn]);v2
# this was about 20 times faster than the first option
[1] NA NA  5  6  7  2  8  9
> Rcpp::cppFunction('NumericVector loifcpp(NumericVector x){int n=x.size();LogicalVector ina=is_na(x);
+ for(int i=1;i<n;i++)if(ina[i])x[i]=x[i-1]+1;return x;}')
> loifcpp(v)
# this was about 200 times faster than the first option
[1] NA NA  5  6  7  2  8  9

# remove NA values from a vector
> v=ifelse(runif(1e5)<.5,NA,sample(1e5))
> b=microbenchmark(na.omit(v),v[complete.cases(v)],v[!is.na(v)],collapse::na_rm(v))
> o=sort(tapply(b$time,gsub("  +"," ",b$expr),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 collapse::na_rm(v)
2.73 v[!is.na(v)]
4.05 v[complete.cases(v)]
5.38 na.omit(v)

# prevent automatic conversion of date to integer in `for` loop
> d=Sys.Date()-0:1
> for(i in d)print(i)
[1] 19734
[1] 19733
> for(i in as.list(d))print(i)
[1] "2024-01-12"
[1] "2024-01-11"
> for(i in d)print(`class<-`(i,"Date")) # convert back to date
[1] "2024-01-12"
[1] "2024-01-11"

# convert long to wide
> d=data.frame(row=paste0("row",sample(1e5)),col=paste0("col",sample(10,1e5,T)),val=sample(100,1e5,T))
> d=d[!duplicated(d[,-3]),]
> b=microbenchmark(times=10,
+ xtabs=xtabs(val~row+col,d),
+ tapply=tapply(d$val,d[,1:2],I),
+ reshape=reshape(d,idvar="row",timevar="col",direction="wide"),
+ dcast=data.table::dcast(data.table::data.table(d),row~col,value.var="val"),
+ matrix={r=unique(d$row);c=unique(d$col);m=matrix(,length(r),length(c),,list(r,c));m[cbind(d$row,d$col)]=d$val;m},
+ match={r=unique(d$row);c=unique(d$col);sapply(c,\(i)with(subset(d,col==i),val[match(r,row)]))|>"rownames<-"(r)},
+ pivot_wider=tidyr::pivot_wider(d,names_from=col,values_from=val),
+ spread=tidyr::spread(d,col,val))
> o=sort(tapply(b$time,b$expr,median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 matrix
1.01 pivot_wider
2.86 match
3.42 dcast
16.26 spread
18.50 reshape
23.67 tapply
24.96 xtabs

# get deciles
> quantile(rnorm(1e6),seq(0,1,.1))|>round(3)
    0%    10%    20%    30%    40%    50%    60%    70%    80%    90%   100%
-5.048 -1.284 -0.844 -0.525 -0.255  0.000  0.253  0.524  0.842  1.282  5.252

# generate a random boolean vector based on a vector of probabilities
> prob=c(.1,.1,.1,.5,.8,.9,.9,.9,.9);runif(length(prob))<prob
[1] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE

# select first column that exists
> mtcars[1:2,]|>dplyr::select(dplyr::any_of(c("horses","horsepower","hp")))
               hp
Mazda RX4     110
Mazda RX4 Wag 110
> firstof=\(x,...)for(a in as.character(match.call(expand.dots=F)$...))if(!is.null(x[[a]]))return(x[,a,drop=F])
> mtcars[1:2,]|>firstof(horses,horsepower,hp)
               hp
Mazda RX4     110
Mazda RX4 Wag 110

# get sums of 3D array by first and second dimensions
> a=array(1:8,c(2,2,2),list(2023:2024,month.name[1:2],c("male","female")))
> a
, , male
     January February
2023       1        3
2024       2        4

, , female
     January February
2023       5        7
2024       6        8
> rowSums(a,dims=2)
     January February
2023       6       10
2024       8       12

# convert RGB to HSV
> color=c("#00bb00","#888888")
> col2rgb(color)
      [,1] [,2]
red      0  136
green  187  136
blue     0  136
> rgb2hsv(col2rgb(color))
       [,1]      [,2]
h 0.3333333 0.0000000
s 1.0000000 0.0000000
v 0.7333333 0.5333333

# probability of rolling a six for 4 for 5 rolls of a die
> dbinom(4,5,1/6) # binomial distribution (the arguments are successes, draws, and probability of success)
[1] 0.003215021
> p=1/6;n=5;k=4;choose(n,k)*p^k*(1-p)^(n-k)
[1] 0.003215021

# probablility that 6 balls drawn from a box with 10 white balls and 20 black balls contain exactly 4 white balls
> dhyper(4,10,20,6) # hypergeometric distribution
[1] 0.06719717

# probablility that 6 balls drawn from a box with 10 white balls and 20 black balls contain 4 or more white balls
> sum(dhyper(4:6,10,20,6)) # hypergeometric distribution
[1] 0.0760389
> fisher.test(matrix(c(4,6-4,10-4,20-(6-4)),2),alternative="greater")$p # based on hypergeometric distribution
# top left cell is number of white balls drawn and bottom left cell is number of black balls drawn
# top right cell is number of white balls not drawn and bottom right cell is number of black balls not drawn
[1] 0.0760389

# convert integer to date
> v=c(12345,23456)
> `class<-`(v,"Date")
[1] "2003-10-20" "2034-03-22"
> as.Date(v,"1970-1-1") # base R requires specifying the origin
[1] "2003-10-20" "2034-03-22"
> library(zoo);as.Date(v) # `zoo` overrides `as.Date` with a version that doesn't require specifying the origin
[1] "2003-10-20" "2034-03-22"

# specify variable name for `with`
> with(list(x=3),x+1)
[1] 4

# convert dataframe to vector efficiently
> summary(microbenchmark(times=100,unlist(mtcars),unlist(mtcars,use.names=F)))[,c(1,5)]
                           expr     mean
1                unlist(mtcars) 59.31585
2 unlist(mtcars, use.names = F)  2.98307

# add totals grouped by each variable to data.table
> d=data.table(month=month.abb[c(1,1,2,2)],state=state.name[1:2],col1=1:4,col2=10:13)
> d=rbind(d,d[,lapply(.SD,sum),month,.SDcols=3:4][,state:="Total"])
> d=rbind(d,d[,lapply(.SD,sum),state,.SDcols=3:4][,month:="Total"])
> d
   month   state col1 col2
1:   Jan Alabama    1   10
2:   Jan  Alaska    2   11
3:   Feb Alabama    3   12
4:   Feb  Alaska    4   13
5:   Jan   Total    3   21
6:   Feb   Total    7   25
7: Total Alabama    4   22
8: Total  Alaska    6   24
9: Total   Total   10   46

# aggregate numeric columns by non-numeric columns
> sum0=\(...)sum(...,na.rm=T)
> aua=\(x,fun=sum0,...){n=sapply(x,is.numeric);aggregate(x[,n,drop=F],x[,!n,drop=F],fun,...)}
> aua(iris,mean) # auto-aggregate
     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1     setosa        5.006       3.428        1.462       0.246
2 versicolor        5.936       2.770        4.260       1.326
3  virginica        6.588       2.974        5.552       2.026

# do a linear interpolation of weekly data to daily
> d=data.frame(x=seq(as.Date("2024-1-1"),,7,3),y=c(53,87,63))
> d
           x  y
1 2024-01-01 53
2 2024-01-08 87
3 2024-01-15 63
> as.data.frame(approx(d$x,d$y,seq(min(d$x),max(d$x),1)))
            x        y
1  2024-01-01 53.00000
2  2024-01-02 57.85714
3  2024-01-03 62.71429
4  2024-01-04 67.57143
5  2024-01-05 72.42857
6  2024-01-06 77.28571
7  2024-01-07 82.14286
8  2024-01-08 87.00000
9  2024-01-09 83.57143
10 2024-01-10 80.14286
11 2024-01-11 76.71429
12 2024-01-12 73.28571
13 2024-01-13 69.85714
14 2024-01-14 66.42857
15 2024-01-15 63.00000

# delete a single column by name in data.table
> data.table(iris)[,Sepal.Width:=NULL][1:2]
   Sepal.Length Petal.Length Petal.Width Species
1:          5.1          1.4         0.2  setosa
2:          4.9          1.4         0.2  setosa

# show ratios between units
> grid::convertUnit(unit(1,"pt"),"lines") # 1 pt is about 0.07 lines
[1] 0.06918500069185lines
> grid::convertUnit(unit(1,"lines"),"in") # 1 line is 0.2 inches
[1] 0.2inches

# faster `table`
> v=sample(letters,1e6,T)
> tab=\(x)data.table(x)[,.N,x][,setNames(N,x)]
> summary(microbenchmark(times=10,table(v),tab(v)))[,c(1,5)]
      expr   median
1 table(v) 65.32598
2   tab(v) 27.96704

# faster `tapply`
> v=1:1e6;group=data.frame(g1=sample(letters[1:3],1e6,T);g2=sample(letters[24:26],1e6,T))
> tap=\(x,y,fun=sum,...){o=data.table(x)[,fun(x,...),data.frame(y)];tapply(o[[ncol(o)]],o[,-ncol(o),with=F],I)}
# `o[[ncol(o)]]` extracts the last column as a vector, and `o[,-ncol(o),with=F]` deletes the last column
> summary(microbenchmark(times=10,tapply(v,group,sum),tap(v,group))
                   expr    median
1 tapply(v, group, sum) 112.66548
2         tap(v, group)  30.62405

# `expand.grid` for data.table
> CJ(month.abb[1:2],state.name[1:3],sorted=F) # this was slightly faster than `expand.grid` but not much faster
# CJ stands for cross join
    V1      V2
1: Jan Alabama
2: Jan  Alaska
3: Jan Arizona
4: Feb Alabama
5: Feb  Alaska
6: Feb Arizona

# print a data.table in a pipeline that includes an in-place assignment at the end
> d=data.table(x=1:2)
> d[,x:=x*2]|>print() # no output and data.table is modified in place
> d[,x:=x*2][]|>print() # output is printed and data.table is not modified in place
   x
1: 4
2: 8

# replace vector values at specified positions with values from another vector
> v=rep(1,5)
> replace(v,3:4,8)
[1] 1 1 8 8 1
> replace(v,3:4,8:9)
[1] 1 1 8 9 1

# insert an item to a vector after the specified position
> append(month.abb[1:3],"x",2)
[1] "Jan" "Feb" "x"   "Mar"

# add new dataframe column after specified index
> d=data.frame(a=1,b=10,c=100)
> tibble::add_column(d,x=0,.after=2)
  a  b x   c
1 1 10 0 100
> cbind(d,x=0)[append(1:ncol(d),ncol(d)+1,2)]
  a  b x   c
1 1 10 0 100

# write gzipped CSV
> con=gzfile("output.csv.gz","w",compression=9);write.csv(mtcars,con);close(con)
> write.csv(mtcars,"output.csv")
> file.info(c("output.csv.gz","output.csv"))$size
[1]  880 1783

# fill in zeroes for missing combinations of non-numeric columns
> d=data.frame(month=month.abb[c(1,1,2)],sex=c("M","F","M"),dead=c(3,1,2))
> num=sapply(d,is.numeric);x=d[1,num,drop=F];x[]=0
> o=rbind(d,cbind(expand.grid(lapply(d[,!num,drop=F],unique)),`rownames<-`(x,NULL)))
> o[!duplicated(o[!num]),]
  month sex dead
1   Jan   M    3
2   Jan   F    1
3   Feb   M    2
7   Feb   F    0

# order dataframe based on a vector of column positions
> cols=c(2,8,9)
> mtcars[do.call(order,lapply(cols,\(i)mtcars[,i])),][1:2,]
               mpg cyl  disp hp drat   wt qsec vs am gear carb
Porsche 914-2 26.0   4 120.3 91 4.43 2.14 16.7  0  1    5    2
Merc 240D     24.4   4 146.7 62 3.69 3.19 20.0  1  0    4    2

# get mode value by group
> v=sample(1:10,1e5,T);g=sample(letters,1e5,T)
> modeby=\(v,g)data.table(v,g)[,.N,.(v,g)][order(-N)][rowid(g)==1]
> modeby2=\(v,g)data.table(v,g)[,.N,.(v,g)][,v[which.max(N)],g]
> fmode=\(v,g)data.table(v,g)[,collapse::fmode(v),g]
> fmodetapply=\(v,g)tapply(v,g,collapse::fmode)
> summary(microbenchmark(times=10,modeby(v,g),modeby2(v,g),fmode(v,g),fmodetapply(v,g)))[,c(1,5)]
               expr   median
1      modeby(v, g) 4.297637
2     modeby2(v, g) 4.258582
3       fmode(v, g) 3.094666
4 fmodetapply(v, g) 6.454565

# subtract 2% from start of range and add 2% to end
> extendrange(0:5,,.02)
[1] -0.1  5.1

# convert theme font size 8 to font size for `geom_text`
> grid::convertUnit(unit(8,"pt"),"mm")
[1] 2.81167842811678mm # theme font sizes are specified in points but `geom_text` sizes are in mm

# convert JSON to a dataframe
> jsonlite::fromJSON('{"data":[{"col1":1,"col2":10},{"col1":2,"col2":20}]}')$data
  col1 col2
1    1   10
2    2   20

# read URL into a string
> con=url("http://example.com");paste(readLines(con),collapse="\n")|>substr(1,80);close(con)
[1] "<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta chars"

# show difference between musical notes on an equal-tempered scale and in just intonation
> d=data.frame(row.names=LETTERS[c(3:7,1,8)],equal=2^(c(0,2,4,5,7,9,11)/12),just=1+c(0,1/8,1/4,1/3,1/2,2/3,7/8))
> round(d,3)
# in the regular equal-tempered musical scale, the frequencies of notes are approximations of the pure ratios shown here
# the just-tempered scale used here is the 5-limit diatonic major scale (Ptolemy's intense diatonic scale)
  equal  just
C 1.000 1.000
D 1.122 1.125
E 1.260 1.250
F 1.335 1.333
G 1.498 1.500
A 1.682 1.667
H 1.888 1.875
> round(transform(1200*log2(d),diff=equal-just),1) # show difference in cents
  equal   just diff
C     0    0.0  0.0
D   200  203.9 -3.9
E   400  386.3 13.7
F   500  498.0  2.0
G   700  702.0 -2.0
A   900  884.4 15.6
H  1100 1088.3 11.7

# 95% confidence interval for ratio of two numbers
> prop.test(123,456)$conf # exact binomial test using Wilson score interval
[1] 0.2300140 0.3134099
attr(,"conf.level")
[1] 0.95
> n=456;p=123/n;p+c(-1,1)*qnorm(.975)*sqrt(p*(1-p)/n) # normal approximation to binomial distribution
# this is less accurate for small numbers or when the ratio is close to 0 or 1
[1] 0.2290011 0.3104726