# use NA instead of zero for missing values in `xtabs`
> d=data.frame(month=month.name[c(1,1,2)],year=c(2022,2023,2023),val=c(5,2,9))
> d
month year val
1 January 2022 5
2 January 2023 2
3 February 2023 9
> xtabs(val~month+year,d)
year
month 2022 2023
February 0 9
January 5 2
> tapply(d$val,d[,1:2],c)
year
month 2022 2023
February NA 9
January 5 2
# convert 3d array to long form
> a=array(sample(99,8),c(2,2,2),list(2022:2023,month.abb[1:2],c("male","female")))
> cbind(expand.grid(dimnames(a)),x=c(a))
Var1 Var2 Var3 x
1 2022 Jan male 52
2 2023 Jan male 86
3 2022 Feb male 63
4 2023 Feb male 24
5 2022 Jan female 94
6 2023 Jan female 93
7 2022 Feb female 18
8 2023 Feb female 92
> as.data.frame.table(a)
Var1 Var2 Var3 x
1 2022 Jan male 52
2 2023 Jan male 86
3 2022 Feb male 63
4 2023 Feb male 24
5 2022 Jan female 94
6 2023 Jan female 93
7 2022 Feb female 18
8 2023 Feb female 92
# prevent custom function definitions in `.Rprofile` from being overridden by a variable assignment
> writeLines("j=paste0","~/.functions.R")
> source("~/.functions.R",attach(NULL,name="myfunctions"))
> j=9
> j(1,2)
[1] "12"
> detach("myfunctions");j(1,2)
Error in j(1, 2) : could not find function "j"
# get the fraction of a normal distribution which is 3 standard deviations or more above the mean
> 1-pnorm(3)
[1] 0.001349898
> integrate(\(x)exp(-x^2/2)/sqrt(2*pi),3,Inf)$value
[1] 0.001349899
# get z-score for point of normal distribution that is higher than 97.5% of values
> qnorm(.975)
[1] 1.95996
# divide ages into age groups
> ages=c(53,2,22,105,0,91)
> starts=c(0,18,50,80)
> cut(ages,c(starts,Inf),include.lowest=T,right=F)
[1] [50,80) [0,18) [18,50) [80,Inf] [0,18) [80,Inf]
Levels: [0,18) [18,50) [50,80) [80,Inf]
> agecut=\(x,starts)cut(pmax(x,0),c(starts,Inf),paste0(starts,c(paste0("-",starts[-1]-1),"+")),T,F)
> agecut(ages,starts)
[1] 50-79 0-17 18-49 80+ 0-17 80+
Levels: 0-17 18-49 50-79 80+
# convert a long character vector to dates faster
> v=as.character(Sys.Date()-sample(1e3,1e5,T))
> b=microbenchmark(times=10,as.Date(v),
+ {u=unique(v);as.Date(u)[match(v,u)]},
+ {f=as.factor(v);as.Date(levels(v))[v]},
+ fasttime::fastDate(v),
+ as.Date(lubridate::parse_date_time(v,orders="ymd")),
+ anytime::anydate(v,calcUnique=T),
+ anytime::anydate(v))
> o=sort(tapply(b$time,gsub(" +"," ",b$expr),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 fasttime::fastDate(v)
3.27 anytime::anydate(v, calcUnique = T)
5.48 { u = unique(v) as.Date(u)[match(v, u)] }
9.42 { f = as.factor(v) as.Date(levels(v))[v] }
9.82 as.Date(lubridate::parse_date_time(v, orders = "ymd"))
101.72 anytime::anydate(v)
242.50 as.Date(v)
# save the intermediate result of pipeline to a variable
> sv=\(x,y=p){do.call("=",list(substitute(y),x),,.GlobalEnv);x}
> seq(5)|>sv()|>mean()|>c(p)
[1] 3 1 2 3 4 5
> seq(5)%>%{.->>p}%>%mean%>%c(p)
[1] 3 1 2 3 4 5
# count dates and return 0 for missing dates in between
> d=as.Date(c("2024-1-2","2024-1-2","2024-1-5"))
> table(factor(d,as.character(Reduce(\(...)seq(...,by=1),range(d)))))
2024-01-02 2024-01-03 2024-01-04 2024-01-05
2 0 0 1
> num=as.numeric(d);ran=Reduce(seq,range(num,na.rm=T));setNames(table(factor(num,ran)),as.Date(ran,"1970-1-1"))
2024-01-02 2024-01-03 2024-01-04 2024-01-05
2 0 0 1
> setNames(tabulate(as.numeric(d-min(d))+1),seq(min(d),max(d),1))
2024-01-02 2024-01-03 2024-01-04 2024-01-05
2 0 0 1
# make a matrix for shifted versions of a vector
> v=1:4;embed(c(NA,v,NA),3)
[,1] [,2] [,3]
[1,] 2 1 NA
[2,] 3 2 1
[3,] 4 3 2
[4,] NA 4 3
# calculate a centered moving average which uses a narrower window at vector ends instead of returning NA values
> v=c(3,5,7,3,-1)
> rowMeans(embed(c(NA,v,NA),3),na.rm=T)
[1] 4 5 5 3 1
> o=outer(1:length(v),-1:1,"+");rowMeans(matrix(v[ifelse(o<1,NA,o)],length(v)),na.rm=T)
[1] 4 5 5 3 1
> l=length(v);sapply(1:l,\(i)mean(v[max(1,i-1):min(l,i+1)],na.rm=T))
[1] 4 5 5 3 1
> zoo::rollapplyr(v,3,mean,partial=T,align="center")
[1] 4 5 5 3 1
> slider::slide_dbl(v,mean,.before=1,.after=1)
[1] 4 5 5 3 1
# convert a vector of distances to a `dist` object
> v=1:6;class(v)="dist";attr(v,"Size")=ceiling(sqrt(length(v)*2));v
1 2 3
2 1
3 2 4
4 3 5 6
> v=1:6;structure(v,class="dist",Size=Re(polyroot(c(-2*length(v),-1,1))[1]))
1 2 3
1 2 3
2 1
3 2 4
4 3 5 6
# ignore NA in cumulative sum
> v=c(1,2,3,NA,5);cumsum(v)
[1] 1 3 6 NA NA
> cumsum(ifelse(is.na(v),0,v))
[1] 1 3 6 6 11
# get length of each subitem of list
> lengths(list(1:3,4:9))
[1] 3 6
# split a vector into a list for each run of consecutive integers
> v=c(1,2,3,8,9);split(v,cumsum(c(T,diff(v)!=1)))
$`1`
[1] 1 2 3
$`2`
[1] 8 9
# disable alphabetic sorting for `tapply`
> v=1:4
> group=c("b","b","a","a")
> tapply(v,group,sum)
a b
7 3
> tapply(v,factor(group,unique(group)),sum)
b a
3 7
# convert datetime to float
> as.double(difftime(as.POSIXct("2023-01-02 01:23:45"),as.Date("1970-1-1")))
[1] 19358.97
# month name to integer
> match(c("January","July"),month.name)
[1] 1 7
# convert HSV to RGB
> 255*attr(colorspace::hex2RGB(colorspace::hex(HSV(seq(0,300,60),.5,1))),"coords")
R G B
[1,] 255 128 128
[2,] 255 255 128
[3,] 128 255 128
[4,] 128 255 255
[5,] 128 128 255
[6,] 255 128 255
# sum together each segment of 7 items in a vector
> v=1:21;colSums(matrix(v,7))
[1] 28 77 126
# turn a cumulative series to non-cumulative
> v=c(0,1,1,5,6)
> diff(c(0,v))
[1] 0 1 0 4 1
# print a dataframe without row names or column names
> d=data.frame(col1=1:2,col2=c("a","b"))
> d
col1 col2
1 1 a
2 2 b
> print.data.frame(d,row.names=F)
col1 col2
1 a
2 b
> apply(d,2,\(x)format(x,width=max(nchar(x))))|>apply(1,paste,collapse=" ")|>writeLines()
1 a
2 b
# shift each item of vector left or right by 2 positions
> v=1:5
> lshift\(x,y=1)c(tail(x,-y),rep(NA,y))
> rshift=\(x,y=1)c(rep(NA,y),head(x,-y))
> lshift(v,2)
[1] 3 4 5 NA NA
> rshift(v,2)
[1] NA NA 1 2 3
> dplyr::lag(v,2)
[1] NA NA 1 2 3
> Hmisc::Lag(v,-2)
[1] 3 4 5 NA NA
# prepend item to list
> l=list(10,20);append(5,l)
[[1]]
[1] 5
[[2]]
[1] 10
[[3]]
[1] 20
# predict values from linear or polynomial regression
> df=data.frame(year=2016:2019,deaths=c(53896,53627,54386,53997))
> predict(lm(deaths~year,df),year=c(2023,2024))
1 2
54560.6 54666.8
> predict(lm(deaths~poly(year,2),df),year=c(2023,2024))
1 2
53690.6 53436.8
# sum together list of named vectors joining by name
> l=list(setNames(c(1,2),c("a","b")),setNames(c(3,4),c("b","c")))
> tapply(unlist(l),names(unlist(l)),sum)
a b c
1 5 4
# add thousands separators
> formatC(1234567,digits=0,format="f",big.mark=",")
[1] "1,234,567"
# include zero counts in `table` output
> v=c(1,1,3,4,4,4)
> table(v)
v
1 3 4
> table(factor(v,min(v):max(v)))
1 2 3 4
2 0 1 3
> tabulate(v)
[1] 2 0 1 3
# difference between maximum and minimum value in vector
> diff(range(c(2,-3,5,9)))
12
# get RGB codes for shades of gray
> sapply(seq(0,255,,6)/255,\(i)rgb(i,i,i))
[1] "#FFFFFF" "#CCCCCC" "#999999" "#666666" "#333333" "#000000"
# get frequency table for combinations of dataframe columns
> d=data.frame(a=c(10,10,20,20,20,10,20,10),b=c(4,5,4,4,4,6,6,6))
> table(d)
a
b 4 5 6
10 1 1 2
20 3 0 1
> aggregate(rep(1,nrow(d)),d,sum)
a b x
1 10 4 1
2 20 4 3
3 10 5 1
4 10 6 2
5 20 6 1
> data.table(d)[,.N,by=.(a,b)]
a b N
1: 10 4 1
2: 10 5 1
3: 20 4 3
4: 10 6 2
5: 20 6 1
> d=data.frame(a=sample(1:1e5,1e4,T),b=sample(1:1e3,1e4,T))
> b=microbenchmark(times=10,table(d),
+ aggregate(rep(1,nrow(d)),d,sum),
+ data.table(d)[,.N,by=.(a,b)])
> o=sort(tapply(b$time,b$expr,median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 data.table(d)[, .N, by = .(a, b)]
20.87 table(d)
49.53 aggregate(rep(1, nrow(d)), d, sum)
# `seq` from minimum item of vector to maximum
> Reduce(seq,range(c(-2,3,4,1)))
[1] -2 -1 0 1 2 3 4
# pass additional arguments to `Reduce`
> Reduce(\(...)seq(...,by=2),range(c(-2,3,4,1)))
[1] -2 0 2 4
# get a boolean vector that shows which positions of a square matrix are on the diagonal
> m=matrix(1:9,3)
> c(row(m)==col(m))
[1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
> !(lower.tri(m)|upper.tri(m))
[1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
> c(diag(nrow(m))==1)
[1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
> 1:length(m)==seq(1,length(m),nrow(m)+1)
[1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
# calculate rowsums for a 3D array
> a=array(1:8,c(2,2,2))
> colSums(aperm(a,c(2,1,3)))
[,1] [,2]
[1,] 4 12
[2,] 6 14
# wide to long conversion in base R
> m=matrix(50:55,3,dimnames=list(letters[1:3],month.name[1:2]))
> m
January February
a 50 53
b 51 54
c 52 55
> cbind(expand.grid(dimnames(m)),x=c(m))
Var1 Var2 x
1 a January 50
2 b January 51
3 c January 52
4 a February 53
5 b February 54
6 c February 55
> data.frame(x=rownames(m),y=c(m),z=colnames(m)[col(m)])
x y z
1 a 50 January
2 b 51 January
3 c 52 January
4 a 53 February
5 b 54 February
6 c 55 February
> as.data.frame.table(m)
Var1 Var2 Freq
1 a January 50
2 b January 51
3 c January 52
4 a February 53
5 b February 54
6 c February 55
> stack(m)
DataFrame with 6 rows and 3 columns
row col value
<factor> <Rle> <integer>
1 a January 50
2 b January 51
3 c January 52
4 a February 53
5 b February 54
6 c February 55
> d=as.data.frame(m);cbind(row=rownames(d),stack(d))
row values ind
1 1 1 V1
2 2 2 V1
3 3 3 V1
4 1 4 V2
5 2 5 V2
6 3 6 V2
7 1 7 V3
8 2 8 V3
9 3 9 V3
# don't omit missing factor levels from `aggregate` output
> aggregate(1:3,list(factor(c(100,102,100),100:102)),sum)
Group.1 x
1 100 4
2 102 2
> aggregate(1:3,list(factor(c(100,102,100),100:102)),sum,drop=F)
Group.1 x
1 100 4
2 101 NA
3 102 2
# convert ISO 8601 week number to date
> year=2021;week=1;weekday=7
> as.Date(paste(year,week,weekday),"%Y %V %u")
[1] "2021-12-24"
> ISOweek::ISOweek2date(sprintf("%d-W%02d-%d",year,week,weekday))
[1] "2021-01-10"
> d=as.Date(paste0(year,"-1-4"));d-(as.integer(format(d,"%w"))+6)%%7-1+7*(week-1)+weekday
[1] "2021-01-10"
# remove other attributes of a vector but keep names
> v=na.omit(setNames(c(1,NA,2),month.abb[1:3]));v
Jan Mar
1 2
attr(,"na.action")
Feb
2
attr(,"class")
[1] "omit"
> c(v)
Jan Mar
1 2
# get year and month for a date
> d=as.Date("2023-12-31")
> substr(d,1,7)
[1] "2023-12"
> format(d,"%Y-%m")
[1] "2023-12"
# remove dataframe rows with one or more NA column
> d=data.frame(x=c(1,NA,3),y=letters[1:3])
> na.omit(d)
x y
1 1 a
3 3 c
# convert character matrix to numeric
> m=matrix(c("1","2","3","4"),2)
> class(m)="numeric"
> m
[,1] [,2]
[1,] 1 3
[2,] 2 4
# get the point that is halfway between two points on a log scale
> exp((log(100)+log(1000))/2)
[1] 316.2278
# `seq` on a log scale
> exp(seq(log(100),log(1e5),,5))
100.0000 316.2278 1000.0000 3162.2777 10000.0000
# geometric mean
> v=1:5
> prod(v)^(1/length(v))
[1] 2.605171
> exp(mean(log(v)))
[1] 2.605171
# list definitions of base functions
> Filter(is.function,mget(ls("package:base"),inherits=T))[6:8]
$`:::`
function (pkg, name) .Primitive(":::")
$`!`
function (x) .Primitive("!")
$`!.hexmode`
function (a)
as.hexmode(bitwNot(as.hexmode(a)))
<bytecode: 0x7ff0bf2b0c78>
<environment: namespace:base>
# find matrix rows where all columns have the same value
> m=matrix(c(8,8,8,9,7,7,5,5,5),3,byrow=T)
> m
[,1] [,2] [,3]
[1,] 8 8 8
[2,] 9 7 7
[3,] 5 5 5
> which(rowSums(m!=m[,1])==0)
[1] 1 3
# mode value
> v=c(5,3,8,5,8,8)
> collapse::fmode(v)
[1] 8
> data.table(v)[,.N,v][which.max(N),v]
[1] 8
> as.numeric(names(which.max(table(v))))
[1] 8
# mean-center matrix columns
> m=matrix(c(1,5,-16,-14),2)
> scale(m,scale=F)
[,1] [,2]
[1,] -2 -1
[2,] 2 1
attr(,"scaled:center")
[1] 3 -15
> apply(m,2,\(x)x-mean(x))
[,1] [,2]
[1,] -2 -1
[2,] 2 1
> t(t(m)-colMeans(m))
[,1] [,2]
[1,] -2 -1
[2,] 2 1
# round numeric columns to integers
> iris|>head(2)|>dplyr::mutate_if(is.double,round)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5 4 1 0 setosa
2 5 3 1 0 setosa
> x=head(iris,2);for(i in which(sapply(x,is.numeric)))x[i]=round(x[i]);x
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5 4 1 0 setosa
2 5 3 1 0 setosa
# remove rows with the same value in two or more different columns
> m=matrix(c(1,1,2,3,4,5,6,7,8,9,9,9),,3,byrow=T)
> m
[,1] [,2] [,3]
[1,] 1 1 2
[2,] 3 4 5
[3,] 6 7 8
[4,] 9 9 9
> m[colSums(apply(m,1,duplicated))==0,]
[,1] [,2] [,3]
[1,] 3 4 5
[2,] 6 7 8
# convert the rows of a matrix to a list of vectors
> m=matrix(1:4,2)
> asplit(m,1)
[[1]]
[1] 1 3
[[2]]
[1] 2 4
# order by column name or column number
> ob=\(x,y){a=match.call()$y;if(is.name(a))a=as.character(a);x[order(x[,a]),]}
> mtcars|>ob(wt)|>head(2)
mpg cyl disp hp drat wt qsec vs am gear carb
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
# vectorized `seq`
> bit::vecseq(c(-3,8),c(-2,10))
[1] -3 -2 8 9 10
> pseq=Vectorize(seq.default,vectorize.args=c("from","to"))
> pseq(c(-3,8),c(-2,10))
[1] -3 -2 8 9 10
> v1=1:1e5;v2=v1+9
> system.time(bit::vecseq(v1,v2))
user system elapsed
0.001 0.001 0.002
> system.time(pseq(v1,v2))
user system elapsed
0.381 0.042 0.423
> system.time(unlist(mapply(seq,v1,v2)))
user system elapsed
0.506 0.002 0.509
# modify a variable outside a function in place
> plusinplace=\(x,y)eval.parent(substitute(x<-x+y))
> x=1;plusinplace(x,2);x
[1] 3
# swap variables
> swap=\(x,y){x2=x;env=parent.frame();do.call("=",list(substitute(x),y),,env);do.call("=",list(substitute(y),x2),,env)}
> x=1;y=2;swap(x,y);c(x,y)
[1] 2 1
# reorder a hierarchical clustering tree so that similar branches are grouped together
> hc=as.hclust(reorder(as.dendrogram(hclust(eurodist)),cmdscale(eurodist,1)))
> hc$label[hc$order]
[1] "Lisbon" "Madrid" "Gibraltar" "Barcelona" "Marseilles" "Lyons"
[7] "Geneva" "Milan" "Cherbourg" "Paris" "Calais" "Brussels"
[13] "Hook of Holland" "Cologne" "Munich" "Vienna" "Stockholm" "Hamburg"
[19] "Copenhagen" "Rome" "Athens"
# get angle between two vectors
> vlen=\(x)sum(x^2)^.5
> vang=\(x,y)acos(sum(x*y)/(vlen(x)*vlen(y)))
> vang(c(2,1),c(1,-2))*180/pi
[1] 90
# number of 2-combinations of 5 items
> n=5;k=2;choose(n,k)
[1] 10
> factorial(n)/(factorial(k)*factorial(n-k))
[1] 10
# get indexes of 2-combinations of 4 items
> combn(4,2)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 1 1 2 2 3
[2,] 2 3 4 3 4 4
> n=4;rbind(rep(1:(n-1),(n-1):1),unlist(lapply(2:n,\(x)x:n)))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 1 1 2 2 3
[2,] 2 3 4 3 4 4
> m=matrix(,4,4);rbind(col(m)[lower.tri(m)],row(m)[lower.tri(m)])
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 1 1 2 2 3
[2,] 2 3 4 3 4 4
> m=matrix(,4,4);t(which(lower.tri(m),arr.ind=T))
[,1] [,2] [,3] [,4] [,5] [,6]
row 2 3 4 3 4 4
col 1 1 1 2 2 3
# get indexes of the fifth 2-combination of 4 items
> combn(4,2)[,5]
[1] 2 4
> n=4;k=5;x=(2*n-1-sqrt(4*n^2-4*n+1-8*(k-1)))%/%2;y=k-x*n+(x^2+x)%/%2+x;c(x+1,y+1)
[1] 2 4
# get distance between Tokyo and London in km
> geosphere::distm(c(137.53,35.49),c(0.38,51.53),fun=geosphere::distHaversine)/1000
[,1]
[1,] 9487.566
# do linear interpolation of NA values
> v=c(1,NA,NA,7,NA,5)
> approx(v,n=length(v))$y
[1] 1 3 5 7 6 5
> zoo::na.approx(v)
[1] 1 3 5 7 6 5
> w=c(NA,1,NA,NA,7,NA,5,NA)
> approx(w,n=length(w))$y
[1] 1.000000 2.428571 3.857143 5.285714 6.714286 6.428571 5.714286 5.000000
> zoo::na.approx(v,na.rm=F)
[1] NA 1 3 5 7 6 5
> zoo::na.approx(v,rule=2)
[1] 1 1 3 5 7 6 5
# do spline extrapolation and interpolation of NA values in the ends and middle of a vector
> v=c(NA,3,8,NA,30)
> spline(v,n=length(v))$y
[1] 3.000 6.375 12.000 19.875 30.000
> zoo::na.spline(v)
[1] 2 3 8 17 30
# remove null items from a list
> l=list("a",NULL,c(1,2))
> Filter(Negate(is.null),l)
[[1]]
[1] "a"
[[2]]
[1] 1 2
> l[lengths(l)>0]
[[1]]
[1] "a"
[[2]]
[1] 1 2
> l|>purrr::discard(is.null)
[[1]]
[1] "a"
[[2]]
[1] 1 2
# get Euclidean distances between vectors on corresponding rows of two equal-sized matrices
> m1=matrix(1:12,,3);m2=matrix(8:-3,,3)
> sqrt(rowSums((m2-m1)^2))
[1] 11.44552 12.44990 14.24781 16.58312
# calculate a cross Euclidean distance of each row vector in matrix 1 to each row vector in matrix 2
> m1=matrix(10:15,,3);m2=matrix(1:9,,3)
> sqrt(pmax(0,outer(rowSums(m1^2),rowSums(m2^2),"+")-2*m1%*%t(m2)))
[,1] [,2] [,3]
[1,] 13.92839 12.20656 10.48809
[2,] 15.65248 13.92839 12.20656
> Rfast::dista(m1,m2)
[,1] [,2] [,3]
[1,] 13.92839 12.20656 10.48809
[2,] 15.65248 13.92839 12.20656
> outer(asplit(m1,1),asplit(m2,1),Vectorize(\(x,y)sqrt(sum((x-y)^2))))
[,1] [,2] [,3]
[1,] 13.92839 12.20656 10.48809
[2,] 15.65248 13.92839 12.20656
# find the closest neighbors of a human population in Global25 PCA
> t=read.csv("https://drive.google.com/uc?export=download&id=1wZr-UOve0KUKo_Qbgeo27m-CQncZWb8y",row.names=1)
> head(sort(as.matrix(dist(t))[,"Udmurt"]))
Udmurt Besermyan Chuvash Saami Tatar_Kazan Komi
0.00000000 0.02190332 0.04896754 0.05229796 0.06070359 0.06728795
# mean of corresponding cells on a list of matrices of equal size
> l=list(matrix(1:20,4),matrix(21:40,4),matrix(41:60,4))
> Reduce("+",l)/length(l)
[,1] [,2] [,3] [,4] [,5]
[1,] 21 25 29 33 37
[2,] 22 26 30 34 38
[3,] 23 27 31 35 39
[4,] 24 28 32 36 40
# get row and column index of maximum value in a matrix
> m=matrix(1:6,2)
> which(m==max(m),arr.ind=T)
row col
[1,] 2 3
> arrayInd(which.max(m),dim(m))
[,1] [,2]
[1,] 2 3
# make a matrix with 3 rows where each row contains vector `v`
> v=1:4;rep(1,3)%o%v
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 1 2 3 4
[3,] 1 2 3 4
# get indexes of up to 2 first occurrences of each unique value in a vector
> v=c(4,4,4,2,3,3,3,3,4,4,3)
> unname(unlist(tapply(1:length(v),v,head,2)))
[1] 4 5 6 1 2
> unname(unlist(lapply(split(1:length(v),v),head,2)))
[1] 4 5 6 1 2
> which(data.table::rowid(v)<3)
[1] 1 2 4 5 6
# `sapply` with indexes
> sapply(month.name[1:3],\(x)paste(x,parent.frame()$i))
January February March
"January 1" "February 2" "March 3"
> v=month.name[1:3];mapply(\(x,i)paste(x,i),v,1:length(v))
January February March
"January 1" "February 2" "March 3"
# find the 5th biggest value in a vector
> set.seed(0);v=rnorm(1e6)
> kit::topn(v,5,index=F)[5]
[1] 4.342213
> sort(tail(sort(v,partial=length(v)-5+1),5))[1]
[1] 4.342213
# find the indexes of the 3 smallest items in a vector
> set.seed(0);v=rnorm(1e6)
> kit::topn(v,3,decreasing=F,hasna=F)
108893 251672 253645
> order(v)[1:3]
108893 251672 253645
> w=which(v<=Rfast::nth(v,3+1))[1:3];w[order(v[w])]
108893 251672 253645
> v2=v;o=c();sapply(1:3,\(i){ind=which.min(v2);v2[ind]<<-NA;ind})
108893 251672 253645
> w=which(v<=max(sort(v,partial=3)));w[order(v[w])][1:3]
108893 251672 253645
# round numbers to 3 significant digits
> x=123456
> signif(x,3)
[1] 123000
> round(x,3-ceiling(log10(abs(x))))
[1] 123000
# add one month to a date
> seq(as.Date("2023-12-25"),,"month",2)[-1]
[1] "2024-01-25"
> as.Date("2023-12-25")%m+%months(1)
[1] "2024-01-25"
> d=as.POSIXlt("2023-12-25");d$mon=d$mon+1;as.Date(d)
[1] "2024-01-25"
# get US state name from abbreviation
> state.name[match("CT",state.abb)]
[1] "Connecticut"
# calculate z-scores
> scale(mtcars[1:3,1:3])|>as.data.frame()
mpg cyl disp
Mazda RX4 -0.5773503 0.5773503 0.5773503
Mazda RX4 Wag -0.5773503 0.5773503 0.5773503
Datsun 710 1.1547005 -1.1547005 -1.1547005
> d=mtcars[1:3,1:3];t((t(d)-colMeans(d))/apply(d,2,sd))
mpg cyl disp
Mazda RX4 -0.5773503 0.5773503 0.5773503
Mazda RX4 Wag -0.5773503 0.5773503 0.5773503
Datsun 710 1.1547005 -1.1547005 -1.1547005
# divide items into clusters of similar items by doing hierarchical clustering of z-scores
> hclust(dist(scale(head(mtcars,8))))|>cutree(3)
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
1 1 1 2
Hornet Sportabout Valiant Duster 360 Merc 240D
3 2 3 1
# convert quarterly data to monthly data
> quarterly=c(67190,66350,65510,65020)
> round(zoo::na.spline(c(rbind(NA,quarterly,NA))))
[1] 67410 67190 66932 66647 66350 66053 65768 65510 65290 65123 65020 64995
> library(tempdisagg);as.numeric(predict(td(ts(quarterly,frequency=4)~1,"mean",to="monthly")))
[1] 67268.94 67248.53 67052.53 66677.09 66339.56 66033.35 65752.47 65501.62 65275.91 65070.94
[11] 64981.93 65007.13
# convert quarterly data to daily
> d=data.frame(as.Date(c("2023-1-1","2023-4-1")),c(12345,23456))
> library(tempdisagg);predict(td(d~1,"mean","daily","fast"))|>head(2)
time value
1 2023-01-01 9584.940
2 2023-01-02 9586.904
# split matrix into groups of columns
> m=matrix(1:10,2);group=c(1,1,2,2,2)
> t(m)|>split.data.frame(group)|>lapply(t)
$`1`
[,1] [,2]
[1,] 1 3
[2,] 2 4
$`2`
[,1] [,2] [,3]
[1,] 5 7 9
[2,] 6 8 10
> split(1:ncol(m),group)|>lapply(\(x)m[,x])
$`1`
[,1] [,2]
[1,] 1 3
[2,] 2 4
$`2`
[,1] [,2] [,3]
[1,] 5 7 9
[2,] 6 8 10
# calculate rowsums by column group vector
> m=matrix(1:10,2);group=c(1,1,2,2,2)
> tapply(m,list(row(m),group[col(m)]),sum)
1 2
1 4 21
2 6 24
> t(rowsum(t(m),group))
1 2
[1,] 4 21
[2,] 6 24
# calculate excess deaths using a baseline from a linear regression
> d=data.frame(year=2015:2022,deaths=c(31712,31336,33581,33140,34170,32739,34913,38554))
> d$trend=predict(lm(deaths~year,subset(d,year<2020)),d)
> d$excess=d$deaths-d$trend
> round(d)
year deaths trend excess
1 2015 31712 31444 268
2 2016 31336 32116 -780
3 2017 33581 32788 793
4 2018 33140 33460 -320
5 2019 34170 34132 38
6 2020 32739 34804 -2065
7 2021 34913 35476 -563
8 2022 38554 36148 2406
# calculate excess age-standardized mortality rate using the 2013 European Standard Population
> nzpop=tail(read.csv("https://mongol-fi.github.io/f/nz_infoshare_population.csv",check.names=F),8)
> nzdeath=tail(read.csv("https://mongol-fi.github.io/f/nz_infoshare_deaths.csv",check.names=F),8)
> esp=c(10,40,55,55,55,60,60,65,70,70,70,70,65,60,55,50,40,25,15,8,2)*100
> espage=c(0,1,seq(5,95,5))
> espcut=\(x)cut(x,c(espage,Inf),include.lowest=T,right=F)
> pop=t(rowsum(t(nzpop[-1]),espcut(as.numeric(colnames(nzpop)[-1]))))
> death=t(rowsum(t(nzdeath[-1]),espcut(as.numeric(colnames(nzdeath)[-1]))))
> d=data.frame(year=nzpop[1],pop=rowSums(nzpop[-1]),death=rowSums(nzdeath[-1]))
> d$asmr=rowSums(t(t(death)*esp)/pop)
> d$trend=lm(asmr~year,subset(d,year<2020))|>predict(d)
> d$excesspct=(d$asmr/d$trend-1)*100
> print.data.frame(round(d),row.names=F)
year pop death asmr trend excesspct
2015 4612910 31608 936 924 1
2016 4716520 31176 893 918 -3
2017 4815070 33333 928 912 2
2018 4902090 33234 901 906 0
2019 4985560 34257 901 900 0
2020 5085970 32616 828 894 -7
2021 5111110 34911 859 888 -3
2022 5125220 38547 929 882 5
# convert date to floating point year
> d=as.Date("2020-12-31")
> y=as.numeric(format(d,"%Y"));y+(as.numeric(format(d,"%j"))-1)/"if"(y%%4==0&!(y%%100==0&y%%400!=0),366,365)
[1] 2020.997
> lubridate::decimal_date(d)
[1] 2020.997
# select the first row for each unique value of a column
> mtcars[!duplicated(mtcars$cyl),]
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
> dplyr::slice_head(mtcars,by=cyl)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
> ub=\(x,y){a=match.call()$y;if(is.name(a))a=as.character(a);x[!duplicated(x[,a]),]}
> ub(mtcars,cyl)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
> collapse::ffirst(mtcars,mtcars$cyl)
mpg cyl disp hp drat wt qsec vs am gear carb
4 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
6 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
8 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
# make an empty matrix by specifying dimension names
> arr=\(...){l=list(...);array(,lengths(l),l)}
> arr(state.name[1:2],month.abb[1:3])
Jan Feb Mar
Alabama NA NA NA
Alaska NA NA NA
# get year of date
> d=as.Date("2023-12-31")
> as.POSIXlt(d)$year+1900
[1] 2023
> as.numeric(format(d,"%Y"))
[1] 2023
# apply multiple functions
> mul=\(x,...)sapply(c(...),mapply,x)
> trees|>mul(min,max,mean)
[,1] [,2] [,3]
Girth 8.3 20.6 13.24839
Height 63.0 87.0 76.00000
Volume 10.2 77.0 30.17097
# run a benchmark and display the median time of a hundred runs relative to the fastest option
> bench=\(times,...){arg=match.call(expand.dots=F)$...;l=length(arg);out=double(times*l);rand=sample(rep(1:l,times))
+ n=1;for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n+1}
+ setNames(out,sapply(arg[rand],\(i)gsub(" +"," ",paste(deparse(i),collapse=" "))))}
> b=bench(100,1:1e6,seq(1e6),sequence(1e6),seq_len(1e6),seq.int(1e6))
> o=sort(tapply(b,names(b),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 1:1e+06
1.00 seq_len(1e+06)
1.00 seq.int(1e+06)
2.24 seq(1e+06)
345.06 sequence(1e+06)
# select columns
> k=\(x,...)x[,sapply(match.call(expand.dots=F)$...,\(i)"if"(is.name(i),match(as.character(i),colnames(x)),i))]
> mtcars[1:2,]|>k(cyl,5)
cyl drat
Mazda RX4 6 3.9
Mazda RX4 Wag 6 3.9
# merge dataframes by rownames
> merge(mtcars[1:3,1:2],mtcars[2:4,6:7],by=0)
Row.names mpg cyl wt qsec
1 Datsun 710 22.8 4 2.320 18.61
2 Mazda RX4 Wag 21.0 6 2.875 17.02
# weighted mean by group
> t=data.frame(age=c(80,80,70,70),pop=c(2850,1820,3053,5190),deaths=c(312,287,201,220))
> with(t,tapply(deaths*pop,age,sum)/tapply(pop,age,sum))
70 80
212.9629 302.2570
> sapply(split(t,t$age),\(i)weighted.mean(i$deaths,i$pop))
70 80
212.9629 302.2570
> data.table(t)[,weighted.mean(deaths,pop),age]
age V1
1: 80 302.2570
2: 70 212.9629
# calculate frequency table using specified factors for each dimension
> fat=\(x,y){o=table(mapply(factor,x,y,SIMPLIFY=F));names(dimnames(o))=NULL;o}
> dim=list(letters[1:3],month.abb[1:3],state.abb[1:2])
> fat(list(c("c","c","b"),c("Jan","Jan","Mar"),c("AL","AL","AK")),dim)
, , AL
Jan Feb Mar
a 0 0 0
b 0 0 0
c 2 0 0
, , AK
Jan Feb Mar
a 0 0 0
b 0 0 1
c 0 0 0
# replace NA values with the previous non-NA value
> v=c(NA,5,NA,NA,3,NA,4)
> data.table::nafill(v,type="locf")
[1] NA 5 5 5 3 3 4
> zoo::na.locf(v,na.rm=F)
[1] NA 5 5 5 3 3 4
# convert a long vector of dates with many repeated dates to character faster
> v=as.Date(sample(18000:19000,1e6,T),"1970-1-1")
> system.time(as.character(v))
user system elapsed
1.639 0.030 1.688
> system.time({u=unique(v);as.character(u)[match(v,u)]})
user system elapsed
0.036 0.002 0.038
> system.time(as.character(seq(min(v),max(v),1))[as.numeric(v-min(v)+1)])
user system elapsed
0.033 0.008 0.041
> system.time(data.table(x=v)[,y:=as.character(x),x]$y)
user system elapsed
0.103 0.003 0.105
# convert date to year and month
> v=Sys.Date()-sample(1e3,1e5,T)
> ua=\(x,fun,...){u=unique(x);fun(u,...)[match(x,u)]}
> b=microbenchmark(times=10,
+ with(as.POSIXlt(v),sprintf("%d-%02d",year+1900,mon+1)),
+ format(v,"%Y-%m"),
+ substr(v,1,7),
+ ua(v,format,"%Y-%m"))
> o=sort(tapply(b$time,gsub(" +"," ",b$expr),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 ua(v, format, "%Y-%m")
12.75 with(as.POSIXlt(v), sprintf("%d-%02d", year + 1900, mon + 1))
14.40 format(v, "%Y-%m")
34.61 substr(v, 1, 7)
# count the number of leap days between the epoch and a date in the Julian calendar inclusive of the date
> d=as.numeric(as.Date(c("2000-2-28","2000-2-29","1968-2-29","1968-3-1")))
> (d-789-ifelse(d<0,1,0))%/%1461+1
[1] 7 8 -1 0
# get the difference between dates in floored years
> d1=as.Date("2021-01-23");d2=as.Date("2022-01-23")
> p1=as.POSIXlt(d1);p2=as.POSIXlt(d2);p2$year-p1$year-pmin(p2$mon<p1$mon,p2$mon==p1$mon&p2$mday<p1$mday)
[1] 1
> as.numeric(format(d2,"%Y"))-as.numeric(format(d1,"%Y"))-as.numeric(format(d2,"%m-%d")<format(d1,"%m-%d"))
[1] 1
> as.numeric(substr(d2,1,4))-as.numeric(substr(d1,1,4))-as.numeric(substr(d2,5,10)<substr(d1,5,10))
[1] 1
> lubridate::interval(d1,d2)%/%lubridate::years()
[1] 1
> d1%--%d2%/%years()
[1] 1
> floor(lubridate::time_length(difftime(d2,d1),"years"))
[1] 0
> floor(as.numeric(lubridate::interval(d1,d2),"years"))
[1] 0
> floor(as.numeric(d2-d1)/365.25)
[1] 0
> age=\(x,y){class(x)=class(y)=NULL;(d2-d1-(d2-789)%/%1461+(d1-789)%/%1461)%/%365}
> age(d1,d2)
[1] 1
# calculate a distance matrix using a mean instead of sum of squared differences to reduce bias caused by NA values
> m=matrix(1:9,3);m[2,3]=NA
> i=combn(nrow(m),2);d=rowMeans((m[i[1,],]-m[i[2,]])^2,na.rm=T);structure(d,class="dist","Size"=nrow(m))
1 2
2 10.0
3 7.0 2.5
# read CSV faster
> write.csv(matrix(sample(1e6),,10),"test.csv",row.names=F)
> system.time(read.csv("test.csv"))
user system elapsed
0.526 0.014 0.540
> system.time(as.data.frame(readr::read_csv("test.csv",show_col_types=F)))
user system elapsed
0.161 0.012 0.062
> system.time(as.data.frame(data.table::fread("test.csv",showProgress=F)))
user system elapsed
0.016 0.002 0.018
# replace NA values with an incrementing sequence starting from the previous non-NA value
> v=c(NA,NA,5,NA,NA,2,8,NA)
> Reduce(\(i,j)if(is.na(j))i+1 else j,v,accumulate=T)
[1] NA NA 5 6 7 2 8 9
> r=rle(is.na(v));w=which(r$values[-1]);s=cumsum(r$lengths)
> v2=v;v2[bit::vecseq(s[w]+1,s[w+1])]=bit::vecseq(v[s[w]]+1,v[s[w]]+r$lengths[w+1]);v2
[1] NA NA 5 6 7 2 8 9
> nn=!is.na(v);w=which(nn);l=length(v);d=diff(c(w,l+1));v2=v;v2[w[1]:l]=sequence(d,v[nn]);v2
[1] NA NA 5 6 7 2 8 9
> Rcpp::cppFunction('NumericVector loifcpp(NumericVector x){int n=x.size();LogicalVector ina=is_na(x);
+ for(int i=1;i<n;i++)if(ina[i])x[i]=x[i-1]+1;return x;}')
> loifcpp(v)
[1] NA NA 5 6 7 2 8 9
# remove NA values from a vector
> v=ifelse(runif(1e5)<.5,NA,sample(1e5))
> b=microbenchmark(na.omit(v),v[complete.cases(v)],v[!is.na(v)],collapse::na_rm(v))
> o=sort(tapply(b$time,gsub(" +"," ",b$expr),median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 collapse::na_rm(v)
2.73 v[!is.na(v)]
4.05 v[complete.cases(v)]
5.38 na.omit(v)
# prevent automatic conversion of date to integer in `for` loop
> d=Sys.Date()-0:1
> for(i in d)print(i)
[1] 19734
[1] 19733
> for(i in as.list(d))print(i)
[1] "2024-01-12"
[1] "2024-01-11"
> for(i in d)print(`class<-`(i,"Date"))
[1] "2024-01-12"
[1] "2024-01-11"
# convert long to wide
> d=data.frame(row=paste0("row",sample(1e5)),col=paste0("col",sample(10,1e5,T)),val=sample(100,1e5,T))
> d=d[!duplicated(d[,-3]),]
> b=microbenchmark(times=10,
+ xtabs=xtabs(val~row+col,d),
+ tapply=tapply(d$val,d[,1:2],I),
+ reshape=reshape(d,idvar="row",timevar="col",direction="wide"),
+ dcast=data.table::dcast(data.table::data.table(d),row~col,value.var="val"),
+ matrix={r=unique(d$row);c=unique(d$col);m=matrix(,length(r),length(c),,list(r,c));m[cbind(d$row,d$col)]=d$val;m},
+ match={r=unique(d$row);c=unique(d$col);sapply(c,\(i)with(subset(d,col==i),val[match(r,row)]))|>"rownames<-"(r)},
+ pivot_wider=tidyr::pivot_wider(d,names_from=col,values_from=val),
+ spread=tidyr::spread(d,col,val))
> o=sort(tapply(b$time,b$expr,median));writeLines(sprintf("%.2f %s",o/min(o),names(o)))
1.00 matrix
1.01 pivot_wider
2.86 match
3.42 dcast
16.26 spread
18.50 reshape
23.67 tapply
24.96 xtabs
# get deciles
> quantile(rnorm(1e6),seq(0,1,.1))|>round(3)
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
-5.048 -1.284 -0.844 -0.525 -0.255 0.000 0.253 0.524 0.842 1.282 5.252
# generate a random boolean vector based on a vector of probabilities
> prob=c(.1,.1,.1,.5,.8,.9,.9,.9,.9);runif(length(prob))<prob
[1] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
# select first column that exists
> mtcars[1:2,]|>dplyr::select(dplyr::any_of(c("horses","horsepower","hp")))
hp
Mazda RX4 110
Mazda RX4 Wag 110
> firstof=\(x,...)for(a in as.character(match.call(expand.dots=F)$...))if(!is.null(x[[a]]))return(x[,a,drop=F])
> mtcars[1:2,]|>firstof(horses,horsepower,hp)
hp
Mazda RX4 110
Mazda RX4 Wag 110
# get sums of 3D array by first and second dimensions
> a=array(1:8,c(2,2,2),list(2023:2024,month.name[1:2],c("male","female")))
> a
, , male
January February
2023 1 3
2024 2 4
, , female
January February
2023 5 7
2024 6 8
> rowSums(a,dims=2)
January February
2023 6 10
2024 8 12
# convert RGB to HSV
> color=c("#00bb00","#888888")
> col2rgb(color)
[,1] [,2]
red 0 136
green 187 136
blue 0 136
> rgb2hsv(col2rgb(color))
[,1] [,2]
h 0.3333333 0.0000000
s 1.0000000 0.0000000
v 0.7333333 0.5333333
# probability of rolling a six for 4 but not more rolls out of 5 rolls of a die
> dbinom(4,5,1/6)
[1] 0.003215021
> p=1/6;n=5;k=4;choose(n,k)*p^k*(1-p)^(n-k)
[1] 0.003215021
# probablility that 6 balls drawn from a box with 10 white balls and 20 black balls contain exactly 4 white balls
> dhyper(4,10,20,6)
[1] 0.06719717
# probablility that 6 balls drawn from a box with 10 white balls and 20 black balls contain 4 or more white balls
> sum(dhyper(4:6,10,20,6))
[1] 0.0760389
> fisher.test(matrix(c(4,6-4,10-4,20-(6-4)),2),alternative="greater")$p
[1] 0.0760389
# convert integer to date
> v=c(12345,23456)
> `class<-`(v,"Date")
[1] "2003-10-20" "2034-03-22"
> as.Date(v,"1970-1-1")
[1] "2003-10-20" "2034-03-22"
> library(zoo);as.Date(v)
[1] "2003-10-20" "2034-03-22"
# specify variable name for `with`
> with(list(x=3),x+1)
[1] 4
# convert dataframe to vector efficiently
> summary(microbenchmark(times=100,unlist(mtcars),unlist(mtcars,use.names=F)))[,c(1,5)]
expr mean
1 unlist(mtcars) 59.31585
2 unlist(mtcars, use.names = F) 2.98307
# add totals grouped by each variable to data.table
> d=data.table(month=month.abb[c(1,1,2,2)],state=state.name[1:2],col1=1:4,col2=10:13)
> d=rbind(d,d[,lapply(.SD,sum),month,.SDcols=3:4][,state:="Total"])
> d=rbind(d,d[,lapply(.SD,sum),state,.SDcols=3:4][,month:="Total"])
> d
month state col1 col2
1: Jan Alabama 1 10
2: Jan Alaska 2 11
3: Feb Alabama 3 12
4: Feb Alaska 4 13
5: Jan Total 3 21
6: Feb Total 7 25
7: Total Alabama 4 22
8: Total Alaska 6 24
9: Total Total 10 46
# aggregate numeric columns by non-numeric columns
> sum0=\(...)sum(...,na.rm=T)
> aua=\(x,fun=sum0,...){n=sapply(x,is.numeric);aggregate(x[,n,drop=F],x[,!n,drop=F],fun,...)}
> aua(iris,mean)
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
# do a linear interpolation of weekly data to daily
> d=data.frame(x=seq(as.Date("2024-1-1"),,7,3),y=c(53,87,63))
> d
x y
1 2024-01-01 53
2 2024-01-08 87
3 2024-01-15 63
> as.data.frame(approx(d$x,d$y,seq(min(d$x),max(d$x),1)))
x y
1 2024-01-01 53.00000
2 2024-01-02 57.85714
3 2024-01-03 62.71429
4 2024-01-04 67.57143
5 2024-01-05 72.42857
6 2024-01-06 77.28571
7 2024-01-07 82.14286
8 2024-01-08 87.00000
9 2024-01-09 83.57143
10 2024-01-10 80.14286
11 2024-01-11 76.71429
12 2024-01-12 73.28571
13 2024-01-13 69.85714
14 2024-01-14 66.42857
15 2024-01-15 63.00000
# delete a single column by name in data.table
> data.table(iris)[,Sepal.Width:=NULL][1:2]
Sepal.Length Petal.Length Petal.Width Species
1: 5.1 1.4 0.2 setosa
2: 4.9 1.4 0.2 setosa
# show ratios between units
> grid::convertUnit(unit(1,"pt"),"lines")
[1] 0.06918500069185lines
> grid::convertUnit(unit(1,"lines"),"in")
[1] 0.2inches
# faster `table`
> v=sample(letters,1e6,T)
> tab=\(x)data.table(x)[,.N,x][,setNames(N,x)]
> summary(microbenchmark(times=10,table(v),tab(v)))[,c(1,5)]
expr median
1 table(v) 65.32598
2 tab(v) 27.96704
# `expand.grid` for data.table
> CJ(month.abb[1:2],state.name[1:3],sorted=F)
V1 V2
1: Jan Alabama
2: Jan Alaska
3: Jan Arizona
4: Feb Alabama
5: Feb Alaska
6: Feb Arizona
# print a data.table in a pipeline that includes an in-place assignment at the end
> d=data.table(x=1:2)
> d[,x:=x*2]|>print()
> d[,x:=x*2][]|>print()
x
1: 4
2: 8
# replace vector values at specified positions with values from another vector
> v=rep(1,5)
> replace(v,3:4,8)
[1] 1 1 8 8 1
> replace(v,3:4,8:9)
[1] 1 1 8 9 1
# insert an item to a vector after the specified position
> append(month.abb[1:3],"x",2)
[1] "Jan" "Feb" "x" "Mar"
# add new dataframe column after specified index
> d=data.frame(a=1,b=10,c=100)
> tibble::add_column(d,x=0,.after=2)
a b x c
1 1 10 0 100
> cbind(d,x=0)[append(1:ncol(d),ncol(d)+1,2)]
a b x c
1 1 10 0 100
# write gzipped CSV
> con=gzfile("output.csv.gz","w",compression=9);write.csv(mtcars,con);close(con)
> write.csv(mtcars,"output.csv")
> file.info(c("output.csv.gz","output.csv"))$size
[1] 880 1783
# fill in zeroes for missing combinations of non-numeric columns
> d=data.frame(month=month.abb[c(1,1,2)],sex=c("M","F","M"),dead=c(3,1,2))
> num=sapply(d,is.numeric);x=d[1,num,drop=F];x[]=0
> o=rbind(d,cbind(expand.grid(lapply(d[,!num,drop=F],unique)),`rownames<-`(x,NULL)))
> o[!duplicated(o[!num]),]
month sex dead
1 Jan M 3
2 Jan F 1
3 Feb M 2
7 Feb F 0
# order dataframe based on a vector of column positions
> cols=c(2,8,9)
> mtcars[do.call(order,lapply(cols,\(i)mtcars[,i])),][1:2,]
mpg cyl disp hp drat wt qsec vs am gear carb
Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2
Merc 240D 24.4 4 146.7 62 3.69 3.19 20.0 1 0 4 2
# get mode value by group
> v=sample(1:10,1e5,T);g=sample(letters,1e5,T)
> modeby=\(v,g)data.table(v,g)[,.N,.(v,g)][order(-N)][rowid(g)==1]
> modeby2=\(v,g)data.table(v,g)[,.N,.(v,g)][,v[which.max(N)],g]
> fmode=\(v,g)data.table(v,g)[,collapse::fmode(v),g]
> fmodetapply=\(v,g)tapply(v,g,collapse::fmode)
> summary(microbenchmark(times=10,modeby(v,g),modeby2(v,g),fmode(v,g),fmodetapply(v,g)))[,c(1,5)]
expr median
1 modeby(v, g) 4.297637
2 modeby2(v, g) 4.258582
3 fmode(v, g) 3.094666
4 fmodetapply(v, g) 6.454565
# subtract 2% from start of range and add 2% to end
> extendrange(0:5,,.02)
[1] -0.1 5.1
# convert theme font size 8 to font size for `geom_text`
> grid::convertUnit(unit(8,"pt"),"mm")
[1] 2.81167842811678mm # theme font sizes are specified in points but `geom_text` sizes are in mm
# convert JSON to a dataframe
> jsonlite::fromJSON('{"data":[{"col1":1,"col2":10},{"col1":2,"col2":20}]}')$data
col1 col2
1 1 10
2 2 20
# read URL into a string
> con=url("http://example.com");paste(readLines(con),collapse="\n")|>substr(1,80);close(con)
[1] "<!doctype html>\n<html>\n<head>\n <title>Example Domain</title>\n\n <meta chars"
# show difference between musical notes on an equal-tempered scale and in just intonation
> d=data.frame(row.names=LETTERS[c(3:7,1,8)],equal=2^(c(0,2,4,5,7,9,11)/12),just=1+c(0,1/8,1/4,1/3,1/2,2/3,7/8))
> round(d,3)
equal just
C 1.000 1.000
D 1.122 1.125
E 1.260 1.250
F 1.335 1.333
G 1.498 1.500
A 1.682 1.667
H 1.888 1.875
> round(transform(1200*log2(d),diff=equal-just),1)
equal just diff
C 0 0.0 0.0
D 200 203.9 -3.9
E 400 386.3 13.7
F 500 498.0 2.0
G 700 702.0 -2.0
A 900 884.4 15.6
H 1100 1088.3 11.7
# get 95% confidence interval for ratio of two numbers
> prop.test(123,456)$conf
[1] 0.2300140 0.3134099
attr(,"conf.level")
[1] 0.95
> n=456;p=123/n;p+c(-1,1)*qnorm(.975)*sqrt(p*(1-p)/n)
[1] 0.2290011 0.3104726
# generate tick breaks that cover the specified values
> v=c(15319,28652,38568)
> pretty(v)
[1] 15000 20000 25000 30000 35000 40000
> scales::breaks_extended(n=5,Q=c(1,2,5))(v)
[1] 15000 20000 25000 30000 35000 40000
# filter vector in pipeline
> v=1:5
> v%>%.[.<3]
[1] 1 2
> v|>Filter(f=\(x)x<3)
[1] 1 2
# rbind without having to specify column names
> d=data.frame(x=1:2,y=5:6);rbind(d,list(3,7))
x y
1 1 5
2 2 6
3 3 7
# add integer index for each unique combination of two or more columns
> d=data.table(year=rep(2020:2022,each=12),month=1:12)
> d=rbind(d,d)
> d[,totalmonth:=.GRP,.(year,month)]
> tail(d,2)
year month totalmonth
1: 2022 11 35
2: 2022 12 36
# get last 3 columns
> m=matrix(1:16,2);m[,ncol(m)-2:0]
[,1] [,2] [,3]
[1,] 11 13 15
[2,] 12 14 16
# fill in zeroes for missing combinations of variables
> df=data.frame(year=c(2023,2023,2024),sex=c("F","M","F"),dead=c(4,2,3))
> dt=data.table(df);tib=tibble(df)
> merge(do.call(CJ,lapply(dt[,-3],unique)),d,all.x=T)[is.na(dead),dead:=0][]
year sex dead
1: 2023 F 4
2: 2023 M 2
3: 2024 F 3
4: 2024 M 0
> df2=rbind(df,cbind(expand.grid(lapply(df[-3],unique)),dead=0));df2[!duplicated(df2[-3]),]
year sex dead
1 2023 F 4
2 2023 M 2
3 2024 F 3
7 2024 M 0
> expand_grid(year=unique(tib$year),sex=unique(tib$sex))|>left_join(tib)|>mutate(dead=replace_na(dead,0))
year sex dead
<dbl> <chr> <dbl>
1 2023 F 4
2 2023 M 2
3 2024 F 3
4 2024 M 0
> tib|>tidyr::complete(year,sex,fill=list(dead=0))
year sex dead
<dbl> <chr> <dbl>
1 2023 F 4
2 2023 M 2
3 2024 F 3
4 2024 M 0
# mean of two dates
> d1=as.Date("2020-1-1");d2=as.Date("2021-1-1")
> mean(d1,d2)
Error in mean.default(unclass(x), ...) :
'trim' must be numeric of length one
> pmean(d1,d2)
[1] "2020-07-02"
# add int and bool
> 28+T
[1] 29
# get the number of days in a month
> d=data.table(year=c(0,100,2020,2020),month=c(2,2,2,3))
> d[,ifelse(month==2,28+(year%%4==0&!(year%%100==0&year%%400!=0)),(31:30)[rep(1:2,,13)[-8]][month])]
[1] 29 28 29 31
> lubridate::days_in_month(d[,as.Date(paste(year,month,1,sep="-"))])
Feb Feb Feb Mar
29 28 29 31
# show ggplot default styles
> theme_gray()[1]
$line
List of 6
$ colour : chr "black"
$ linewidth : num 0.5
$ linetype : num 1
$ lineend : chr "butt"
$ arrow : logi FALSE
$ inherit.blank: logi TRUE
- attr(*, "class")= chr [1:2] "element_line" "element"
# increment date by one month
> d=as.Date("2024-12-1")
> seq(d,,"month",2)[2]
[1] "2025-01-01"
> d%m+%months(1)
[1] "2025-01-01"