# Estimation of sampling error for haplotypic and genotypic
# linkage disequilbrium (LD) coefficients under Hardy-Weinberg equilibrium (HWE)

# This R code supplements
# Takeuchi et al. (2009)
# A genome-wide association study confirms VKORC1, CYP2C9 and CYP4F2
# as principal genetic determinants of warfarin dose, PLoS Genetics

# We consider two SNPs in LD.
# First SNP has minor allele a, major allele A.
# Second SNP has minor allele b, major allele B.
# Let their frequency be fa <= fb <= fB <= fA

# For a (sampled) haploid,
# a random variable x1 is set to 1 for a, 0 for A, and
# x2 is assigned 1 for b, 0 for B.
# The correlation coefficient between x1 and x2 is r, the haplotypic LD coefficent.
# The square of r is r^2.
# For a (sampled) diploid,
# a random variable y1 is set to 2 for genotype aa, 1 for aA, 0 for AA, and
# y2 is assigned 2 for bb, 1 for bB, 0 for BB.
# The correlation coefficient between y1 and y2 is R, the genotypic LD coefficeint.
# The square of R is R^2.
# Under HWE, r=R.

# Simulate n diploids under predefined LD, and calculate LD value from the samples.
simulate = function(fa, fb,
	Dprime, # LD is predefined as D'
	n # Number of diploid samples
	) {
	fA = 1 - fa;
	fB = 1 - fb;
	D = if (Dprime >= 0) {
			fa * fB * Dprime
		} else {
			fa * fb * Dprime
		}
	r = D/sqrt(fa*fA*fb*fB);
	fab = fa * fb + D; #frequency of haplotype ab in population
	faB = fa * fB - D; #frequency of haplotype aB in population
	fAb = fA * fb - D; #frequency of haplotype Ab in population
	fAB = fA * fB + D; #frequency of haplotype AB in population
	#print(paste(fab, faB, fAb, fAB));
	paternalchromosomes = t(rmultinom(n, 1, c(fab, faB, fAb, fAB))); #sample paternal chromosomes
	maternalchromosomes = t(rmultinom(n, 1, c(fab, faB, fAb, fAB))); #sample maternal chromosomes
	allchromosomes = rbind(paternalchromosomes, maternalchromosomes);
	diploids = paternalchromosomes + maternalchromosomes;
	rsampled = cor(allchromosomes[,1]+allchromosomes[,2],
		allchromosomes[,1]+allchromosomes[,3]);
	Rsampled = cor(diploids[,1]+diploids[,2],
		diploids[,1]+diploids[,3]);
	c(r, #population value r=R
		rsampled, #sampled r
		Rsampled, #sampled R
		abs((rsampled-Rsampled)/r) #difference between sampled r and R, relative to true r
	)
}
simulate(0.1, 0.1, 0.5, 1000)

# Now, repeat the sampling to assess the confidence interval (CI) of sampled r and R.
simn = 200 # Number of simulations
simlb = 5 # lower bound of 95% CI (= simn * 0.025)
simub = 195 # upper bound of 95% CI (= simn * 0.975)

procedure = function (fa, fb) {
output = numeric()
for (Dprime in seq(-1,1,0.1)) {
	cat(".")
	simresults = numeric();
	for (i in 1:simn) {simresults = c(simresults, simulate(fa, fb, Dprime, 1000))}
	output = c(output,
		sprintf("%.3f", simresults[1]), #population value of r=R
		sprintf("%.3f", sort(simresults[4*c(1:simn) -2])[simlb]), #95% CI for sampled r
		sprintf("%.3f", sort(simresults[4*c(1:simn) -2])[simub]),
		sprintf("%.3f", sort(simresults[4*c(1:simn) -1])[simlb]), #95% CI for sampled R
		sprintf("%.3f", sort(simresults[4*c(1:simn) -1])[simub]),
		sprintf("%.3f", sort(simresults[4*c(1:simn)   ])[simlb]), #95% CI for relative
		sprintf("%.3f", sort(simresults[4*c(1:simn)   ])[simub]), #difference between sampled r and R
		sprintf("%.3f", simresults[1]^2), #population value of r^2=R^2
		sprintf("%.3f", sort(simresults[4*c(1:simn) -1]^2)[simlb]), #95% CI for R^2
		sprintf("%.3f", sort(simresults[4*c(1:simn) -1]^2)[simub]) #biased upwards
		)
	}
cat("\n")
output = data.frame(matrix(output, nrow=21, ncol=10, byrow=T))
names(output) = c("rpop=Rpop", "r_95%LB", "r_95%UB", "R_95%LB", "R_95%UB",
	"abs(r-R)/rpop_95%LB", "abs(r-R/rpop)_95%UB",
	"Rpop^2",
	"R^2_95%LB", "R^2_95%UB")
output
}

# When minor allele frequency of a SNP is as low as 1%, LD estimates are not accurate.
# Under Rpop=0.1, although relative difference between r and R is up to 50% (of the true value),
# R itself can range like (-0.02, 0.25), and deviate largely from the true value Rpop.
# Accordingly, r also deviates from rpop.
procedure(fa = 0.01, fb = 0.01)
procedure(fa = 0.01, fb = 0.1)
procedure(fa = 0.01, fb = 0.3)
procedure(fa = 0.01, fb = 0.5)
# When minor allele frequency of a SNP is 10% or more, LD estimates are accurate for 1000 diploids.
# Under Rpop=0.1, relative difference between r and R again can be up to 50% (of the true value),
# but R itself ranges like (0.03, 0.17) around Rpop,
# and r as (0.04, 0.15) is centered even better around rpop.
procedure(fa = 0.1, fb = 0.1)
procedure(fa = 0.1, fb = 0.3)
procedure(fa = 0.1, fb = 0.5)
procedure(fa = 0.3, fb = 0.3)
procedure(fa = 0.3, fb = 0.5)
procedure(fa = 0.5, fb = 0.5)