# Estimation of sampling error for haplotypic and genotypic # linkage disequilbrium (LD) coefficients under Hardy-Weinberg equilibrium (HWE) # This R code supplements # Takeuchi et al. (2009) # A genome-wide association study confirms VKORC1, CYP2C9 and CYP4F2 # as principal genetic determinants of warfarin dose, PLoS Genetics # We consider two SNPs in LD. # First SNP has minor allele a, major allele A. # Second SNP has minor allele b, major allele B. # Let their frequency be fa <= fb <= fB <= fA # For a (sampled) haploid, # a random variable x1 is set to 1 for a, 0 for A, and # x2 is assigned 1 for b, 0 for B. # The correlation coefficient between x1 and x2 is r, the haplotypic LD coefficent. # The square of r is r^2. # For a (sampled) diploid, # a random variable y1 is set to 2 for genotype aa, 1 for aA, 0 for AA, and # y2 is assigned 2 for bb, 1 for bB, 0 for BB. # The correlation coefficient between y1 and y2 is R, the genotypic LD coefficeint. # The square of R is R^2. # Under HWE, r=R. # Simulate n diploids under predefined LD, and calculate LD value from the samples. simulate = function(fa, fb, Dprime, # LD is predefined as D' n # Number of diploid samples ) { fA = 1 - fa; fB = 1 - fb; D = if (Dprime >= 0) { fa * fB * Dprime } else { fa * fb * Dprime } r = D/sqrt(fa*fA*fb*fB); fab = fa * fb + D; #frequency of haplotype ab in population faB = fa * fB - D; #frequency of haplotype aB in population fAb = fA * fb - D; #frequency of haplotype Ab in population fAB = fA * fB + D; #frequency of haplotype AB in population #print(paste(fab, faB, fAb, fAB)); paternalchromosomes = t(rmultinom(n, 1, c(fab, faB, fAb, fAB))); #sample paternal chromosomes maternalchromosomes = t(rmultinom(n, 1, c(fab, faB, fAb, fAB))); #sample maternal chromosomes allchromosomes = rbind(paternalchromosomes, maternalchromosomes); diploids = paternalchromosomes + maternalchromosomes; rsampled = cor(allchromosomes[,1]+allchromosomes[,2], allchromosomes[,1]+allchromosomes[,3]); Rsampled = cor(diploids[,1]+diploids[,2], diploids[,1]+diploids[,3]); c(r, #population value r=R rsampled, #sampled r Rsampled, #sampled R abs((rsampled-Rsampled)/r) #difference between sampled r and R, relative to true r ) } simulate(0.1, 0.1, 0.5, 1000) # Now, repeat the sampling to assess the confidence interval (CI) of sampled r and R. simn = 200 # Number of simulations simlb = 5 # lower bound of 95% CI (= simn * 0.025) simub = 195 # upper bound of 95% CI (= simn * 0.975) procedure = function (fa, fb) { output = numeric() for (Dprime in seq(-1,1,0.1)) { cat(".") simresults = numeric(); for (i in 1:simn) {simresults = c(simresults, simulate(fa, fb, Dprime, 1000))} output = c(output, sprintf("%.3f", simresults[1]), #population value of r=R sprintf("%.3f", sort(simresults[4*c(1:simn) -2])[simlb]), #95% CI for sampled r sprintf("%.3f", sort(simresults[4*c(1:simn) -2])[simub]), sprintf("%.3f", sort(simresults[4*c(1:simn) -1])[simlb]), #95% CI for sampled R sprintf("%.3f", sort(simresults[4*c(1:simn) -1])[simub]), sprintf("%.3f", sort(simresults[4*c(1:simn) ])[simlb]), #95% CI for relative sprintf("%.3f", sort(simresults[4*c(1:simn) ])[simub]), #difference between sampled r and R sprintf("%.3f", simresults[1]^2), #population value of r^2=R^2 sprintf("%.3f", sort(simresults[4*c(1:simn) -1]^2)[simlb]), #95% CI for R^2 sprintf("%.3f", sort(simresults[4*c(1:simn) -1]^2)[simub]) #biased upwards ) } cat("\n") output = data.frame(matrix(output, nrow=21, ncol=10, byrow=T)) names(output) = c("rpop=Rpop", "r_95%LB", "r_95%UB", "R_95%LB", "R_95%UB", "abs(r-R)/rpop_95%LB", "abs(r-R/rpop)_95%UB", "Rpop^2", "R^2_95%LB", "R^2_95%UB") output } # When minor allele frequency of a SNP is as low as 1%, LD estimates are not accurate. # Under Rpop=0.1, although relative difference between r and R is up to 50% (of the true value), # R itself can range like (-0.02, 0.25), and deviate largely from the true value Rpop. # Accordingly, r also deviates from rpop. procedure(fa = 0.01, fb = 0.01) procedure(fa = 0.01, fb = 0.1) procedure(fa = 0.01, fb = 0.3) procedure(fa = 0.01, fb = 0.5) # When minor allele frequency of a SNP is 10% or more, LD estimates are accurate for 1000 diploids. # Under Rpop=0.1, relative difference between r and R again can be up to 50% (of the true value), # but R itself ranges like (0.03, 0.17) around Rpop, # and r as (0.04, 0.15) is centered even better around rpop. procedure(fa = 0.1, fb = 0.1) procedure(fa = 0.1, fb = 0.3) procedure(fa = 0.1, fb = 0.5) procedure(fa = 0.3, fb = 0.3) procedure(fa = 0.3, fb = 0.5) procedure(fa = 0.5, fb = 0.5)