# ldgroup program written in R # linkage disequilibrium grouping of single nucleotide # polymorphisms (SNPs) reflecting haplotype phylogeny # for efficient selection of tag SNPs. # http://www.fumihiko.takeuchi.name/publications.html # 2007.11.16 Fumihiko Takeuchi ########## # edit the setting below ########## # load igraph library library(igraph) # set working directory setwd("/Users/fumi/Desktop/ldgroup/") # output file names are filenameradix.tagSNPs.txt # where is the r^2 threshold for defining LD groups filenameradix <- "test" # frequency of a neglectable rare haplotype frequencythreshold <- 0.05 # allowable total frequency of neglected haplotypes totfreqneglectedhaplotypesmax <- 0.05 # verbose output during calculation (TRUE or FALSE) verbose <- FALSE ########## Execute below when the input is phased by Beagle foo <- read.table("test.phased", header=FALSE) snp <- as.character(foo[[2]]) cat("#SNPs =", length(snp), "\n") haplotype <- data.frame(t(foo[ ,-(1:2)])) cat("#individuals =", dim(haplotype)[1]/2, "\n") rm(foo) ######### Excecute below when the input is phased by MACH snp <- read.table("test.dat", header=FALSE) snp <- as.character(snp[[2]]) cat("#SNPs =", length(snp), "\n") diplotype <- read.table("test.geno", header=FALSE) diplotype <- diplotype[ ,-(1:2)] if (length(snp) == dim(diplotype)[2]) { cat("#individuals =", dim(diplotype)[1], "\n") } else { cat("Wrong #SNPs in diplotype file,", dim(diplotype)[1], "\n") } separate <- function (x) { c(substr(x, 1, 1), substr(x, 3, 3))} separatev <- function (x) { unlist(lapply(x, separate)) } haplotype <- data.frame(lapply(diplotype, separatev)) rm(diplotype) ########## # main calcualtions ########## # x is n x 2 data.frame rsquare <- function(x) { if ((length(union(x[ ,1],list())) == 1) || (length(union(x[ ,2],list())) == 1)) { return(0) } else { t <- table(x) return( (t[1,1]*t[2,2] - t[1,2]*t[2,1])^2 / (t[1,1]+t[1,2]) / (t[2,1]+t[2,2]) / (t[1,1]+t[2,1]) / (t[1,2]+t[2,2]) ) } } sv <- seq(0, 1, 0.1) totfreqneglectedhaplotypesv <- numeric(length(sv)) names(totfreqneglectedhaplotypesv) <- sv # Step 2 ########## iteration for s starts here for (s in sv) { cat("\n##########\niteration for s =", s, "\n") # Step 3 if (verbose) { cat("calculating LD groups ... ") } g <- graph.empty(n = length(snp), directed = FALSE) # index for vertices starts from 0 for (i in 1:(length(snp) - 1)) { for (j in (i+1):(min(i+100,length(snp)))) { if (rsquare(haplotype[ ,c(i,j)]) > s) { g <- add.edges(g,c(i-1,j-1)) } } } cg <- clusters(g) LDgroups <- lapply(1:length(cg$csize) - 1, (function (x) { which(cg$membership == x) }) ) if (verbose) { cat("done.\n") } # Step 5 haplofreqsall <- function (snps) { table(apply(as.matrix(haplotype[ ,snps]), 1, (function (x) paste(x, collapse="")))) / dim(haplotype)[1] } completeLDsubgroups <- function (mh) { if (verbose) { cat("common haplotypes:\n") for (i in 1:length(mh)) { cat("", names(mh)[i], mh[i], "\n") } } df <- data.frame(t(data.frame(strsplit(names(mh), split = character(0))))) normalized <- unlist(lapply(df, (function (x) {paste(x == x[[1]], collapse=" ")}) )) fac <- factor(normalized) if (verbose) { cat("complete-LD subgroups:\n") tapply(1:length(normalized), fac, (function (x) { v <- rep(0, length(normalized)) v[x] <- 1 cat(" ", v, "\n", sep = "")})) } return(tapply(1:length(normalized), fac, c)) } tagSNPs <- list() totfreqneglectedhaplotypes <- numeric(0) for (i in 1:length(LDgroups)) { if (verbose) { cat("\nLD group", i, "comprising SNPs", LDgroups[[i]], "\n") } hfa <- haplofreqsall(LDgroups[[i]]) mh <- hfa[hfa > frequencythreshold] if (length(mh) == 0) { cat("No common haplotype!\n") tagSNPs[[i]] <- list(LDgroups[[i]]) totfreqneglectedhaplotypes[i] <- 0 } else { tagSNPs[[i]] <- lapply(completeLDsubgroups(mh), (function (x) { (LDgroups[[i]])[x] }) ) names(tagSNPs[[i]]) <- NULL # Step 6 totfreqneglectedhaplotypes[i] <- 1- sum(mh) } } outputfilename <- paste(filenameradix, ".tagSNPs", sub("\\.", "", as.character(s)), ".txt", sep="") output <- character(0) for (i in 1:length(tagSNPs)) { for(j in 1:length(tagSNPs[[i]])) { for (k in tagSNPs[[i]][[j]]) { output <- c(output, i, j, snp[k]) } } } write(output, ncolumns = 3, file = outputfilename, sep = "\t") # Step 6, continued totfreqneglectedhaplotypesv[as.character(s)] <- { t <- totfreqneglectedhaplotypes[ unlist(lapply(LDgroups, (function (x) { length(x)>1 }))) ]; if (length(t)>0) { mean(t) } else { 0 } } } ########## iteration for s ends here # Step 7 cat("Total frequency of neglected haplotypes according to various s,\nthe r^2 threshold for LD grouping, was:\n") print(totfreqneglectedhaplotypesv) i <- length(sv)-1 while (i >= 1) { if (totfreqneglectedhaplotypesv[i] > totfreqneglectedhaplotypesmax) { break } i <- i - 1 } cat("The recommended threshold for tag SNPs selection is s >=", sv[i+1], "\n")