library(fabia) library(permute) source("/seppdata/sepp/linkage/release/scripts/funct.R") load(file="/seppdata/sepp/linkage/release/annotation/individuals.Rda") minruns <- 1 ## min number or runs maxruns <- 100 ## max number or runs snps <- 10000 ## number of SNPs samplesN <- 1000 ## number of individuals ShunkLength <- 5000 ## length suffeled shunks of original data noImplanted <- 1 ## how many haplotypes implanted <- 10 ## in how many samples lengthI <- 20 ## IBD length in kb noOverwrite <- FALSE ## noOverwrite=TRUE ensures that a haplotype is not superimposed by another haplotype write(paste("maxruns: ",maxruns,sep=""),file="dataSimParameters.txt",ncolumns=10) write(paste("snps: ",snps,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("samplesN: ",samplesN,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("ShunkLength: ",ShunkLength,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("noImplanted: ",noImplanted,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("implanted: ",implanted,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("lengthI: ",lengthI,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) snpsAll <- 3201157 samplesAll <- 1092 haploAll <- 2184 shift <- 5000 intervallAll <- 10000 over <- intervallAll%/%shift N1 <- snpsAll%/%shift endRunAll <- (N1-over+1) # 639 startRunAll <- 0 startRun <- 1 endRun <- 639 nucleotide <- c("A","T","C","G") haploN <- 2*samplesN dataA <- 1:samplesN g2 <- 2*(1:snps) g1 <- g2 -1 s2 <- 2*dataA s1 <- s2-1 id <- as.character(dataA) zeroid <- as.character(rep(0,samplesN)) famID <- id dim(famID) <- c(samplesN,1) ID <- id dim(ID) <- c(samplesN,1) patID <- zeroid dim(patID) <- c(samplesN,1) matID <- zeroid dim(matID) <- c(samplesN,1) sex <- zeroid dim(sex) <- c(samplesN,1) phen <- zeroid dim(phen) <- c(samplesN,1) snpNames <- as.character(1:snps) chr <- rep(1,snps) for (run in minruns:maxruns) { write(noImplanted,file=paste("dataSim",run,"Impl.txt",sep=""),ncolumns=10) posAll <- sample(startRun:endRun,1) start <- posAll*shift end <- start + intervallAll if (end > snpsAll) { end <- snpsAll } pRange <- paste("_",format(start,scientific=FALSE),"_",format(end,scientific=FALSE),sep="") write(pRange,file=paste("dataSimN",run,"Impl.txt",sep=""),ncolumns=10) annot <- read.table(paste("../shuffle",pRange,"_annot.txt",sep=""),header = FALSE, sep = " ", quote = "",as.is = TRUE,skip=2) for (i in 3:9) { annot[[i]] <- gsub(",",";",annot[[i]]) } for (i in 4:5) { annot[[i]] <- gsub("<","A",annot[[i]]) annot[[i]] <- gsub(">","A",annot[[i]]) } pos <- annot[,2] ref <- substr(annot[,4],1,1) alt <- substr(annot[,5],1,1) Dpos <- c(sample(50:150,1),diff(pos)) Zannot <- read.table(paste("/seppdata/sepp/linkage/release/split/ALL.chr1.merged_beagle_mach.20101123.snps_indels_svs.genotypes",pRange,"_annot.txt",sep=""),header = FALSE, sep = " ", quote = "",as.is = TRUE,skip=2) for (i in 3:9) { Zannot[[i]] <- gsub(",",";",Zannot[[i]]) } for (i in 4:5) { Zannot[[i]] <- gsub("<","A",Zannot[[i]]) Zannot[[i]] <- gsub(">","A",Zannot[[i]]) } Zpos <- Zannot[,2] Zref <- substr(Zannot[,4],1,1) Zalt <- substr(Zannot[,5],1,1) ZDpos <- c(sample(50:150,1),diff(Zpos)) alleleI <- matrix(0,haploN,snps) alleleN <- matrix("N",haploN,snps) for (i in 1:haploN) { alleleN[i,] <- ref } file <- paste("../shuffle",pRange,"_mat.txt",sep="") file1 <- paste("/seppdata/sepp/linkage/release/split/ALL.chr1.merged_beagle_mach.20101123.snps_indels_svs.genotypes",pRange,"_mat.txt",sep="") minors <- as.vector(rep(0,haploN)) selectI <- sample(samplesAll,samplesN) selectI <- sort(selectI) readin <- as.vector(unlist(rbind(2*selectI-1,2*selectI))) write(selectI,file=paste("dataSimN",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) write(readin,file=paste("dataSimN",run,"Impl.txt",sep=""),append=TRUE,ncolumns=2000) for (i in 1:haploN) { i1 <- readin[i] minors[i] <- as.integer(scan(file,what="integer",nlines=1,skip=3*(i1-1)+2)) tmp <- as.integer(scan(file,what="integer",nlines=1,skip=3*(i1-1)+3)) alleleI[i,tmp] <- 1 alleleN[i,tmp] <- alt[tmp] } Ipos <- diffinv(Dpos)[2:(snps+1)] implantIL <- list() implantNL <- list() startN <- list() endN <- list() fromN <- list() lengthN <- list() for (noIm in 1:noImplanted) { notfoundIBD <- TRUE while (notfoundIBD) { startN[[noIm]] <- sample((snps-500),1) DZpos <- Zpos-Zpos[startN[[noIm]]] endN[[noIm]] <- max(which(DZpos0) if (length(a1)>10) { notfoundIBD <- FALSE } } } physPos <- diffinv(Dpos)[2:(snps+1)] snpMajor <- ref snpMinor <- alt DphysPos <- Dpos freq <- colSums(alleleI)/haploN pos <- physPos pos1 <- pos/1000000 plinkCols <- cbind(famID,ID,patID,matID,sex,phen) plinkmap <- cbind(chr,snpNames,pos1,pos) plinkmap[,1] <- as.integer(plinkmap[,1]) plinkmap[,2] <- as.integer(plinkmap[,2]) plinkmap[,3] <- format(as.double(plinkmap[,3]),nsmall=6) plinkmap[,4] <- as.integer(plinkmap[,4]) plinkLine <- c("FID","IID","PAT","MAT","SEX","PHENOTYPE",snpNames) mcmc <- rbind(pos1,freq) initM <- matrix(0,samplesN,2*snps) freq <- colSums(alleleI)/haploN alleleIimp <- alleleI alleleNimp <- alleleN allinter <- list() allimpl <- list() for (noIm in 1:noImplanted) { notfoundIBD <- TRUE while (notfoundIBD) { notfoundLong <- TRUE while (notfoundLong) { start <- sample((snps-lengthN[[noIm]]-1),1) end <- start+lengthN[[noIm]]-1 if (pos[end]-pos[start]>1.5*ShunkLength) { notfoundLong <- FALSE } } inter <- start:end implantI <- implantIL[[noIm]] allinter[[noIm]] <- inter implantN <- ref[inter] a1 <- which(implantI>0) intt <- inter[a1] implantN[a1] <- alt[intt] implantNL[[noIm]] <- implantN impl <- sample(haploN,implanted) allimpl[[noIm]] <- impl notfoundIBD <- FALSE a2 <- freq[intt] a3 <- which(a2<0.05) if (length(a3)<6) { notfoundIBD <- TRUE } if ((noIm>1)&&(noOverwrite)) { for (no2Im in 1:(noIm-1)) { if ((length(intersect(impl,allimpl[[no2Im]]))>0)&&(length(intersect(inter,allinter[[no2Im]]))>0)) { notfoundIBD <- TRUE } } } } impl=sort(impl) implG <- (impl+1)%/%2 write(impl,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100) write(implG,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100) write(inter,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) write(implantI,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) write(implantN,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) intervalImp <- matrix(0,implanted,4) for (i in 1:implanted) { alleleIimp[impl[i],inter] <- implantI alleleNimp[impl[i],inter] <- implantN interImp <- c(start,end,1,lengthN[[noIm]]) intervalImp[i,] <- interImp } write(interImp,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100) } alleleIimpGeno <- alleleIimp[s1,] + alleleIimp[s2,] dummyL <- 1:snps dummyC <- rep(0,snps) dummy <- as.character(dummyL) annot <- list() annot[[1]] <- dummyL annot[[2]] <- pos annot[[3]] <- snpNames annot[[4]] <- snpMajor annot[[5]] <- snpMinor annot[[6]] <- dummy annot[[7]] <- dummy annot[[8]] <- dummy annot[[9]] <- dummy annot[[10]] <- freq annot[[11]] <- dummyC save(snps,haploN,samplesN,dataA,pos,pos1,snpNames,snpMajor,snpMinor,freq,annot,file=paste("dataSim",run,"Anno.Rda",sep="")) save(impl,implG,inter,implantI,implantN,intervalImp,file=paste("dataSim",run,"Impl.Rda",sep="")) # BEAGLE ######## v1 <- rep(1,haploN) dim(v1) <- c(1,haploN) v0 <- as.numeric(rbind(dataA,dataA)) dim(v0) <- c(1,haploN) dataB1 <- rbind(v0,v1,t(alleleNimp)) col1 <- c("id","BC58",snpNames) dim(col1) <- c((snps+2),1) col2 <- rep("M",snps) col2 <- c("I","A",col2) dim(col2) <- c((snps+2),1) dataB <- cbind(col2,col1,dataB1) write.table(dataB,file=paste("dataSim",run,"beagle.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) # PLINK ####### dataP1 <- matrix("character",nrow=samplesN,ncol=2*snps) dataP1[,g1] <- alleleNimp[s1,] dataP1[,g2] <- alleleNimp[s2,] dataP <- cbind(plinkCols,dataP1) write.table(dataP,file=paste("dataSim",run,"plink.ped",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) write.table(plinkmap,file=paste("dataSim",run,"plink.map",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") write.table(plinkCols,file=paste("dataSim",run,"plink.fam",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") #MCMC ##### write.table(alleleIimpGeno,file=paste("dataSim",run,"mcmc.genotype",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") write.table(mcmc,file=paste("dataSim",run,"mcmc.posmaf",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") write.table(initM,file=paste("dataSim",run,"mcmc.initz",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") #RELATE ######## write.table((alleleIimpGeno+1),file=paste("dataSim",run,"relate.geno",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) write.table(pos1,file=paste("dataSim",run,"relate.pos",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) write.table(chr,file=paste("dataSim",run,"relate.chr",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) # fastPHASE ####### write(as.integer(samplesN),file=paste("dataSim",run,"fastPHASE.txt",sep=""),ncolumns=10) write(as.integer(snps),file=paste("dataSim",run,"fastPHASE.txt",sep=""),append=TRUE,ncolumns=10) dataFP <- matrix("",nrow=3*samplesN,ncol=snps) f3 <- 3*dataA f2 <- f3-1 f1 <- f3-2 dataInd <- matrix("",nrow=samplesN,ncol=snps) dataInd[,1] <- paste("# id ",dataA) temp1 <- as.matrix(alleleIimpGeno) temp1[which(temp1>1,arr.ind=TRUE)] <- 1 temp2 <- as.matrix(alleleIimpGeno) temp2 <- temp2-1 temp2[which(temp2<0,arr.ind=TRUE)] <- 0 dataFP[f1,] <- dataInd dataFP[f2,] <- temp2 dataFP[f3,] <- temp1 dataFP[which(dataFP=="0",arr.ind=TRUE)] <- "0 " dataFP[which(dataFP=="1",arr.ind=TRUE)] <- "1 " write.table(dataFP,file=paste("dataSim",run,"fastPHASE.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE,sep="") #FABIA ###### write(as.integer(haploN),file=paste("dataSim",run,"fabia.txt",sep=""),ncolumns=10) write(as.integer(snps),file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10) for (i in 1:haploN) { a1 <- which(alleleIimp[i,]>0.01) al <- length(a1) b1 <- alleleIimp[i,a1] a1 <- a1 - 1 dim(a1) <- c(1,al) dim(b1) <- c(1,al) write(al,file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10) write(a1,file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10000) write(format(as.double(b1),nsmall=1),file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10000) } }