library(Matrix) library(fabia) source("/seppdata/sepp/linkage/release/scripts/funct.R") load("/seppdata/sepp/linkage/SFSsimulation/data/allData.Rda") ##obtained: l0,refA,altA,posA,t1,GalleleI minruns <- 1 ## min number or runs maxruns <- 100 ## max number or runs snps <- 10000 ## number of SNPs samplesN <- 1000 ## number of individuals ## avSNV in 1000 individuals: 418000 ## DNA length: 44094874 ## av. SNV dist in 1000 individuals: 44094874/41800 = 105.4901 ShunkLength <- 5000 noImplanted <- 1 ## how many IBD segments implanted <- 10 ## in how many samples lengthI <- 20 ## IBD length in kb noOverwrite <- FALSE ## noOverwrite=TRUE ensures that a haplotype is not superimposed by another haplotype write(paste("maxruns: ",maxruns,sep=""),file="dataSimParameters.txt",ncolumns=10) write(paste("snps: ",snps,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("samplesN: ",samplesN,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("ShunkLength: ",ShunkLength,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("noImplanted: ",noImplanted,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("implanted: ",implanted,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) write(paste("lengthI: ",lengthI,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10) snpsAll <- 1148822 samplesAll <- 5000 haploAll <- 10000 nucleotide <- c("A","T","C","G") haploN <- 2*samplesN dataA <- 1:samplesN g2 <- 2*(1:snps) g1 <- g2 -1 s2 <- 2*dataA s1 <- s2-1 id <- as.character(dataA) zeroid <- as.character(rep(0,samplesN)) famID <- id dim(famID) <- c(samplesN,1) ID <- id dim(ID) <- c(samplesN,1) patID <- zeroid dim(patID) <- c(samplesN,1) matID <- zeroid dim(matID) <- c(samplesN,1) sex <- zeroid dim(sex) <- c(samplesN,1) phen <- zeroid dim(phen) <- c(samplesN,1) snpNames <- as.character(1:snps) chr <- rep(1,snps) for (run in minruns:maxruns) { write(noImplanted,file=paste("dataSim",run,"Impl.txt",sep=""),ncolumns=10) selectI <- sample(samplesAll,samplesN) selectI <- sort(selectI) readin <- as.vector(unlist(rbind(2*selectI-1,2*selectI))) coSu <- colSums(GalleleI[readin,]) coSuW <- which(coSu>0) Lsnps <- length(coSuW) RalleleI <- GalleleI[readin,coSuW] posB <- posA[coSuW] refB <- refA[coSuW] altB <- altA[coSuW] write(selectI,file=paste("dataSimN",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) write(readin,file=paste("dataSimN",run,"Impl.txt",sep=""),append=TRUE,ncolumns=2000) startSNV <- sample((Lsnps-snps-10),1) endSNV <- startSNV+snps-1 rangeSNVs <- startSNV:endSNV pos <- posB[rangeSNVs] ref <- refB[rangeSNVs] alt <- altB[rangeSNVs] alleleI <- matrix(0,haploN,snps) alleleN <- matrix("N",haploN,snps) for (i in 1:haploN) { alleleN[i,] <- ref } minors <- as.vector(rep(0,haploN)) alleleI <- RalleleI[,rangeSNVs] for (i in 1:haploN) { pp <- which(alleleI[i,]>0) minors[i] <- length(pp) alleleN[i,pp] <- alt[pp] } implantIL <- list() implantNL <- list() startN <- list() endN <- list() fromN <- list() atN <- list() lengthN <- list() for (noIm in 1:noImplanted) { notfoundIBD <- TRUE while (notfoundIBD) { startN[[noIm]] <- sample((snps-500),1) DZpos <- pos-pos[startN[[noIm]]] endN[[noIm]] <- max(which(DZpos0) if (length(a1)>10) { notfoundIBD <- FALSE } } } physPos <- pos snpMajor <- ref snpMinor <- alt freq <- colSums(alleleI)/haploN pos1 <- pos/1000000 plinkCols <- cbind(famID,ID,patID,matID,sex,phen) plinkmap <- cbind(chr,snpNames,pos1,pos) plinkmap[,1] <- as.integer(plinkmap[,1]) plinkmap[,2] <- as.integer(plinkmap[,2]) plinkmap[,3] <- format(as.double(plinkmap[,3]),nsmall=6) plinkmap[,4] <- as.integer(plinkmap[,4]) plinkLine <- c("FID","IID","PAT","MAT","SEX","PHENOTYPE",snpNames) mcmc <- rbind(pos1,freq) initM <- matrix(0,samplesN,2*snps) freq <- colSums(alleleI)/haploN alleleIimp <- alleleI alleleNimp <- alleleN allinter <- list() allimpl <- list() for (noIm in 1:noImplanted) { notfoundIBD <- TRUE while (notfoundIBD) { notfoundLong <- TRUE while (notfoundLong) { start <- sample((snps-lengthN[[noIm]]-1),1) end <- start+lengthN[[noIm]]-1 if (pos[end]-pos[start]>1.5*ShunkLength) { notfoundLong <- FALSE } } inter <- start:end implantI <- implantIL[[noIm]] allinter[[noIm]] <- inter implantN <- ref[inter] a1 <- which(implantI>0) intt <- inter[a1] implantN[a1] <- alt[intt] implantNL[[noIm]] <- implantN impl <- sample(haploN,implanted) allimpl[[noIm]] <- impl notfoundIBD <- FALSE a2 <- freq[intt] a3 <- which(a2<0.03) if (length(a3)<9) { notfoundIBD <- TRUE } if ((noIm>1)&&(noOverwrite)) { for (no2Im in 1:(noIm-1)) { if ((length(intersect(impl,allimpl[[no2Im]]))>0)&&(length(intersect(inter,allinter[[no2Im]]))>0)) { notfoundIBD <- TRUE } } } } impl=sort(impl) implG <- (impl+1)%/%2 write(impl,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100) write(implG,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100) write(inter,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) write(implantI,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) write(implantN,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000) intervalImp <- matrix(0,implanted,4) for (i in 1:implanted) { alleleIimp[impl[i],inter] <- implantI alleleNimp[impl[i],inter] <- implantN interImp <- c(start,end,1,lengthN[[noIm]]) intervalImp[i,] <- interImp } write(interImp,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100) } alleleIimpGeno <- alleleIimp[s1,] + alleleIimp[s2,] dummyL <- 1:snps dummyC <- rep(0,snps) dummy <- as.character(dummyL) annot <- list() annot[[1]] <- dummyL annot[[2]] <- pos annot[[3]] <- snpNames annot[[4]] <- snpMajor annot[[5]] <- snpMinor annot[[6]] <- dummy annot[[7]] <- dummy annot[[8]] <- dummy annot[[9]] <- dummy annot[[10]] <- freq annot[[11]] <- dummyC save(snps,haploN,samplesN,dataA,pos,pos1,snpNames,snpMajor,snpMinor,freq,annot,file=paste("dataSim",run,"Anno.Rda",sep="")) save(impl,implG,inter,implantI,implantN,intervalImp,file=paste("dataSim",run,"Impl.Rda",sep="")) # BEAGLE ######## v1 <- rep(1,haploN) dim(v1) <- c(1,haploN) v0 <- as.numeric(rbind(dataA,dataA)) dim(v0) <- c(1,haploN) dataB1 <- rbind(v0,v1,t(alleleNimp)) col1 <- c("id","BC58",snpNames) dim(col1) <- c((snps+2),1) col2 <- rep("M",snps) col2 <- c("I","A",col2) dim(col2) <- c((snps+2),1) dataB <- cbind(col2,col1,dataB1) write.table(dataB,file=paste("dataSim",run,"beagle.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) # PLINK ####### dataP1 <- matrix("character",nrow=samplesN,ncol=2*snps) dataP1[,g1] <- alleleNimp[s1,] dataP1[,g2] <- alleleNimp[s2,] dataP <- cbind(plinkCols,dataP1) write.table(dataP,file=paste("dataSim",run,"plink.ped",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) write.table(plinkmap,file=paste("dataSim",run,"plink.map",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") write.table(plinkCols,file=paste("dataSim",run,"plink.fam",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") #MCMC ##### write.table(as.matrix(alleleIimpGeno),file=paste("dataSim",run,"mcmc.genotype",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") write.table(mcmc,file=paste("dataSim",run,"mcmc.posmaf",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") write.table(initM,file=paste("dataSim",run,"mcmc.initz",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t") #RELATE ######## write.table((as.matrix(alleleIimpGeno)+1),file=paste("dataSim",run,"relate.geno",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) write.table(pos1,file=paste("dataSim",run,"relate.pos",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) write.table(chr,file=paste("dataSim",run,"relate.chr",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE) # fastPHASE ####### write(as.integer(samplesN),file=paste("dataSim",run,"fastPHASE.txt",sep=""),ncolumns=10) write(as.integer(snps),file=paste("dataSim",run,"fastPHASE.txt",sep=""),append=TRUE,ncolumns=10) dataFP <- matrix("",nrow=3*samplesN,ncol=snps) f3 <- 3*dataA f2 <- f3-1 f1 <- f3-2 dataInd <- matrix("",nrow=samplesN,ncol=snps) dataInd[,1] <- paste("# id ",dataA) temp1 <- as.matrix(alleleIimpGeno) temp1[which(temp1>1,arr.ind=TRUE)] <- 1 temp2 <- as.matrix(alleleIimpGeno) temp2 <- temp2-1 temp2[which(temp2<0,arr.ind=TRUE)] <- 0 dataFP[f1,] <- dataInd dataFP[f2,] <- temp2 dataFP[f3,] <- temp1 dataFP[which(dataFP=="0",arr.ind=TRUE)] <- "0 " dataFP[which(dataFP=="1",arr.ind=TRUE)] <- "1 " write.table(dataFP,file=paste("dataSim",run,"fastPHASE.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE,sep="") #FABIA ###### write(as.integer(haploN),file=paste("dataSim",run,"fabia.txt",sep=""),ncolumns=10) write(as.integer(snps),file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10) for (i in 1:haploN) { a1 <- which(alleleIimp[i,]>0.01) al <- length(a1) b1 <- alleleIimp[i,a1] a1 <- a1 - 1 dim(a1) <- c(1,al) dim(b1) <- c(1,al) write(al,file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10) write(a1,file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10000) write(format(as.double(b1),nsmall=1),file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10000) } }