R script for computation of 2D-SFS from .arp files

R script for computation of 2D-SFS from .arp files

This script allows one to take multiple-population SNP genotype data in the .arp (Arlequin) format and compute 2D-SFS for each pair of populations. All scripts are provided in their current state of development with no warranty that they will work in all cases, and only for reference purposes. They can be used and modified by citing the Author (see script header). Example files can be obtained from the Author upon request.

#this script was written by Ivan Scotti, INRA, URFM
#Site Agroparc, Domaine Saint-Paul
#84914 AVIGNON Cedex 9
#you are free to use and modify it, provided you quote the author
#for questions, please contact Ivan Scotti at
#ivan.scotti[at]paca.inra.fr
#15 July 2015
#
fromArpToSFS<-function(infile,nChrom,sampleSize,nPop)
    {
  #starts from a .arp file with nPop population samples and outputs
  #all the pairwise 2D-SFS
  #
  #infile is a .arp file produced by fsc251 or fsc252
  #nChrom is the number of 'chromosomes' simulated
  #sampleSize is number of hamploid genotypes per population
  #(assuming all pops have same numer of samples)
  #values:
  #minAllFreq.df: data frame with frequencies and contig id
  #MAF2DSFS: data frame containing the 2D-SFS
  #example arguments
  # infile="waSNP10_maxLmodif_1_1.arp"
  # nChrom=104
  # sampleSize=40
  # nPop=4
  #routine for extracting numbers of SNPs per chrom starts here
  #readLines takes all lines including commented ones
  #starting point is a .arp file produced by fsc251 or fsc252
  test1<-readLines(infile)
  #get lines with SNP counts per chromosome
  test2<-test1[grep(pattern="polymorphic positions",test1)]
  #splitting strings to get the numbers
  test3<-strsplit(test2,split=" ")
  #getting the second item of each element of the list
  #notice FUN="[["
  snpPerSeq<-as.integer(unlist(lapply(test3, "[[", 2)))
  #create a vector of identifiers of "contigs"
  #notice the times= argument
  contigIdent<-rep(1:nChrom,times = snpPerSeq)
  #checking numbers
  #table(rep(1:2883,times=snpPerSeq))
 
  #routine to obtain SNP frequencies
  #using the underscore to identify lines with genotypes
  test4<-test1[grep(pattern="_",test1)]
  test5<-strsplit(test4,split="\t")
  test6<-unlist(lapply(test5, "[[", 3))
  test7<-strsplit(test6,split="")
  #635
  #identifying the N's
  monomorph<-which(test7[[1]]=="N")
  #197
  #counting nb of elements
  nLoci<-length(test7[[1]])
  #building a matrix, one col per SNP
  test8<-matrix(data=unlist(test7),nrow=nPop*sampleSize,ncol=nLoci,byrow=T)
  #removing the N's
  test9<-test8[,-monomorph]
  test9.df<-as.data.frame(test9)
  #need to add all levels (0:3) to all columns
  test10.df<-lapply(X=test9.df,FUN=factor,levels=0:3)
  #dim(test10.df)
  #converting to data frame - works well with a uniformly structured list
  test10.df<-as.data.frame(test10.df)
  #splitting into populations
  #this makes a list of four data frames with pop names
  test10.list<-list()
  for (i in 1:nPop)
  {
    test10.list[[i]]<-test10.df[((i-1)*sampleSize+1):((i-1)*sampleSize+sampleSize),]
  }
  names(test10.list)<-paste("test10Pop",1:nPop,sep="")
  #dim(test10.list[[1]])
  #computing frequency tables
  #frequencies in the global population
  globalCount<-lapply(X=test10.df,FUN=table)
  #frequencies in individual populations
  popCounts<-list()
  for (i in 1:nPop)
  {
   popCounts[[i]]<-lapply(X=test10.list[[i]],FUN=table)
  }
  names(popCounts)<-paste("pop",1:nPop,"count",sep="")
  #turning lists into tables (matrices) and then data frame
  #first for global count
  globalCount.mt<-matrix(nrow=length(globalCount),ncol=4,
                         data=unlist(globalCount),byrow=T)
  #replacing 0 values with NA in global count
  #(necessary to seek the global MAF, otherwise it will be zero for all loci
  globalCount.mt[globalCount.mt==0]<-NA
  #and then for single populations
  popCounts2<-list()
  for (i in 1:nPop)
  {
    popCounts2[[i]]<-matrix(nrow=length(popCounts[[i]]),ncol=4,
                           data=unlist(popCounts[[i]]),byrow=T)
  }
  #cbinding all populations and the global population
  #note the do.call() used to bind all the element of the list - do.call()
  #applies to LISTS of arguments, so it is made exactly to handle this type of case
  counts.df<-as.data.frame(cbind(do.call(cbind,popCounts2),globalCount.mt))
  #names for columns: pops 0...nPop, variants 0 through 3
  names(counts.df)<-paste(rep(c(paste("pop",1:nPop,"_",sep=""),"global_"),each=4),rep(0:3,(nPop+1)),sep="")
  #from here on, production of pairwise 2D-SFS's
  #the layout is generalised from the work done on waSNP
  #
  #starting the extraction of MAF
  #identifying the minor allele for each locus
  #i=2
  #y=counts.df
  #creating a function to use with apply:
  #y is the data frame containing single-pop
  #and global frequencies
  minAllFreqCalc<-function(y)
  {
    #identifying the minor allele in the global count
    #notice the index [1] that takes (arbitrarily)
    #the first allele in case of tie. This is needed
    #because for even allele frequencies, two values
    #will be returned by which()
    #also notice: as.integer() needed because otherwise
    #R will interpret erratically the min() command
    minAllPos<-which(as.integer(y[(4*nPop+1):(4*nPop+4)])==min(as.integer(y[(4*nPop+1):(4*nPop+4)]),na.rm=T))[1]
    #extracting the frequencies in each pop for that allele:
    #will take in each row the column corresponding
    #to the minor allele in the first pop, then move
    #four columns right and take the same allele in the
    #second pop
    as.integer(y[seq(minAllPos,4*nPop,4)])
  }
  #embedding into apply
  #notice the rather convolute way to obtain a data frame with characters
  #applying as.data.frame directly
  minAllFreq.df<-as.data.frame(t(apply(X=counts.df,FUN=minAllFreqCalc,MARGIN=1)))
  names(minAllFreq.df)<-paste("MAFpop",1:nPop,sep="")
  #importing contig identity
  minAllFreq.df<-cbind(contigIdent,minAllFreq.df)
  #before proceeding with the construction and modification of the 2D-SFS,
  #loci with even frequencies must be identified
  #and their counts will be "split" later
  #first, get the global count of Minor Alleles:
  minAllGlobalCounts<-apply(X=minAllFreq.df[,2:(nPop+1)],FUN=sum,MARGIN=1)
  #paste it to its df
  minAllFreq.df<<-cbind(minAllFreq.df,minAllGlobalCounts)
  #looking for loci with even freqs, i.e. MAF=nPop*sampleSize*0.5
  #length(which(minAllGlobalCounts==nPop*sampleSize*0.5))
  #there are N of those.
  #computing the "raw" 2D-SFS's
  #generating  a list to store the matrices
  multiSFS.list<-list()
  #resetting the list index
  k<-0
  for (i in 1:(nPop-1))
  {
    for (j in (i+1):nPop)
    {
      #checking indexes (not used in the final script)
      #print(paste(i,j))
      #generating pairwise SFS for populations i and j
      #remember: first column is contig identity
      MAF2DSFS<-as.matrix(table(minAllFreq.df[,c(i+1,j+1)]))
      #completing 2DSFS with missing "marginal" columns and rows
      #NB this should not be necessary for multi-SFS, as at least
      #one cell should be (statistically) filled for all rows and all columns
      while(dim(MAF2DSFS)[1]<(sampleSize+1)) MAF2DSFS<-rbind(MAF2DSFS,rep(0,dim(MAF2DSFS)[2]))
      while(dim(MAF2DSFS)[2]<(sampleSize+1)) MAF2DSFS<-cbind(MAF2DSFS,rep(0,dim(MAF2DSFS)[1]))
      colnames(MAF2DSFS)<-paste("d0_",0:sampleSize,sep="")
      row.names(MAF2DSFS)<-paste("d1_",0:sampleSize,sep="")
      #curious to see the "fixed difference" loci? here they are
#       subset(minAllFreq.df,MAFpop0==5)
#       subset(minAllFreq.df,MAFpop1==5)
      #splitting in half the counts for alleles with global freq = 0.5
      for (t in 1:(dim(MAF2DSFS)[1]-1))
      {
        splitSum<-(MAF2DSFS[(t+1),(dim(MAF2DSFS)[1]-t)]+MAF2DSFS[(dim(MAF2DSFS)[1]-t),(t+1)])

Date de modification : 22 juin 2023 | Date de création : 16 juillet 2015 | Rédaction : Ivan Scotti