Accueil

Nom du cookie	Finalité	Durée de conservation
Cookies de sessions CAS et PHP	Identifiants de connexion, sécurisation de session	Session
Tarteaucitron	Sauvegarde vos choix en matière de consentement des cookies	12 mois

Nom du cookie	Finalité	Durée de conservation
atid	Tracer le parcours du visiteur afin d’établir les statistiques de visites.	13 mois
atuserid	Stocker l'ID anonyme du visiteur qui se lance dès la première visite du site	13 mois
atidvisitor	Recenser les numsites (identifiants unique d'un site) vus par le visiteur et stockage des identifiants du visiteur.	13 mois

R script for computation of 2D-SFS from .arp files

This script allows one to take multiple-population SNP genotype data in the .arp (Arlequin) format and compute 2D-SFS for each pair of populations. All scripts are provided in their current state of development with no warranty that they will work in all cases, and only for reference purposes. They can be used and modified by citing the Author (see script header). Example files can be obtained from the Author upon request.

#this script was written by Ivan Scotti, INRA, URFM
#Site Agroparc, Domaine Saint-Paul
#84914 AVIGNON Cedex 9
#you are free to use and modify it, provided you quote the author
#for questions, please contact Ivan Scotti at
#ivan.scotti[at]paca.inra.fr
#15 July 2015
#
fromArpToSFS<-function(infile,nChrom,sampleSize,nPop)
    {
#starts from a .arp file with nPop population samples and outputs
#all the pairwise 2D-SFS
#
#infile is a .arp file produced by fsc251 or fsc252
#nChrom is the number of 'chromosomes' simulated
#sampleSize is number of hamploid genotypes per population
#(assuming all pops have same numer of samples)
#values:
#minAllFreq.df: data frame with frequencies and contig id
#MAF2DSFS: data frame containing the 2D-SFS
#example arguments
# infile="waSNP10_maxLmodif_1_1.arp"
# nChrom=104
# sampleSize=40
# nPop=4
#routine for extracting numbers of SNPs per chrom starts here
#readLines takes all lines including commented ones
#starting point is a .arp file produced by fsc251 or fsc252
test1<-readLines(infile)
#get lines with SNP counts per chromosome
test2<-test1[grep(pattern="polymorphic positions",test1)]
#splitting strings to get the numbers
test3<-strsplit(test2,split=" ")
#getting the second item of each element of the list
#notice FUN="[["
snpPerSeq<-as.integer(unlist(lapply(test3, "[[", 2)))
#create a vector of identifiers of "contigs"
#notice the times= argument
contigIdent<-rep(1:nChrom,times = snpPerSeq)
#checking numbers
#table(rep(1:2883,times=snpPerSeq))

#routine to obtain SNP frequencies
#using the underscore to identify lines with genotypes
test4<-test1[grep(pattern="_",test1)]
test5<-strsplit(test4,split="\t")
test6<-unlist(lapply(test5, "[[", 3))
test7<-strsplit(test6,split="")
#635
#identifying the N's
monomorph<-which(test7[[1]]=="N")
#197
#counting nb of elements
nLoci<-length(test7[[1]])
#building a matrix, one col per SNP
test8<-matrix(data=unlist(test7),nrow=nPop*sampleSize,ncol=nLoci,byrow=T)
#removing the N's
test9<-test8[,-monomorph]
test9.df<-as.data.frame(test9)
#need to add all levels (0:3) to all columns
test10.df<-lapply(X=test9.df,FUN=factor,levels=0:3)
#dim(test10.df)
#converting to data frame - works well with a uniformly structured list
test10.df<-as.data.frame(test10.df)
#splitting into populations
#this makes a list of four data frames with pop names
test10.list<-list()
for (i in 1:nPop)
{
    test10.list[[i]]<-test10.df[((i-1)*sampleSize+1):((i-1)*sampleSize+sampleSize),]
}
names(test10.list)<-paste("test10Pop",1:nPop,sep="")
#dim(test10.list[[1]])
#computing frequency tables
#frequencies in the global population
globalCount<-lapply(X=test10.df,FUN=table)
#frequencies in individual populations
popCounts<-list()
for (i in 1:nPop)
{
   popCounts[[i]]<-lapply(X=test10.list[[i]],FUN=table)
}
names(popCounts)<-paste("pop",1:nPop,"count",sep="")
#turning lists into tables (matrices) and then data frame
#first for global count
globalCount.mt<-matrix(nrow=length(globalCount),ncol=4,
                         data=unlist(globalCount),byrow=T)
#replacing 0 values with NA in global count
#(necessary to seek the global MAF, otherwise it will be zero for all loci
globalCount.mt[globalCount.mt==0]<-NA
#and then for single populations
popCounts2<-list()
for (i in 1:nPop)
{
    popCounts2[[i]]<-matrix(nrow=length(popCounts[[i]]),ncol=4,
                           data=unlist(popCounts[[i]]),byrow=T)
}
#cbinding all populations and the global population
#note the do.call() used to bind all the element of the list - do.call()
#applies to LISTS of arguments, so it is made exactly to handle this type of case
counts.df<-as.data.frame(cbind(do.call(cbind,popCounts2),globalCount.mt))
#names for columns: pops 0...nPop, variants 0 through 3
names(counts.df)<-paste(rep(c(paste("pop",1:nPop,"_",sep=""),"global_"),each=4),rep(0:3,(nPop+1)),sep="")
#from here on, production of pairwise 2D-SFS's
#the layout is generalised from the work done on waSNP
#
#starting the extraction of MAF
#identifying the minor allele for each locus
#i=2
#y=counts.df
#creating a function to use with apply:
#y is the data frame containing single-pop
#and global frequencies
minAllFreqCalc<-function(y)
{
    #identifying the minor allele in the global count
    #notice the index [1] that takes (arbitrarily)
    #the first allele in case of tie. This is needed
    #because for even allele frequencies, two values
    #will be returned by which()
    #also notice: as.integer() needed because otherwise
    #R will interpret erratically the min() command
    minAllPos<-which(as.integer(y[(4*nPop+1):(4*nPop+4)])==min(as.integer(y[(4*nPop+1):(4*nPop+4)]),na.rm=T))[1]
    #extracting the frequencies in each pop for that allele:
    #will take in each row the column corresponding
    #to the minor allele in the first pop, then move
    #four columns right and take the same allele in the
    #second pop
    as.integer(y[seq(minAllPos,4*nPop,4)])
}
#embedding into apply
#notice the rather convolute way to obtain a data frame with characters
#applying as.data.frame directly
minAllFreq.df<-as.data.frame(t(apply(X=counts.df,FUN=minAllFreqCalc,MARGIN=1)))
names(minAllFreq.df)<-paste("MAFpop",1:nPop,sep="")
#importing contig identity
minAllFreq.df<-cbind(contigIdent,minAllFreq.df)
#before proceeding with the construction and modification of the 2D-SFS,
#loci with even frequencies must be identified
#and their counts will be "split" later
#first, get the global count of Minor Alleles:
minAllGlobalCounts<-apply(X=minAllFreq.df[,2:(nPop+1)],FUN=sum,MARGIN=1)
#paste it to its df
minAllFreq.df<<-cbind(minAllFreq.df,minAllGlobalCounts)
#looking for loci with even freqs, i.e. MAF=nPop*sampleSize*0.5
#length(which(minAllGlobalCounts==nPop*sampleSize*0.5))
#there are N of those.
#computing the "raw" 2D-SFS's
#generating a list to store the matrices
multiSFS.list<-list()
#resetting the list index
k<-0
for (i in 1:(nPop-1))
{
    for (j in (i+1):nPop)
    {
      #checking indexes (not used in the final script)
      #print(paste(i,j))
      #generating pairwise SFS for populations i and j
      #remember: first column is contig identity
      MAF2DSFS<-as.matrix(table(minAllFreq.df[,c(i+1,j+1)]))
      #completing 2DSFS with missing "marginal" columns and rows
      #NB this should not be necessary for multi-SFS, as at least
      #one cell should be (statistically) filled for all rows and all columns
      while(dim(MAF2DSFS)[1]<(sampleSize+1)) MAF2DSFS<-rbind(MAF2DSFS,rep(0,dim(MAF2DSFS)[2]))
      while(dim(MAF2DSFS)[2]<(sampleSize+1)) MAF2DSFS<-cbind(MAF2DSFS,rep(0,dim(MAF2DSFS)[1]))
      colnames(MAF2DSFS)<-paste("d0_",0:sampleSize,sep="")
      row.names(MAF2DSFS)<-paste("d1_",0:sampleSize,sep="")
      #curious to see the "fixed difference" loci? here they are
#       subset(minAllFreq.df,MAFpop0==5)
#       subset(minAllFreq.df,MAFpop1==5)
      #splitting in half the counts for alleles with global freq = 0.5
      for (t in 1:(dim(MAF2DSFS)[1]-1))
      {
        splitSum<-(MAF2DSFS[(t+1),(dim(MAF2DSFS)[1]-t)]+MAF2DSFS[(dim(MAF2DSFS)[1]-t),(t+1)])

Date de modification : 22 juin 2023 | Date de création : 16 juillet 2015 | Rédaction : Ivan Scotti