#' ---
#' title: "Script to simulate Octomom copy number variation over time in a mixed population"
#' author: "Luis Teixeira - Host microorganism interactions lab - Instituto Gulbenkian de Ciencia"
#' ---

set.seed(33)

#days to test
day<-c(0,5,10,15,20,25,30,35,40)
dayl<-length(day)


#SURVIVAL DATA

library(survival)

survival=read.csv("S1_Dataset.csv", head=T, sep=",")

#get survival percentages of each Octomom copy number at each day from survival curves
surv2=survfit(Surv(Time,Status)~Wolbachia_type, data=subset(survival,survival$Wolbachia_type=="2"))
#plot(model2)
survprob2 <- summary(surv2,time=day)$surv
survprob2

surv12=survfit(Surv(Time,Status)~Wolbachia_type, data=subset(survival,survival$Wolbachia_type=="12"))
#plot(model12)
survprob12 <- summary(surv12,time=day)$surv
survprob12


#make matrix to calculate percentage of each Octomom copy number at each day
two<-rep(0,dayl)
twelve<-rep(0,dayl)
surv<-data.frame(day,two,twelve)

surv$two[1:length(survprob2)]<-survprob2
surv$twelve[1:length(survprob12)]<-survprob12

surv$perc_2<-surv$two/(surv$two+surv$twelve)


#survival curve data has initial higher mortality in 2 copies Octomom than 12 copies Octomom
#this is due to experimental noise
#to make simulation more stringet set initial survival of 2 copies to 100%
#results do not change
#surv[2:4,2]<-c(1,1,1)


#WOLBACHIA LEVELS DATA

levels=read.csv("S2_Dataset.csv", head=T, sep=",")
levels$octomom<-as.factor(levels$octomom)

#transform Time data
levels$time<-gsub(" .*$","",levels$time)
levels$time<-as.numeric(levels$time)

#subset to different Octomom levels
levels2<-subset(levels,octomom=="2")
levels12<-subset(levels,octomom=="12")

#model for 2 copies
model_2<-lm(log(levels)~time,data=levels2)
growth_2<-as.numeric(model_2$coefficients)


#model for 12 copies
model_12<-lm(log(levels)~time,data=levels12)
growth_12<-as.numeric(model_12$coefficients)


#PRODUCE table with estimates of wolbachia levels and (octomom cn*levels)

#table for 2 copies
octo<-rep(2,dayl)
mlevels_2<-data.frame(day,octo)
mlevels_2$id<-with(mlevels_2,paste(day,octo,sep="_"))
mlevels_2$level<-exp(growth_2[1]+(growth_2[2]*mlevels_2$day))

#table for 12 copies
octo<-rep(12,dayl)
mlevels_12<-data.frame(day,octo)
mlevels_12$id<-with(mlevels_12,paste(day,octo,sep="_"))
mlevels_12$level<-exp(growth_12[1]+(growth_12[2]*mlevels_12$day))

#make table with both
mlevels<-rbind(mlevels_2,mlevels_12)

#make levels and octomom number multiplication
mlevels$comb<-with(mlevels,octo*level)


#SAMPLE m samples of n individuals with 2 or 12 octomom copies from survival estimates

#number of individuals per sample
n=5
#number of samples per time point
m=30


#create data frame to place generated values
octomom<-rep(0,dayl*m*n)
sample<-rep(0,dayl*m*n)
time<-rep(0,dayl*m*n)
simsamples<-data.frame(time,sample,octomom)

for (i in 1:dayl) {
	perc_2<-subset(surv,day==day[i])$perc_2
	for (j in 1:m) {
		#define rows to modify in simsamples data frame
		samplerange<-c(((i-1)*m*n+(j-1)*n+1):((i-1)*m*n+(j-1)*n+n))
		
		#insert time information in simsamples
		simsamples$time[samplerange]<-day[i]

		#define and insert sample id in simsamples
		sampleid<-paste("sample",((i-1)*m+j),sep="")
		simsamples$sample[samplerange]<-sampleid

		#sample flies with 2 ou 12 Octomom copy number based on survival data of that day
		simsamples$octomom[samplerange] <- sample(c(2,12),
		size = n,
		prob = c(perc_2,(1-perc_2)),
		replace = TRUE
		)
		}
	}

#Populate simsamples table with predicted values

#create id for combination of time and octomom copy number to match mlevels table
simsamples$id<-with(simsamples,paste(time,octomom,sep="_"))

simsamples$level<-mlevels$level[match(simsamples$id,mlevels$id)]
simsamples$comb<-mlevels$comb[match(simsamples$id,mlevels$id)]

#get average Octomom copy number per sample
#formula is: sum of compounds (octomom cn * wolb level) divided by sum of wolb levels
library(plyr)

Mean_octomom<-ddply(simsamples, .(sample,time),summarize, moctomom = (sum(comb)/sum(level)))

ml<-mean(Mean_octomom$moctomom)



#PLOTTING
library(ggplot2)


#plot octomom levels of sample with smooth line and line of average
ggplot(data=Mean_octomom,aes(time,moctomom)) +
	geom_jitter(height=1,width=1) +
	geom_smooth() +
	geom_hline(yintercept = ml) +
	xlab("Time (days)") +
  	ylab("Pooled sample Octomom copy number")



