

setwd("/home/akilimo/projects/SoilNPK/lintul/Rwanda")
Max3Years_WLY_Rwanda <- readRDS("Max3Years_WLY_Rwanda.RDS")
str(Max3Years_WLY_Rwanda)
plot(Max3Years_WLY_Rwanda$long, Max3Years_WLY_Rwanda$lat)

setwd("/home/akilimo/projects/SoilNPK/Rwanda")
AC_CI_isda <- readRDS("AC_CIALCA_isda.RDS") ## made in NOT_AC_dataAnalysis.R
head(AC_CI_isda)
unique(AC_CI_isda$country)
## drop DRC
AC_CI_isda <- droplevels(AC_CI_isda[!AC_CI_isda$country == "DRC", ])



splitdata <- function(probs=c(0.7,0.3), soil_BLUPSData){
  set.seed(444)
  trianData <- NULL
  testDatA <- NULL
  for(country in unique(soil_BLUPSData$country)){
    Country_data <- droplevels(soil_BLUPSData[soil_BLUPSData$country == country,])
    indexCountry <- sample(2, nrow(Country_data), replace=TRUE, prob=probs)## where control yield is used as a covariate
    trainDataCountry <- Country_data[indexCountry == 1, ]
    testDataCountry <- Country_data[indexCountry == 2, ]
    trianData <- rbind(trianData, trainDataCountry) 
    testDatA <- rbind(testDatA, testDataCountry)
  }
  return(list(trianData, testDatA))
}


traintestData_AC_CI_isda <- splitdata(soil_BLUPSData = AC_CI_isda)
trainData_AC_CI_isda <- traintestData_AC_CI_isda[[1]]
testData_AC_CI_isda <- traintestData_AC_CI_isda[[2]]


trainData_AC_CI_isda$texture_class_0_20 <- factor(trainData_AC_CI_isda$texture_class_0_20, levels=levels(AC_CI_isda$texture_class_0_20))
trainData_AC_CI_isda$texture_class_20_50  <- factor(trainData_AC_CI_isda$texture_class_20_50 , levels=levels(AC_CI_isda$texture_class_20_50 ))
trainData_AC_CI_isda$fcc <- factor(trainData_AC_CI_isda$fcc, levels=levels(AC_CI_isda$fcc))
trainData_AC_CI_isda$land_cover_2015 <- factor(trainData_AC_CI_isda$land_cover_2015, levels=levels(AC_CI_isda$land_cover_2015))
trainData_AC_CI_isda$land_cover_2016 <- factor(trainData_AC_CI_isda$land_cover_2016, levels=levels(AC_CI_isda$land_cover_2016))
trainData_AC_CI_isda$land_cover_2017 <- factor(trainData_AC_CI_isda$land_cover_2017, levels=levels(AC_CI_isda$land_cover_2017))
trainData_AC_CI_isda$land_cover_2018 <- factor(trainData_AC_CI_isda$land_cover_2018, levels=levels(AC_CI_isda$land_cover_2018))
trainData_AC_CI_isda$land_cover_2019 <- factor(trainData_AC_CI_isda$land_cover_2019, levels=levels(AC_CI_isda$land_cover_2019))
trainData_AC_CI_isda$country <- factor(trainData_AC_CI_isda$country, levels=levels(AC_CI_isda$country))

testData_AC_CI_isda$texture_class_0_20 <- factor(testData_AC_CI_isda$texture_class_0_20, levels=levels(AC_CI_isda$texture_class_0_20))
testData_AC_CI_isda$texture_class_20_50  <- factor(testData_AC_CI_isda$texture_class_20_50 , levels=levels(AC_CI_isda$texture_class_20_50 ))
testData_AC_CI_isda$fcc <- factor(testData_AC_CI_isda$fcc, levels=levels(AC_CI_isda$fcc))
testData_AC_CI_isda$land_cover_2015 <- factor(testData_AC_CI_isda$land_cover_2015, levels=levels(AC_CI_isda$land_cover_2015))
testData_AC_CI_isda$land_cover_2016 <- factor(testData_AC_CI_isda$land_cover_2016, levels=levels(AC_CI_isda$land_cover_2016))
testData_AC_CI_isda$land_cover_2017 <- factor(testData_AC_CI_isda$land_cover_2017, levels=levels(AC_CI_isda$land_cover_2017))
testData_AC_CI_isda$land_cover_2018 <- factor(testData_AC_CI_isda$land_cover_2018, levels=levels(AC_CI_isda$land_cover_2018))
testData_AC_CI_isda$land_cover_2019 <- factor(testData_AC_CI_isda$land_cover_2019, levels=levels(AC_CI_isda$land_cover_2019))
testData_AC_CI_isda$country <- factor(testData_AC_CI_isda$country, levels=levels(AC_CI_isda$country))


Ndata_Train_ACCI_isda <- subset(trainData_AC_CI_isda, select=-c(soilP, soilK))
Pdata_Train_ACCI_isda <- subset(trainData_AC_CI_isda, select=-c(soilN, soilK))
Kdata_Train_ACCI_isda <- subset(trainData_AC_CI_isda, select=-c(soilN, soilP))

Ndata_Valid_ACCI_isda <- subset(testData_AC_CI_isda, select=-c(soilP, soilK))
Pdata_Valid_ACCI_isda <- subset(testData_AC_CI_isda, select=-c(soilN, soilK))
Kdata_Valid_ACCI_isda <- subset(testData_AC_CI_isda, select=-c(soilN, soilP))



##########################################################################
## Random Forest to get apparent soil NPK supply
##########################################################################
require(randomForest)
require(gtools)
## Coustome control parameter 
#custom <- trainControl(method="repeatedcv", number=10, repeats=5, verboseIter=TRUE)
custom <- trainControl(method="oob", number=10)

##soil N
Ndata_Train <- subset(Ndata_Train_ACCI_isda, select=-c(CON)) ## not country based isda rsq = 0.78
Ndata_Valid <- Ndata_Valid_ACCI_isda

set.seed(773)
RF_N1 <- randomForest(soilN ~ ., Ndata_Train , importance=TRUE, ntree=1000)
Ndata_Valid$predN <- predict(RF_N1, Ndata_Valid)
sst <- sum((Ndata_Valid$soilN - mean(Ndata_Valid$soilN))^2)
sse <- sum((Ndata_Valid$soilN - Ndata_Valid$predN)^2)
rsq <- 1 - sse / sst
rsq##0.77



###soil P
Pdata_Train <- subset(Pdata_Train_ACCI_isda, select=-c(CON)) ## rsq = 0.74
Pdata_Valid <- Pdata_Valid_ACCI_isda


set.seed(773)
RF_P1 <- randomForest(soilP ~ ., Pdata_Train , importance=TRUE, ntree=1000)
Pdata_Valid$predP <- predict(RF_P1, Pdata_Valid)
sst <- sum((Pdata_Valid$soilP - mean(Pdata_Valid$soilP))^2)
sse <- sum((Pdata_Valid$soilP - Pdata_Valid$predP)^2)
rsq <- 1 - sse / sst
rsq##0.74


## soil K

Kdata_Train <- subset(Kdata_Train_ACCI_isda , select=-c(CON))##0.55
Kdata_Valid <- Kdata_Valid_ACCI_isda

set.seed(773)
RF_K1 <- randomForest(soilK ~ ., Kdata_Train , importance=TRUE, ntree=1000)
Kdata_Valid$predK <- predict(RF_K1, Kdata_Valid)
sst <- sum((Kdata_Valid$soilK - mean(Kdata_Valid$soilK))^2)
sse <- sum((Kdata_Valid$soilK - Kdata_Valid$predK)^2)
rsq <- 1 - sse / sst
rsq ##0.55


##############################
## ge the soil data for the whole of Rwanda and Burundi and get soilNPK for it

setwd("/home/akilimo/projects/SoilNPK/Coordinates")
RW_isda_H2O2dapth <- readRDS("RW_isda_H2O2dapth.RDS") ## RW & BU  data 
BU_isda_H2O2dapth <- readRDS("BU_isda_H2O2dapth.RDS")
DRC_isda_H2O2dapth <- readRDS("DRC_isda_H2O2dapth.RDS")
ACAI_isda_H2O2dapth <- readRDS("ACAI_isda_H2O2dapth.RDS")
##TODO further teh modling









































### get all the NOT data for ACAI and all
fd <- read.csv("/home/akilimo/lintul/lintul/dataSources/fd2.csv")


getRFY <- function(HD, 
                   RDY, 
                   country = c("NG", "TZ"), fd){
  d <- HD
  # fd <- read.csv("/home/akilimo/lintul/dataSources/fd2.csv")
  DC <- merge(data.frame(dayNr=d), fd[fd$country==country,], sort=FALSE)$DMCont
  RFY <- RDY / DC * 100
  return(RFY)
  
}




#' RF model works only if the factr levels are exactly identical to the data used to develp the model, 
Rfmodel_Wrapper <- function(FCY, country, lat, lon){
  require(randomForest)
  require(caret)

  setwd("/home/akilimo/projects/akilimo_recommendation")
  GIS_soilINS_modData2 <- read.csv("NOT_GIS_CON_2020.csv")
  GIS_soilINS_modData2$Clay_5 <- as.numeric(GIS_soilINS_modData2$Clay_5)
  GIS_soilINS_modData2$Clay_15 <- as.numeric(GIS_soilINS_modData2$Clay_15)
  GIS_soilINS_modData2$Clay_30 <- as.numeric(GIS_soilINS_modData2$Clay_30)
  GIS_soilINS_modData2$silt_5 <- as.numeric(GIS_soilINS_modData2$silt_5)
  GIS_soilINS_modData2$silt_15 <- as.numeric(GIS_soilINS_modData2$silt_15)
  GIS_soilINS_modData2$silt_30 <- as.numeric(GIS_soilINS_modData2$silt_30)
  GIS_soilINS_modData2$BD_5 <- as.numeric(GIS_soilINS_modData2$BD_5)
  GIS_soilINS_modData2$BD_15 <- as.numeric(GIS_soilINS_modData2$BD_15)
  GIS_soilINS_modData2$BD_30 <- as.numeric(GIS_soilINS_modData2$BD_30)
  GIS_soilINS_modData2$CEC_5 <- as.numeric(GIS_soilINS_modData2$CEC_5)
  GIS_soilINS_modData2$CEC_15 <- as.numeric(GIS_soilINS_modData2$CEC_15)
  GIS_soilINS_modData2$CEC_30 <- as.numeric(GIS_soilINS_modData2$CEC_30)
  GIS_soilINS_modData2$TotalN <- as.numeric(GIS_soilINS_modData2$TotalN)
  GIS_soilINS_modData2$Mn <- as.numeric(GIS_soilINS_modData2$Mn)
  GIS_soilINS_modData2$B <- as.numeric(GIS_soilINS_modData2$B)
  GIS_soilINS_modData2$Ca <- as.numeric(GIS_soilINS_modData2$Ca)
  GIS_soilINS_modData2$Fe <- as.numeric(GIS_soilINS_modData2$Fe)
  GIS_soilINS_modData2$Cu <- as.numeric(GIS_soilINS_modData2$Cu)
  GIS_soilINS_modData2$Al <- as.numeric(GIS_soilINS_modData2$Al)
  GIS_soilINS_modData2$Mg <- as.numeric(GIS_soilINS_modData2$Mg)
  GIS_soilINS_modData2$Na <- as.numeric(GIS_soilINS_modData2$Na)
  GIS_soilINS_modData2$ncluster <- as.factor(GIS_soilINS_modData2$ncluster)
  GIS_soilINS_modData2$CON <- as.numeric(GIS_soilINS_modData2$CON)
  
  
  GIS_soilINS_modData2$CONclass <- ifelse(GIS_soilINS_modData2$CON < 7.5, "class1",
                                          ifelse(GIS_soilINS_modData2$CON >= 7.5 & GIS_soilINS_modData2$CON < 15, "class2",
                                                 ifelse(GIS_soilINS_modData2$CON >= 15 & GIS_soilINS_modData2$CON < 22.5, "class3",
                                                        ifelse(GIS_soilINS_modData2$CON >= 22.5 & GIS_soilINS_modData2$CON < 30, "class4", "class5"))))
  
  GIS_soilINS_modData2$CONclass <- as.factor(GIS_soilINS_modData2$CONclass)
  
  ### to avoid error from RF model "New factor levels not present in the training data" we should do  ...
  # setwd("D:/ACAI_Wrapper/cloud_compute/LINTUL/TZ_data")
  # SoilData_TZ <- readRDS("ISRICsoil.RDS")
  # setwd("D:/ACAI_Wrapper/cloud_compute/LINTUL/NG_data")
  # SoilData_NG <- readRDS("ISRICsoil.RDS")
  # SoilData_TZ$country <- as.factor("Tanzania")
  # SoilData_NG$country <- as.factor("Nigeria")
  # ISRIC_SoilData <- rbind(SoilData_TZ, SoilData_NG)
  # setwd("D:/ACAI_Wrapper/cloud_compute/LINTUL")
  # saveRDS(ISRIC_SoilData, "ISRIC_SoilData_2020.RDS")
  #lat = 4.775; lon = 8.425
  
  #setwd("D:/ACAI_Wrapper/cloud_compute/LINTUL")
  setwd("/home/akilimo/projects/akilimo_recommendation")
  ISRIC_SoilData <- readRDS("ISRIC_SoilData_2020.RDS")
  ISRIC_SoilData <- unique(ISRIC_SoilData[ISRIC_SoilData$lat == lat & ISRIC_SoilData$long == lon, ])
  
  ISRIC_SoilData$Clay_5 <- as.numeric(ISRIC_SoilData$Clay_5)
  ISRIC_SoilData$Clay_15 <- as.numeric(ISRIC_SoilData$Clay_15)
  ISRIC_SoilData$Clay_30 <- as.numeric(ISRIC_SoilData$Clay_30)
  ISRIC_SoilData$silt_5 <- as.numeric(ISRIC_SoilData$silt_5)
  ISRIC_SoilData$silt_15 <- as.numeric(ISRIC_SoilData$silt_15)
  ISRIC_SoilData$silt_30 <- as.numeric(ISRIC_SoilData$silt_30)
  ISRIC_SoilData$BD_5 <- as.numeric(ISRIC_SoilData$BD_5)
  ISRIC_SoilData$BD_15 <- as.numeric(ISRIC_SoilData$BD_15)
  ISRIC_SoilData$BD_30 <- as.numeric(ISRIC_SoilData$BD_30)
  ISRIC_SoilData$CEC_5 <- as.numeric(ISRIC_SoilData$CEC_5)
  ISRIC_SoilData$CEC_15 <- as.numeric(ISRIC_SoilData$CEC_15)
  ISRIC_SoilData$CEC_30 <- as.numeric(ISRIC_SoilData$CEC_30)
  ISRIC_SoilData$TotalN <- as.numeric(ISRIC_SoilData$TotalN)
  ISRIC_SoilData$Mn <- as.numeric(ISRIC_SoilData$Mn)
  ISRIC_SoilData$B <- as.numeric(ISRIC_SoilData$B)
  ISRIC_SoilData$Ca <- as.numeric(ISRIC_SoilData$Ca)
  ISRIC_SoilData$Fe <- as.numeric(ISRIC_SoilData$Fe)
  ISRIC_SoilData$Cu <- as.numeric(ISRIC_SoilData$Cu)
  ISRIC_SoilData$Al <- as.numeric(ISRIC_SoilData$Al)
  ISRIC_SoilData$Mg <- as.numeric(ISRIC_SoilData$Mg)
  ISRIC_SoilData$Na <- as.numeric(ISRIC_SoilData$Na)
  ISRIC_SoilData$ncluster <- as.factor(ISRIC_SoilData$ncluster)
  ISRIC_SoilData$CON <- FCY ## this value is only to standardaize the for the RF, other wise it gets teh value form the user input  
  
  
  ISRIC_SoilData$CONclass <- ifelse(ISRIC_SoilData$CON < 7.5, "class1",
                                    ifelse(ISRIC_SoilData$CON >= 7.5 & ISRIC_SoilData$CON < 15, "class2",
                                           ifelse(ISRIC_SoilData$CON >= 15 & ISRIC_SoilData$CON < 22.5, "class3",
                                                  ifelse(ISRIC_SoilData$CON >= 22.5 & ISRIC_SoilData$CON < 30, "class4", "class5"))))
  
  
  
  ISRIC_SoilData$CONclass <- as.factor(ISRIC_SoilData$CONclass)
  
  ISRIC_SoilData$soilN <- 0
  ISRIC_SoilData$soilP <- 0
  ISRIC_SoilData$soilK <- 0
  
  trianData <- droplevels(ISRIC_SoilData[, colnames(GIS_soilINS_modData2)])
  
  trianData$use <- "Valid"
  GIS_soilINS_modData2$use <- "train"
  factoring <- rbind(GIS_soilINS_modData2, trianData)
  
  GIS_soilINS_modData2 <- factoring[factoring$use == "train", ]
  GIS_soilINS_modData2 <- subset(GIS_soilINS_modData2, select=-c(use))
  
  ISRIC_SoilData <- factoring[factoring$use == "Valid", ]
  ISRIC_SoilData <- subset(ISRIC_SoilData, select=-c(use))
  
  ### Data partioning
  set.seed(444)
  ind <- sample(2, nrow(GIS_soilINS_modData2), replace=TRUE, prob=c(0.7, 0.3))## where conrtol yield is used as a covariate
  trainData <- GIS_soilINS_modData2[ind==1, ]
  testData <- GIS_soilINS_modData2[ind==2, ]
  
  Ndata_Train  <- subset(trainData, select=-c(soilP, soilK))
  Pdata_Train  <- subset(trainData, select=-c(soilN, soilK))
  Kdata_Train  <- subset(trainData, select=-c(soilN, soilP))
  
  Ndata_Valid  <- subset(testData, select=-c(soilP, soilK))
  Pdata_Valid  <- subset(testData, select=-c(soilN, soilK))
  Kdata_Valid  <- subset(testData, select=-c(soilN, soilP))
  
  ## Coustome control parameter 
  #custom <- trainControl(method="repeatedcv", number=10, repeats=5, verboseIter=TRUE)
  require(caret)
  custom <- trainControl(method="oob", number=10)
  ##########################################################################
  ## Random Forest soilN:
  ##########################################################################
  set.seed(444)
  RF_N1 <- randomForest(log(soilN) ~ ., subset(Ndata_Train, select = -c(CON)), importance=TRUE, ntree=1000)
  
  ##########################################################################
  ## Random Forest "soilP" 
  ##########################################################################
  set.seed(773)
  RF_P1 <- randomForest(log(soilP) ~ ., subset(Pdata_Train, select=-c(CON)),importance=TRUE, ntree=1000)
  
  ##########################################################################
  ## Random Forest soilK" R sq. 0.60 if control is used, 0.29 otherwise
  ##########################################################################
  set.seed(773)
  RF_K1 <- randomForest(log(soilK) ~ ., subset(Kdata_Train, select=-c(CON)), importance=TRUE, ntree=1000)
  
  ##########################################################################
  ## use the random forest model and get the soil NPK estimates for the whole area
  ##########################################################################
  ISRIC_SoilData$soilN <- 0
  ISRIC_SoilData$soilP <- 0
  ISRIC_SoilData$soilK <- 0
  
  
  ISRIC_SoilData <- ISRIC_SoilData[, c("soilN", "soilP", "soilK", "exchK", "olsenP", "Clay_5","Clay_15","Clay_30","percentSOM_5","percentSOM_15","percentSOM_30",
                                       "pH_5","pH_15","pH_30", "silt_5", "silt_15", "silt_30","BD_5", "BD_15", "BD_30", "CEC_5","CEC_15","CEC_30", "percentSOC_5",
                                       "percentSOC_15", "percentSOC_30", "FC_5", "FC_15", "FC_30", "wp_5", "wp_15", "wp_30", "sws_5", "sws_15", "sws_30",
                                       "TotalN", "Mn","B", "Ca","Fe", "Cu","Al", "Mg", "Na","ncluster","country", "CON", "CONclass")]
  ISRIC_SoilData <- subset(ISRIC_SoilData, select = -(CON))
  
  ISRIC_SoilData$country <- as.factor(ISRIC_SoilData$country)
  ISRIC_SoilData$ncluster <- as.factor(ISRIC_SoilData$ncluster)
  
  ISRIC_SoilData$soilN <-  exp(predict(RF_N1, ISRIC_SoilData))
  ISRIC_SoilData$soilP <-  exp(predict(RF_P1, ISRIC_SoilData))
  ISRIC_SoilData$soilK <-  exp(predict(RF_K1, ISRIC_SoilData))
  
  ISRIC_SoilData$rec_N <- 0.5
  ISRIC_SoilData$rec_P <- 0.15
  ISRIC_SoilData$rec_K <- 0.5
  ISRIC_SoilData$rel_N <- 1
  ISRIC_SoilData$rel_P <- ISRIC_SoilData$soilP / ISRIC_SoilData$soilN
  ISRIC_SoilData$rel_K <- ISRIC_SoilData$soilK / ISRIC_SoilData$soilN
  ISRIC_SoilData$lat <- lat
  ISRIC_SoilData$long <- lon
  ISRIC_SoilData$location <- paste(ISRIC_SoilData$lat, ISRIC_SoilData$long, sep="_")
  ISRIC_SoilData$Zone <- country
  ISRIC_SoilData <- ISRIC_SoilData[, c("location","lat", "long", "soilN","soilP","soilK", "Zone","rec_N", "rec_P", "rec_K", "rel_N","rel_P","rel_K")]
  return(ISRIC_SoilData)
  
}

