

# install.packages("lme4", repos=c("http://lme4.r-forge.r-project.org/repos", getOption("repos")[["CRAN"]]))
# install.packages("lmerTest", repos=c("http://lmerTest.r-forge.r-project.org/repos", getOption("repos")[["CRAN"]]))

require(lsmeans)
require(emmeans)
require(reshape2)
require(lme4)
require(lmerTest)
require(tidyr)
require(ggplot2)
require(party)
require(plyr)
require(gtools)
library(raster)
library(sf)
require(plyr)
require(MASS)
#require(glmnet)
require(caret)
require(mlbench)
require(psych)
library(rgdal)
require(randomForest)
require(rgeos)


## ACAI NOT
setwd("~/extra_storage/shiny_server/SoilNPK/Rwanda")

# setwd("/home/akilimo/projects/SoilNPK/Rwanda")
ACNOT <- readRDS("NOT_Blups_2019.RDS")



head(ACNOT)
ggplot(ACNOT, aes(NPK, PK, col=country))+
  geom_point()+
  facet_wrap(~country)
longDataac <- gather(ACNOT, treat, value, PK:CON)
head(longDataac)


longDataac <- longDataac[!longDataac$treat == "half_NPK", ]
longDataac$YieldDiff <- longDataac$NPK - longDataac$value
longDataac$NutrientMissing <- ifelse(longDataac$treat == "NP", "K", ifelse(longDataac$treat == "NK", "P", 
                                                                       ifelse(longDataac$treat == "PK", "N", "NPK")))

longDataac$TreatDiff <- ifelse(longDataac$treat == "NP", "NPK - NP (K Eff.)", 
                             ifelse(longDataac$treat == "NK", "NPK - NK (P Eff.)", 
                                    ifelse(longDataac$treat == "PK", "NPK - PK (N Eff.)", "NPK - CON (NPK Eff.)")))
longDataac <- droplevels(longDataac[longDataac$YieldDiff <= 30, ])

longDataac$NutrientMissing <- factor(longDataac$NutrientMissing, levels = c("N", "P", "K", "NPK"))
longDataac$TreatDiff <- factor(longDataac$TreatDiff, levels = c("NPK - CON (NPK Eff.)", "NPK - PK (N Eff.)", 
                                                            "NPK - NP (K Eff.)","NPK - NK (P Eff.)"))
gg1 <- ggplot(data=longDataac, aes(value, NPK,  col=country)) +
  facet_grid(country~treat, scales = 'free_y') +
  geom_point(size=1.2) +
  geom_abline(intercept=0,slope=1, col="darkgrey") +
  scale_color_manual(values=c("black", "orange")) +
  xlab("Control, NK, NP, PK, root yield")+
  theme_bw() +
  theme(axis.text  = element_text(size=13),legend.position="none",
        axis.title = element_text(size=15), strip.text = element_text(size=14), 
        legend.text =element_text(size=15), legend.title = element_blank())

ggsave("NOT_ACAI.png", gg1, width=12, height = 6)




##########################################################################################################
## RW data
##########################################################################################################
# setwd("/home/akilimo/projects/SoilNPK/Rwanda")
setwd("~/extra_storage/shiny_server/SoilNPK/Rwanda")
DS_corr <- read.csv("Root_yield3.csv")
ds_NOTY <- read.csv("RootYield2_Density.csv")

nrow(DS_corr)
nrow(ds_NOTY)

## RW data needs density correction: by modelling
DS_corr <- droplevels(DS_corr[!DS_corr$Country == "Rw", ])
ds_NOTY$trialID <- ds_NOTY$trial_ID
DS_corr$nrPlants_m2 <- 1

ds_NOTY <- ds_NOTY[, colnames(DS_corr)]

DS_corr2 <- rbind(DS_corr, ds_NOTY)


nrow(DS_corr)
nrow(DS_corr2)

DS_corr <- DS_corr2

require(tidyverse)
head(DS_corr)
names(DS_corr) <- gsub("X_geopoint_", "", names(DS_corr))
CIALCA_coordainte <- unique(DS_corr[, c("latitude", "longitude")])
str(CIALCA_coordainte)
saveRDS(CIALCA_coordainte, "CIALCA_coordainte.RDS")

unique(DS_corr$Treatment)
ds_NOT <- droplevels(DS_corr[DS_corr$Treatment %in% c("NK", "PK", "NP", "NPK_rep1", "NPK_rep2", "NoF"), ])
head(ds_NOT)
unique(ds_NOT$Country )
length(unique(ds_NOT$trialID))
ds_NOT$Country <- ifelse(ds_NOT$Country == "RDC", "DRC", as.character(ds_NOT$Country) )


### treat errors corrected
rbind(ds_NOT[ds_NOT$Treatment == "NPK_rep2" & ds_NOT$K_rate != 180, ],
                  ds_NOT[ds_NOT$Treatment == "NP" & ds_NOT$N_rate != 150, ],
                  ds_NOT[ds_NOT$Treatment == "NP" & ds_NOT$P_rate != 40, ],
                  ds_NOT[ds_NOT$Treatment == "NP" & ds_NOT$K_rate != 0, ])


unique(ds_NOT$N_rate)
unique(ds_NOT$P_rate)
unique(ds_NOT$K_rate)


ds_NOT <- ds_NOT[!is.na(ds_NOT$rootYield), ]
head(ds_NOT)

write.csv(ds_NOT, "/home/akilimo/projects/SoilNPK/Rwanda/RwBuDRC_Season1Data.csv", row.names = FALSE)

################################################################
## ISRIC data
###################################################################
setwd("/home/akilimo/lintul/lintul")
source("sourceISRIC.R")
RW_CA <- unique(ds_NOT[, c("longitude","latitude")])
RW_CA <- RW_CA[!is.na(RW_CA$longitude), ]
names(RW_CA) <- c("lon", "lat")
str(RW_CA)
ISRIC_RW_CS <- NULL
for(i in 1:nrow(RW_CA)){
  isd <- getISRICData(lat = RW_CA$lat[i], lon = RW_CA$lon[i])
  ISRIC_RW_CS <- rbind(ISRIC_RW_CS, isd)
}

setwd("/home/akilimo/projects/SoilNPK/Rwanda")
# saveRDS(ISRIC_RW_CS, "ISRIC_RW_Cassava.RDS")
ISRIC_RW_CS <- readRDS("ISRIC_RW_Cassava.RDS")
str(ISRIC_RW_CS)

ISRIC_RW_CS$location <- paste(ISRIC_RW_CS$long, ISRIC_RW_CS$lat, sep="_")
ds_NOT$location <- paste(ds_NOT$longitude, ds_NOT$latitude, sep="_")
ISRIC_RW_CS <- unique(merge(ISRIC_RW_CS, unique(ds_NOT[, c("trialID", "location")]), by="location" ))


######################################################################
## iSDA
######################################################################
setwd("/home/akilimo/projects/SoilNPK/Coordinates")
CIALCA_isda <- readRDS("CIALCA_isda.RDS")
CIALCA_landCropCover <- readRDS("CIALCA_isda_cover.RDS")

CIALCA_landCropCover$land_cover_2015 <- revalue(CIALCA_landCropCover$land_cover_2015, c("Closed forest, unknown" = "losedForest", "Cultivated and managed vegetation/agriculture (cropland)" = "Cropland",
                                "Open forest, unknown"= " OpenForest", "Urban / built up" = "Urban",  "Shrubs" = "Shrubs", "Herbaceous vegetation" = "herbaceousVegetation"))

CIALCA_landCropCover$land_cover_2016 <- revalue(CIALCA_landCropCover$land_cover_2016, c("Closed forest, unknown" = "losedForest", "Cultivated and managed vegetation/agriculture (cropland)" = "Cropland",
                                "Open forest, unknown"= " OpenForest", "Urban / built up" = "Urban",  "Shrubs" = "Shrubs", "Herbaceous vegetation" = "herbaceousVegetation"))

CIALCA_landCropCover$land_cover_2017 <- revalue(CIALCA_landCropCover$land_cover_2017, c("Closed forest, unknown" = "losedForest", "Cultivated and managed vegetation/agriculture (cropland)" = "Cropland",
                                "Open forest, unknown"= " OpenForest", "Urban / built up" = "Urban",  "Shrubs" = "Shrubs", "Herbaceous vegetation" = "herbaceousVegetation"))

CIALCA_landCropCover$land_cover_2018 <- revalue(CIALCA_landCropCover$land_cover_2018, c("Closed forest, unknown" = "losedForest", "Cultivated and managed vegetation/agriculture (cropland)" = "Cropland",
                                "Open forest, unknown"= " OpenForest", "Urban / built up" = "Urban",  "Shrubs" = "Shrubs", "Herbaceous vegetation" = "herbaceousVegetation"))

CIALCA_landCropCover$land_cover_2019 <- revalue(CIALCA_landCropCover$land_cover_2019, c("Closed forest, unknown" = "losedForest", "Cultivated and managed vegetation/agriculture (cropland)" = "Cropland",
                                "Open forest, unknown"= " OpenForest", "Urban / built up" = "Urban",  "Shrubs" = "Shrubs", "Herbaceous vegetation" = "herbaceousVegetation"))


CIALCA_landCropCover <- CIALCA_landCropCover[, c("lat","lon", "slope_angle", "fcc", "land_cover_2015", "land_cover_2016", "land_cover_2017", "land_cover_2018", "land_cover_2019")]
CIALCA_isda_main <- CIALCA_isda[!CIALCA_isda$Variable == "bedrock_depth", ]
CIALCA_bedRock_200cm <- CIALCA_isda[CIALCA_isda$Variable == "bedrock_depth", ]
CIALCA_bedRock_200cm$value <- as.numeric(as.character(CIALCA_bedRock_200cm$value))
bedrockdepth1 <- CIALCA_bedRock_200cm[seq(1,180, by=2), ]
bedrockdepth2 <- CIALCA_bedRock_200cm[seq(2,180, by=2), ]

summary(bedrockdepth1$value)
summary(bedrockdepth2$value)

CIALCA_isda <- rbind(CIALCA_isda_main, bedrockdepth1)
head(CIALCA_isda)
CIALCA_isda$Variable <- paste(CIALCA_isda$Variable, CIALCA_isda$depth, sep="_")
CIALCA_isda_wide <- spread(CIALCA_isda[, c("lat","lon","Variable", "value")], Variable, value, )
colnames(CIALCA_isda_wide) <- gsub("-", "_", colnames(CIALCA_isda_wide))
head(CIALCA_isda_wide)


CIALCA_isda_wide <- unique(merge(CIALCA_isda_wide, CIALCA_landCropCover, by=c("lat", "lon")))
head(CIALCA_isda_wide)
str(CIALCA_isda_wide)
## covert to numeric 
require(tidyverse)
CIALCA_isda_wide %>% str()
CIALCA_isda_wide$texture_class_0_20 <- as.factor(CIALCA_isda_wide$texture_class_0_20)
CIALCA_isda_wide$texture_class_20_50 <- as.factor(CIALCA_isda_wide$texture_class_20_50)
CIALCA_isda_wide <- CIALCA_isda_wide %>% mutate_if(is.character, as.numeric)
CIALCA_isda_wide %>% str()



### get ornaic matter
CIALCA_isda_wide$soilOM_0_20  <- CIALCA_isda_wide$carbon_organic_0_20  * 2
CIALCA_isda_wide$soilOM_20_50  <- CIALCA_isda_wide$carbon_organic_20_50  * 2

CIALCA_isda_wide$percentSOM_0_20 <- CIALCA_isda_wide$soilOM_0_20/10
CIALCA_isda_wide$percentSOM_20_50 <- CIALCA_isda_wide$soilOM_20_50/10



## use pedo-transfer equations and get Fc, WP and SC
##### WP ######
CIALCA_isda_wide$wp_0_20A <- (-0.024*CIALCA_isda_wide$sand_content_0_20/100) + 0.487*CIALCA_isda_wide$clay_content_0_20/100 + 0.006*CIALCA_isda_wide$percentSOM_0_20 + 
  0.005*(CIALCA_isda_wide$sand_content_0_20/100 *CIALCA_isda_wide$percentSOM_0_20) - 0.013*(CIALCA_isda_wide$clay_content_0_20/100 * CIALCA_isda_wide$percentSOM_0_20) + 
  0.068*(CIALCA_isda_wide$sand_content_0_20/100 * CIALCA_isda_wide$clay_content_0_20/100 ) + 0.031
CIALCA_isda_wide$wp_0_20 <- (CIALCA_isda_wide$wp_0_20A + (0.14 * CIALCA_isda_wide$wp_0_20A - 0.02))*100

CIALCA_isda_wide$wp_20_50A <- (-0.024*CIALCA_isda_wide$sand_content_20_50/100) + 0.487*CIALCA_isda_wide$clay_content_20_50/100 + 0.006*CIALCA_isda_wide$percentSOM_20_50 + 
  0.005*(CIALCA_isda_wide$sand_content_20_50/100 *CIALCA_isda_wide$percentSOM_20_50) - 0.013*(CIALCA_isda_wide$clay_content_20_50/100 * CIALCA_isda_wide$percentSOM_20_50) + 
  0.068*(CIALCA_isda_wide$sand_content_20_50/100 * CIALCA_isda_wide$clay_content_20_50/100 ) + 0.031
CIALCA_isda_wide$wp_20_50 <- (CIALCA_isda_wide$wp_20_50A + (0.14 * CIALCA_isda_wide$wp_20_50A - 0.02))*100

CIALCA_isda_wide <- subset(CIALCA_isda_wide, select=-c(wp_0_20A, wp_20_50A))



##### FC ######
CIALCA_isda_wide$FC_0_20_A <- -0.251*CIALCA_isda_wide$sand_content_0_20/100 + 0.195*CIALCA_isda_wide$clay_content_0_20/100 + 0.011*CIALCA_isda_wide$percentSOM_0_20 + 
  0.006*(CIALCA_isda_wide$sand_content_0_20/100 * CIALCA_isda_wide$percentSOM_0_20) - 0.027*(CIALCA_isda_wide$clay_content_0_20/100 * CIALCA_isda_wide$percentSOM_0_20) + 
  0.452*(CIALCA_isda_wide$sand_content_0_20/100 * CIALCA_isda_wide$clay_content_0_20/100) + 0.299
CIALCA_isda_wide$FC_0_20 <- (CIALCA_isda_wide$FC_0_20_A + (1.283 * CIALCA_isda_wide$FC_0_20_A^2 - 0.374 * CIALCA_isda_wide$FC_0_20_A - 0.015))*100


CIALCA_isda_wide$FC_20_50_A <- -0.251*CIALCA_isda_wide$sand_content_20_50/100 + 0.195*CIALCA_isda_wide$clay_content_20_50/100 + 0.011*CIALCA_isda_wide$percentSOM_20_50 + 
  0.006*(CIALCA_isda_wide$sand_content_20_50/100 * CIALCA_isda_wide$percentSOM_20_50) - 0.027*(CIALCA_isda_wide$clay_content_20_50/100 * CIALCA_isda_wide$percentSOM_20_50) + 
  0.452*(CIALCA_isda_wide$sand_content_20_50/100 * CIALCA_isda_wide$clay_content_20_50/100) + 0.299
CIALCA_isda_wide$FC_20_50 <- (CIALCA_isda_wide$FC_20_50_A + (1.283 * CIALCA_isda_wide$FC_20_50_A^2 - 0.374 * CIALCA_isda_wide$FC_20_50_A - 0.015))*100


CIALCA_isda_wide <- subset(CIALCA_isda_wide, select=-c(FC_0_20_A, FC_20_50_A))


##### soil water at ######
CIALCA_isda_wide$sws_0_20_A <- 0.278*(CIALCA_isda_wide$sand_content_0_20/100)+0.034*(CIALCA_isda_wide$clay_content_0_20/100)+0.022*CIALCA_isda_wide$percentSOM_0_20-
  0.018*(CIALCA_isda_wide$sand_content_0_20/100*CIALCA_isda_wide$percentSOM_0_20)-
  0.027*(CIALCA_isda_wide$clay_content_0_20/100*CIALCA_isda_wide$percentSOM_0_20)-0.584*(CIALCA_isda_wide$sand_content_0_20/100*CIALCA_isda_wide$clay_content_0_20/100)+0.078

CIALCA_isda_wide$sws_0_20_B <- (CIALCA_isda_wide$sws_0_20_A +(0.636*CIALCA_isda_wide$sws_0_20_A-0.107))*100
CIALCA_isda_wide$sws_0_20 <- (CIALCA_isda_wide$FC_0_20/100+CIALCA_isda_wide$sws_0_20_B/100-(0.097*CIALCA_isda_wide$sand_content_0_20/100)+0.043)*100


CIALCA_isda_wide$sws_20_50_A <- 0.278*(CIALCA_isda_wide$sand_content_20_50/100)+0.034*(CIALCA_isda_wide$clay_content_20_50/100)+0.022*CIALCA_isda_wide$percentSOM_20_50-
  0.018*(CIALCA_isda_wide$sand_content_20_50/100*CIALCA_isda_wide$percentSOM_20_50)-
  0.027*(CIALCA_isda_wide$clay_content_20_50/100*CIALCA_isda_wide$percentSOM_20_50)-0.584*(CIALCA_isda_wide$sand_content_20_50/100*CIALCA_isda_wide$clay_content_20_50/100)+0.078
CIALCA_isda_wide$sws_20_50_B <- (CIALCA_isda_wide$sws_20_50_A +(0.636*CIALCA_isda_wide$sws_20_50_A-0.107))*100
CIALCA_isda_wide$sws_20_50 <- (CIALCA_isda_wide$FC_20_50/100+CIALCA_isda_wide$sws_20_50_B/100-(0.097*CIALCA_isda_wide$sand_content_20_50/100)+0.043)*100


CIALCA_isda_wide <- subset(CIALCA_isda_wide, select=-c(sws_0_20_A, sws_0_20_B, sws_20_50_A, sws_20_50_B))
str(CIALCA_isda_wide)


CIALCA_isda_wide$location <- paste(CIALCA_isda_wide$lon, CIALCA_isda_wide$lat, sep="_")
ds_NOT$location <- paste(ds_NOT$longitude, ds_NOT$latitude, sep="_")
CIALCA_isda_wide <- unique(merge(CIALCA_isda_wide, unique(ds_NOT[, c("trialID", "location")]), by="location" ))
saveRDS(CIALCA_isda_wide, "CIALCA_isda_wide.RDS")

nrow(CIALCA_isda_wide)


###############################################
head(ds_NOT)
ds_NOT$Ctrial <- paste(ds_NOT$Country, ds_NOT$Trial, ds_NOT$Season, sep="_")
ds_NOT$Treatment <- ifelse(ds_NOT$Treatment == "NoF", "Control", as.character(ds_NOT$Treatment))
ds_NOT$Treatment2 <- ifelse(ds_NOT$Treatment == "NPK_rep1" | ds_NOT$Treatment == "NPK_rep2", "NPK", as.character(ds_NOT$Treatment))
ds_NOT$Treatment2 <- factor(ds_NOT$Treatment2, levels = c("Control", "NPK", "NP","NK","PK"))
ds_NOT <- ds_NOT[ds_NOT$rootYield  <= 120, ]

table(ds_NOT$Ctrial, ds_NOT$Treatment2)


gg <- ggplot(ds_NOT, aes(Treatment2, rootYield, fill=Treatment2)) +
  geom_boxplot() +
  # geom_point()+
  facet_wrap(~Ctrial) +
  theme_bw() +
  theme(axis.text.x = element_text(angle=90))

ggsave("Cassava_Exp.png", gg, width=8, height = 6)



Q1 <- unique(ds_NOT[, c("Country","Trial","Season","trialID","Treatment", "rootYield")])
Q1$Ctrial <- paste(Q1$Country, Q1$Trial, sep="_")
head(Q1)
Q1 <- spread(Q1, Treatment, rootYield)
unique(Q1$Treatment)
Q1$seasonctrial <- paste(Q1$Ctrial, Q1$Season, sep="_")
Q1 <- Q1[, c("seasonctrial","NPK_rep1", "NPK_rep2","PK","NK","NP", "Control")]
Q1 <- gather(Q1, treat, value, PK:Control )
table(Q1$seasonctrial, Q1$treat)
Q1A <- Q1[, c("seasonctrial", "NPK_rep1","treat", "value")]
Q1B <- Q1[, c("seasonctrial", "NPK_rep2","treat", "value")]
names(Q1A) <- c("seasonctrial", "NPK", "treat", "value")
names(Q1B) <- c("seasonctrial", "NPK", "treat", "value")

Q1 <- rbind(Q1A, Q1B)

head(Q1)
gg  <- ggplot(Q1, aes(value, NPK, col=seasonctrial))+
  geom_point()+
  facet_grid(seasonctrial~treat) +
  xlim(0, 100) + ylim(0,100)+
  xlab("Yield for nutrient missing treatments") +
  geom_abline(slope = 1, intercept = 0)+
  theme_bw() + theme(axis.text = element_text(size=12), strip.text.y = element_text(angle=0),
                     strip.text.x = element_text(size=12),
                     axis.title = element_text(size=14), legend.position = "none")


ggsave(gg, "ScatterOmissionNPK.png")

dsss <- unique(ds_NOT[, c("Country","Trial","Ctrial", "Season","trialID","nrPlants_m2", "Treatment", "rootYield")])
dsss$Ctrial <- paste(dsss$Country, dsss$Trial, sep="_")


dsssw <- spread(dsss, Treatment, rootYield)
head(dsssw)


summary(dsssw)



dss1 <- dsssw[, c( "Country", "Trial", "Season","trialID", "nrPlants_m2","NK", "NP", "NPK_rep1","PK","Ctrial")]
dss2 <- dsssw[, c( "Country", "Trial", "Season","trialID", "nrPlants_m2", "NK", "NP", "NPK_rep2","PK","Ctrial")]
dss1$rep <- "NPK_rep1"
dss2$rep <- "NPK_rep2"
colnames(dss1) <- colnames(dss2) <- c("Country", "Trial", "Season","trialID", "nrPlants_m2","NK", "NP", "NPK","PK","Ctrial", "rep")
dssw <- rbind(dss1, dss2)
dssw$seasonctrial <- paste(dssw$Ctrial, dssw$Season, sep="_")
head(dssw)

table(dsss$Country, dsss$nrPlants_m2)
ddply(dssw, .(seasonctrial), summarize, max(nrPlants_m2), min(nrPlants_m2))
summary(dssw[dssw$seasonctrial == "Rw_FM_A_2019",]$nrPlants_m2)
summary(dssw[dssw$seasonctrial == "Rw_RM_A_2019",]$nrPlants_m2)
summary(dssw[dssw$seasonctrial == "BI_NOT_HR_A_2019",]$nrPlants_m2)
summary(dssw[dssw$seasonctrial == "DRC_NOT_A_2019",]$nrPlants_m2)
summary(dssw[dssw$seasonctrial == "DRC_NOT_B_2019",]$nrPlants_m2)


nknpk <- ggplot(dssw, aes(NK, NPK, col=seasonctrial))+
  geom_point()+
  xlim(0, 100) + ylim(0,100)+
  geom_abline(slope = 1, intercept = 0)+
  theme_bw()+ theme(legend.position = "none",axis.text = element_text(size=12)
                    ,axis.title = element_text(size=14))
  

npnpk <- ggplot(dssw, aes(NP, NPK, col=seasonctrial))+
    geom_point()+
   xlim(0, 100) + ylim(0,100)+
    geom_abline(slope = 1, intercept = 0)+
    theme_bw()+ theme(legend.position = "none",axis.text = element_text(size=1),
                      ,axis.title = element_text(size=14))


pknpk <- ggplot(dssw, aes(PK, NPK, col=seasonctrial))+
  geom_point()+
  xlim(0, 100) + ylim(0,100)+
  geom_abline(slope = 1, intercept = 0)+
  theme_bw() + theme(axis.text = element_text(size=12) ,axis.title = element_text(size=14))

ggsave("NK_NPK.png",nknpk,  width=4, height = 6)
ggsave("NP_NPK.png",npnpk,  width=4, height = 6)
ggsave("PK_NPK.png",pknpk, width=6, height = 6)

head(dssw)
dssw[dssw$seasonctrial == "Rw_FM_A_2019", ]

##################################

## fitting models
ds_NOT <- dsss
ds_NOT$Treatment <- ifelse(ds_NOT$Treatment == "NPK_rep1" |  ds_NOT$Treatment == "NPK_rep2", "NPK", as.character(ds_NOT$Treatment))
ds_NOT$Treatment <- as.factor(ds_NOT$Treatment)
unique(ds_NOT$nrPlants_m2)
summary(ds_NOT$nrPlants_m2)
ds_NOT$nrPlants_m2 <- round(ds_NOT$nrPlants_m2, digits=1)
ds_NOT$nrPlants_m2 <- as.factor(ds_NOT$nrPlants_m2)
summary(ds_NOT$rootYield)
head(ds_NOT)

## convert the ton to kg to avoid close to zero values
ds_NOT$YieldKg <- ds_NOT$rootYield * 1000
unique(ds_NOT$Ctrial)
head(ds_NOT)

modk <- lmer(data=ds_NOT, sqrt(YieldKg) ~ Treatment:Ctrial + (0+Treatment|trialID))
modl <- lmer(data=ds_NOT, sqrt(YieldKg) ~ Treatment + Treatment:Ctrial + (0+Treatment|trialID))
modm <- lmer(data=ds_NOT, sqrt(YieldKg) ~ nrPlants_m2 + Treatment:Ctrial + (0+Treatment|trialID))
modn <- lmer(data=ds_NOT, sqrt(YieldKg) ~ Treatment + nrPlants_m2 + Treatment:Ctrial + (0+Treatment|trialID))
mods <- lmer(data=ds_NOT, sqrt(YieldKg) ~ Treatment*nrPlants_m2 + Treatment:Ctrial + (0+Treatment|trialID))


plot(modk)
plot(modl)
plot(modm)
plot(modn) ## the best model

anova(modk, modl) ## sig
anova(modk, modm) ## sig
anova(modm, modn) ## sig
anova(modn, mods) ## not sig

drop1(modn,scope=c("Treatment:Ctrial"),test="Chisq")
drop1(mods,scope=c("Treatment:nrPlants_m2", "Treatment:Ctrial"),test="Chisq") ## only "Treatment:Ctrial" is sig



ds_NOT$predicted <- (predict(modn)**2)   # Save the predicted values
ds_NOT$residuals <- (residuals(modn)**2) # Save the residual values
ggplot(ds_NOT, aes(predicted, residuals, col=Ctrial)) +
  geom_point() +
  geom_abline(slope=0, intercept = 0)+
  facet_grid(Treatment~Ctrial)+
  theme_bw()+theme(legend.position = "none")

summary(ds_NOT$predicted)
summary(ds_NOT$YieldKg)

ggplot(ds_NOT, aes(YieldKg, predicted, col=Ctrial)) +
  geom_point() +
  geom_abline(slope=1, intercept = 0)+
  facet_grid(Treatment~Ctrial, scale="free")+
  ylab("Predicted root yield")+ xlab("Observed root yield") + 
  theme_bw()+theme(legend.position = "none", strip.text = element_text(size=12),
                   axis.text = element_text(size=10), axis.title = element_text(size=12))
# ggsave("observed_predicted.png", ggh)


modn <- lmer(data=ds_NOT, sqrt(YieldKg) ~ Treatment + nrPlants_m2 + Treatment:Ctrial + (0+Treatment|trialID))
VarCorr(modn)
ref.grid(modn, emm_options(pbkrtest.limit = 4648))
lsm3 <- emmeans::emmeans(modn, ~ Treatment + Treatment:Ctrial)
lsmk2 <- lsmeans(modn, c("Treatment","Ctrial"))
lsmDF3 <- as.data.frame(lsm3)
head(lsmDF3)
lsmk2 <- as.data.frame(lsmk2)
head(lsmk2)


saveRDS(lsmDF3,"lsmDF3.RDS")
lsmDF3 <- lsmDF3[,c(1:3)]
names(lsmDF3)[3] <- "lsmean"
row.names(lsmDF3) <- NULL

ran3 <- ranef(modn)$trialID
ran3$trialID <- row.names(ran3)
head(ran3)
names(ran3) <- gsub("Treatment", "", names(ran3))
ran3 <- melt(ran3, 
             measure.vars=c("NPK", "PK", "NK", "NP", "Control"),
             variable.name="Treatment",
             id.vars="trialID",
             value.name="blup")
head(ran3)
ran3 <- unique(merge(ran3, unique(ds_NOT[, c("trialID", "Country", "Trial", "Ctrial")]), by="trialID"))

head(ran3)
head(lsmDF3)

ranA <- merge(ran3, lsmDF3, by=c("Treatment", "Ctrial"))
ranA$blup <- ranA$blup + ranA$lsmean
ranA <- subset(ranA, select=-c(lsmean))
head(ranA)
summary(ranA$blup)
ranA$blup <- (ranA$blup)^2
ranA <- unique(ranA[, c("Ctrial","trialID","Country", "Trial", "Treatment", "blup")])


ran.w <- spread(ranA, Treatment, blup)
head(ran.w)


saveRDS(ran.w, "CIALCA_Blups.RDS")
ran.w <- readRDS("CIALCA_Blups.RDS")
longData <- gather(ran.w, treat, value, PK:Control)
head(longData)


longData$YieldDiff <- longData$NPK - longData$value
unique(longData$treat )
longData$NutrientMissing <- ifelse(longData$treat == "NP", "K", ifelse(longData$treat == "NK", "P", 
                                                                       ifelse(longData$treat == "PK", "N", "NPK")))
unique(longData$NutrientMissing)
longData$TreatDiff <- ifelse(longData$treat == "NP", "K Eff", 
                             ifelse(longData$treat == "NK", "P Eff", 
                                    ifelse(longData$treat == "PK", "N Eff", "NPK Eff")))
# longData <- droplevels(longData[longData$YieldDiff <= 30, ])

longData$NutrientMissing <- factor(longData$NutrientMissing, levels = c("N", "P", "K", "NPK"))
longData$TreatDiff <- factor(longData$TreatDiff, levels = c("NPK Eff", "N Eff", "P Eff","K Eff"))
head(longData)
str(longData)
unique(longData$Ctrial)
unique(longData$Trial)
Q1 <- longData[longData$Ctrial == "BI_NOT_HR", ]#   ""     ""
Q2 <- longData[longData$Ctrial == "DRC_NOT", ]
Q3 <- longData[longData$Ctrial == "Rw_FM", ]
Q4 <- longData[longData$Ctrial == "Rw_RM", ]

longData$value <- longData$value / 1000
longData$YieldDiff <- longData$YieldDiff / 1000


ggplot(data=longData[!longData$Ctrial == "DRC_NOT", ], aes(value, y=YieldDiff, col = factor(Ctrial))) +
   facet_grid(Ctrial~TreatDiff) +
  stat_density2d(geom="path") +
  geom_point(size=1.2) +
  geom_abline(intercept=0,slope=0, col="darkgrey") +
  xlab("") +
  ylab("Yield difference [t/ha]") +
  theme_bw() +
  theme(axis.text  = element_text(size=12), legend.position = "none",
        axis.title = element_text(size=15), strip.text = element_text(size=12), 
        legend.text =element_text(size=15), legend.title = element_blank())


gg1 <-  ggplot(Q1, aes(x = value/1000, y = YieldDiff/1000)) +
  facet_grid(.~TreatDiff, scales="free") +
  geom_point(size=1.2) + ylim(-10, 15)+xlim(0,50) +
  geom_density_2d(col="chartreuse3")+
  geom_abline(intercept=0,slope=0, col="darkgrey") +
  xlab("") +
  ylab("Yield difference [t/ha]") +
  ggtitle("BI_NOT_HR") +
  theme_bw() +
  theme(axis.text  = element_text(size=10), legend.position = "none",
        axis.title = element_text(size=15), strip.text = element_text(size=12), 
        legend.text =element_text(size=15), legend.title = element_blank())

gg2 <- ggplot(Q2, aes(x = value, y = YieldDiff)) +
  facet_grid(.~TreatDiff, scales="free") +
  geom_point(size=1.2) + ylim(-10, 45)+ xlim(-10,100) +
  geom_density_2d(col="dodgerblue1")+
  geom_abline(intercept=0,slope=0, col="darkgrey") +
  xlab("") +
  ylab("") +
  ggtitle("DRC_NOT") +
  theme_bw() +
  theme(axis.text  = element_text(size=10), legend.position = "none",
        axis.title = element_text(size=15), strip.text = element_text(size=12), 
        legend.text =element_text(size=15), legend.title = element_blank())


gg3 <- ggplot(Q3, aes(x = value, y = YieldDiff)) +
  facet_grid(.~TreatDiff, scales="free") +
  geom_point(size=1.2) + ylim(-10, 10) + xlim(0,75) +
  geom_density_2d(col="orangered")+
  geom_abline(intercept=0,slope=0, col="darkgrey") +
  xlab("") +
  ylab("Yield difference [t/ha]") +
  ggtitle("Rwanda FM") +
  theme_bw() +
  theme(axis.text  = element_text(size=10), legend.position = "none",
        axis.title = element_text(size=15), strip.text = element_text(size=12), 
        legend.text =element_text(size=15), legend.title = element_blank())

gg4 <- ggplot(Q4, aes(x = value, y = YieldDiff)) +
  facet_grid(.~TreatDiff, scales="free") +
  geom_point(size=1) + ylim(-10, 15) + xlim(0,55) +
  geom_density_2d(col="seagreen3")+
  geom_abline(intercept=0,slope=0, col="darkgrey") +
  xlab("") +
  ylab("") +
  ggtitle("Rwanda RM") +
  theme_bw() +
  theme(axis.text  = element_text(size=10), legend.position = "none",
        axis.title = element_text(size=15), strip.text = element_text(size=12), 
        legend.text =element_text(size=15), legend.title = element_blank())




ggsave("CIALCA_BLUPS3.png", grid.arrange(gg1, gg2, gg3, gg4), width=12, height = 6)



################################################3
## reverse QUEFTS
setwd("/home/akilimo/projects/SoilNPK/Coordinates")
saveRDS(ran.w, "CIALCA_Blups.RDS")
ran.w <- readRDS("CIALCA_Blups.RDS")
CIALCA_isric_GPS <- unique(merge(ran.w, ISRIC_RW_CS, by="trialID"))
CIALCA_isda_GPS <- unique(merge(ran.w, CIALCA_isda_wide, by="trialID"))



### change fresh matter to dry matter
CIALCA_isric_GPS$DM_NPK <- CIALCA_isric_GPS$NPK * 0.3
CIALCA_isric_GPS$DM_PK <- CIALCA_isric_GPS$PK * 0.3
CIALCA_isric_GPS$DM_NK <- CIALCA_isric_GPS$NK * 0.3
CIALCA_isric_GPS$DM_NP <- CIALCA_isric_GPS$NP * 0.3
CIALCA_isric_GPS$DM_CON <- CIALCA_isric_GPS$Control * 0.3
head(CIALCA_isric_GPS)

CIALCA_isda_GPS$DM_NPK <- CIALCA_isda_GPS$NPK * 0.3
CIALCA_isda_GPS$DM_PK <- CIALCA_isda_GPS$PK * 0.3
CIALCA_isda_GPS$DM_NK <- CIALCA_isda_GPS$NK * 0.3
CIALCA_isda_GPS$DM_NP <- CIALCA_isda_GPS$NP * 0.3
CIALCA_isda_GPS$DM_CON <- CIALCA_isda_GPS$Control * 0.3
head(CIALCA_isda_GPS)


RecoveryFraction <- data.frame(rec_N=0.5, rec_P=0.21, rec_K=0.49)
source("QUEFTS_function.R")
# ds_GPS <- CIALCA_isric_GPS
rm(ds_GPS)
ds_GPS <- CIALCA_isda_GPS
head(ds_GPS)

# setwd("/home/akilimo/projects/AKILIMO_Modeling/QUEFTS")
# saveRDS(ds_GPS, "ds_GPS.RDS")

soilNPK <- NULL
validation_result <- NULL
for(i in 1:nrow(ds_GPS)){
  oneTrial <- ds_GPS[i,]	
  # WLY <- oneTrial$NPK/3*1000 # convert to kg.ha
  WLY <- oneTrial$DM_NPK # is in kg/ha dry root
  
  addedFertilizer <- as.data.frame(matrix(nrow=4,ncol=3))
  colnames(addedFertilizer) <- c("FN", "FP", "FK")
  addedFertilizer[,1] <- c(0, 0, 150, 150)
  addedFertilizer[,2] <- c(0, 40, 0, 40)
  addedFertilizer[,3] <- c(0, 180, 180, 0)	
  
  YieldMatrix <- as.data.frame(matrix(nrow=4,ncol=2))
  colnames(YieldMatrix) <- c("treatment", "NOT_yield")
  YieldMatrix[,1] <- c("Control", "PK", "NK", "NP")
  YieldMatrix[,2] <- c(oneTrial$DM_CON, oneTrial$DM_PK, oneTrial$DM_NK, oneTrial$DM_NP)
  
  ## optimizing by min TSS. 
  soil_NPK <- optim(par=c(0,0,0), optim_INS, lower=c(0.1, 0.1, 0.1), method = "L-BFGS-B", addedFertilizer=addedFertilizer, YieldMatrix=YieldMatrix, WLY=WLY)$par	
  
  if(!is.na(oneTrial$Control)){
    Yield_control <- Yield_S(SN = addedFertilizer[1,1] + soil_NPK[1] , SP = addedFertilizer[1,2] + soil_NPK[2], 
                             SK = addedFertilizer[1,3] + soil_NPK[3], WLY = WLY)
    oneTrial$Control_observed <- oneTrial$DM_CON
    oneTrial$Control_estimated <-  Yield_control
  }else{
    oneTrial$Control_observed <- NA
    oneTrial$Control_estimated <-  NA
  }
  
  
  validation_result <- rbind(validation_result, oneTrial)
  
  snpk <- oneTrial
  snpk$soilN <- soil_NPK[1]
  snpk$soilP <- soil_NPK[2]
  snpk$soilK <- soil_NPK[3]
  soilNPK <- rbind(soilNPK, snpk)
}

head(validation_result)

saveRDS(validation_result, "validation_result_isric.RDS")
saveRDS(soilNPK, "soilNPK_isrics.rds")

saveRDS(validation_result, "validation_result_isda.RDS")
saveRDS(soilNPK, "soilNPK_isda.rds")

validation_result <- readRDS("validation_result_isric.RDS")

ggcass <- ggplot(validation_result, aes(Control_estimated, Control_observed, col=Ctrial))+
  geom_point()+ 
  geom_abline(intercept=0, slope=1)+
  xlim(0,20000) + ylim(0,20000)+
  #geom_smooth(method=lm)+
  # stat_density2d()+ 
  xlab(" Control yield estimated through QUEFTS")+ ylab("Control yeild Observed (BLUPS)")+
 theme_bw() +
  theme(axis.text  = element_text(size=10), 
        axis.title = element_text(size=15), strip.text = element_text(size=12), 
        legend.text =element_text(size=15), legend.title = element_blank())



ggsave("Cassava_backQUEFTS_noDensity.png", ggcass, width = 8, height = 6)





######################################################################################
## there is no planting and harvest dates to run LINTUL and check for WLY vs NPK yield
## first Random forest and then check if NG and TZ models can be used for CIALCA data as well.
#######################################################################################
setwd("/home/akilimo/projects/SoilNPK/Rwanda")
soilNPK <- readRDS("soilNPK_isrics.rds")
head(soilNPK)
summary(soilNPK)
quantile(soilNPK$NPK, probs=seq(0,1,0.01))
par(mfrow=c(2,2))
hist(soilNPK$NPK)
hist(soilNPK$soilN)
hist(soilNPK$soilP)
hist(soilNPK$soilK)


GIS_soilINS_modData2 <- soilNPK[, c("soilN", "soilP", "soilK", "exchK", "P_Mehlich", "Clay_5","Clay_15","Clay_30","percentSOM_5","percentSOM_15","percentSOM_30",
                                       "pH_5","pH_15","pH_30", "silt_5", "silt_15", "silt_30","BD_5", "BD_15", "BD_30", "CEC_5","CEC_15","CEC_30", "percentSOC_5",
                                       "percentSOC_15", "percentSOC_30", "FC_5", "FC_15", "FC_30", "wp_5", "wp_15", "wp_30", "sws_5", "sws_15", "sws_30",
                                       "TotalN", "Mn","B", "Ca","Fe", "Cu","Al", "Mg", "Na","ncluster", "Country", "Control")]


soilNPK_isad <- readRDS("soilNPK_isda.rds")
str(soilNPK_isad)
GIS_soilINS_modDataisda <- soilNPK_isad[, c("soilN", "soilP", "soilK","aluminium_extractable_0_20", "aluminium_extractable_20_50",
                                            "bedrock_depth_0_200", "bulk_density_0_20","bulk_density_20_50", "calcium_extractable_0_20", "calcium_extractable_20_50",
                                            "carbon_organic_0_20", "carbon_organic_20_50", "carbon_total_0_20", "carbon_total_20_50", "cation_exchange_capacity_0_20",
                                            "cation_exchange_capacity_20_50","clay_content_0_20","clay_content_20_50","iron_extractable_0_20","iron_extractable_20_50",
                                            "magnesium_extractable_0_20", "magnesium_extractable_20_50", "nitrogen_total_0_20", "nitrogen_total_20_50",
                                            "ph_0_20", "ph_20_50", "phosphorous_extractable_0_20", "phosphorous_extractable_20_50","potassium_extractable_0_20",
                                            "potassium_extractable_20_50", "sand_content_0_20","sand_content_20_50", "silt_content_0_20", "silt_content_20_50",
                                            "stone_content_0_20", "stone_content_20_50", "sulphur_extractable_0_20", "sulphur_extractable_20_50", "texture_class_0_20",
                                            "texture_class_20_50", "zinc_extractable_0_20", "zinc_extractable_20_50", "slope_angle", "fcc", "land_cover_2015", 
                                            "land_cover_2016","land_cover_2017", "land_cover_2018","land_cover_2019", "soilOM_0_20", "soilOM_20_50",
                                            "percentSOM_0_20", "percentSOM_20_50", "wp_0_20", "wp_20_50", "FC_0_20", "FC_20_50", "sws_0_20", "sws_20_50",
                                            "Country", "Control")]
str(GIS_soilINS_modDataisda)


GIS_soilINS_modData2$Clay_5 <- as.numeric(GIS_soilINS_modData2$Clay_5)
GIS_soilINS_modData2$Clay_15 <- as.numeric(GIS_soilINS_modData2$Clay_15)
GIS_soilINS_modData2$Clay_30 <- as.numeric(GIS_soilINS_modData2$Clay_30)
GIS_soilINS_modData2$silt_5 <- as.numeric(GIS_soilINS_modData2$silt_5)
GIS_soilINS_modData2$silt_15 <- as.numeric(GIS_soilINS_modData2$silt_15)
GIS_soilINS_modData2$silt_30 <- as.numeric(GIS_soilINS_modData2$silt_30)
GIS_soilINS_modData2$BD_5 <- as.numeric(GIS_soilINS_modData2$BD_5)
GIS_soilINS_modData2$BD_15 <- as.numeric(GIS_soilINS_modData2$BD_15)
GIS_soilINS_modData2$BD_30 <- as.numeric(GIS_soilINS_modData2$BD_30)
GIS_soilINS_modData2$CEC_5 <- as.numeric(GIS_soilINS_modData2$CEC_5)
GIS_soilINS_modData2$CEC_15 <- as.numeric(GIS_soilINS_modData2$CEC_15)
GIS_soilINS_modData2$CEC_30 <- as.numeric(GIS_soilINS_modData2$CEC_30)
GIS_soilINS_modData2$TotalN <- as.numeric(GIS_soilINS_modData2$TotalN)
GIS_soilINS_modData2$Mn <- as.numeric(GIS_soilINS_modData2$Mn)
GIS_soilINS_modData2$B <- as.numeric(GIS_soilINS_modData2$B)
GIS_soilINS_modData2$Ca <- as.numeric(GIS_soilINS_modData2$Ca)
GIS_soilINS_modData2$Fe <- as.numeric(GIS_soilINS_modData2$Fe)
GIS_soilINS_modData2$Cu <- as.numeric(GIS_soilINS_modData2$Cu)
GIS_soilINS_modData2$Al <- as.numeric(GIS_soilINS_modData2$Al)
GIS_soilINS_modData2$Mg <- as.numeric(GIS_soilINS_modData2$Mg)
GIS_soilINS_modData2$Na <- as.numeric(GIS_soilINS_modData2$Na)
GIS_soilINS_modData2$ncluster <- as.factor(GIS_soilINS_modData2$ncluster)
GIS_soilINS_modData2$Control <- as.numeric(GIS_soilINS_modData2$Control)
GIS_soilINS_modData2$Country  <- as.factor(GIS_soilINS_modData2$Country )

### create classes of control
GIS_soilINS_modData2$Control <- GIS_soilINS_modData2$Control/1000
GIS_soilINS_modData2$CONclass <- ifelse(GIS_soilINS_modData2$Control < 7.5, "class1",
                                          ifelse(GIS_soilINS_modData2$Control >= 7.5 & GIS_soilINS_modData2$Control < 15, "class2",
                                                 ifelse(GIS_soilINS_modData2$Control >= 15 & GIS_soilINS_modData2$Control < 22.5, "class3",
                                                        ifelse(GIS_soilINS_modData2$Control >= 22.5 & GIS_soilINS_modData2$Control < 30, "class4", "class5"))))
GIS_soilINS_modData2$CONclass <- as.factor(GIS_soilINS_modData2$CONclass)

### create classes of control
GIS_soilINS_modDataisda$Control <- GIS_soilINS_modDataisda$Control/1000
GIS_soilINS_modDataisda$CONclass <- ifelse(GIS_soilINS_modDataisda$Control < 7.5, "class1",
                                        ifelse(GIS_soilINS_modDataisda$Control >= 7.5 & GIS_soilINS_modDataisda$Control < 15, "class2",
                                               ifelse(GIS_soilINS_modDataisda$Control >= 15 & GIS_soilINS_modDataisda$Control < 22.5, "class3",
                                                      ifelse(GIS_soilINS_modDataisda$Control >= 22.5 & GIS_soilINS_modDataisda$Control < 30, "class4", "class5"))))

GIS_soilINS_modDataisda$CONclass <- as.factor(GIS_soilINS_modDataisda$CONclass)
GIS_soilINS_modDataisda$Country <- as.factor(GIS_soilINS_modDataisda$Country)
GIS_soilINS_modDataisda <- GIS_soilINS_modDataisda[complete.cases(GIS_soilINS_modDataisda), ]


### Data partioning take 70:30 by country

### Data partioning take 70:30 by country
#'function to split the data by country to training and test datasets
#' @param probs a vector of probability weights for obtaining the elements of the vector being sampled.
#' @param soil_BLUPSData data frame with soil variables and the BLUPS for yield for the control treatment 
splitdata <- function(probs=c(0.7,0.3), soil_BLUPSData){
  set.seed(444)
  trianData <- NULL
  testDatA <- NULL
  for(country in unique(soil_BLUPSData$Country)){
    Country_data <- droplevels(soil_BLUPSData[soil_BLUPSData$Country == country,])
    indexCountry <- sample(2, nrow(Country_data), replace=TRUE, prob=probs)## where conrtol yield is used as a covariate
    trainDataCountry <- Country_data[indexCountry == 1, ]
    testDataCountry <- Country_data[indexCountry == 2, ]
    trianData <- rbind(trianData, trainDataCountry) 
    testDatA <- rbind(testDatA, testDataCountry)
  }
  return(list(trianData, testDatA))
}


traintestData <- splitdata(soil_BLUPSData = GIS_soilINS_modData2)
trainData <- traintestData[[1]]
testData <- traintestData[[2]]

traintestData_isda <- splitdata(soil_BLUPSData = GIS_soilINS_modDataisda)
trainDataisda <- traintestData_isda[[1]]
testDataisda <- traintestData_isda[[2]]



Ndata_Train  <- subset(trainData, select=-c(soilP, soilK))
Pdata_Train  <- subset(trainData, select=-c(soilN, soilK))
Kdata_Train  <- subset(trainData, select=-c(soilN, soilP))

Ndata_Valid  <- subset(testData, select=-c(soilP, soilK))
Pdata_Valid  <- subset(testData, select=-c(soilN, soilK))
Kdata_Valid  <- subset(testData, select=-c(soilN, soilP))


Ndata_Train_isda  <- subset(trainDataisda, select=-c(soilP, soilK))
Pdata_Train_isda  <- subset(trainDataisda, select=-c(soilN, soilK))
Kdata_Train_isda  <- subset(trainDataisda, select=-c(soilN, soilP))

Ndata_Valid_isda  <- subset(testDataisda, select=-c(soilP, soilK))
Pdata_Valid_isda  <- subset(testDataisda, select=-c(soilN, soilK))
Kdata_Valid_isda  <- subset(testDataisda, select=-c(soilN, soilP))



## Coustome control parameter 
#custom <- trainControl(method="repeatedcv", number=10, repeats=5, verboseIter=TRUE)
custom <- trainControl(method="oob", number=10)
##########################################################################
## Random Forest soilN: Rsq with control = 0.67 and with CONclass = 0.48
##########################################################################
## using isric
set.seed(773)
RF_N1 <- randomForest(soilN ~ ., subset(Ndata_Train, select=-c(Control)), importance=TRUE, ntree=1000)
# RF_N1 <- randomForest(sqrt(soilN) ~ ., subset(Ndata_Train, select=-c(CONclass)), importance=TRUE, ntree=1000)
Ndata_Valid$predN <- predict(RF_N1, Ndata_Valid)
sst <- sum((Ndata_Valid$soilN - mean(Ndata_Valid$soilN))^2)
sse <- sum((Ndata_Valid$soilN - Ndata_Valid$predN)^2)
rsq <- 1 - sse / sst
rsq
varImpPlot(RF_N1)
varImpPlot(RF_N1)
varimportance <- data.frame(RF_N1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilN_isric.csv", row.names = FALSE)

ggn <- ggplot(Ndata_Valid, aes(soilN, predN, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,250) + ylim(0,250)+
  xlab("soil N from QUEFTS optimization")+
  ylab("perdicted soil N from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilN_isric.pdf", ggn, width=5, height=5)

## using isda (R sq = 0.63 with contorlClass and 0.65 )
set.seed(773) 
RF_N1_isda <- randomForest(soilN ~ ., subset(Ndata_Train_isda, select=-c(Control)), importance=TRUE, ntree=1000)
# RF_N1_isda <- randomForest(soilN ~ ., subset(Ndata_Train_isda, select=-c(CONclass)), importance=TRUE, ntree=1000)
Ndata_Valid_isda$predN <- predict(RF_N1_isda, Ndata_Valid_isda)
sst <- sum((Ndata_Valid_isda$soilN - mean(Ndata_Valid_isda$soilN))^2)
sse <- sum((Ndata_Valid_isda$soilN - Ndata_Valid_isda$predN)^2)
rsq <- 1 - sse / sst
rsq
varImpPlot(RF_N1_isda)
varImpPlot(RF_N1_isda)
varimportance <- data.frame(RF_N1_isda$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilN_isda.csv", row.names = FALSE)

ggn <- ggplot(Ndata_Valid_isda, aes(soilN, predN, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,250) + ylim(0,250)+
  xlab("soil N from QUEFTS optimization")+
  ylab("perdicted soil N from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilN_isda.pdf", ggn, width=5, height=5)



##########################################################################
## Random Forest soilP: 0.77 and without CONclass = 0.50
##########################################################################
set.seed(773)
RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train, select=-c(CONclass)), importance=TRUE, ntree=1000)
Pdata_Valid$predP <- predict(RF_P1, Pdata_Valid)
sst <- sum((Pdata_Valid$soilP - mean(Pdata_Valid$soilP))^2)
sse <- sum((Pdata_Valid$soilP - Pdata_Valid$predP)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_P1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilP_isric.csv", row.names = FALSE)


ggp <- ggplot(Pdata_Valid, aes(soilP, predP, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,150) + ylim(0,150)+
  xlab("soil P from QUEFTS optimization")+
  ylab("perdicted soil P from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilP_isric.pdf", ggp, width=5, height=5)


set.seed(773)
RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train_isda, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train_isda, select=-c(CONclass)), importance=TRUE, ntree=1000)
Pdata_Valid_isda$predP <- predict(RF_P1, Pdata_Valid_isda)
sst <- sum((Pdata_Valid_isda$soilP - mean(Pdata_Valid_isda$soilP))^2)
sse <- sum((Pdata_Valid_isda$soilP - Pdata_Valid_isda$predP)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_P1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilP_isda.csv", row.names = FALSE)


ggp <- ggplot(Pdata_Valid_isda, aes(soilP, predP, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,150) + ylim(0,150)+
  xlab("soil P from QUEFTS optimization")+
  ylab("perdicted soil P from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilP_isda.pdf", ggp, width=5, height=5)


##########################################################################
## Random Forest soilK" R sq. 0.59 and without control = 0.19
##########################################################################
set.seed(773)
RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train, select=-c(CONclass)), importance=TRUE, ntree=1000)
Kdata_Valid$predK <- predict(RF_K1, Kdata_Valid)
sst <- sum((Kdata_Valid$soilK - mean(Kdata_Valid$soilK))^2)
sse <- sum((Kdata_Valid$soilK - Kdata_Valid$predK)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_K1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilK_isric.csv", row.names = FALSE)


ggk <- ggplot(Kdata_Valid, aes(soilK, predK, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,350) + ylim(0,350)+
  xlab("soil K from QUEFTS optimization")+
  ylab("perdicted soil K from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilK_isric.pdf", ggk, width=5, height=5)



set.seed(773)
RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train_isda, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train_isda, select=-c(CONclass)), importance=TRUE, ntree=1000)
Kdata_Valid_isda$predK <- predict(RF_K1, Kdata_Valid_isda)
sst <- sum((Kdata_Valid_isda$soilK - mean(Kdata_Valid_isda$soilK))^2)
sse <- sum((Kdata_Valid_isda$soilK - Kdata_Valid_isda$predK)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_K1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilK_isda.csv", row.names = FALSE)


ggk <- ggplot(Kdata_Valid_isda, aes(soilK, predK, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,350) + ylim(0,350)+
  xlab("soil K from QUEFTS optimization")+
  ylab("perdicted soil K from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilK_isda.pdf", ggk, width=5, height=5)



##########################################################################
## use the random forest model and get the soil NPK estimates for the whole area
##########################################################################
## define the study area for the three countries, get ISRIC data
## the whole Rwanda and the whole Burundi, and then the Sud-Kivu province for DRC. Though large parts of Sud-Kivu are forest and uninhabitated…


## get GPS readings
setwd("/home/akilimo/lintul/dataSources/GIS_layers")



## prvides with the list of regions or states for user to select the AOI within shiny app and the selections will be provided to create coordinates for the area
## level should be Level1 or Level2.
## country can be:  Tanzania, Nigeria, Ethiopia, Kenya,Rwanda, Burundi, Ghana
define_regions <- function(country, level){
  setwd("/home/akilimo/lintul/dataSources/GIS_layers")
  if(country == "Tanzania" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_TZA_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Tanzania" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_TZA_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "Nigeria" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_NGA_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Nigeria" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_NGA_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "Ethiopia" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_ETH_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Ethiopia" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_ETH_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "Burundi" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_BDI_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Burundi" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_BDI_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "Kenay" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_KEN_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Kenay" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_KEN_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "Ghana" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_GHA_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Ghana" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_GHA_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "Rwanda" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_RWA_1")
    levelsList <- unique(C_Region$NAME_1)
  }else if(country == "Rwanda" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_RWA_2")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "DRC" & level == "Level1"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_COD_1")
    levelsList <- unique(C_Region$NAME_2)
  }else if(country == "DRC" & level == "Level2"){
    C_Region <- readOGR(dsn=getwd(), layer="gadm36_COD_2")
    levelsList <- unique(C_Region$NAME_2)
  }
  return(levelsList)
}



### with the regions/states selected withini shiny app, it will create the lat and lon, state or region names to defeine the pixels within AOI
getCoordinates <- function(listLevel1="All", country){
  ### Country shape files obtained from https://gadm.org/download_country_v3.html
  setwd("/home/akilimo/lintul/dataSources/GIS_layers")
  if(country == "Nigeria"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_NGA_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_NGA_2")
  }else if (country == "Tanzania"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_TZA_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_TZA_2")
    level2 <- droplevels(level2[!level2$NAME_2 == "Lake Victoria", ])
  }else if (country == "Ethiopia"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_ETH_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_ETH_2")
  }else if (country == "Ghana"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_GHA_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_GHA_2")
  }else if (country == "Kenya"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_KEN_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_KEN_2")
  }else if (country == "Rwanda"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_RWA_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_RWA_2")
  }else if (country == "Burundi"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_BDI_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_BDI_2")
  }else if (country == "DRC"){
    level1 <- readOGR(dsn=getwd(), layer="gadm36_COD_1")
    level2 <- readOGR(dsn=getwd(), layer="gadm36_COD_2")
  }
  
  
  if(listLevel1 != "All"){
    level2 <- level2[level2$NAME_1 %in% listLevel1, ] ## AKILIMO area within the country
  }
  plot(level1)
  plot(level2, add=TRUE, col="red")
  xmin <- extent(level2)[1]
  xmax <- extent(level2)[2]
  ymin <- extent(level2)[3]
  ymax <- extent(level2)[4]
  
  ## define a rectangualr area that covers the whole study area
  lon_coors <- seq(xmin - 0.05, xmax + 0.05, by=0.05)
  
  if(ymin > 0 & ymax > 0){
    lat_coors <- seq(ymin - 0.05, ymax + 0.05, by=0.05)
  }else if (ymin > 0 & ymax < 0){
    lat_coors_pos <- seq(ymin - 0.05, 0, by=0.05)
    lat_coors_neg <- (seq(-0.025, ymax - 0.05, by=0.05))*-1
    lat_coors <- c(lat_coors_pos, lat_coors_neg)
  }else if (ymin < 0 & ymax < 0){
    lat_coors <- (seq(ymin - 0.05, ymax + 0.05, by=0.05))
  }
  
  rect_coord <- as.data.frame(expand.grid(Longitude = lon_coors, Latitude = lat_coors)) ## this is a rectangular region need to be cropped
  head(rect_coord)
  

  rect_coord$lat <- floor(rect_coord$Latitude*10)/10 + ifelse(rect_coord$Latitude-(floor(rect_coord$Latitude*10)/10) < 0.05, 0.025, 0.075)
  rect_coord$long <- floor(rect_coord$Longitude*10)/10 + ifelse(rect_coord$Longitude-(floor(rect_coord$Longitude*10)/10) < 0.05, 0.025, 0.075)
  rect_coord <- unique(rect_coord[,c("long", "lat")])

  Districtadded <- NULL
  for(k in 1:nrow(rect_coord)){
    print(k)
    p <- rect_coord[k,]
    Q <- as.data.frame(raster::extract(level2, p))
    if(!is.na(Q$NAME_1)){
      Districtadded <- rbind(Districtadded, cbind(p, Q[, c("NAME_1", "NAME_2")]))
    }
  }
  
  if(listLevel1 != "All"){
    Districtadded2 <- droplevels(Districtadded[Districtadded$NAME_1 %in% listLevel1, ])
  }
  Districtadded2$location <- paste(Districtadded2$lat, Districtadded2$long, sep="_")
  
  return(Districtadded2)
}


TZ_regions <- c( "Kagera","Mara","Mwanza","Simiyu", "Geita","Kigoma","Shinyanga","Tanga", "Zanzibar North","Pwani",
                 "Zanzibar West","Zanzibar South and Central", "Lindi", "Mtwara", "Pemba South", "Pemba North")

NG_states <- c("Akwa Ibom", "Cross River", "Abia","Delta", "Imo" ,"Anambra", "Ebonyi", "Edo", "Ondo", "Enugu", "Ogun", "Benue", 
                "Taraba", "Kogi", "Osun", "Oyo", "Ekiti",  "Kwara")


RW_GPS <- getCoordinates(listLevel1="All", country = "Rwanda")##828
BU_GPS <- getCoordinates(listLevel1="All", country = "Burundi")##869
DRC_GPS <- getCoordinates(listLevel1="Sud-Kivu", country = "DRC")##2094
TZ_GPS <- getCoordinates(listLevel1=TZ_regions, country = "Tanzania")##10136
NG_GPS <- getCoordinates(listLevel1=NG_states, country = "Nigeria")


setwd("/home/akilimo/projects/SoilNPK/Coordinates")
write.csv(RW_GPS, "Rwanda_Coordinates.csv", row.names = FALSE)
write.csv(BU_GPS, "Burundi_Coordinates.csv", row.names = FALSE)
write.csv(DRC_GPS, "DRC_Coordinates.csv", row.names = FALSE)
write.csv(TZ_GPS, "Tanzania_Coordinates.csv", row.names = FALSE)
write.csv(NG_GPS, "Nigeria_Coordinates.csv", row.names = FALSE)


NG_GPS <- read.csv("Nigeria_Coordinates.csv")
TZ_GPS <- read.csv("Tanzania_Coordinates.csv")
DRC_GPS <- read.csv("DRC_Coordinates.csv")
BU_GPS <- read.csv("Burundi_Coordinates.csv")
RW_GPS <- read.csv("Rwanda_Coordinates.csv")


head(RW_GPS)
head(BU_GPS)
head(DRC_GPS)
head(TZ_GPS)
head(NG_GPS)


#######################################################################################
## get the ISRIC data for the whole AOI by country
######################################################################################
setwd("/home/akilimo/lintul/lintul")
source("sourceISRIC.R")

get_ISIRIC_AOI <- function(AOI_GPS){
  AOI_ISRIC <- unique(AOI_GPS[, c("long","lat")])
  names(AOI_ISRIC) <- c("lon", "lat")
 
  ISRIC_AOI <- NULL
  for(i in 1:nrow(AOI_ISRIC)){
    onePoint <- getISRICData(lat = AOI_ISRIC$lat[i], lon = AOI_ISRIC$lon[i])
    ISRIC_AOI <- rbind(ISRIC_AOI, onePoint)
  }
  return(ISRIC_AOI)
}



## RWANDA 828
RW_AOI_ISRIC <- get_ISIRIC_AOI(AOI_GPS = RW_GPS)
## BURUNDI 828
BU_AOI_ISRIC <- get_ISIRIC_AOI(AOI_GPS = BU_GPS)
## BURUNDI 828
DRC_AOI_ISRIC <- get_ISIRIC_AOI(AOI_GPS = DRC_GPS)
## BURUNDI 828
TZ_AOI_ISRIC <- get_ISIRIC_AOI(AOI_GPS = TZ_GPS)
## BURUNDI 828
NG_AOI_ISRIC <- get_ISIRIC_AOI(AOI_GPS = NG_GPS)


setwd("/home/akilimo/projects/SoilNPK/dataSource/ISRICData")
saveRDS(RW_AOI_ISRIC, "RW_AOI_ISRIC.RDS")
saveRDS(BU_AOI_ISRIC, "BU_AOI_ISRIC.RDS")
saveRDS(DRC_AOI_ISRIC, "DRC_AOI_ISRIC.RDS")
saveRDS(TZ_AOI_ISRIC, "TZ_AOI_ISRIC.csv")
saveRDS(NG_AOI_ISRIC, "NG_AOI_ISRIC.csv")



################### using iSDA data get the soil NPK for the target area

setwd("/home/akilimo/projects/SoilNPK/Coordinates")
RW_isda_H2O2dapth <- readRDS("RW_isda_H2O2dapth.RDS")
BU_isda_H2O2dapth <- readRDS("BU_isda_H2O2dapth.RDS")
DRC_isda_H2O2dapth <- readRDS("DRC_isda_H2O2dapth.RDS")

Burundi_Coor <- read.csv("Burundi_Coordinates.csv")
DRC_Coor <- read.csv("DRC_Coordinates.csv")

Burundi_Coor$location <- paste(Burundi_Coor$long, Burundi_Coor$lat, sep="_")
DRC_Coor$location <- paste(DRC_Coor$long, DRC_Coor$lat, sep="_")

BU_isda_H2O2dapth <- unique(merge(BU_isda_H2O2dapth, Burundi_Coor, by.x=c("lon","lat","location"),by.y=c("long","lat","location")))
DRC_isda_H2O2dapth <- unique(merge(DRC_isda_H2O2dapth, DRC_Coor, by.x=c("lon","lat","location"),by.y=c("long","lat","location")))

BU_isda_H2O2dapth <- BU_isda_H2O2dapth[, colnames(RW_isda_H2O2dapth)]
DRC_isda_H2O2dapth <- DRC_isda_H2O2dapth[, colnames(RW_isda_H2O2dapth)]

RW_isda_H2O2dapth$Country <- "Rw"
BU_isda_H2O2dapth$Country <- "BI"
DRC_isda_H2O2dapth$Country <- "DRC"

RWBuRDC_iSDA <- rbind(RW_isda_H2O2dapth, BU_isda_H2O2dapth, DRC_isda_H2O2dapth)
length(unique(RWBuRDC_iSDA$location))##3557

length(unique(RW_isda_H2O2dapth$location))##828
length(unique(BU_isda_H2O2dapth$location))##805
length(unique(DRC_isda_H2O2dapth$location))##1924

setwd("/home/akilimo/projects/SoilNPK/Rwanda")
soilNPK <- readRDS("soilNPK_isda.rds")
head(soilNPK)

RWBuRDC_iSDA$soilN <- 0
RWBuRDC_iSDA$soilP <- 0
RWBuRDC_iSDA$soilK <- 0
str(RWBuRDC_iSDA)
RWBuRDC_iSDA$texture_class_0_20 <- as.factor(RWBuRDC_iSDA$texture_class_0_20)



GIS_soilINS_modDataisda <- soilNPK[, c("soilN", "soilP", "soilK","aluminium_extractable_0_20", "aluminium_extractable_20_50",
                                            "bedrock_depth_0_200", "bulk_density_0_20","bulk_density_20_50", "calcium_extractable_0_20", "calcium_extractable_20_50",
                                            "carbon_organic_0_20", "carbon_organic_20_50", "carbon_total_0_20", "carbon_total_20_50", "cation_exchange_capacity_0_20",
                                            "cation_exchange_capacity_20_50","clay_content_0_20","clay_content_20_50","iron_extractable_0_20","iron_extractable_20_50",
                                            "magnesium_extractable_0_20", "magnesium_extractable_20_50", "nitrogen_total_0_20", "nitrogen_total_20_50",
                                            "ph_0_20", "ph_20_50", "phosphorous_extractable_0_20", "phosphorous_extractable_20_50","potassium_extractable_0_20",
                                            "potassium_extractable_20_50", "sand_content_0_20","sand_content_20_50", "silt_content_0_20", "silt_content_20_50",
                                            "stone_content_0_20", "stone_content_20_50", "sulphur_extractable_0_20", "sulphur_extractable_20_50", "texture_class_0_20",
                                            "texture_class_20_50", "zinc_extractable_0_20", "zinc_extractable_20_50", "slope_angle", "fcc", "land_cover_2015", 
                                            "land_cover_2016","land_cover_2017", "land_cover_2018","land_cover_2019", "soilOM_0_20", "soilOM_20_50",
                                            "percentSOM_0_20", "percentSOM_20_50", "wp_0_20", "wp_20_50", "FC_0_20", "FC_20_50", "sws_0_20", "sws_20_50",
                                            "Country", "Control")]
str(GIS_soilINS_modDataisda)
GIS_soilINS_modDataisda$Country  <- as.factor(GIS_soilINS_modDataisda$Country )

### create classes of control
GIS_soilINS_modDataisda$Control <- GIS_soilINS_modDataisda$Control/1000
GIS_soilINS_modDataisda$CONclass <- ifelse(GIS_soilINS_modDataisda$Control < 7.5, "class1",
                                        ifelse(GIS_soilINS_modDataisda$Control >= 7.5 & GIS_soilINS_modDataisda$Control < 15, "class2",
                                               ifelse(GIS_soilINS_modDataisda$Control >= 15 & GIS_soilINS_modDataisda$Control < 22.5, "class3",
                                                      ifelse(GIS_soilINS_modDataisda$Control >= 22.5 & GIS_soilINS_modDataisda$Control < 30, "class4", "class5"))))
GIS_soilINS_modDataisda$CONclass <- as.factor(GIS_soilINS_modDataisda$CONclass)

### create classes of control
GIS_soilINS_modDataisda$Control <- GIS_soilINS_modDataisda$Control/1000
GIS_soilINS_modDataisda$CONclass <- ifelse(GIS_soilINS_modDataisda$Control < 7.5, "class1",
                                           ifelse(GIS_soilINS_modDataisda$Control >= 7.5 & GIS_soilINS_modDataisda$Control < 15, "class2",
                                                  ifelse(GIS_soilINS_modDataisda$Control >= 15 & GIS_soilINS_modDataisda$Control < 22.5, "class3",
                                                         ifelse(GIS_soilINS_modDataisda$Control >= 22.5 & GIS_soilINS_modDataisda$Control < 30, "class4", "class5"))))

GIS_soilINS_modDataisda$CONclass <- as.factor(GIS_soilINS_modDataisda$CONclass)
GIS_soilINS_modDataisda$Country <- as.factor(GIS_soilINS_modDataisda$Country)
GIS_soilINS_modDataisda <- GIS_soilINS_modDataisda[complete.cases(GIS_soilINS_modDataisda), ]


### Data partioning take 70:30 by country

### Data partioning take 70:30 by country
#'function to split the data by country to training and test datasets
#' @param probs a vector of probability weights for obtaining the elements of the vector being sampled.
#' @param soil_BLUPSData data frame with soil variables and the BLUPS for yield for the control treatment 
splitdata <- function(probs=c(0.7,0.3), soil_BLUPSData){
  set.seed(444)
  trianData <- NULL
  testDatA <- NULL
  for(country in unique(soil_BLUPSData$Country)){
    Country_data <- droplevels(soil_BLUPSData[soil_BLUPSData$Country == country,])
    indexCountry <- sample(2, nrow(Country_data), replace=TRUE, prob=probs)## where conrtol yield is used as a covariate
    trainDataCountry <- Country_data[indexCountry == 1, ]
    testDataCountry <- Country_data[indexCountry == 2, ]
    trianData <- rbind(trianData, trainDataCountry) 
    testDatA <- rbind(testDatA, testDataCountry)
  }
  return(list(trianData, testDatA))
}


traintestData <- splitdata(soil_BLUPSData = GIS_soilINS_modData2)
trainData <- traintestData[[1]]
testData <- traintestData[[2]]

traintestData_isda <- splitdata(soil_BLUPSData = GIS_soilINS_modDataisda)
trainDataisda <- traintestData_isda[[1]]
testDataisda <- traintestData_isda[[2]]



Ndata_Train  <- subset(trainData, select=-c(soilP, soilK))
Pdata_Train  <- subset(trainData, select=-c(soilN, soilK))
Kdata_Train  <- subset(trainData, select=-c(soilN, soilP))

Ndata_Valid  <- subset(testData, select=-c(soilP, soilK))
Pdata_Valid  <- subset(testData, select=-c(soilN, soilK))
Kdata_Valid  <- subset(testData, select=-c(soilN, soilP))


Ndata_Train_isda  <- subset(trainDataisda, select=-c(soilP, soilK))
Pdata_Train_isda  <- subset(trainDataisda, select=-c(soilN, soilK))
Kdata_Train_isda  <- subset(trainDataisda, select=-c(soilN, soilP))

Ndata_Valid_isda  <- subset(testDataisda, select=-c(soilP, soilK))
Pdata_Valid_isda  <- subset(testDataisda, select=-c(soilN, soilK))
Kdata_Valid_isda  <- subset(testDataisda, select=-c(soilN, soilP))



## Coustome control parameter 
#custom <- trainControl(method="repeatedcv", number=10, repeats=5, verboseIter=TRUE)
custom <- trainControl(method="oob", number=10)
##########################################################################
## Random Forest soilN: Rsq with control = 0.67 and with CONclass = 0.48
##########################################################################
## using isric
set.seed(773)
RF_N1 <- randomForest(soilN ~ ., subset(Ndata_Train, select=-c(Control)), importance=TRUE, ntree=1000)
# RF_N1 <- randomForest(sqrt(soilN) ~ ., subset(Ndata_Train, select=-c(CONclass)), importance=TRUE, ntree=1000)
Ndata_Valid$predN <- predict(RF_N1, Ndata_Valid)
sst <- sum((Ndata_Valid$soilN - mean(Ndata_Valid$soilN))^2)
sse <- sum((Ndata_Valid$soilN - Ndata_Valid$predN)^2)
rsq <- 1 - sse / sst
rsq
varImpPlot(RF_N1)
varImpPlot(RF_N1)
varimportance <- data.frame(RF_N1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilN_isric.csv", row.names = FALSE)

ggn <- ggplot(Ndata_Valid, aes(soilN, predN, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,250) + ylim(0,250)+
  xlab("soil N from QUEFTS optimization")+
  ylab("perdicted soil N from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilN_isric.pdf", ggn, width=5, height=5)

## using isda (R sq = 0.63 with contorlClass and 0.65 )
set.seed(773) 
RF_N1_isda <- randomForest(soilN ~ ., subset(Ndata_Train_isda, select=-c(Control)), importance=TRUE, ntree=1000)
# RF_N1_isda <- randomForest(soilN ~ ., subset(Ndata_Train_isda, select=-c(CONclass)), importance=TRUE, ntree=1000)
Ndata_Valid_isda$predN <- predict(RF_N1_isda, Ndata_Valid_isda)
sst <- sum((Ndata_Valid_isda$soilN - mean(Ndata_Valid_isda$soilN))^2)
sse <- sum((Ndata_Valid_isda$soilN - Ndata_Valid_isda$predN)^2)
rsq <- 1 - sse / sst
rsq
varImpPlot(RF_N1_isda)
varImpPlot(RF_N1_isda)
varimportance <- data.frame(RF_N1_isda$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilN_isda.csv", row.names = FALSE)

ggn <- ggplot(Ndata_Valid_isda, aes(soilN, predN, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,250) + ylim(0,250)+
  xlab("soil N from QUEFTS optimization")+
  ylab("perdicted soil N from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilN_isda.pdf", ggn, width=5, height=5)



##########################################################################
## Random Forest soilP: 0.77 and without CONclass = 0.50
##########################################################################
set.seed(773)
RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train, select=-c(CONclass)), importance=TRUE, ntree=1000)
Pdata_Valid$predP <- predict(RF_P1, Pdata_Valid)
sst <- sum((Pdata_Valid$soilP - mean(Pdata_Valid$soilP))^2)
sse <- sum((Pdata_Valid$soilP - Pdata_Valid$predP)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_P1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilP_isric.csv", row.names = FALSE)


ggp <- ggplot(Pdata_Valid, aes(soilP, predP, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,150) + ylim(0,150)+
  xlab("soil P from QUEFTS optimization")+
  ylab("perdicted soil P from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilP_isric.pdf", ggp, width=5, height=5)


set.seed(773)
RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train_isda, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_P1 <- randomForest(soilP ~ ., subset(Pdata_Train_isda, select=-c(CONclass)), importance=TRUE, ntree=1000)
Pdata_Valid_isda$predP <- predict(RF_P1, Pdata_Valid_isda)
sst <- sum((Pdata_Valid_isda$soilP - mean(Pdata_Valid_isda$soilP))^2)
sse <- sum((Pdata_Valid_isda$soilP - Pdata_Valid_isda$predP)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_P1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilP_isda.csv", row.names = FALSE)


ggp <- ggplot(Pdata_Valid_isda, aes(soilP, predP, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,150) + ylim(0,150)+
  xlab("soil P from QUEFTS optimization")+
  ylab("perdicted soil P from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilP_isda.pdf", ggp, width=5, height=5)


##########################################################################
## Random Forest soilK" R sq. 0.59 and without control = 0.19
##########################################################################
set.seed(773)
RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train, select=-c(CONclass)), importance=TRUE, ntree=1000)
Kdata_Valid$predK <- predict(RF_K1, Kdata_Valid)
sst <- sum((Kdata_Valid$soilK - mean(Kdata_Valid$soilK))^2)
sse <- sum((Kdata_Valid$soilK - Kdata_Valid$predK)^2)
rsq <- 1 - sse / sst
rsq


varImpPlot(RF_K1)
varimportance <- data.frame(RF_P1$importance)
varimportance$vars <- rownames(varimportance)
varimportance <- varimportance[order(varimportance$X.IncMSE, decreasing = TRUE), ]
rownames(varimportance) <- NULL
varimportance$Importance <- c(1:nrow(varimportance))
write.csv(varimportance[, c('vars', 'Importance')], "RFvars_soilK_isric.csv", row.names = FALSE)


ggk <- ggplot(Kdata_Valid, aes(soilK, predK, col=Country)) +
  geom_point()+
  geom_abline(slope = 1, intercept = 0) +
  xlim(0,350) + ylim(0,350)+
  xlab("soil K from QUEFTS optimization")+
  ylab("perdicted soil K from GIS data")+
  theme_bw()
ggsave("QUEFTS_GIS_RF_soilK_isric.pdf", ggk, width=5, height=5)



set.seed(773)
RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train_isda, select = -c(Control)), importance=TRUE, ntree=1000)
# RF_K1 <- randomForest(soilK ~ ., subset(Kdata_Train_isda, select=-c(CONclass)), importance=TRUE, ntree=1000)
Kdata_Valid_isda$predK <- predict(RF_K1, Kdata_Valid_isda)
sst <- sum((Kdata_Valid_isda$soilK - mean(Kdata_Valid_isda$soilK))^2)
sse <- sum((Kdata_Valid_isda$soilK - Kdata_Valid_isda$predK)^2)
rsq <- 1 - sse / sst
rsq



#####################################################
### get the soil NPK for the whole area
#####################################################
setwd("/home/akilimo/projects/SoilNPK")
RW_AOI_ISRIC <- readRDS("RW_AOI_ISRIC.RDS")
BU_AOI_ISRIC <- readRDS("BU_AOI_ISRIC.RDS")
DRC_AOI_ISRIC <- readRDS("DRC_AOI_ISRIC.RDS")


