########################################################
# FUNCTIONS TO IDENTIFY ISSUES WITH ASSIGN FORM OUTPUT #
########################################################

findDupl <- function(ds, entity=c("FD","TL","PO","PA")){
  #SHORT DEF:   Function to find duplicate field, trial, plot or plant IDs.
  #RETURNS:     Vector of IDs that occur more than once in the dataset.
  #DESCRIPTION: Duplicate IDs can either occur because the Assign ODK form was completed more than once, 
  #             or the same barcode was printed twice and the same barcode ID was assigned to different 
  #             fields, trials, plots or plants.
  #INPUT:       ds: (merged) output from an Assign ODK form
  #             entity: either one of FD (Assign Field), TL (Assign Trial to Field), PO (Assign Plots to
  #             Trial) or PA (Assign Plants ot Plot)
  
  if (entity=="FD") {
    IDs <- ds$fieldID
  }else{
    if (entity=="TL"){
      IDs <- ds$trialID
    }else{
      if (entity=="PO"){
        IDs <- ds$plotID
      }else{
        if (entity=="PA"){
          IDs <- ds$plantID
        }
      }
    }
  }
  freq <- as.data.frame(table(IDs))
  dupl <- freq[freq$Freq>1,]$IDs
  if(length(dupl)==0) {dupl <- NULL}
  print(paste(length(dupl), "duplicate IDs were found.", sep=" "))
  return(as.vector(dupl))
}

findKillKey <- function(ID, ds, entity = c("FD", "TL", "PO", "PA"), maxDist=150){
  #SHORT DEF:   Function to identify entries with duplicate IDs that are assigned to the same higher  
  #             level entity ID (or same field based on distance for field IDs).
  #RETURNS:     Vector of OKD form KEYs to remove.
  #DESCRIPTION: the ds can then be reduced by removing the KEYs identified by this function.
  #INPUT:       ID: a trial, plot or plant ID that occurs more than once in the dataset (identified by 
  #             findDupl).
  #             ds: (merged) output from an Assign ODK form
  #             entity: either one of FD (Assign Field), TL (Assign Trial to Field), PO (Assign Plots to
  #             Trial) or PA (Assign Plants ot Plot)
  #             maxDist: maximum distance (in m) between GPS coordinates for submissions with duplicate
  #             fieldIDs to be considered as duplicates
  
  if (entity=="FD") {
    require(geosphere)
    ss <- ds[ds$fieldID == ID,]
    coords <- subset(ss, select=c(geopoint.Latitude, geopoint.Longitude))
    dist <- distm(coords, coords, fun = distHaversine)
    if(max(dist) < maxDist){
      KEY <- ss[ss$geopoint.Accuracy == min(ss$geopoint.Accuracy),]$KEY
      if(length(KEY)>1){
        KEY <- KEY[1]
      }
      killKEY <- ss[!ss$KEY==KEY,]$KEY
    }else{
      killKEY <- ss[which(rowSums(dist>maxDist,1)==0),]$KEY
    }
  }else{
    if (entity=="TL") {
      ss <- ds[ds$trialID == ID,]
      tmp <- subset(ss, select=c(fieldID, trialID, fieldbookID, KEY))
      KEY <- tmp[!duplicated(tmp[,-4]),]$KEY
      killKEY <- ss[!ss$KEY==KEY,]$KEY
    }else{
      if (entity=="PO"){
        ss <- ds[ds$plotID == ID,]
        tmp <- subset(ss, select=c(trialID, plotID, treatCode, KEY2))
        KEY <- tmp[!duplicated(tmp[,-4]),]$KEY2
        killKEY <- ss[!ss$KEY2==KEY,]$KEY2
      }else{
        if (entity=="PA"){
          ss <- ds[ds$plantID == ID,]
          tmp <- subset(ss, select=c(plotID, plantID, KEY2))
          KEY <- tmp[!duplicated(tmp[,-3]),]$KEY2
          killKEY <- ss[!ss$KEY2==KEY,]$KEY2
        }
      }
    }
  }
  return(as.vector(killKEY))
}

removeDupl <- function(ds, entity = c("FD", "TL", "PO", "PA"), maxDist=150, printprog=FALSE){
  #SHORT DEF:   Function to remove entries from ds with duplicate primary IDs identified by findKillKey.
  #RETURNS:     Dataframe with duplicate entries removed.
  #DESCRIPTION: After applying this function to the output of a (merged) Assign form, only duplicate 
  #             entries remain that need further attention and are likely printing errors (same barcode 
  #             to different entities).
  #INPUT:       ds: (merged) output from Assign Plots to Trial ODK form
  #             entity: either one of FD (Assign Field), TL (Assign Trial to Field), PO (Assign Plots to
  #             Trial) or PA (Assign Plants ot Plot)
  # entity:     either one of FD (Assign Field), TL (Assign Trial to Field), PO (Assign Plots to
  #             Trial) or PA (Assign Plants ot Plot)
  
  NI <- nrow(ds)
  dupl <- findDupl(ds=ds, entity=entity)
  j <- 0
  n <- length(dupl)
  killKEY <- NULL
  for(i in dupl){
    killKEY <- c(killKEY, findKillKey(ID=i, ds=ds, entity=entity), maxDist=maxDist)
    j <- j+1
    if(printprog){print(paste("Entry ", j, "/", n, " checked.", sep=""))}
  }
  if(entity %in% c("FD", "TL")){
    ds <- ds[!ds$KEY %in% killKEY,]
  }else{
    if(entity %in% c("PO", "PA")){
      ds <- ds[!ds$KEY2 %in% killKEY,]
    }
  }
  if(!printprog){print(paste(length(killKEY), "entries were checked.", sep=" "))}
  NF <- nrow(ds)
  print(paste(NI-NF, "entries were removed.", sep=" "))
  return(ds)
}

findLoose <- function(ds2, ds1, entity=c("FD","TL","PO")){
  #SHORT DEF:   Function to find 'loose' IDs (parent IDs that are not assigned).
  #RETURNS:     Vector of loose IDs.
  #DESCRIPTION: If any loose IDs are found, then the enumerator must be alerted and (s)he must ensure 
  #             that the Assign ODK form is completed during a next visit.
  #INPUT:       ds2: (merged) output from an Assign ODK form (either Assign Trial to Field, Assign Plots
  #             to Trial, or Assign Plants to Plot)
  #             ds1: (merged) output from the parent Assign ODK form (either Assign Field, Assign Trial
  #             to Field or Assign Plots to Trial)
  #             entity (of the parent): either one of FD (Assign Field), TL (Assign Trial to Field), or
  #             PO (Assign Plots to Trial). PA (Assign Plants ot Plot) cannot be used
  
  if (entity=="FD") {
    IDs1 <- ds1$fieldID
    IDs2 <- ds2$fieldID
  }else{
    if (entity=="TL") {
      IDs1 <- ds1$trialID
      IDs2 <- ds2$trialID
    }else{
      if (entity=="PO"){
        IDs1 <- ds1$plotID
        IDs2 <- ds2$plotID
      }
    }
  }
  looseIDs <- setdiff(IDs2, IDs1)
  if(length(looseIDs)==0){looseIDs <- NULL}
  print(paste(length(looseIDs), "loose IDs were found.", sep=" "))
  return(looseIDs)
}

findDuplTT <- function(dsPO){
  #SHORT DEF:   Function to find duplicate treatments within trials.
  #RETURNS:     Dataframe with trialIDs and treatCodesthat occur more than once within a trial.
  #DESCRIPTION: Treatments should be unique within a trial, hence if a trial had duplicate treatments,
  #             these were wrongly assigned by the enumerator, who will need to be contacted to correct.
  #INPUT:       dsPO: merged output from Assign Plots to Trial ODK form
  
  tmp <- subset(dsPO, select = c(trialID, treatCode))
  dupl <- tmp[duplicated(tmp),]
  print(paste(nrow(dupl), "duplicate treatments within trials were found.", sep=" "))
  return(as.vector(dupl))
}

mergeODKforms <- function(ds1, ds2, ds3=NULL, ds4=NULL){
  #SHORT DEF:   Function to merge ODK forms with repeat loops (up to 3 deep = 4 datasets).
  #RETURNS:     Merged dataframe with renamed KEYS (KEY1, KEY2, KEY3, KEY4).
  #DESCRIPTION: Assumes that there are no consistency errors, merges up to 3 deep = 4 datasets, and 
  #             requires at least two datasets.
  #INPUT:       ds1: highest level (parent) dataset (required)
  #             ds2: second level (daughter) dataset (required)
  #             ds3: third level (grand daughter) dataset (optional)
  #             ds4: fourth level (grand grand daughter) dataset (optional)
  
  names(ds1)[names(ds1) == "KEY"] <- "KEY1"
  names(ds2)[names(ds2) == "KEY"] <- "KEY2"
  names(ds2)[names(ds2) == "PARENT_KEY"] <- "KEY1"
  ds <- merge(ds1, ds2)
  if(!is.null(ds3)) {
    names(ds3)[names(ds3) == "KEY"] <- "KEY3"
    names(ds3)[names(ds3) == "PARENT_KEY"] <- "KEY2"
    ds <- merge(ds, ds3)
    if(!is.null(ds4)) {
      names(ds4)[names(ds4) == "KEY"] <- "KEY4"
      names(ds4)[names(ds4) == "PARENT_KEY"] <- "KEY3"
      ds <- merge(ds, ds4)
    }
  }
  return(ds)
}

sendReport <- function(mailFrom = readChar("mailFrom.txt", nchar=100),
                       mailPssw = readChar("mailPssw.txt", nchar=100),
                       to,...){
  #SHORT DEF:   Function to send error reports to data collectors.
  #RETURNS:     Sends an automated email requesting for data corrections.
  #DESCRIPTION: 
  #INPUT:       to: vector of email adresses to send email report to
  
  library(mailR)
  
  subject <- "blablabla" #TODO: replace by email subject of report to be sent
  body <-    "blablabla" #TODO: replace by email body of report to be sent
  
  send.mail(from = mailFrom, to = to, subject = subject, body = body, html = TRUE, inline = TRUE,
            #smtp = list(host.name = "smtp.office365.com", port = 587, user.name = mailFrom, passwd = mailPssw, tls = TRUE),  #from cgiar.org email address
            smtp = list(host.name = "smtp.gmail.com", port = 465, user.name = mailFrom, passwd = mailPssw, ssl = TRUE),      #from gmail.com email address
            authenticate = TRUE, send = TRUE)
}

disagIDs <- function(ds, entity=c("FD","TL","PO","PA","PS","SS")){
  #SHORT DEF:   Function to disaggregate field, trial, plot or plant IDs and add as new columns to ds.
  #RETURNS:     ds, with columns added ID.project, ID.entity, ID.country and ID.seqNr.
  #DESCRIPTION: IDs are composed as XXYYZZ123456, where XX=project, YY=entity, ZZ=county and 123456=seqNr.
  #             This ID is disaggregated and values for XX, YY, ZZ and 123456 are added as new columns.
  #INPUT:       ds: (merged) output from an Assign ODK form
  #             entity: either one of FD (Field), TL (Trial), PO (Plot), PA (Plant), PS (Plant Sample) or
  #             SS (Soil Sample).
  
  if (entity=="FD") {
    ds$fieldID.project <- substr(ds$fieldID, 1, 2)
    ds$fieldID.entity  <- substr(ds$fieldID, 3, 4)
    ds$fieldID.country <- substr(ds$fieldID, 5, 6)
    ds$fieldID.seqnr   <- as.numeric(substr(ds$fieldID, 7, 12))
  }else{
    if (entity=="TL"){
      ds$trialID.project <- substr(ds$trialID, 1, 2)
      ds$trialID.entity  <- substr(ds$trialID, 3, 4)
      ds$trialID.country <- substr(ds$trialID, 5, 6)
      ds$trialID.seqnr   <- as.numeric(substr(ds$trialID, 7, 12))
    }else{
      if (entity=="PO"){
        ds$plotID.project <- substr(ds$plotID, 1, 2)
        ds$plotID.entity  <- substr(ds$plotID, 3, 4)
        ds$plotID.country <- substr(ds$plotID, 5, 6)
        ds$plotID.seqnr   <- as.numeric(substr(ds$plotID, 7, 12))
      }else{
        if (entity=="PA"){
          ds$plantID.project <- substr(ds$plantID, 1, 2)
          ds$plantID.entity  <- substr(ds$plantID, 3, 4)
          ds$plantID.country <- substr(ds$plantID, 5, 6)
          ds$plantID.seqnr   <- as.numeric(substr(ds$plantID, 7, 12))
        }else{
          if (entity=="PS"){
            ds$plantSampleID.project <- substr(ds$plantSampleID, 1, 2)
            ds$plantSampleID.entity  <- substr(ds$plantSampleID, 3, 4)
            ds$plantSampleID.country <- substr(ds$plantSampleID, 5, 6)
            ds$plantSampleID.seqnr   <- as.numeric(substr(ds$plantSampleID, 7, 12))
          }else{
            if (entity=="SS"){
              ds$soilSampleID.project <- substr(ds$soilSampleID, 1, 2)
              ds$soilSampleID.entity  <- substr(ds$soilSampleID, 3, 4)
              ds$soilSampleID.country <- substr(ds$soilSampleID, 5, 6)
              ds$soilSampleID.seqnr   <- as.numeric(substr(ds$soilSampleID, 7, 12))
            }
          }
        }
      }
    }
  }
  return(ds)
}


#SHORT DEF:   Function to call briefcase and download a set of forms.
#RETURNS:     Nothing. Forms are saved in the directory indicated.
#DESCRIPTION: Function built using the odkr package that calls briefcase. The briefcase jar file must be saved in the 
#             directory provided and named odkBriefcase_latest.jar.
#             The directory must also contain a txt.file named pws.txt, in which the username and password are saved. with 
#INPUT:       forms: Vector of form IDs for which data will be downloaded.
#             target: directory in which xml file and exported csv file will be saved.
briefCaseDwnld <- function(forms, target, source=c("ONA","ODK")){
  
  if(identical(find.package("odkr", quiet=TRUE), character(0))){
    library(devtools)
    install_github("validmeasures/odkr")
  }
  require(odkr)
  
  for (i in forms){
    
    fls <- list.files(path=target, pattern=paste("^", i, sep=""))
    if(length(fls)>0) file.remove(paste(target, "/", fls, sep=""))
    
    pull_remote(id = i, 
                target = target, 
                to = target,
                from = ifelse(source=="ONA", "https://ona.io/iita_nrm", "https://acai-agg.appspot.com"), 
                username = scan(paste(target, "/", ifelse(source=="ONA", "pws.txt", "pwp.txt"), sep=""), what="character")[1], 
                password = scan(paste(target, "/", ifelse(source=="ONA", "pws.txt", "pwp.txt"), sep=""), what="character")[2])
    
    export_data(id = i, 
                target = target, 
                from = target, 
                to = target, 
                filename = paste(i,".csv",sep=""))
  }
}
