Added sid in output table

f1408ace · Richard Cornes · 66c6dc63 · f1408ace
Commit f1408ace authored 3 years ago by Richard Cornes
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 327 deletions

rscripts/get_dups.R rscripts/get_dups.R +0 -327

No files found.
--- a/rscripts/get_dups.R
+++ b/rscripts/get_dups.R
-#!/usr/bin/env Rscript
-# code written by Elizabeth Kent, National Oceanography Centre
-# code written for the NERC HOSTACE project and adapted for
-# C3S_311a_Lot2
-
-# Description:
-##
-## IMPORTANT: all paths to all data are defined in config.yml
-## input_data:
-# 1. arguments (e.g. c(1990, 1999))
-# 2. output_data/MFILES_SHIP_PROC
-# 3. output_data/PAIRFILES
-
-## Outputs the data to a home dir:
-# output_data/PAIR_DUPFILES
-Sys.setenv(TZ='GMT')
-my_args = commandArgs(trailingOnly=TRUE)
-config<-config::get(file = "config.yml")
-
-options(warn=1)
-options(width=1000)
-data.table::setDTthreads(1) # this parellises, test it
-
-
-cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
-dck.gts<-c(555,700,792,793,794,795,796,797,798,888,992,993,994,995,996,997)
-dck.dm<-c(926,927,928)
-
-if (!dir.exists(paste0(config$output_dir,"PAIR_DUPFILES"))) dir.create(paste0(config$output_dir,"PAIR_DUPFILES"))
-
-#for ( year in 1850:2018 ) {
-
-if ( length(my_args) == 0 ) {my_args<-my_args2}
-for ( year in my_args[1]:my_args[2] ) {
-
-#for ( year in seq(1989,1989,1) ) {
- for ( imo in 1:12 ) {
- #for ( imo in 1:1 ) {
-
-   cat("-----------------------------------------------","\n")
-   cat(year,imo,"\n")
-   #cat(year,"\n")
-   cat("-----------------------------------------------","\n")
-
-   vlist.match<-c("sst","slp","at","w","d","ww","n","wbt","dpt","vv","rh","wh","nh","w1","osv")
-   vlist.ecv<-c("sst","slp","at","w","d","wbt","dpt","rh")
-   vlist.pos<-c("yr","mo","dy","hr","id","dck","sid","lon","lat")
-
-   fn.out<-paste0(config$output_dir,"PAIR_DUPFILES/",year,"-",cmo[imo],".txt")
-   if (file.exists(fn.out)) file.remove(fn.out)
-
-   fn.in <- paste0(paste0(config$output_dir,"MFILES_SHIP_PROC/"),year,"_",imo,".Rda")
-   if ( !file.exists(fn.in) ) next
-   mdata<-readRDS(fn.in)
-
-   if ( file.exists(paste0(config$output_dir,"PAIRFILES/",year,"_",imo,".Rda"))) {
-    pp.dup<-readRDS(paste0(config$output_dir,"PAIRFILES/",year,"_",imo,".Rda"))
-    pp.dup<-pp.dup[pp.dup$id.match,]
-    pp.dup$i.id<-gsub("\\|","",pp.dup$i.id)
-   } else {
-    pp.dup <- NULL
-    #mdata$dflag<-0
-    #nodup<-subset(mdata,mo==imo)
-    #nodup<-nodup[,c("uid","dflag")]
-    #nodup$brace<-"{}"
-    #nodup$uid<-paste0("ICOADS-30-",nodup$uid)
-    #write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
-    next
-   }
-   
-   if ( nrow(pp.dup) == 0 ) { 
-    mdata$dflag<-0
-    #nodup<-subset(mdata,mo==imo)
-    #nodup<-nodup[,c("uid","dflag")]
-    #nodup$brace="{}"
-    #nodup$uid<-paste0("ICOADS-30-",nodup$uid)
-    #write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
-    next
-   }
-   
-   if ( FALSE ) {
-    # want to make sure that report in pair flagged as duplicate doesn't contain info missing in "best" report
-    pp.dup$minnum<-ifelse(pp.dup$numel >= pp.dup$i.numel,pp.dup$i.numel,pp.dup$numel)
-    pp.dup<-subset(pp.dup,nmatch>=minnum)  # want to look at this later, some dck (e.g. 721 have different vars in diff reps.)
-   }
-
-     pp.dup<-pp.dup[pp.dup$id.match,]
-     #pp.dup<-subset(pp.dup,dy<=5)
-
-     #------------------------------------------------------------------------------------------
-     # find uid groups
-       cat("finding uid groups .... ")
-   
-       pp<-pp.dup[,c("uid","i.uid")]
-   
-       foo<-lapply(1:nrow(pp),function(i) c(pp[i,1],pp[i,2]))
-   
-       edges <- do.call(rbind, lapply(foo, function(x) {
-         if (length(x) > 1) cbind(head(x, -1), tail(x, -1)) else NULL
-       }))
-       rm(foo)
-   
-       # from the igraph package
-       g <- igraph::graph.data.frame(edges, directed=FALSE)
-       # this is a list with each member being the uids to group
-       dup.uids<-split(igraph::V(g)$name, igraph::clusters(g)$membership)
-   
-       cat("got uid groups","\n")
-     #------------------------------------------------------------------------------------------
-   
-     #cat("about to merge dup group info"," ")
-     vlist.match<-vlist.match[which(vlist.match %in% names(mdata))]
-     mdata$numel<-rowSums(!is.na(mdata[,vlist.match]))
-     vlist.ecv<-vlist.ecv[which(vlist.ecv %in% names(mdata))]
-     mdata$numecv<-rowSums(!is.na(mdata[,vlist.ecv]))
-     cat("added element counts","\n")
-
-     # append group number to data frame mdata
-     names(dup.uids)<-paste0(names(dup.uids),".")
-     aa<-as.data.frame(unlist(dup.uids))
-     aa$dup.group<-rownames(aa)
-     rownames(aa)<-NULL
-     names(aa)<-c("uid","dup.group")
-     aa$dup.group<-substr(aa$dup.group, 1, regexpr("\\.", aa$dup.group)-1)
-     mdata2<-merge(mdata,aa)
-     if ( nrow(mdata2) == 0 ) {
-      cat("no potential duplicates","\n")
-      sp<-NULL
-      sp.good<-NULL
-      sp.check<-NULL
-      #nodup<-subset(mdata,mo==imo)
-      #nodup<-nodup[,c("uid","dflag")]
-      #nodup$brace="{}"
-      #nodup$uid<-paste0("ICOADS-30-")
-      #write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
-      {next}
-     } 
-     cat("added group number to main data frame","\n")
-   
-     rm(aa)
-     if ( nrow(pp.dup) == 0 ) {
-       mdata$dflag<-0
-       #nodup<-subset(mdata,mo==imo)
-       #nodup<-nodup[,c("uid","dflag")]
-       #nodup$brace="{}"
-       #nodup$uid<-paste0("ICOADS-30-")
-       #write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
-       next
-     }
-
-     vlist.add<-c("uid","dups","dup.group","dck.pri","id.class","is.gts","is.dm","c1m","immv","numel","numecv")
-     vp<-c(vlist.pos,vlist.match[which(vlist.match %in% names(mdata2))],names(mdata2)[grepl("prec$",names(mdata2))],vlist.add)
-     # sort them within dup groups so the best duplicate appears first
-     if ( !("immv" %in% names(mdata2) ) ) mdata2$immv<-NA
-     if ( !("c1m" %in% names(mdata2) ) ) mdata2$c1m<-NA
-     mdata2<-icoads.utils::add_dck_priority(mdata2)
-     #mdata2<-icoads.utils::add_ID_class(mdata2)
-     mdata2$is.gts<-ifelse(mdata2$dck %in% dck.gts, TRUE, FALSE)
-     mdata2$is.dm<-ifelse(mdata2$dck %in% dck.dm, TRUE, FALSE)
-     sub<-mdata2[order(as.numeric(mdata2$dup.group),mdata2$dck.pri,-mdata2$immv,-mdata2$numecv,-mdata2$numel,-is.na(mdata2$c1m),mdata2$dups),vp]
-     #cat("4 ")
-     sp<-split(sub,sub$dup.group)
-   
-     # find maximum difference across variables
-     ind<-sapply(sp,icoads.utils::var_diff)
-   
-     cat("got quality info","\n")
-   
-     # split into those which agree and those which don't
-     diff.lim<-0.01
-     sp.good<-sp[which(ind<diff.lim)] 
-     sp.check<-sp[which(ind>=diff.lim & ind < 0.2)] # inspection suggests this is a reasonable value
-     ind<-sapply(sp.check,icoads.utils::var_diff)
-   
-     cat("split by quality info","\n")
-     cat("number of dup groups",length(sp),"\n")
-     if ( length(sp.good) > 0 ) {
-      cat("number of OK dup groups",length(sp.good),"\n")
-      cat("number of identified dups",sum(sapply(sp.good, nrow))-length(sp.good),"\n")
-     } else {
-      cat("no OK dup groups","\n")
-     }
-     cat("number of icoads dups",sum(!is.na(mdata$dups[which(mdata$dups>2)])),"of",nrow(mdata),"\n")
-     if ( length(sp.check) > 0 ) {
-      cat("number of suspect dup groups",length(sp.check),"\n")
-      cat("% needing checking = ",round(length(sp.check)/length(sp)*100,1),"\n")
-      for ( i in 1:length(sp.check) ) {
-        print(ind[i])
-        print(sp.check[[i]])
-      }
-      #readline(prompt="Press [enter] to continue")
-     } else {
-      cat("no suspect dup groups","\n")
-     }
-   
-    ## Write data to file
-    if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
-     if ( length(sp.good) > 0 ) {
-      sp.tmp <- lapply(sp.good, icoads.utils::write_fun2)
-      sp.tmp2 <- do.call("rbind",sp.tmp)
-      # subset to only uid in primary month
-      motmp<-subset(mdata,mo==imo)
-      #print(length(sp.tmp2))
-      sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
-      #print(length(sp.tmp2))
-      motmp<-NULL
-      if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
-     }
-     ## see if we can make a better grouping
-     group.fix<-rep(FALSE,times=length(sp.check))
-     if ( length(sp.check) > 0 ) {
-      for ( ii in 1:length(sp.check) ) {
-        tmp<-sp.check[[ii]]
-        print(tmp)
-        num.rep<-nrow(tmp)
-        test.names<-unique(tmp$id[!(tmp$id.class %in% c("SHIP","MASKSTID","generic"))])
-        # if only generic IDs then mark as fixed
-        if ( length(test.names) == 0 ) {
-          group.fix[ii]<-TRUE
-          next
-        }
-        best.pri<-min(tmp$dck.pri)
-        best.rep<-subset(tmp,id %in% test.names & dck.pri == best.pri)
-        cand.rep<-subset(tmp,!(uid %in% best.rep$uid))
-        for ( jj in 1:nrow(best.rep) ) {
-          gg<-best.rep[jj,]
-          gg<-rbind(gg,cand.rep[cand.rep$id==gg$id[1],])
-          cand.rep<-subset(cand.rep,!(uid %in% gg$uid))
-          if ( nrow(cand.rep) > 0 ) {
-          mat<-array(99,dim=c(nrow(gg),nrow(cand.rep)))
-          for ( kk in 1:nrow(gg) ) {
-           for ( ll in 1:nrow(cand.rep) ) {
-             if(cand.rep$id[ll] %in% c("SHIP","MASKSTID","generic")) {
-                mat[kk,ll]<-sum(abs(gg[kk,vlist.match]-cand.rep[ll,vlist.match]),na.rm=T)
-             }
-           }   
-          }
-          to.add<-if ( min(mat) < 0.1 ) unique(which(mat<0.1,arr.ind=T)[,2]) else 9999 
-          if(min(to.add) < 9999) {
-            gg<-rbind(gg,cand.rep[to.add,])
-            cand.rep<-subset(cand.rep,!(uid %in% gg$uid))
-          }
-        }
-        if ( nrow(gg) > 1 ) write.table(icoads.utils::write_fun2(gg),fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
-        #if ( nrow(gg) > 1 ) print(gg)
-        num.rep<-num.rep-nrow(gg)
-        }
-      # check if we only have generic reports that don't agree with any reports with IDs
-      # ***** look at this *****
-      if(num.rep>1 ) {
-        sp.check[[ii]] <- cand.rep
-      } else if (num.rep == 1) {
-        group.fix[ii]<-TRUE
-      }
-      if(num.rep==0)group.fix[ii]<-TRUE
-      }
-     }
-     ## these are unclear, mark as duplicates with no preference
-     sp.check<-sp.check[!group.fix]
-     if ( length(sp.check) > 0 ) {
-      sp.tmp <- lapply(sp.check, icoads.utils::write_fun3)
-      sp.tmp2 <- do.call("rbind",sp.tmp)
-      # subset to only uid in primary month
-      motmp<-subset(mdata,mo==imo)
-      #print(length(sp.tmp2))
-      sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
-      #print(length(sp.tmp2))
-      motmp<-NULL
-      #print(sp.tmp2)
-      if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
-     }
-     #stop()
-
-      ## Get dflag
-     sp.tmp<-NULL
-     sp.tmp2<-NULL
-     sp2.tmp<-NULL
-     sp2.tmp2<-NULL
-     if ( length(sp.good) > 0 ) {
-      sp.tmp <- lapply(sp.good, icoads.utils::get_dflag2)
-      sp.tmp2 <- do.call("rbind",sp.tmp)
-      sp.tmp2 <- as.data.frame(sp.tmp2)
-      sp.tmp2$dflag <- as.numeric(sp.tmp2$dflag)
-     }
-     if ( length(sp.check) > 0 ) {
-      sp2.tmp <- lapply(sp.check, icoads.utils::get_dflag3)
-      sp2.tmp2 <- do.call("rbind",sp2.tmp)
-      sp2.tmp2 <- as.data.frame(sp2.tmp2)
-      sp2.tmp2$dflag <- as.numeric(sp2.tmp2$dflag)
-     }
-     if (!is.null(sp.tmp2) & !is.null(sp2.tmp2) ) {
-      sp.tmp2<-rbind(sp.tmp2,sp2.tmp2)
-     } else if ( !is.null(sp.tmp2) ) {
-      sp.tmp2<-sp.tmp2
-     } else if ( !is.null(sp2.tmp2) ) {
-      sp.tmp2<-sp2.tmp2
-     }
-    } # end any potential duplicates
-
-    if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
-      mdata<-merge(mdata,sp.tmp2,by="uid",all.x=T)
-      mdata$dflag<-ifelse(is.na(mdata$dflag),0,mdata$dflag)
-    } else {
-      mdata$dflag<-0
-    }
-    nodup<-subset(mdata,dflag==0 & mo == imo)
-    if ( nrow(nodup) > 0 ) {
-      #nodup<-nodup[,c("uid","dflag")]
-      #nodup$uid<-paste0("ICOADS-30-",nodup$uid)
-      #nodup$brace<-"{}"
-      #write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
-    }
-    print(addmargins(table(mdata$dck,mdata$dflag)))
-    print(addmargins(table(mdata$dck,mdata$dups)))
-    print(addmargins(table(pp.dup$dck,pp.dup$i.dck)))
-
-    #mdata<-subset(mdata,dflag!=3)
-    #pdf(paste0("MERGE_PLOTS_NEW/",year,".pdf"),height=10,width=10)
-    #source('merge_ids.R')
-    #dev.off()
-    #if ( !got.to.end ) {file.remove(paste0("MERGE_PLOTS_NEW/",year,".pdf"))}
-    next
-   
-  } # end loop over months
- } # end loop over years
-cat("finishing new_get_dups.R",my_args[1],my_args[2],"\n")
-#cat('memory usage', pryr::mem_used(), "\n")