Commit f1408ace authored by Richard Cornes's avatar Richard Cornes
Browse files

Added sid in output table

parent 66c6dc63
#!/usr/bin/env Rscript
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
# Description:
##
## IMPORTANT: all paths to all data are defined in config.yml
## input_data:
# 1. arguments (e.g. c(1990, 1999))
# 2. output_data/MFILES_SHIP_PROC
# 3. output_data/PAIRFILES
## Outputs the data to a home dir:
# output_data/PAIR_DUPFILES
Sys.setenv(TZ='GMT')
my_args = commandArgs(trailingOnly=TRUE)
config<-config::get(file = "config.yml")
options(warn=1)
options(width=1000)
data.table::setDTthreads(1) # this parellises, test it
cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
dck.gts<-c(555,700,792,793,794,795,796,797,798,888,992,993,994,995,996,997)
dck.dm<-c(926,927,928)
if (!dir.exists(paste0(config$output_dir,"PAIR_DUPFILES"))) dir.create(paste0(config$output_dir,"PAIR_DUPFILES"))
#for ( year in 1850:2018 ) {
if ( length(my_args) == 0 ) {my_args<-my_args2}
for ( year in my_args[1]:my_args[2] ) {
#for ( year in seq(1989,1989,1) ) {
for ( imo in 1:12 ) {
#for ( imo in 1:1 ) {
cat("-----------------------------------------------","\n")
cat(year,imo,"\n")
#cat(year,"\n")
cat("-----------------------------------------------","\n")
vlist.match<-c("sst","slp","at","w","d","ww","n","wbt","dpt","vv","rh","wh","nh","w1","osv")
vlist.ecv<-c("sst","slp","at","w","d","wbt","dpt","rh")
vlist.pos<-c("yr","mo","dy","hr","id","dck","sid","lon","lat")
fn.out<-paste0(config$output_dir,"PAIR_DUPFILES/",year,"-",cmo[imo],".txt")
if (file.exists(fn.out)) file.remove(fn.out)
fn.in <- paste0(paste0(config$output_dir,"MFILES_SHIP_PROC/"),year,"_",imo,".Rda")
if ( !file.exists(fn.in) ) next
mdata<-readRDS(fn.in)
if ( file.exists(paste0(config$output_dir,"PAIRFILES/",year,"_",imo,".Rda"))) {
pp.dup<-readRDS(paste0(config$output_dir,"PAIRFILES/",year,"_",imo,".Rda"))
pp.dup<-pp.dup[pp.dup$id.match,]
pp.dup$i.id<-gsub("\\|","",pp.dup$i.id)
} else {
pp.dup <- NULL
#mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace<-"{}"
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
if ( nrow(pp.dup) == 0 ) {
mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
if ( FALSE ) {
# want to make sure that report in pair flagged as duplicate doesn't contain info missing in "best" report
pp.dup$minnum<-ifelse(pp.dup$numel >= pp.dup$i.numel,pp.dup$i.numel,pp.dup$numel)
pp.dup<-subset(pp.dup,nmatch>=minnum) # want to look at this later, some dck (e.g. 721 have different vars in diff reps.)
}
pp.dup<-pp.dup[pp.dup$id.match,]
#pp.dup<-subset(pp.dup,dy<=5)
#------------------------------------------------------------------------------------------
# find uid groups
cat("finding uid groups .... ")
pp<-pp.dup[,c("uid","i.uid")]
foo<-lapply(1:nrow(pp),function(i) c(pp[i,1],pp[i,2]))
edges <- do.call(rbind, lapply(foo, function(x) {
if (length(x) > 1) cbind(head(x, -1), tail(x, -1)) else NULL
}))
rm(foo)
# from the igraph package
g <- igraph::graph.data.frame(edges, directed=FALSE)
# this is a list with each member being the uids to group
dup.uids<-split(igraph::V(g)$name, igraph::clusters(g)$membership)
cat("got uid groups","\n")
#------------------------------------------------------------------------------------------
#cat("about to merge dup group info"," ")
vlist.match<-vlist.match[which(vlist.match %in% names(mdata))]
mdata$numel<-rowSums(!is.na(mdata[,vlist.match]))
vlist.ecv<-vlist.ecv[which(vlist.ecv %in% names(mdata))]
mdata$numecv<-rowSums(!is.na(mdata[,vlist.ecv]))
cat("added element counts","\n")
# append group number to data frame mdata
names(dup.uids)<-paste0(names(dup.uids),".")
aa<-as.data.frame(unlist(dup.uids))
aa$dup.group<-rownames(aa)
rownames(aa)<-NULL
names(aa)<-c("uid","dup.group")
aa$dup.group<-substr(aa$dup.group, 1, regexpr("\\.", aa$dup.group)-1)
mdata2<-merge(mdata,aa)
if ( nrow(mdata2) == 0 ) {
cat("no potential duplicates","\n")
sp<-NULL
sp.good<-NULL
sp.check<-NULL
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-")
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
{next}
}
cat("added group number to main data frame","\n")
rm(aa)
if ( nrow(pp.dup) == 0 ) {
mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-")
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
vlist.add<-c("uid","dups","dup.group","dck.pri","id.class","is.gts","is.dm","c1m","immv","numel","numecv")
vp<-c(vlist.pos,vlist.match[which(vlist.match %in% names(mdata2))],names(mdata2)[grepl("prec$",names(mdata2))],vlist.add)
# sort them within dup groups so the best duplicate appears first
if ( !("immv" %in% names(mdata2) ) ) mdata2$immv<-NA
if ( !("c1m" %in% names(mdata2) ) ) mdata2$c1m<-NA
mdata2<-icoads.utils::add_dck_priority(mdata2)
#mdata2<-icoads.utils::add_ID_class(mdata2)
mdata2$is.gts<-ifelse(mdata2$dck %in% dck.gts, TRUE, FALSE)
mdata2$is.dm<-ifelse(mdata2$dck %in% dck.dm, TRUE, FALSE)
sub<-mdata2[order(as.numeric(mdata2$dup.group),mdata2$dck.pri,-mdata2$immv,-mdata2$numecv,-mdata2$numel,-is.na(mdata2$c1m),mdata2$dups),vp]
#cat("4 ")
sp<-split(sub,sub$dup.group)
# find maximum difference across variables
ind<-sapply(sp,icoads.utils::var_diff)
cat("got quality info","\n")
# split into those which agree and those which don't
diff.lim<-0.01
sp.good<-sp[which(ind<diff.lim)]
sp.check<-sp[which(ind>=diff.lim & ind < 0.2)] # inspection suggests this is a reasonable value
ind<-sapply(sp.check,icoads.utils::var_diff)
cat("split by quality info","\n")
cat("number of dup groups",length(sp),"\n")
if ( length(sp.good) > 0 ) {
cat("number of OK dup groups",length(sp.good),"\n")
cat("number of identified dups",sum(sapply(sp.good, nrow))-length(sp.good),"\n")
} else {
cat("no OK dup groups","\n")
}
cat("number of icoads dups",sum(!is.na(mdata$dups[which(mdata$dups>2)])),"of",nrow(mdata),"\n")
if ( length(sp.check) > 0 ) {
cat("number of suspect dup groups",length(sp.check),"\n")
cat("% needing checking = ",round(length(sp.check)/length(sp)*100,1),"\n")
for ( i in 1:length(sp.check) ) {
print(ind[i])
print(sp.check[[i]])
}
#readline(prompt="Press [enter] to continue")
} else {
cat("no suspect dup groups","\n")
}
## Write data to file
if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
if ( length(sp.good) > 0 ) {
sp.tmp <- lapply(sp.good, icoads.utils::write_fun2)
sp.tmp2 <- do.call("rbind",sp.tmp)
# subset to only uid in primary month
motmp<-subset(mdata,mo==imo)
#print(length(sp.tmp2))
sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
#print(length(sp.tmp2))
motmp<-NULL
if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
}
## see if we can make a better grouping
group.fix<-rep(FALSE,times=length(sp.check))
if ( length(sp.check) > 0 ) {
for ( ii in 1:length(sp.check) ) {
tmp<-sp.check[[ii]]
print(tmp)
num.rep<-nrow(tmp)
test.names<-unique(tmp$id[!(tmp$id.class %in% c("SHIP","MASKSTID","generic"))])
# if only generic IDs then mark as fixed
if ( length(test.names) == 0 ) {
group.fix[ii]<-TRUE
next
}
best.pri<-min(tmp$dck.pri)
best.rep<-subset(tmp,id %in% test.names & dck.pri == best.pri)
cand.rep<-subset(tmp,!(uid %in% best.rep$uid))
for ( jj in 1:nrow(best.rep) ) {
gg<-best.rep[jj,]
gg<-rbind(gg,cand.rep[cand.rep$id==gg$id[1],])
cand.rep<-subset(cand.rep,!(uid %in% gg$uid))
if ( nrow(cand.rep) > 0 ) {
mat<-array(99,dim=c(nrow(gg),nrow(cand.rep)))
for ( kk in 1:nrow(gg) ) {
for ( ll in 1:nrow(cand.rep) ) {
if(cand.rep$id[ll] %in% c("SHIP","MASKSTID","generic")) {
mat[kk,ll]<-sum(abs(gg[kk,vlist.match]-cand.rep[ll,vlist.match]),na.rm=T)
}
}
}
to.add<-if ( min(mat) < 0.1 ) unique(which(mat<0.1,arr.ind=T)[,2]) else 9999
if(min(to.add) < 9999) {
gg<-rbind(gg,cand.rep[to.add,])
cand.rep<-subset(cand.rep,!(uid %in% gg$uid))
}
}
if ( nrow(gg) > 1 ) write.table(icoads.utils::write_fun2(gg),fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
#if ( nrow(gg) > 1 ) print(gg)
num.rep<-num.rep-nrow(gg)
}
# check if we only have generic reports that don't agree with any reports with IDs
# ***** look at this *****
if(num.rep>1 ) {
sp.check[[ii]] <- cand.rep
} else if (num.rep == 1) {
group.fix[ii]<-TRUE
}
if(num.rep==0)group.fix[ii]<-TRUE
}
}
## these are unclear, mark as duplicates with no preference
sp.check<-sp.check[!group.fix]
if ( length(sp.check) > 0 ) {
sp.tmp <- lapply(sp.check, icoads.utils::write_fun3)
sp.tmp2 <- do.call("rbind",sp.tmp)
# subset to only uid in primary month
motmp<-subset(mdata,mo==imo)
#print(length(sp.tmp2))
sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
#print(length(sp.tmp2))
motmp<-NULL
#print(sp.tmp2)
if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
}
#stop()
## Get dflag
sp.tmp<-NULL
sp.tmp2<-NULL
sp2.tmp<-NULL
sp2.tmp2<-NULL
if ( length(sp.good) > 0 ) {
sp.tmp <- lapply(sp.good, icoads.utils::get_dflag2)
sp.tmp2 <- do.call("rbind",sp.tmp)
sp.tmp2 <- as.data.frame(sp.tmp2)
sp.tmp2$dflag <- as.numeric(sp.tmp2$dflag)
}
if ( length(sp.check) > 0 ) {
sp2.tmp <- lapply(sp.check, icoads.utils::get_dflag3)
sp2.tmp2 <- do.call("rbind",sp2.tmp)
sp2.tmp2 <- as.data.frame(sp2.tmp2)
sp2.tmp2$dflag <- as.numeric(sp2.tmp2$dflag)
}
if (!is.null(sp.tmp2) & !is.null(sp2.tmp2) ) {
sp.tmp2<-rbind(sp.tmp2,sp2.tmp2)
} else if ( !is.null(sp.tmp2) ) {
sp.tmp2<-sp.tmp2
} else if ( !is.null(sp2.tmp2) ) {
sp.tmp2<-sp2.tmp2
}
} # end any potential duplicates
if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
mdata<-merge(mdata,sp.tmp2,by="uid",all.x=T)
mdata$dflag<-ifelse(is.na(mdata$dflag),0,mdata$dflag)
} else {
mdata$dflag<-0
}
nodup<-subset(mdata,dflag==0 & mo == imo)
if ( nrow(nodup) > 0 ) {
#nodup<-nodup[,c("uid","dflag")]
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#nodup$brace<-"{}"
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
}
print(addmargins(table(mdata$dck,mdata$dflag)))
print(addmargins(table(mdata$dck,mdata$dups)))
print(addmargins(table(pp.dup$dck,pp.dup$i.dck)))
#mdata<-subset(mdata,dflag!=3)
#pdf(paste0("MERGE_PLOTS_NEW/",year,".pdf"),height=10,width=10)
#source('merge_ids.R')
#dev.off()
#if ( !got.to.end ) {file.remove(paste0("MERGE_PLOTS_NEW/",year,".pdf"))}
next
} # end loop over months
} # end loop over years
cat("finishing new_get_dups.R",my_args[1],my_args[2],"\n")
#cat('memory usage', pryr::mem_used(), "\n")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment