Commit 1e987b37 authored by Richard Cornes's avatar Richard Cornes
Browse files

Vairous bugs fixed that stopped this code running

parent 6fb7720f
#!/usr/bin/env Rscript
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
# Description:
##
## IMPORTANT: all paths to all data are defined in config.yml
## input_data:
# 1. arguments (e.g. c(1990, 1999))
# 2. output_data/MFILES_SHIP_PROC
# 3. output_data/PAIRFILES
## Outputs the data to a home dir:
# output_data/PAIR_DUPFILES
Sys.setenv(TZ='GMT')
my_args = commandArgs(trailingOnly=TRUE)
config<-config::get(file = "config.yml")
options(warn=1)
options(width=1000)
data.table::setDTthreads(1) # this parellises, test it
cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
dck.gts<-c(555,700,792,793,794,795,796,797,798,888,992,993,994,995,996,997)
dck.dm<-c(926,927,928)
if (!dir.exists(paste0(config$output_dir,"PAIR_DUPFILES"))) dir.create(paste0(config$output_dir,"PAIR_DUPFILES"))
#for ( year in 1850:2018 ) {
if ( length(my_args) == 0 ) {my_args<-my_args2}
for ( year in my_args[1]:my_args[2] ) {
#for ( year in seq(1989,1989,1) ) {
for ( imo in 1:12 ) {
#for ( imo in 1:1 ) {
cat("-----------------------------------------------","\n")
cat(year,imo,"\n")
#cat(year,"\n")
cat("-----------------------------------------------","\n")
vlist.match<-c("sst","slp","at","w","d","ww","n","wbt","dpt","vv","rh","wh","nh","w1","osv")
vlist.ecv<-c("sst","slp","at","w","d","wbt","dpt","rh")
vlist.pos<-c("yr","mo","dy","hr","id","dck","sid","lon","lat")
fn.out<-paste0(config$output_dir,"PAIR_DUPFILES/",year,"-",cmo[imo],".txt")
if (file.exists(fn.out)) file.remove(fn.out)
fn.in <- paste0(paste0(config$output_dir,"MFILES_SHIP_PROC/"),year,"_",imo,".Rda")
if ( !file.exists(fn.in) ) next
mdata<-readRDS(fn.in)
if ( file.exists(paste0(config$output_dir,"PAIRFILES/",year,"_",imo,".Rda"))) {
pp.dup<-readRDS(paste0(config$output_dir,"PAIRFILES/",year,"_",imo,".Rda"))
pp.dup<-pp.dup[pp.dup$id.match,]
pp.dup$i.id<-gsub("\\|","",pp.dup$i.id)
} else {
pp.dup <- NULL
#mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace<-"{}"
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
if ( nrow(pp.dup) == 0 ) {
mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
if ( FALSE ) {
# want to make sure that report in pair flagged as duplicate doesn't contain info missing in "best" report
pp.dup$minnum<-ifelse(pp.dup$numel >= pp.dup$i.numel,pp.dup$i.numel,pp.dup$numel)
pp.dup<-subset(pp.dup,nmatch>=minnum) # want to look at this later, some dck (e.g. 721 have different vars in diff reps.)
}
pp.dup<-pp.dup[pp.dup$id.match,]
#pp.dup<-subset(pp.dup,dy<=5)
#------------------------------------------------------------------------------------------
# find uid groups
cat("finding uid groups .... ")
pp<-pp.dup[,c("uid","i.uid")]
foo<-lapply(1:nrow(pp),function(i) c(pp[i,1],pp[i,2]))
edges <- do.call(rbind, lapply(foo, function(x) {
if (length(x) > 1) cbind(head(x, -1), tail(x, -1)) else NULL
}))
rm(foo)
# from the igraph package
g <- igraph::graph.data.frame(edges, directed=FALSE)
# this is a list with each member being the uids to group
dup.uids<-split(igraph::V(g)$name, igraph::clusters(g)$membership)
cat("got uid groups","\n")
#------------------------------------------------------------------------------------------
#cat("about to merge dup group info"," ")
vlist.match<-vlist.match[which(vlist.match %in% names(mdata))]
mdata$numel<-rowSums(!is.na(mdata[,vlist.match]))
vlist.ecv<-vlist.ecv[which(vlist.ecv %in% names(mdata))]
mdata$numecv<-rowSums(!is.na(mdata[,vlist.ecv]))
cat("added element counts","\n")
# append group number to data frame mdata
names(dup.uids)<-paste0(names(dup.uids),".")
aa<-as.data.frame(unlist(dup.uids))
aa$dup.group<-rownames(aa)
rownames(aa)<-NULL
names(aa)<-c("uid","dup.group")
aa$dup.group<-substr(aa$dup.group, 1, regexpr("\\.", aa$dup.group)-1)
mdata2<-merge(mdata,aa)
if ( nrow(mdata2) == 0 ) {
cat("no potential duplicates","\n")
sp<-NULL
sp.good<-NULL
sp.check<-NULL
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-")
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
{next}
}
cat("added group number to main data frame","\n")
rm(aa)
if ( nrow(pp.dup) == 0 ) {
mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-")
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
vlist.add<-c("uid","dups","dup.group","dck.pri","id.class","is.gts","is.dm","c1m","immv","numel","numecv")
vp<-c(vlist.pos,vlist.match[which(vlist.match %in% names(mdata2))],names(mdata2)[grepl("prec$",names(mdata2))],vlist.add)
# sort them within dup groups so the best duplicate appears first
if ( !("immv" %in% names(mdata2) ) ) mdata2$immv<-NA
if ( !("c1m" %in% names(mdata2) ) ) mdata2$c1m<-NA
mdata2<-icoads.utils::add_dck_priority(mdata2)
#mdata2<-icoads.utils::add_ID_class(mdata2)
mdata2$is.gts<-ifelse(mdata2$dck %in% dck.gts, TRUE, FALSE)
mdata2$is.dm<-ifelse(mdata2$dck %in% dck.dm, TRUE, FALSE)
sub<-mdata2[order(as.numeric(mdata2$dup.group),mdata2$dck.pri,-mdata2$immv,-mdata2$numecv,-mdata2$numel,-is.na(mdata2$c1m),mdata2$dups),vp]
#cat("4 ")
sp<-split(sub,sub$dup.group)
# find maximum difference across variables
ind<-sapply(sp,icoads.utils::var_diff)
cat("got quality info","\n")
# split into those which agree and those which don't
diff.lim<-0.01
sp.good<-sp[which(ind<diff.lim)]
sp.check<-sp[which(ind>=diff.lim & ind < 0.2)] # inspection suggests this is a reasonable value
ind<-sapply(sp.check,icoads.utils::var_diff)
cat("split by quality info","\n")
cat("number of dup groups",length(sp),"\n")
if ( length(sp.good) > 0 ) {
cat("number of OK dup groups",length(sp.good),"\n")
cat("number of identified dups",sum(sapply(sp.good, nrow))-length(sp.good),"\n")
} else {
cat("no OK dup groups","\n")
}
cat("number of icoads dups",sum(!is.na(mdata$dups[which(mdata$dups>2)])),"of",nrow(mdata),"\n")
if ( length(sp.check) > 0 ) {
cat("number of suspect dup groups",length(sp.check),"\n")
cat("% needing checking = ",round(length(sp.check)/length(sp)*100,1),"\n")
for ( i in 1:length(sp.check) ) {
print(ind[i])
print(sp.check[[i]])
}
#readline(prompt="Press [enter] to continue")
} else {
cat("no suspect dup groups","\n")
}
## Write data to file
if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
if ( length(sp.good) > 0 ) {
sp.tmp <- lapply(sp.good, icoads.utils::write_fun2)
sp.tmp2 <- do.call("rbind",sp.tmp)
# subset to only uid in primary month
motmp<-subset(mdata,mo==imo)
#print(length(sp.tmp2))
sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
#print(length(sp.tmp2))
motmp<-NULL
if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
}
## see if we can make a better grouping
group.fix<-rep(FALSE,times=length(sp.check))
if ( length(sp.check) > 0 ) {
for ( ii in 1:length(sp.check) ) {
tmp<-sp.check[[ii]]
print(tmp)
num.rep<-nrow(tmp)
test.names<-unique(tmp$id[!(tmp$id.class %in% c("SHIP","MASKSTID","generic"))])
# if only generic IDs then mark as fixed
if ( length(test.names) == 0 ) {
group.fix[ii]<-TRUE
next
}
best.pri<-min(tmp$dck.pri)
best.rep<-subset(tmp,id %in% test.names & dck.pri == best.pri)
cand.rep<-subset(tmp,!(uid %in% best.rep$uid))
for ( jj in 1:nrow(best.rep) ) {
gg<-best.rep[jj,]
gg<-rbind(gg,cand.rep[cand.rep$id==gg$id[1],])
cand.rep<-subset(cand.rep,!(uid %in% gg$uid))
if ( nrow(cand.rep) > 0 ) {
mat<-array(99,dim=c(nrow(gg),nrow(cand.rep)))
for ( kk in 1:nrow(gg) ) {
for ( ll in 1:nrow(cand.rep) ) {
if(cand.rep$id[ll] %in% c("SHIP","MASKSTID","generic")) {
mat[kk,ll]<-sum(abs(gg[kk,vlist.match]-cand.rep[ll,vlist.match]),na.rm=T)
}
}
}
to.add<-if ( min(mat) < 0.1 ) unique(which(mat<0.1,arr.ind=T)[,2]) else 9999
if(min(to.add) < 9999) {
gg<-rbind(gg,cand.rep[to.add,])
cand.rep<-subset(cand.rep,!(uid %in% gg$uid))
}
}
if ( nrow(gg) > 1 ) write.table(icoads.utils::write_fun2(gg),fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
#if ( nrow(gg) > 1 ) print(gg)
num.rep<-num.rep-nrow(gg)
}
# check if we only have generic reports that don't agree with any reports with IDs
# ***** look at this *****
if(num.rep>1 ) {
sp.check[[ii]] <- cand.rep
} else if (num.rep == 1) {
group.fix[ii]<-TRUE
}
if(num.rep==0)group.fix[ii]<-TRUE
}
}
## these are unclear, mark as duplicates with no preference
sp.check<-sp.check[!group.fix]
if ( length(sp.check) > 0 ) {
sp.tmp <- lapply(sp.check, icoads.utils::write_fun3)
sp.tmp2 <- do.call("rbind",sp.tmp)
# subset to only uid in primary month
motmp<-subset(mdata,mo==imo)
#print(length(sp.tmp2))
sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
#print(length(sp.tmp2))
motmp<-NULL
#print(sp.tmp2)
if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
}
#stop()
## Get dflag
sp.tmp<-NULL
sp.tmp2<-NULL
sp2.tmp<-NULL
sp2.tmp2<-NULL
if ( length(sp.good) > 0 ) {
sp.tmp <- lapply(sp.good, icoads.utils::get_dflag2)
sp.tmp2 <- do.call("rbind",sp.tmp)
sp.tmp2 <- as.data.frame(sp.tmp2)
sp.tmp2$dflag <- as.numeric(sp.tmp2$dflag)
}
if ( length(sp.check) > 0 ) {
sp2.tmp <- lapply(sp.check, icoads.utils::get_dflag3)
sp2.tmp2 <- do.call("rbind",sp2.tmp)
sp2.tmp2 <- as.data.frame(sp2.tmp2)
sp2.tmp2$dflag <- as.numeric(sp2.tmp2$dflag)
}
if (!is.null(sp.tmp2) & !is.null(sp2.tmp2) ) {
sp.tmp2<-rbind(sp.tmp2,sp2.tmp2)
} else if ( !is.null(sp.tmp2) ) {
sp.tmp2<-sp.tmp2
} else if ( !is.null(sp2.tmp2) ) {
sp.tmp2<-sp2.tmp2
}
} # end any potential duplicates
if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
mdata<-merge(mdata,sp.tmp2,by="uid",all.x=T)
mdata$dflag<-ifelse(is.na(mdata$dflag),0,mdata$dflag)
} else {
mdata$dflag<-0
}
nodup<-subset(mdata,dflag==0 & mo == imo)
if ( nrow(nodup) > 0 ) {
#nodup<-nodup[,c("uid","dflag")]
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#nodup$brace<-"{}"
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
}
print(addmargins(table(mdata$dck,mdata$dflag)))
print(addmargins(table(mdata$dck,mdata$dups)))
print(addmargins(table(pp.dup$dck,pp.dup$i.dck)))
#mdata<-subset(mdata,dflag!=3)
#pdf(paste0("MERGE_PLOTS_NEW/",year,".pdf"),height=10,width=10)
#source('merge_ids.R')
#dev.off()
#if ( !got.to.end ) {file.remove(paste0("MERGE_PLOTS_NEW/",year,".pdf"))}
next
} # end loop over months
} # end loop over years
cat("finishing new_get_dups.R",my_args[1],my_args[2],"\n")
#cat('memory usage', pryr::mem_used(), "\n")
......@@ -19,7 +19,7 @@ my_args <- commandArgs(trailingOnly=TRUE)
##pub47_dir="/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/PUB47byMONTH"
##out_grid_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/GRID_HTS/"
config <- config::get(file = my_args[2])
config <- config::get(file = my_args[3])
data_dir <- config$data_dir
pub47_dir <- config$pub_47_by_month_path
out_dir <- config$ens_hts_dir
......
......@@ -15,7 +15,7 @@ options(warn=1)
my_args <- commandArgs(trailingOnly=TRUE)
##--------------------------------------------------------------------------------
config <- config::get(file = my_args[2])
config <- config::get(file = my_args[3])
##data_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/FINAL_PROC"
##pub47_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/HOSTACE_PROC_ANC_INFO/PUB47byMONTH.4Nov2020"
##out_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/ENS_HTS/"
......@@ -246,7 +246,7 @@ for ( year in my_args[1]:my_args[2] ) {
}
cat("about to assign country","\n")
source("assign_country.R")
source("../../rscripts/assign_country.R")
cat("assigned country","\n")
##stop()
......
......@@ -249,8 +249,8 @@ if ( print.comm ) cat("cabbage","\n")
mdata$homog.id <- ifelse ( mdata$dck == 720 & mdata$sid == 135 & nchar(mdata$id) == 8, paste0(substr(mdata$id,1,4),"-SEQ") , mdata$homog.id )
mdata$homog.id.flag <- ifelse ( mdata$dck == 720 & mdata$sid == 135 & nchar(mdata$id) == 8, 3 , mdata$homog.id.flag )
mdata<-icoads.utils::add_ID_class(mdata)
if ( print.comm ) cat("added ID class","\n")
mdata$orig.id <- mdata$id
mdata$id <- mdata$homog.id
......
default:
ICOADS_data_path: "/noc/mpoc/surface_data/ICOADS_R3_PROC/INPUT_Rda/r3.0.0/ICOADS_R3.0.0T_"
ICOADS_data_path: "/noc/mpoc/surface_data/ICOADS_R3_TOTAL_RDA/ICOADS_R3.0.0T_"
#ICOADS_data_path: "/noc/mpoc/surface_data/ICOADS_R3_PROC/INPUT_Rda/r3.0.0/ICOADS_R3.0.0T_"
ICOADS_RELEASE: "r3.0.0"
ICOADS_NRT_RELEASE: "r3.0.1"
......@@ -35,19 +36,19 @@ default:
# setup for GIULIA's tracking
track_code_dir: "/noc/users/metman/GIULIA_TRACK_CODE/TRACK_CODE_FROM_JASMIN/"
track_root_athena: "/pgdata/metman/TRACK"
track_input: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/TRACK_INPUT/"
track_output: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/GTRACK_OUT/"
track_input: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/TRACK_INPUT/"
track_output: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/GTRACK_OUT/"
# Final Processing Dirs
data_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/FINAL_PROC"
data_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/FINAL_PROC"
# Height Files/Dirs
ens_hts_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/ENS_HTS/"
ens_grid_hts_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/GRID_HTS/"
def_hts_file: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/default_height_ensemble.Rda"
ens_hts_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/ENS_HTS/"
ens_grid_hts_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/GRID_HTS/"
def_hts_file: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/default_height_ensemble.Rda"
# Diurnal dirs
dir_flags_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/RUN_ZERO/output_data/DIURNAL_FAIL_IDS/"
dir_flags_dir: "/noc/mpoc/surface_data/ICOADS_v0.0.1/output_data/DIURNAL_FAIL_IDS/"
# Use TOTAL data or FINAL data
use_total: TRUE
......
......@@ -6,7 +6,7 @@
# start year = $1
# end year = $2
source /noc/users/metman/SETUPS/setup_icproc
source setup_icproc
if ( ! -d ${basedir}OPFILES ) then
#echo Exists
......@@ -23,6 +23,9 @@ touch ${log_output_dir}/dop.$1.$2
touch ${log_output_dir}/dop.$1.$2
echo "logfile" ${log_output_dir}/dop.$1.$2
echo 'HOOVERING' >> ${log_output_dir}/dop.$1.$2
Rscript $codedir/hoover_data.R $1 $2 >> ${log_output_dir}/dop.$1.$2
echo 'running disjoint tracking' >> ${log_output_dir}/dop.$1.$2
echo 'start year' $1 >> ${log_output_dir}/dop.$1.$2
echo 'end year' $2 >> ${log_output_dir}/dop.$1.$2
......
......@@ -29,7 +29,7 @@ echo 'start_at_mid' >> ${log_output_dir}/op.$1.$2
echo "---------------" >> ${log_output_dir}/op.$1.$2
echo "XXX processing shipdata inc. ids" $1 $2 >> ${log_output_dir}/op.$1.$2
Rscript $codedir/process_ships.R $1 $2 >> ${log_output_dir}/op.$1.$2
#Rscript $codedir/process_ships.R $1 $2 >> ${log_output_dir}/op.$1.$2
set y = $1
while ( $y <= $2 )
......
......@@ -10,5 +10,5 @@ module load intel_compiler
module load gcc/7.2
set basedir="/noc/mpoc/surface_data/ICOADS_v0.0.1/"
set codedir="/noc/users/metman/icoads-r-hostace/rscripts/"
set codedir="/noc/users/ricorne/icoads-r-hostace/rscripts/"
set configfile="config.yml"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment