Commit a739b285 authored by bearecinos's avatar bearecinos
Browse files

v0.1 - lizz scripts unchanged

parent d00109fd
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
print.comm <- TRUE
# args[1] is start year
# args[2] is end year
if(length(args)< 2) {args<-args2}
if(length(args)!= 2) {stop("wrong number of input args or args2 not set")}
source("~/Rscripts/read_rdsfiles.R")
options(warnings=1)
for ( iyear in seq(args[1],args[2]) ) {
cat("------------------------------","\n")
cat(iyear,"\n")
cat("------------------------------","\n")
root.dir<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/"
anc.root<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/"
dir.in <- paste0(root.dir,"SHIP_CLEAN")
files <- list.files(path = dir.in, pattern = as.character(iyear), full.names = TRUE)
if(length(files)==0) {next}
data <- rbind.files(files)
if ( print.comm ) cat('got input data, num obs =',nrow(data),"\n")
cat("\n")
cat("removing new dups",sum(data$dflag>1),"\n")
print(table(data$dck,data$dflag))
cat("removing new track fails",sum(data$trk==1),"\n")
print(table(data$dck,data$trk))
data<-subset(data,dflag<=1 & trk==0)
if ( "lz" %in% names(data) ) {
cat("removing landlocked flag",sum(!is.na(data$lz)),"\n")
data <- subset(data,is.na(lz)) # lz missing is data over ocean
}
if ( print.comm ) cat('about to format tracking files',"\n")
#============================================
# format tracking files here
#============================================
names.in <- c("obsid","uid","yr","mo","dy","hr","lon","lat","vs",
"ds","sst","slp","at","dpt","w","d","n","vv","ww","w1",
"c1","c1m","recordnumber","pt","dck","qcpt","idflag",
"dupflag","dckpriority","ii","sid","si","sim","id",
"track.id","qcid","date","id.type")
if ( print.comm ) {
cat("iso","\n")
print(table(data$iso))
cat("trk","\n")
print(table(data$trk))
cat("dflag","\n")
print(table(data$dflag))
}
datain<-subset(data,iso==0&trk==0&dflag<=1)
datain$obsid<-datain$uid
datain$recordnumber<-datain$uid
if ( !("sim" %in% names(datain) ) ) datain$sim<-rep(NA,times=nrow(datain))
if ( !("n" %in% names(datain) ) ) datain$n<-rep(NA,times=nrow(datain))
if ( !("w1" %in% names(datain) ) ) datain$w1<-rep(NA,times=nrow(datain))
datain$qcpt<-ifelse(!is.na(datain$pt),datain$pt,5)
datain$dupflag<-datain$dflag
datain$track.id<-datain$qcid
datain$dckpriority<-datain$dck.pri
datain$idflag<-NA # need to check is is OK
datain<-datain[,names.in]
datain<-datain[!is.na(datain$date),]
datain<-subset(datain,id.type %in% c("missing","generic"))
print(table(datain$dck,datain$id.type))
cat('number of obs =',nrow(datain),"\n")
cat('number of ids =',nrow(table(datain$qcid)),"\n")
if (!dir.exists(paste0(root.dir,"NEW_TRACK_INPUT"))) dir.create(paste0(root.dir,"NEW_TRACK_INPUT"))
fn.old<-list.files("NEW_TRACK_INPUT",pattern=as.character(iyear),full.name=T)
for ( ff in fn.old ) { if ( file.exists(ff)) file.remove(ff) }
if ( nrow(datain) == 0 ) next
YRMO <- split(datain, data.frame(datain$yr, datain$mo), drop=TRUE)
filenames <- paste("NEW_TRACK_INPUT/",names(YRMO), ".Rda",sep="")
jj<-mapply(saveRDS, YRMO, file = filenames )
cat('finished clean2track.R',"\n")
} # end loop over year
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
print.comm <- TRUE
# args[1] is start year
# args[2] is end year
#require(imma) # for track flags
source("~/Rscripts/id_group_func.R")
source("~/Rscripts/read_rdsfiles.R")
dck.nodup<-c(701)
if(length(args)< 2) {args<-args2}
if(length(args)!= 2) {stop("wrong number of input args or args2 not set")}
options(warnings=1)
root.dir<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/"
anc.root<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/"
ids.gen<-read.table(paste0(anc.root,"gen_id.txt"))
ids.gen<-c(ids.gen$V1,"")
coded.id<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{3}[AA|UK|FR|DE|EU|HK]{2}[0-9]{2}$"
coded.id.g<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{3}[AA|UK|FR|DE|EU|HK]{2}[0-9]{2}_g[0-9]$"
cod.inv1<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{2}[AA|UK|FR|DE|EU|HK]{2}[0-9]{2}$"
cod.inv2<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{3}[AA|UK|FR|DE|EU|HK]{1}[0-9]{2}$"
cod.inv3<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{3}[AA|UK|FR|DE|EU|HK]{2}[0-9]{1}$"
for ( iyear in seq(args[1],args[2]) ) {
cat("------------------------------","\n")
cat(iyear,"\n")
cat("------------------------------","\n")
dir.in <- paste0(root.dir,"MFILES_SHIP_FINAL")
files <- list.files(path = dir.in, pattern = as.character(iyear), full.names = TRUE)
if(length(files)==0) {next}
data <- rbind.files(files)
if ( print.comm ) cat('got input data, num obs =',nrow(data),"\n")
data$dflag<-ifelse(data$dck%in%dck.nodup,0,data$dflag)
data <- subset(data, dflag <= 1)
if ( print.comm ) cat('subset input data, num obs =',nrow(data),"\n")
data$id<-ifelse(data$id == "" | data$id == " ", NA, data$id)
data$new.id<-ifelse(data$new.id == "" | data$new.id == " ", NA, data$new.id)
data$new.id<-ifelse(grepl(coded.id.g,data$new.id),substr(data$new.id,1,7),data$new.id)
data$id.ok<-ifelse(grepl(coded.id,data$new.id),"coded",data$id.ok)
data$id.ok<-ifelse(grepl(cod.inv1,data$new.id),"invalid",data$id.ok)
data$id.ok<-ifelse(grepl(cod.inv2,data$new.id),"invalid",data$id.ok)
data$id.ok<-ifelse(grepl(cod.inv3,data$new.id),"invalid",data$id.ok)
data$id.ok<-ifelse(grepl("BBXX",data$new.id),"invalid",data$id.ok)
data$id.ok<-ifelse(data$new.id %in% ids.gen, "generic", data$id.ok)
for ( ii in names(table(data$new.id[data$id.ok=="few"])) ) {
if (sum(data$new.id==ii,na.rm=T) >= 8 ) {
xx<-unique(c("valid",names(table(data$id.ok[data$new.id==ii]))))
data$id.ok[data$new.id==ii]<-xx[xx!="few"][1]
}
}
data <- id_group_func(data)
data$qcid <- data$new.id
print(table(data$dck,data$id.ok))
# qcid is input for tracking, add count
data$id.type<-data$id.ok
data$qcid<-ifelse(data$id.type=="generic", NA, data$qcid)
tmp <- data.frame(table(data$qcid))
names(tmp) <- c("qcid","qcidFreq")
data <- merge(data, tmp, all.x=T)
data$id.type<-ifelse(data$qcidFreq < 8 & !is.na(data$qcidFreq), "few", data$id.type)
data$qcid<-ifelse(data$qcidFreq < 8, NA, data$qcid)
tmp <- data.frame(table(data$qcid))
names(tmp) <- c("qcid","qcidFreq")
data$qcidFreq<-NULL
data <- merge(data, tmp, all.x=T)
nn<-names(table(data$qcid))
nn<-nn[nn!=""]
data$id.type<-ifelse(data$qcid %in% nn, "valid",data$id.type)
nn<-names(table(data$qcid))
data$lon180<-ifelse(data$lon>=180,data$lon-360,data$lon)
data$lon360<-data$lon
data$iso<-0
data$trk<-0
data$spd<-NA
data$cse<-NA
if ( print.comm ) pb <- txtProgressBar(min = 0, max = length(nn), style = 3)
icount<-0
for ( ship in nn ) {
icount<-icount+1
if ( print.comm ) setTxtProgressBar(pb, icount)
sub<-subset(data,qcid==ship)
source("~/Rscripts/get_isolated.R")
data$iso<-ifelse(data$uid %in% uid.iso,1,data$iso)
sub<-subset(sub,!(sub$uid %in% uid.iso))
sub<-subset(sub,!(dck %in% dck.nodup))
if(is.null(sub) ) next
if(nrow(sub) < 8 ) {
#cat(ship,"now too small",nrow(sub),"\n")
next
}
sub<-sub[order(sub$date),]
if(min(table(sub$date)) == 0 ) {
cat(nn,"duplicate date","\n")
}
tr<-imma::track_check(uid=sub$uid,yr=sub$yr,mo=sub$mo,dy=sub$dy,
hr=sub$hr,lon=sub$lon180,
lat=sub$lat,id=sub$qcid,pt=sub$pt,dck=sub$dck,vsi=sub$vsi,dsi=sub$dsi,
max_direction_change = 60, max_speed_change = 10, max_absolute_speed = 40,
max_midpoint_discrepancy = 150, add_vars = TRUE)
tr<-data.frame(tr)
fail<-subset(tr,tr$trk==1)
if(length(fail$tr)>0 ) {
data$trk<-ifelse(data$uid %in% fail$uid, 1, data$trk)
}
notfail<-subset(tr,tr$trk==0)
sub<-subset(sub,sub$uid%in%notfail$uid)
if(min(diff(sub$date),na.rm=T)==0) {
dup.date<-sub$date[which(duplicated(sub$date))]
uid.dup<-c()
for ( dd in dup.date ) {
tmp<-subset(sub,date==dd)
tmp[order(tmp$dck.pri,-tmp$numecv,-tmp$numel),]
tmp<-tmp[2:nrow(tmp),]
uid.dup<-c(uid.dup,tmp$uid)
}
data$dflag<-ifelse(data$uid %in% uid.dup,3,data$dflag)
}
} # end loop over valid IDs with >= 8 obs
# flagged data, write out clean file
if (!dir.exists(paste0(root.dir,"SHIP_CLEAN"))) dir.create(paste0(root.dir,"SHIP_CLEAN"))
YRMO <- split(data, data.frame(data$yr, data$mo), drop=TRUE)
filenames <- paste("SHIP_CLEAN/",names(YRMO), ".Rda",sep="")
jj<-mapply(saveRDS, YRMO, file = filenames )
cat('finished clean_data.R',"\n")
} # end loop over year
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
basedir<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/"
source("~/Rscripts/assess_match.R")
source("~/Rscripts/write_dup_func.R")
source("~/Rscripts/add_dck_priority.R")
#source("add_ID_class.R")
#require(imma)
#require(data.table)
#require(igraph)
options(warn=1)
options(width=1000)
data.table::setDTthreads(1) # this parellises, test it
cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
dck.gts<-c(555,700,792,793,794,795,796,797,888,992,993,994,995,996,997)
dck.dm<-c(926,927,928)
if (!dir.exists(paste0(basedir,"NEW_DUP_FILES"))) dir.create(paste0(basedir,"NEW_DUP_FILES"))
#for ( year in 1850:2018 ) {
if ( length(args) == 0 ) {args<-args2}
for ( year in args[1]:args[2] ) {
#for ( year in seq(1989,1989,1) ) {
for ( imo in 1:12 ) {
#for ( imo in 1:1 ) {
cat("-----------------------------------------------","\n")
cat(year,imo,"\n")
#cat(year,"\n")
cat("-----------------------------------------------","\n")
vlist.match<-c("sst","slp","at","w","d","ww","n","wbt","dpt","vv","rh","wh","nh","w1","osv")
vlist.ecv<-c("sst","slp","at","w","d","wbt","dpt","rh")
vlist.pos<-c("yr","mo","dy","hr","id","dck","sid","lon","lat")
fn.out<-paste0("NEW_DUP_FILES/",year,"-",cmo[imo],".txt")
if (file.exists(fn.out)) file.remove(fn.out)
fn.in <- paste0(basedir,"MFILES_SHIP_PROC/",year,"_",imo,".Rda")
if ( !file.exists(fn.in) ) next
mdata<-readRDS(fn.in)
if ( file.exists(paste0("NEW_PAIRFILES/",year,"_",imo,".Rda"))) {
pp.dup<-readRDS(paste0("NEW_PAIRFILES/",year,"_",imo,".Rda"))
pp.dup<-pp.dup[pp.dup$id.match,]
pp.dup$i.id<-gsub("\\|","",pp.dup$i.id)
} else {
pp.dup <- NULL
#mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace<-"{}"
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
if ( nrow(pp.dup) == 0 ) {
mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
if ( FALSE ) {
# want to make sure that report in pair flagged as duplicate doesn't contain info missing in "best" report
pp.dup$minnum<-ifelse(pp.dup$numel >= pp.dup$i.numel,pp.dup$i.numel,pp.dup$numel)
pp.dup<-subset(pp.dup,nmatch>=minnum) # want to look at this later, some dck (e.g. 721 have different vars in diff reps.)
}
pp.dup<-pp.dup[pp.dup$id.match,]
#------------------------------------------------------------------------------------------
# find uid groups
cat("finding uid groups .... ")
pp<-pp.dup[,c("uid","i.uid")]
foo<-lapply(1:nrow(pp),function(i) c(pp[i,1],pp[i,2]))
edges <- do.call(rbind, lapply(foo, function(x) {
if (length(x) > 1) cbind(head(x, -1), tail(x, -1)) else NULL
}))
rm(foo)
# from the igraph package
g <- igraph::graph.data.frame(edges, directed=FALSE)
# this is a list with each member being the uids to group
dup.uids<-split(igraph::V(g)$name, igraph::clusters(g)$membership)
cat("got uid groups","\n")
#------------------------------------------------------------------------------------------
#cat("about to merge dup group info"," ")
vlist.match<-vlist.match[which(vlist.match %in% names(mdata))]
mdata$numel<-rowSums(!is.na(mdata[,vlist.match]))
vlist.ecv<-vlist.ecv[which(vlist.ecv %in% names(mdata))]
mdata$numecv<-rowSums(!is.na(mdata[,vlist.ecv]))
# append group number to data frame mdata
names(dup.uids)<-paste0(names(dup.uids),".")
aa<-as.data.frame(unlist(dup.uids))
aa$dup.group<-rownames(aa)
rownames(aa)<-NULL
names(aa)<-c("uid","dup.group")
aa$dup.group<-substr(aa$dup.group, 1, regexpr("\\.", aa$dup.group)-1)
mdata2<-merge(mdata,aa)
if ( nrow(mdata2) == 0 ) {
cat("no potential duplicates","\n")
sp<-NULL
sp.good<-NULL
sp.check<-NULL
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-")
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
{next}
}
rm(aa)
if ( nrow(pp.dup) == 0 ) {
mdata$dflag<-0
#nodup<-subset(mdata,mo==imo)
#nodup<-nodup[,c("uid","dflag")]
#nodup$brace="{}"
#nodup$uid<-paste0("ICOADS-30-")
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
next
}
vlist.add<-c("uid","dups","dup.group","dck.pri","id.class","is.gts","is.dm","c1m","immv","numel","numecv")
vp<-c(vlist.pos,vlist.match[which(vlist.match %in% names(mdata2))],names(mdata2)[grepl("prec$",names(mdata2))],vlist.add)
# sort them within dup groups so the best duplicate appears first
if ( !("immv" %in% names(mdata2) ) ) mdata2$immv<-NA
if ( !("c1m" %in% names(mdata2) ) ) mdata2$c1m<-NA
mdata2<-add_dck_priority(mdata2)
#mdata2<-add_ID_class(mdata2)
mdata2$is.gts<-ifelse(mdata2$dck %in% dck.gts, TRUE, FALSE)
mdata2$is.dm<-ifelse(mdata2$dck %in% dck.dm, TRUE, FALSE)
sub<-mdata2[order(as.numeric(mdata2$dup.group),mdata2$dck.pri,-mdata2$immv,-mdata2$numecv,-mdata2$numel,-is.na(mdata2$c1m),mdata2$dups),vp]
#cat("4 ")
sp<-split(sub,sub$dup.group)
# find maximum difference across variables
ind<-sapply(sp,var.diff)
cat("got quality info","\n")
# split into those which agree and those which don't
diff.lim<-0.01
sp.good<-sp[which(ind<diff.lim)]
sp.check<-sp[which(ind>=diff.lim & ind < 0.2)] # inspection suggests this is a reasonable value
ind<-sapply(sp.check,var.diff)
cat("split by quality info","\n")
cat("number of dup groups",length(sp),"\n")
if ( length(sp.good) > 0 ) {
cat("number of OK dup groups",length(sp.good),"\n")
cat("number of identified dups",sum(sapply(sp.good, nrow))-length(sp.good),"\n")
} else {
cat("no OK dup groups","\n")
}
cat("number of icoads dups",sum(!is.na(mdata$dups[which(mdata$dups>2)])),"of",nrow(mdata),"\n")
if ( length(sp.check) > 0 ) {
cat("number of suspect dup groups",length(sp.check),"\n")
cat("% needing checking = ",round(length(sp.check)/length(sp)*100,1),"\n")
for ( i in 1:length(sp.check) ) {
print(ind[i])
print(sp.check[[i]])
}
#readline(prompt="Press [enter] to continue")
} else {
cat("no suspect dup groups","\n")
}
## Write data to file
if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
if ( length(sp.good) > 0 ) {
sp.tmp <- lapply(sp.good,write.fun2)
sp.tmp2 <- do.call("rbind",sp.tmp)
# subset to only uid in primary month
motmp<-subset(mdata,mo==imo)
#print(length(sp.tmp2))
sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
#print(length(sp.tmp2))
motmp<-NULL
if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
}
## these are unclear, mark as duplicates with no preference
if ( length(sp.check) > 0 ) {
sp.tmp <- lapply(sp.check,write.fun3)
sp.tmp2 <- do.call("rbind",sp.tmp)
# subset to only uid in primary month
motmp<-subset(mdata,mo==imo)
#print(length(sp.tmp2))
sp.tmp2<-sp.tmp2[which(substr(sp.tmp2,11,16)%in% motmp$uid)]
#print(length(sp.tmp2))
motmp<-NULL
#print(sp.tmp2)
if ( length(sp.tmp2) > 0 ) write.table(sp.tmp2,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE)
}
#stop()
## Get dflag
sp.tmp<-NULL
sp.tmp2<-NULL
sp2.tmp<-NULL
sp2.tmp2<-NULL
if ( length(sp.good) > 0 ) {
sp.tmp <- lapply(sp.good,get.dflag2)
sp.tmp2 <- do.call("rbind",sp.tmp)
sp.tmp2 <- as.data.frame(sp.tmp2)
sp.tmp2$dflag <- as.numeric(sp.tmp2$dflag)
}
if ( length(sp.check) > 0 ) {
sp2.tmp <- lapply(sp.check,get.dflag3)
sp2.tmp2 <- do.call("rbind",sp2.tmp)
sp2.tmp2 <- as.data.frame(sp2.tmp2)
sp2.tmp2$dflag <- as.numeric(sp2.tmp2$dflag)
}
if (!is.null(sp.tmp2) & !is.null(sp2.tmp2) ) {
sp.tmp2<-rbind(sp.tmp2,sp2.tmp2)
} else if ( !is.null(sp.tmp2) ) {
sp.tmp2<-sp.tmp2
} else if ( !is.null(sp2.tmp2) ) {
sp.tmp2<-sp2.tmp2
}
} # end any potential duplicates
if ( length(sp.good) > 0 | length(sp.check) > 0 ) {
mdata<-merge(mdata,sp.tmp2,by="uid",all.x=T)
mdata$dflag<-ifelse(is.na(mdata$dflag),0,mdata$dflag)
} else {
mdata$dflag<-0
}
nodup<-subset(mdata,dflag==0 & mo == imo)
if ( nrow(nodup) > 0 ) {
#nodup<-nodup[,c("uid","dflag")]
#nodup$uid<-paste0("ICOADS-30-",nodup$uid)
#nodup$brace<-"{}"
#write.table(nodup,fn.out,row.names=FALSE,col.names=FALSE,quote=FALSE,append=TRUE,sep="|")
}
print(addmargins(table(mdata$dck,mdata$dflag)))
print(addmargins(table(mdata$dck,mdata$dups)))
print(addmargins(table(pp.dup$dck,pp.dup$i.dck)))
#mdata<-subset(mdata,dflag!=3)
#pdf(paste0("MERGE_PLOTS_NEW/",year,".pdf"),height=10,width=10)
#source('merge_ids.R')
#dev.off()
#if ( !got.to.end ) {file.remove(paste0("MERGE_PLOTS_NEW/",year,".pdf"))}
next
} # end loop over months
} # end loop over years
cat("finishing new_get_dups.R",args[1],args[2],"\n")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
anc.root <- "/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/"
source("~/Rscripts/get_mismatch.R")
source("~/Rscripts/new_add_match_id.R")
source("~/Rscripts/assess_match.R")
source("~/Rscripts/add_dck_priority.R")
#source("add_ID_class.R")
source("~/Rscripts/get_prec.R")
#source("~/Rscripts/add_date2.R")
#require(imma)
#require(lubridate)
source("~/Rscripts/read_rdsfiles.R") # rbind.all.columns
#source("/noc/mpoc/surface_data/TRACKING/FinalTrack/RSCRIPTS/add_date.R")
#require(igraph)
options(warn=1)
options(width=200)
basedir<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/"
if (!dir.exists(paste0(basedir,"NEW_PAIRFILES"))) dir.create(paste0(basedir,"NEW_PAIRFILES"))
cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
dck.gts<-c(555,700,792,793,794,795,796,797,888,992,993,994,995,996,997)
dck.dm<-c(926,927,928)
ids.gen<-read.table(paste0(anc.root,"gen_id.txt"))
ids.gen<-c(ids.gen$V1,"")
to.print<-TRUE
if ( length(args) == 0 ) {args<-args2}
for ( year in args[1]:args[2] ) {
for ( imo in 1:12 ) {
cat("-----------------------------------------------","\n")
cat(year,imo,"\n")
cat("-----------------------------------------------","\n")
fn.in <- paste0(basedir,"MFILES_SHIP_PROC/",year,"_",imo,".Rda")
if ( !file.exists(fn.in) ) next
mdata<-readRDS(fn.in)
# get next month
imo2<-ifelse(imo<12,imo+1,1)
year2<-ifelse(imo2==1,year+1,year)
if ( year2 != 2019 ) {
fn.in2<-paste0(basedir,"MFILES_SHIP_PROC/",year2,"_",imo2,".Rda")
if ( file.exists(fn.in2) ) {
d2<-readRDS(fn.in2)
mdata<-rbind.all.columns(mdata,d2)
rm(d2)
}
}
mdata$lon2<-ifelse(mdata$lon >= 180, mdata$lon-360, mdata$lon)
mdata<-add_dck_priority(mdata)
#mdata<-add_ID_class(mdata)
head(mdata$itu_co)
mdata$is.gts<-ifelse(mdata$dck %in% dck.gts, TRUE, FALSE)
mdata$is.dm<-ifelse(mdata$dck %in% dck.dm, TRUE, FALSE)
tmp<-data.frame(table(mdata$id))
names(tmp)<-c("id","id.num")
mdata<-merge(mdata,tmp,all.x=T)
mdata$id.num<-ifelse(mdata$id %in% ids.gen, 0, mdata$id.num)
# get dck/sid precision variables from table
mdata<-get_prec(mdata,paste0(anc.root,"prec_input.txt"))
#-----------------------------------------------------------
# full duplicates, similar contents and ID
#-----------------------------------------------------------
#-----------------------------------------------------------
# default tolerances for candidate matches
# if not passed, others set to zero
# tol.id=2,tol.pos=1,tol.sst=1,tol.slp=1,tol.at=1,
# tol.dpt=1,tol.w=1,tol.d=1,tol.ww=0,tol.w1=0,tol.vv=0,tol.n=0
#-----------------------------------------------------------
# to allow mismatch in variables used as key, need to
# define a new variable, e.g. rounded
#-----------------------------------------------------------
# new "key" variables
# use yr/mo/dy from new.date variable
#mdata$yr<-as.numeric(format(mdata$new.date,"%Y"))
#mdata$mo<-as.numeric(format(mdata$new.date,"%m"))
#mdata$dy<-as.numeric(format(mdata$new.date,"%d"))
mdata$jday<-as.numeric(format(mdata$date,"%j"))
mdata$date<-NULL # mucks up data.table
mdata$old.date<-NULL # mucks up data.table
#mdata$new.date<-NULL # mucks up data.table
mdata$mlstdate<-NULL # mucks up data.table
mdata$k.yr<-mdata$yr
mdata$k.mo<-mdata$mo
mdata$k.dy<-mdata$dy
mdata$k.jday<-mdata$jday
mdata$k.hr<-round(mdata$hr)
mdata$rlat<-round(mdata$lat)
mdata$rlon<-round(mdata$lon)
mdata$rsst<-round(mdata$sst)
mdata$alat<-abs(mdata$lat)
mdata$alon<-abs(mdata$lon)
#key.list<-c("k.yr","k.mo","k.dy","k.hr","rlat","rlon")
pp.dup<-NULL
pp.dup.full<-NULL
pp.dup.1day<-NULL
pp.dup.1hr<-NULL
pp.dup.mo<-NULL
if ( nrow(mdata) > 500000 ) {
ndays<-5
} else if ( nrow(mdata) > 50000 ) {
ndays<-10
} else {
ndays<-400
}
if ( mdata$yr[1] >= 2015 ) {ndays<-1}
mdata.2mo<-mdata
mdata<-subset(mdata,mo==imo)
# make sure we have at least 2 days to compare at the end
date1<-paste0(year,"-",imo,"-01")
date2<-paste0(year2,"-",imo2,"-01")
dy.in.mo<-as.numeric(difftime(date2, date1 ))
for ( iday in seq(1,max(1,dy.in.mo-ndays+1),ndays) ) {
start.day<-iday
end.day<-iday+ndays-1
if ( end.day >= dy.in.mo-2 ) end.day <- 31
#cat(iday,start.day,end.day,"\n")
sub<-subset(mdata,dy>=start.day & dy<=end.day)
sub.2dy<-subset(mdata,dy>=start.day & dy<=end.day+1)
sub.2mo<-subset(mdata.2mo,dy>=start.day & dy<=end.day)
if ( ndays < 400 ) cat("running for days",min(sub$dy),max(sub$dy),"\n")
# these key values used to index table, linking reports that match
key.list<-c("k.yr","k.mo","k.dy","k.hr","rlat","rlon")
#key.list<-c("yr","mo","dy","rlat","rlon")
# these variables are used to refine matching, values from both reports kept
#close.list<-c("yr","mo","dy","hr","id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon")
close.list<-c("id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon","jday")
cat('about to get matches',"\n")
pp.dup.full<-get_mis(df.in=sub,key.list=key.list,close.list=close.list,
tol.pos=0.51,tol.id=9,
tol.w=2,tol.d=13,
tol.sst=1.01,tol.at=1.01,tol.dpt=1.01,tol.slp=1.01,
tol.ww=99,tol.w1=99,tol.vv=99)
cat('got matches, full date/time & pos',nrow(pp.dup.full),"\n")
# mismatch of 1 day, at midnight
key.list<-c("k.yr","k.mo","k.hr","rlat","rlon")
#close.list<-c("id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon","jday")
pp.dup.1day<-get_mis(df.in=sub.2dy,key.list=key.list,close.list=close.list,
tol.pos=0.51,tol.id=9,tol.hr=0,tol.dy=999,
tol.w=2,tol.d=13,
tol.sst=1.01,tol.at=1.01,tol.dpt=1.01,tol.slp=1.01,
tol.ww=99,tol.w1=99,tol.vv=99)
# allow mismatch of 1 day
pp.dup.1day<-subset(pp.dup.1day,abs(pp.dup.1day$jday-pp.dup.1day$i.jday) == 1 )
# as day is mismatched, require more elements to match
pp.dup.1day<-pp.dup.1day[pp.dup.1day$numel>=4&pp.dup.1day$i.numel>=4,]
#pp.dup.1day<-pp.dup.1day[abs(pp.dup.1day$dy-pp.dup.1day$i.dy)==1,]
# and only midnight obs
#pp.dup.1day<-pp.dup.1day[(pp.dup.1day$hr==0),]
cat('got matches, full pos, 1 day different',nrow(pp.dup.1day),"\n")
# mismatch of 1 hour
key.list<-c("k.yr","k.mo","k.dy","rlat","rlon")
#close.list<-c("id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon","jday")
pp.dup.1hr<-get_mis(df.in=sub.2dy,key.list=key.list,close.list=close.list,
tol.pos=0.51,tol.id=9,tol.hr=99,tol.dy=0,
tol.w=2,tol.d=13,
tol.sst=1.01,tol.at=1.01,tol.dpt=1.01,tol.slp=1.01,
tol.ww=99,tol.w1=99,tol.vv=99)
# allow mismatch of 1 hour
pp.dup.1hr<-subset(pp.dup.1hr,abs(hr-i.hr) == 1 )
# these are due to calculation of GMT from local time differing between decks - select only different deck
pp.dup.1hr<-subset(pp.dup.1hr,dck!=i.dck )
# and exclude hourly data
hour.dck<-c(117,708,992,792)
pp.dup.1hr<-subset(pp.dup.1hr,!(dck %in% hour.dck))
hour.sid<-c(109)
pp.dup.1hr<-subset(pp.dup.1hr,!(sid %in% hour.sid))
# as hr is mismatched, require more elements to match
pp.dup.1hr<-pp.dup.1hr[pp.dup.1hr$numel>=4&pp.dup.1hr$i.numel>=4,]
cat('got matches, full pos, 1 hr different',nrow(pp.dup.1hr),"\n")
# mismatch of 1 month
key.list<-c("k.yr","k.jday","k.hr","rlat","rlon")
#close.list<-c("id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon","jday")
pp.dup.mo<-get_mis(df.in=sub.2mo,key.list=key.list,close.list=close.list,
tol.pos=0.51,tol.id=9,tol.hr=0,tol.dy=0,tol.mo==99,
tol.w=2,tol.d=13,
tol.sst=1.01,tol.at=1.01,tol.dpt=1.01,tol.slp=1.01,
tol.ww=99,tol.w1=99,tol.vv=99)
# allow mismatch of 1 month or more
pp.dup.mo<-subset(pp.dup.mo,abs(pp.dup.mo$mo-pp.dup.mo$i.mo) > 1 )
# as month is mismatched, require more elements to match
pp.dup.mo<-pp.dup.mo[pp.dup.mo$numel>=4&pp.dup.mo$i.numel>=4,]
cat('got matches, full pos, month different',nrow(pp.dup.mo),"\n")
if ( (is.null(pp.dup)) & (nrow(pp.dup.full) > 0 ) ) {
pp.dup<-pp.dup.full
} else if ( (is.null(pp.dup)) & nrow(pp.dup.1day)>0 ) {
pp.dup<-pp.dup.1day
} else if ( (is.null(pp.dup)) & nrow(pp.dup.1hr)>0 ) {
pp.dup<-pp.dup.1hr
} else if ( (is.null(pp.dup)) & nrow(pp.dup.mo)>0 ) {
pp.dup<-pp.dup.mo
} else if (is.null(pp.dup)) {
} else { # got pp.up, append data if any
if ( nrow(pp.dup.full)>0 ) pp.dup<-rbind.all.columns(pp.dup,pp.dup.full)
if ( nrow(pp.dup.1hr)>0 ) pp.dup<-rbind.all.columns(pp.dup,pp.dup.1hr)
if ( nrow(pp.dup.1day)>0 ) pp.dup<-rbind.all.columns(pp.dup,pp.dup.1day)
if ( nrow(pp.dup.mo)>0 ) pp.dup<-rbind.all.columns(pp.dup,pp.dup.mo)
}
} # end loop over days
# remove irrelevant row labels
rownames(pp.dup)<-NULL
if ( is.null(pp.dup) ) {next}
if ( nrow(pp.dup) > 0 ) {
# add difference variable to pairs data frame d.var
# and flag to say if it is within precision m.var
for ( myvar in close.list ) {
if ( class(pp.dup[,myvar]) == "character" ) {next}
pp.dup[,paste0("d.",myvar)]<-round(pp.dup[,myvar]-pp.dup[,paste0("i.",myvar)],2)
if ( myvar %in% c("yr","mo","dy","hr","jday")) {next}
if ( !(myvar %in% names(mdata)) ) {next}
# is variable within precision for 1st in pair
pp.dup[,paste0("m.",myvar)]<-ifelse(abs(pp.dup[,paste0("d.",myvar)])<pp.dup[,paste0(myvar,".prec")],TRUE,FALSE)
# is variable within precision for 2nd in pair
pp.dup[,paste0("m.",myvar)]<-ifelse(abs(pp.dup[,paste0("d.",myvar)])<pp.dup[,paste0("i.",myvar,".prec")],TRUE,pp.dup[,paste0("m.",myvar)])
# is either missing
pp.dup[,paste0("m.",myvar)]<-ifelse(is.na(pp.dup[,paste0("m.",myvar)]),NA,pp.dup[,paste0("m.",myvar)])
# if allowing mismatch at prec = 1, make sure at least one is rounded
pp.dup[,paste0("m.",myvar)]<-ifelse((pp.dup[,paste0("i.",myvar,".prec")] == 1 | pp.dup[,paste0(myvar,".prec")]) & abs(pp.dup[,paste0("d.",myvar)]) > 0.1 & (pp.dup[,myvar]%%1 != 0 | pp.dup[,paste0("i.",myvar)]%%1 != 0 ) | (pp.dup[,myvar]%%1 != 0.5 & pp.dup$dck == 555 | pp.dup[,paste0("i.",myvar)]%%1 != 0.5 & pp.dup$i.dck == 555 ),FALSE,pp.dup[,paste0("m.",myvar)])
}
# see if IDs match
pp.dup<-new_add_match_id(pp.dup)
saveRDS(pp.dup,paste0("NEW_PAIRFILES/",year,"_",imo,".Rda"))
if ( ndays < 400 ) {
cat("overall",nrow(pp.dup),"pairs","\n")
cat("overall",nrow(pp.dup[which(pp.dup$hr!=pp.dup$i.hr),]),"pairs with hour mismatch","\n")
cat("overall",nrow(pp.dup[which(pp.dup$dy!=pp.dup$i.dy),]),"pairs with day mismatch","\n")
cat("overall",nrow(pp.dup[which(pp.dup$mo!=pp.dup$i.mo),]),"pairs with month mismatch","\n")
}
if ( "id.match" %in% names(pp.dup) & to.print ) source("~/Rscripts/print_id_match_info.R")
pp.dup<-pp.dup[pp.dup$id.match,]
} else {
next
}
} # end loop over months
} # end loop over years
cat('finishing get pairs',args[1],args[2],"\n")
new_homog_ids <- function(df) {
############################################################
# these are standard linkages between IDs
############################################################
# code written by Elizabeth Kent, National Oceanography Centre
# for C3S_311a_Lot2, adapted for HOSTACE
#require(stringdist)
anc.root<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/"
linked.ids<-NULL
df$homog.id <- toupper(df$id)
df$homog.id.flag <- 0
#if ( !("homog.id" %in% names(df) ) ) {df$homog.id<-NA}
#if ( !("homog.id.flag" %in% names(df) ) ) {df$homog.id.flag<-0}
# set all to old id
#df$homog.id <- ifelse( df$homog.id.flag == 0, df$id, df$homog.id)
# 0 is unchanged
# 1 is changed by known match
# 2 is changed by match to pub47
# 3 is changed by sequential join
# get known matches which might not have duplicates
nnn<-names(table(df$id[which(df$dck%in% c(194,201,202,203,227))]))
if ( length(nnn) > 0 ) {
nnn<-nnn[!is.na(nnn)]
nnn<-nnn[nnn!=""]
nnn<-nnn[nchar(nnn) > 4]
linked.ids<-NULL
if ( !is.null(nnn) ) {
if ( length(nnn) > 0 ) {
l.nnn<-length(nnn)
if ( l.nnn > 1 ) {
for ( i in seq(1,max(1,l.nnn-1),1)) {
for ( j in seq(i+1,l.nnn,1 )) {
#print(paste(i,j,l.nnn))
#print(paste(nnn[i],nnn[j]))
if ( i > length(nnn) ) {next}
if ( j > length(nnn) ) {next}
#if ( is.na(nnn[i]) | is.na(nnn[j])) {next}
#if ( nnn[i] == "" | nnn[j] == "" ) {next}
if (nnn[i] == nnn[j] ) {next}
#if ( nchar(nnn[i]) <= 1 | nchar(nnn[j]) <= 1 ) {next}
if ( stringdist::stringdist(nnn[i],nnn[j]) == abs(nchar(nnn[i])-nchar(nnn[j]))) {
#cat(i,j,nnn[i],nnn[j],"\n")
linked.ids<-rbind(linked.ids,data.frame(id=nnn[i],i.id=nnn[j]))
}
if ( gsub("7 ","0",nnn[i]) == nnn[j] | gsub("7 ","0",nnn[j]) == nnn[i]) {
#cat(i,j,nnn[i],nnn[j],"\n")
linked.ids<-rbind(linked.ids,data.frame(id=nnn[i],i.id=nnn[j]))
}
} # end loop over j
} # end loop over i
}
}
}
} # end length nnn = 0
if ( !is.null(linked.ids) ) {
#print(linked.ids)
link2<-linked.ids
link2$id<-ifelse(nchar(linked.ids$id) == 5, linked.ids$id, linked.ids$i.id)
link2$i.id<-ifelse(nchar(linked.ids$id) == 5, linked.ids$i.id, linked.ids$id)
print(link2)
for ( i in 1:nrow(link2) ) {
#cat(i,link2$id[i],link2$i.id[i],"\n")
df$homog.id<-ifelse(df$id == link2$i.id[i], link2$id[i], df$homog.id)
df$homog.id.flag<-ifelse(df$id == link2$i.id[i], 1, df$homog.id.flag)
#cat(sum(df$id!=df$homog.id),"\n")
}
}
df$homog.id.flag<-ifelse(df$dck == 194 & nchar(df$id) == 6 & df$id == df$homog.id, 1, df$homog.id.flag)
df$homog.id<-ifelse(df$dck == 194 & nchar(df$id) == 6 & df$id == df$homog.id, substr(df$id,2,6), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[{]",df$id), gsub("[{]","0",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("J",df$id), gsub("J","0",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[AK]",df$id), gsub("[AK]","1",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[BL]",df$id), gsub("[BL]","2",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[CM]",df$id), gsub("[CM]","3",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[DN]",df$id), gsub("[DN]","4",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[EO]",df$id), gsub("[EO]","5",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[FP]",df$id), gsub("[FP]","6",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[GQ]",df$id), gsub("[GQ]","7",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[HR]",df$id), gsub("[HR]","8",df$id), df$homog.id)
df$homog.id <- ifelse( df$dck %in% c(192,215,720) & nchar(df$id) == 8 & grepl("[IS]",df$id), gsub("[IS]","9",df$id), df$homog.id)
df$homog.id.flag <- ifelse( df$dck %in% c(192,215,720) & df$id != df$homog.id, 1, df$homog.id.flag )
########################################################################
# replace callsigns that have a pub47 call as substring with pub47 call
########################################################################
#p47id<-read.table("all_ids_pub47.txt",header=T)
#pub47.calls<-subset(p47id,nchar(id)>=4)
p47tmp<-read_trackmonth(paste0(anc.root,"PUB47byMONTH"),syr=min(df$yr)-1,eyr=max(df$yr)+1)
pub47.calls<-p47tmp$callsign
call.dcks <-c(128,233,254,255,555,700,708,709,732,735,749,781,792,849,874,875,888,889,892,926,927,992,993,995,999)
pub47.calls<-pub47.calls[nchar(pub47.calls) >= 4]
pub47.calls<-pub47.calls[grepl("[A-Z]",pub47.calls)]
pub47.calls<-unique(pub47.calls)
if(length(pub47.calls) == 0 ) { return(df) }
########################################################################
# got full list of pub47 ids
########################################################################
id.keep<-c("KYOKUYO2","KYOKUYO3","EDWARD","GRIFFO","SEACAT","INNOVATOR","PRINCESS","PRETORIA","ALBATROS","DAMEROMA","BATUK00","MANE","MANITOWOC","TAMOURE")
if ( sum(df$dck %in% call.dcks) == 0 & min(df$yr) < 1966 ) return(df)
m.tmp<-subset(df,!(id %in% c("","SHIP","AAAA","MASKSTID")) & nchar(id) >= 4 & grepl("[A-Z]",toupper(id)))
pub47.match<-c()
for ( call in pub47.calls ) {
#cat(call," ")
tmp<-subset(m.tmp,stringdist::stringdist(call,m.tmp$id) == abs(nchar(call)-nchar(m.tmp$id)) & call!=m.tmp$id )
#if ( length(tmp$id) > 0 ) print(paste(call,paste(names(table(tmp$id)),collapse=","),sep=","))
#print(table(tmp$id))
if ( length(tmp$id) > 0 ) pub47.match <- c(pub47.match,paste(call,paste(names(table(tmp$id)),collapse=","),sep=","))
}
#print(class(pub47.match))
if ( is.null(pub47.match)) return(df)
if ( length(pub47.match) == 0 ) return(df)
#print('bison')
#print(pub47.match)
# find and remove multiple matches
a<-paste(pub47.match,collapse=",")
a<-gsub(",,",",",a)
b<-unlist(strsplit(a, ","))
pdup<-b[duplicated(b)]
#pdup<-pdup[pdup!=""]
#print(pdup)
for ( idup in pdup ) {
#print(idup)
patt <- paste0('\\b', idup, '\\b')
pub47.match<-gsub(patt,"XXXXXX",pub47.match)
}
pub47.match<-pub47.match[!grepl("XXXXXX",pub47.match)]
print(length(pub47.match))
for ( i in 1:length(pub47.match) ) {
tmp<-unlist(strsplit(pub47.match[i], ","))
df$homog.id<-ifelse(df$id %in% tmp[2:length(tmp)],tmp[1],df$homog.id)
df$homog.id.flag<-ifelse(df$id %in% tmp[2:length(tmp)],2,df$homog.id.flag)
}
# keep some IDS that don't seem to be callsigns
df$homog.id.flag <- ifelse ( df$id %in% id.keep, 0, df$homog.id.flag )
df$homog.id <- ifelse( df$homog.id.flag == 0, df$id, df$homog.id)
print(tail(sort(table(df$uid))))
return(df)
}
This diff is collapsed.
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
source("~/Rscripts/read_rdsfiles.R")
source("~/Rscripts/add_shipnames.R")
source("~/Rscripts/new_homog_ids.R")
source("~/Rscripts/add_ID_class.R")
#source(~/Rscripts/"liz_merge.R")
require(stringdist)
print.comm <- FALSE
basedir<-"."
call.dcks <-c(128,233,254,255,555,700,708,709,732,735,749,781,792,849,874,875,888,889,892,926,927,992,993,995,999)
if (!dir.exists("MFILES_SHIP_IDPROC")) dir.create("MFILES_SHIP_IDPROC")
options(warn=1)
if ( length(args) < 2 ) {args<-args2}
#mydf<-data.frame(c())
for ( year in args[1]:args[2] ) {
cat(year," ")
for ( imo in 1:12 ) {
cat(imo," ")
fn<-paste0("MFILES_SHIP/",year,"_",imo,".Rda")
if(!file.exists(fn))next
mdata<-readRDS(paste0("MFILES_SHIP/",year,"_",imo,".Rda"))
fn<-paste0("MFILES_NOTSHIP/",year,"_",imo,".Rda")
if(file.exists(fn)) {
mdata2<-readRDS(paste0("MFILES_NOTSHIP/",year,"_",imo,".Rda"))
mdata<-subset(mdata,!(uid %in% mdata2$uid))
}
if ( print.comm ) cat("got mdata","\n")
#cat('about to homog',"\n")
mdata<-new_homog_ids(mdata)
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("after homog")
if ( print.comm ) cat("homogenised IDs","\n")
#cat('about to add shipnames',"\n")
mdata<-add_shipnames(mdata)
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("after shipname")
if ( print.comm ) cat("added shipnames","\n")
#cat('added shipnames',"\n")
fn<-paste0(basedir,"SEQ_IDS/dck192_",year,".",imo,".Rda")
if ( file.exists(fn) ) {
cat("\n","merging",fn,"\n")
seq.ids<-readRDS(fn)
seq.ids$nid2<-seq.ids$nid
mdata<-merge(mdata,seq.ids[,c("uid","nid2")],by="uid",all.x=TRUE)
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("after seqid")
#mdata<-liz_merge(mdata,seq.ids[,c("uid","nid2")],by.var="uid")
mdata$homog.id<-ifelse(!is.na(mdata$nid2),mdata$nid2,mdata$homog.id)
mdata$homog.id.flag<-ifelse(!is.na(mdata$nid2),99,mdata$homog.id.flag)
mdata$nid2<-NULL
}
if ( print.comm ) cat("carrot","\n")
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("after 192")
fn<-paste0(basedir,"SEQ_IDS/dck216_",year,".",imo,".Rda")
if ( file.exists(fn) ) {
cat("\n","merging",fn,"\n")
seq.ids<-readRDS(fn)
seq.ids$nid2<-seq.ids$nid
mdata<-merge(mdata,seq.ids[,c("uid","nid2")],by="uid",all.x=TRUE)
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("after seqid2")
#mdata<-liz_merge(mdata,seq.ids[,c("uid","nid2")],by.var="uid")
mdata$homog.id<-ifelse(!is.na(mdata$nid2),mdata$nid2,mdata$homog.id)
mdata$homog.id.flag<-ifelse(!is.na(mdata$nid2),99,mdata$homog.id.flag)
mdata$nid2<-NULL
}
if ( print.comm ) cat("cabbage","\n")
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("after 216")
mdata$homog.id <- ifelse ( mdata$dck == 720 & mdata$sid == 135 & nchar(mdata$id) == 8, paste0(substr(mdata$id,1,4),"-SEQ") , mdata$homog.id )
mdata$homog.id.flag <- ifelse ( mdata$dck == 720 & mdata$sid == 135 & nchar(mdata$id) == 8, 3 , mdata$homog.id.flag )
mdata<-add_ID_class(mdata)
if ( print.comm ) cat("added ID class","\n")
mdata$orig.id <- mdata$id
mdata$id <- mdata$homog.id
mdata$id<-ifelse(!is.na(mdata$shipname) & mdata$id.class != "callsign" & mdata$id.class != "ESURFMAR", mdata$shipname, mdata$homog.id)
fn.out<-paste0("MFILES_SHIP_IDPROC/",year,"_",imo,".Rda")
if ( length(mdata$uid[duplicated(mdata$uid)]) > 0 ) stop("at end")
saveRDS(mdata,fn.out)
#if(!("c99" %in% names(mdata))) mdata$c99<-NA
#if ( nrow(mydf) == 0 ) {
# mydf<-mdata[,c("date","lat","lon","shipname","id","dck","sid","c99")]
#} else {
# mydf<-rbind(mydf,mdata[,c("date","lat","lon","shipname","id","dck","sid","c99")])
#}
#mydf<-subset(mydf,id!=""|id!="SHIP")
#mydf<-subset(mydf,dck %in% c(705,706,707))
#print(table(mdata$shipname))
} # end loop over imo
cat("\n")
#fn.out<-paste0("TEST_NAMES/",year,".Rda")
#sub<-mydf
#saveRDS(mydf,fn.out)
#mydf<-data.frame(c())
} # end loop over year
cat("finished process_ids.R",args[1],args[2],"\n")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# code written by Elizabeth Kent, National Oceanography Centre
# for C3S_311a_Lot2
# information on time corrections comes partly from NERC HOSTACE project
source("~/Rscripts/add_date2.R")
source("~/Rscripts/read_rdsfiles.R") # for rbind.all.columns
basedir<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/"
print.comm<-FALSE
drop<-c("wmi","is","es","rs","ir","rrr","qci","mds","osz","oov","ooz","opv","opz","osiv","osiz","onv","onz","ophv","ophz","ocv","ocz","cce","wwe","ne","nhe","he","cle","cme","che","am","ah","uh","sbi","sa","ri","um","oav","oaz","wx","sx","eoh","hop","smf","sme","smv")
if (!dir.exists("MFILES_SHIP_PROC")) dir.create("MFILES_SHIP_PROC")
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
#require(lubridate)
options(warn=1)
cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
if ( length(args) == 0 ) {args<-args2}
# remove files
for ( year in args[1]:args[2] ) {
for ( imo in 1:12 ) {
fn.out <- paste0("MFILES_SHIP_PROC/",year,"_",imo, ".Rda")
#print(fn)
if (file.exists(fn.out)) file.remove(fn.out)
}
}
rm(fn.out)
for ( year in args[1]:args[2] ) {
cat("-----------------------------------------------","\n")
cat(year," ")
for ( imo in 1:12 ) {
cat(imo," ")
#fn.out<-paste0("MFILES_SHIP_PROC/",year,"-",imo,".Rda")
fn.in <- paste0(basedir,"MFILES_SHIP_IDPROC/",year,"_",imo,".Rda")
if ( !file.exists(fn.in) ) next
mdata<-readRDS(fn.in)
mdata$date.ch.flag<-0
# 0 is unchanged
# 1 is hr missing, 12 local inserted
# 2 is shift of midnight local - 1 day
# 3 is shift of midnight GMT + 1 day
# 4 is dck 781 adjusted assuming hrs are local
var.want<-names(mdata)
var.want<-var.want[!(var.want %in% drop)]
if ( print.comm ) cat("carrot","\n")
# put missing hour to midday local
#time_string <- sprintf("%4d-%02d-%02d 12:00", mdata$yr, mdata$mo, mdata$dy )
#my_date_time <- as.POSIXct(time_string) + (mdata$lon / 15)*3600
#mdata$date.flag<-ifelse(is.na(mdata$hr),1,mdata$date.flag)
#mdata$hr <- ifelse(is.na(mdata$hr),round(as.numeric(format.POSIXct( my_date_time ,"%H")) + as.numeric(format.POSIXct( my_date_time ,"%M"))/60,0),mdata$hr)
#rm(time_string)
#rm(my_date_time)
# correct MDB times where needed
# first calculate local time - appropriate for MO
mdata<-add_date2(mdata)
#mdata<-subset(mdata,!is.na(date)) # shouldn't be needed after running again
#mdata$date<-as.POSIXct(mdata$date)
mdata$old.date<-mdata$date
#if ( sum(is.na(mdata$date) ) > 0 ) {
tmplon <- ifelse (mdata$lon>=360,mdata$lon-360,mdata$lon)
mdata$offset <- round(tmplon/360*24*3600)
mdata$mlst <- mdata$date + mdata$offset
mdata$mlstdate <- strptime(mdata$mlst, format="%Y-%m-%d")
tmp1 <- strftime(mdata$mlst, format="%H")
tmp2 <- strftime(mdata$mlst, format="%M")
mdata$mlsttime <- as.numeric(tmp1) + as.numeric(tmp2)/60
rm(tmp1)
rm(tmp2)
mdata$mlsttime <- round(ifelse(mdata$mlsttime > 23.5,mdata$mlsttime-24,mdata$mlsttime))
mdata$mlsttime <- ifelse(mdata$mlsttime > 23.9,mdata$mlsttime-24,mdata$mlsttime)
#} else {
# mdata$mlst<- NA
# mdata$mlstdate <- NA
# mdata$mlsttime <- NA
#}
if ( print.comm ) cat("cabbage","\n")
# dck 781 adjust time assuming reported time is local
if ( 781 %in% names(table(mdata$dck))) {
if(print.comm)cat('potato',"\n")
mdata$yr<-ifelse(mdata$dck == 781, as.numeric(format(mdata$mlst,"%Y")), mdata$yr)
mdata$mo<-ifelse(mdata$dck == 781, as.numeric(format(mdata$mlst,"%m")), mdata$mo)
mdata$dy<-ifelse(mdata$dck == 781, as.numeric(format(mdata$mlst,"%d")), mdata$dy)
mdata$hr<-ifelse(mdata$dck == 781, mdata$mlsttime, mdata$hr)
tmp<-floor(mdata$hr)
tmp2<-round((mdata$hr-tmp)*60)
mdata$date<-ifelse(mdata$dck == 781, as.POSIXct(paste0(mdata$yr,"-",mdata$mo,"-",mdata$dy," ",tmp,":",tmp2,":00")),mdata$date)
mdata$date.ch.flag<-ifelse(mdata$dck == 781, 4, mdata$date.ch.flag)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
}
if(print.comm)cat('swede',"\n")
# got local time, now lon2 is -179.99 -> 180; lon is 0 -> 360
#mdata$lon2<-ifelse(mdata$lon > 180, mdata$lon-360, mdata$lon)
# these are the mdb dcks
#d.mdb<-c(201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 213, 214, 215, 216, 218, 221, 223, 224, 226, 227, 229, 230, 233, 234, 239, 245, 246, 247, 249, 254, 255)
# make MDB and other time corrections
#dtime<-as.difftime(1, unit="days")
#mdata$date2<-mdata$date
# 201 0 GMT is a day early; up & inc 1899
mdata$date<-ifelse(mdata$dck%in%c(201) & mdata$hr == 0 & mdata$yr<1900, mdata$date+lubridate::days(1),mdata$date)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
# 207 0 GMT a day early; 1946-1949
mdata$date<-ifelse(mdata$dck==207 & mdata$hr == 0 & mdata$yr>=1946 & mdata$yr<=1949, mdata$date+lubridate::days(1), mdata$date)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
# 189 0 GMT a day early; 1948-1949
mdata$date<-ifelse(mdata$dck==189 & mdata$hr == 0 & mdata$yr>=1948 & mdata$yr<=1949, mdata$date+lubridate::days(1), mdata$date)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
# 216 0 GMT a day early; 1936-1939
mdata$date<-ifelse(mdata$dck==216 & mdata$hr == 0 & mdata$yr>=1936 & mdata$yr<=1939, mdata$date+lubridate::days(1), mdata$date)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
# 707 0 GMT a day early; 1913-1939
mdata$date<-ifelse(mdata$dck==707 & mdata$hr == 0 & mdata$yr>=1913 & mdata$yr<=1939, mdata$date+lubridate::days(1), mdata$date)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
# 209 0 GMT is a day early, not 1956 or earlier
#mdata$date<-ifelse(mdata$dck%in%c(209) & mdata$hr == 0 & mdata$yr > 1956, mdata$date+days(1),mdata$date)
# mdata$date.ch.flag<-ifelse(mdata$old.date!=mdata$date & mdata$date.ch.flag==0, 2 ,mdata$date.ch.flag)
# 215 0 local time is a day late
mdata$date<-ifelse(mdata$dck%in%c(215) & mdata$mlsttime == 0, mdata$date-lubridate::days(1),mdata$date)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
mdata$date.ch.flag<-ifelse(mdata$old.date!=mdata$date & mdata$date.ch.flag==0, 2, mdata$date.ch.flag)
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
if ( sum(mdata$old.date!=mdata$date,na.rm=T) > 0 ) {
pig<-subset(mdata,old.date!=date)
cat("dates changed =",sum(mdata$old.date!=mdata$date),"dcks =",paste(names(table(pig$dck)),collapse=" / "),"\n")
rm(pig)
}
if ( print.comm ) cat("turnip","\n")
mdata$old.lat<-mdata$lat
mdata$old.lon<-mdata$lon
if ( class(mdata$date)[1] == "numeric") mdata$date<-as.POSIXct(mdata$date,origin="1970-01-01")
if ( class(mdata$old.date)[1] == "numeric") mdata$old.date<-as.POSIXct(mdata$old.date,origin="1970-01-01")
mdata$yr<-as.numeric(format(mdata$date,"%Y"))
mdata$mo<-as.numeric(format(mdata$date,"%m"))
mdata$dy<-as.numeric(format(mdata$date,"%d"))
mdata$hr<-as.numeric(format(mdata$date,"%H"))
#print(table(mdata$yr,mdata$mo))
if ( print.comm ) cat("parsnip","\n")
YRMO <- split(mdata, data.frame(mdata$yr, mdata$mo), drop=TRUE)
names(YRMO)<-gsub("\\.","_",names(YRMO))
fn.out <- paste("MFILES_SHIP_PROC/",names(YRMO), ".Rda",sep="")
if ( length(fn.out) > 1 ) cat(paste(fn.out,collapse="; "),"\n")
for ( ii in 1:length(fn.out) ) {
if ( file.exists(fn.out[ii]) ) {
toadd<-readRDS(fn.out[ii])
cat('adding to file',fn.out[ii],nrow(toadd),"reports","\n")
op<-rbind.all.columns(YRMO[[ii]],toadd)
rm(toadd)
} else {
op<-YRMO[[ii]]
#cat('not adding to file',fn[ii],"\n")
}
saveRDS(op,fn.out[ii])
}
#odf<-data.frame(sub$uid,sub$date2,sub$flag.date,round(sub$lat,2),sub$flag.lat,round(sub$lon2,2),sub$flag.lon)
} # end loop over months
cat("\n","-----------------------------------------------","\n")
} # end loop over years
cat('finishing processing ship data inc date changes',args[1],args[2],"\n")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
#require(stringdist)
#require(geosphere)
source("~/Rscripts/read_rdsfiles.R")
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
options(warn=1)
if ( length(args) < 2 ) {args<-args2}
#source("/noc/mpoc/surface_data/TRACKING/FinalTrack/RSCRIPTS/get_speed.R")
anc.root<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/"
fixed.ids<-read.table(paste0(anc.root,"fixed_ids_pub47.txt"),header=T,sep=",")
stat.ids<-as.vector(t(fixed.ids))
stat.ids<-c(stat.ids,"BOUY","BUOY","PLAT","RIGG")
ship.ids<-read.table(paste0(anc.root,"not_fix_ids_pub47.txt"),header=T,sep=",")
ship.ids<-as.vector(t(ship.ids))
same.ids<-c()
nn.want<-c()
lvar<-c("id","i.id","pt","i.pt","dck","i.dck","lat","i.lat","lon","i.lon","slp","i.slp","sst","i.sst","at","i.at","dpt","i.dpt","w","i.w","d","i.d")
for ( iyear in args[1]:args[2] ) {
cat('-----------------------------------------------',"\n")
cat(iyear,"\n")
cat('-----------------------------------------------',"\n")
# delete old output files
f.out<-paste0("MFILES_NOTSHIP/",iyear,"_",seq(1,12),".Rda")
for ( ff in f.out ) {
if (file.exists(ff)) file.remove(ff)
}
uid.want<-c()
g.pos<-NULL
stat.ids<-unique(stat.ids)
bdata<-read_trackmonth("CROSS_MOORED",syr=iyear)
pdata<-read_trackmonth("CROSS_PLAT",syr=iyear)
ddata<-read_trackmonth("CROSS_DRIFT",syr=iyear)
cdata<-read_trackmonth("CROSS_COAST",syr=iyear)
smdata<-read_trackmonth("CROSS_SUBMARINE",syr=iyear)
data<-bdata # should always have moored buoy matches
# except where there is submarine data (1936-1955)
if ( is.null(data) & !is.null(smdata) ) {
data<-smdata
} else {
if (!is.null(smdata)) data<-rbind.all.columns(data,smdata)
}
if (!is.null(pdata)) data<-rbind.all.columns(data,pdata)
if (!is.null(ddata)) data<-rbind.all.columns(data,ddata)
if (!is.null(cdata)) data<-rbind.all.columns(data,cdata)
#if (!is.null(smdata)) data<-rbind.all.columns(data,smdata)
if (!is.null(pdata)) rm(pdata)
if (!is.null(ddata)) rm(ddata)
if (!is.null(cdata)) rm(cdata)
if (!is.null(smdata)) rm(smdata)
cat('got cross data',"\n")
# now get IDs from other data types
bdata<-read_trackmonth("MFILES_MOORED",syr=iyear)
if (!is.null(bdata)) {
ids.moored<-names(table(bdata$id))
rm(bdata)
} else {
ids.moored<-NULL
}
pdata<-read_trackmonth("MFILES_PLAT",syr=iyear)
if (!is.null(pdata)) {
ids.plat<-names(table(pdata$id))
rm(pdata)
} else {
ids.plat<-NULL
}
ddata<-read_trackmonth("MFILES_DRIFT",syr=iyear)
if (!is.null(ddata)) {
ids.drift<-names(table(ddata$id))
rm(ddata)
} else {
ids.drift<-NULL
}
cdata<-read_trackmonth("MFILES_COAST",syr=iyear)
if (!is.null(cdata)) {
ids.coast<-names(table(cdata$id))
rm(cdata)
} else {
ids.coast<-NULL
}
smdata<-read_trackmonth("MFILES_SUBMARINE",syr=iyear)
if (!is.null(smdata)) {
ids.submarine<-names(table(smdata$id))
rm(smdata)
} else {
ids.submarine<-NULL
}
cat('got other data',"\n")
nn<-c()
if(!is.null(data)) {
#dck.not.plat<-c(667,926)
#data<-subset(data,!(data$i.dck %in% dck.not.plat))
nt<-table(data$i.id) # these are IDs with potential match to non-ship data
#nt<-nt[nt>=10]
nn<-names(nt)
nn<-nn[nn!=""]
nn<-nn[nn!="SHIP"]
nn<-nn[nn!="MASKSTID"]
nn2<-nn[nn %in% ship.ids]
nn<-nn[!(nn %in% ship.ids)]
if(length(nn2)>0){
if ( length(nn2) <= 10 ) {
cat("excluding Pub 47 IDS:",paste(nn2[!(nn2 %in% nn)],collapse=" / "),"\n")
} else {
cat("excluding",length(nn2),"Pub 47 IDS","\n")
}
}
}
# nn are IDs from matching
sdata<-read_trackmonth("MFILES_SHIP",syr=iyear)
if(is.null(sdata)) { next }
cat('got ship data',"\n")
sdata$nonship.flag<-99
# 0 = same ID as non-ship in same same period
# 1 = station ID
# 2 = from cross-match, moored
# 3 = from cross-match, drift
# 4 = from cross-match, plat
# 5 = from cross-match, coast
# 6 = missing/generic ID by location
# 7 = match to substring of identified non-ship ID
#stop()
if(!is.null(data) | length(nn) > 0 ) {
same.ids<-unique(data$i.id[data$i.id %in% data$id])
same.ids<-same.ids[nchar(same.ids)>=4]
sdata$nonship.flag<-ifelse(sdata$id %in% unique(c(ids.moored,ids.plat,ids.drift,ids.coast,ids.submarine)), 0, sdata$nonship.flag)
sdata$nonship.flag<-ifelse(sdata$id %in% stat.ids, 1, sdata$nonship.flag)
nn.want<-c(nn,stat.ids,same.ids)
nn.want<-nn.want[!(nn.want %in% ship.ids)]
nn<-unique(nn.want)
for ( ids in nn ) {
sub<-subset(sdata,id==ids)
if(nrow(sub) < 8 ) {next}
sub<-add_date2(sub)
sub<-sub[order(sub$date),]
sub$lon2<-ifelse(sub$lon>=180,sub$lon-360,sub$lon)
dist<-geospheres::distHaversine(data.frame(lon=sub$lon2,lat=sub$lat))/1000
tdiff<-diff(sub$date)
units(tdiff)<-"hours"
spd<-dist/as.numeric(tdiff)
pstat<-round(sum(spd==0,na.rm=T)/length(spd)*100,1)
lim<-min(0.95,1-(2/length(spd)))
pspeed<-quantile(spd[is.finite(spd)],lim,na.rm=T)
if ( is.na(pspeed) ) pspeed<-max(dist,na.rm=T)
#if ( iyear >= 1987 & iyear <= 1991 ) {
#}
if(pstat<95 & pspeed > 1) {
nn.want<-nn.want[nn.want!=ids]
next
}
d1<-as.character(as.Date(min(sub$date)))
d2<-as.character(as.Date(max(sub$date)))
cat("ID=",ids,"date range:",d1,d2,"nobs=",nrow(sub),'maxspd=',pspeed,'pstat=',pstat,"pt=",paste(names(table(sub$pt)),collapse="/"),"dck=",paste(names(table(sub$dck)),collapse="/"),"\n")
} # end of loop over ids in cross-files
} # end of check for ids in cross-files
#stat.ids<-unique(c(stat.ids,nn.want))
#stat.ids<-subset(stat.ids,!(stat.ids %in% ship.ids) )
stat.ids<-nn.want # non-stationary IDs have been removed from selection
# can't easily identify which type the match is for, sort this later if poss
#sdata$nonship.flag<-ifelse(sdata$id %in% nn.want & sdata$nonship.flag == 99 & sdata$data.type=="drift", 3, sdata$nonship.flag)
#sdata$nonship.flag<-ifelse(sdata$id %in% nn.want & sdata$nonship.flag == 99 & sdata$data.type=="moored", 2, sdata$nonship.flag)
#sdata$nonship.flag<-ifelse(sdata$id %in% nn.want & sdata$nonship.flag == 99 & sdata$data.type=="plat", 4, sdata$nonship.flag)
#sdata$nonship.flag<-ifelse(sdata$id %in% nn.want & sdata$nonship.flag == 99 & sdata$data.type=="coast", 5, sdata$nonship.flag)
# remainder are stat.ids
sdata$nonship.flag<-ifelse(sdata$id %in% nn.want & sdata$nonship.flag == 99, 1, sdata$nonship.flag)
cat("finished looking at cross-matches","\n")
# get positions to match to generic IDs (data frame gpos)
#gen.dck<-c(888)
if ( !is.null(data) ) {
if ( nrow(data) > 0 ) {
gen.sub<-subset(data,i.id=="SHIP" | i.id== "" | i.id == "MASKSTID")
gen.sub$pos.diff<-abs(gen.sub$lat-gen.sub$i.lat)+abs(gen.sub$lon-gen.sub$i.lon)
gen.sub$sstm<-abs(gen.sub$sst-gen.sub$i.sst)
gen.sub$atm<-abs(gen.sub$at-gen.sub$i.at)
gen.sub$slpm<-abs(gen.sub$slp-gen.sub$i.slp)
gen.sub$wm<-abs(gen.sub$w-gen.sub$i.w)
gen.sub$dm<-abs(gen.sub$d-gen.sub$i.d)
gen.sub<-subset(gen.sub,pos.diff<=0.2)
gen.sub<-subset(gen.sub,pmin(gen.sub$sstm,gen.sub$atm,gen.sub$slpm,gen.sub$wm,gen.sub$dm,na.rm=T)==0)
uid.want<-gen.sub$i.uid
if ( FALSE ) {
g.pos<-data.frame(lat=gen.sub$i.lat,lon=gen.sub$i.lon,id=gen.sub$i.id,dck=gen.sub$i.dck,yr=gen.sub$i.yr,uid=gen.sub$i.uid)
g.pos<-g.pos[which(!duplicated(g.pos$uid)),]
g.pos$uid<-NULL
g.pos<-aggregate(list(rep=rep(1,nrow(g.pos))), g.pos, length)
g.pos<-subset(g.pos,rep>9)
} else {
g.pos<-data.frame(tmp=c())
}
}
}
if ( is.null(g.pos) ) g.pos<-data.frame(tmp=c())
#cat("same.ids","\n")
#cat(same.ids,"\n")
#cat("stat.ids","\n")
#cat(stat.ids,"\n")
#if(nrow(g.pos) > 0 ) print(g.pos)
#want.ids<-unique(c(same.ids,stat.ids))
want.ids<-unique(c(stat.ids))
nonship<-subset(sdata,id%in%want.ids)
cat("non-ship subset",nrow(nonship),"reports","\n")
#print(dim(nonship))
#if ( nrow(g.pos) != 0 ) {
# for ( ii in seq(1,nrow(g.pos),1) ) {
# lat.want<-g.pos$lat[ii]
# lon.want<-g.pos$lon[ii]
# id.want<-g.pos$id[ii]
# cat(ii,lat.want,lon.want,id.want,"\n")
# nonship2<-subset(sdata,lat==lat.want&lon==lon.want&id==id.want)
# if ( nrow(nonship2) > 0 ) nonship<-rbind(nonship,nonship2)
# }
#}
# now see if there are any general matches to ID substrings
sdata$nonship.flag<-ifelse(sdata$nonship.flag == 99 & sdata$uid %in% uid.want, 6, sdata$nonship.flag)
sdata2<-subset(sdata,!(sdata$uid %in% nonship$uid))
# platform ids from the input non-ship data files
pid<-names(table(data$id))
pid<-unique(c(pid,want.ids))
pid<-pid[which(nchar(pid)>3)]
pid<-pid[grepl("[A-Z]",pid)]
pid<-pid[pid!="BBXX"]
#print("bison")
#print(pid)
for ( pp in pid ) {
substr.add<-c()
if ( pp %in% c("PLAT","RIGG","BUOY","BOUY")) next
#if ( pp %in% man.exl ) {next}
# this checks for substrings
pig<-subset(sdata2,stringdist::stringdist(pp,sdata2$id)==abs(nchar(pp)-nchar(sdata2$id))&nchar(sdata2$id)>=4)
#cat(pp,nrow(pig),"\n")
if(nrow(pig) == 0) next
pig<-subset(sdata,stringdist::stringdist(pp,sdata$id)==abs(nchar(pp)-nchar(sdata$id))&nchar(sdata$id)>=4)
# check for movement
for ( pp2 in names(table(pig$id) ) ) {
if (pp2 == pp ) next
cat("substring",pp,pp2,"\n")
tmp<-subset(pig,id==pp2|id==pp)
if(nrow(tmp) < 5 ) next
tmp<-add_date2(tmp)
tmp<-tmp[order(tmp$date),]
tmp$lon2<-ifelse(tmp$lon>=180,tmp$lon-360,tmp$lon)
dist<-distHaversine(data.frame(lon=tmp$lon2,lat=tmp$lat))/1000
tdiff<-diff(tmp$date)
units(tdiff)<-"hours"
spd<-dist/as.numeric(tdiff)
#spd<-calc_speed(lon=tmp$lon,lat=tmp$lat,date=tmp$date)
#dist<-calc_dist(lon=tmp$lon,lat=tmp$lat)
lim<-min(0.95,1-(2/length(spd)))
pspeed<-quantile(spd[is.finite(spd)],lim,na.rm=T)
pstat<-round(sum(spd==0,na.rm=T)/length(spd)*100,1)
if ( is.na(pspeed)) pspeed<-max(dist,na.rm=T)
#if(pstat<95 | max(dd2,na.rm=T) > 1000) {
if(pstat<95 & pspeed > 1) {
cat('removing substring match',pp,pp2,"pstat=",pstat,"pspeed=",pspeed,"\n")
#print(table(tmp$id))
#nn.want<-nn.want[nn.want!=pp2]
next
} else {
substr.add<-c(substr.add,pp2)
#nn.want<-c(nn.want,pp2)
}
}
pig<-subset(pig,id %in% substr.add)
sdata$nonship.flag<-ifelse(sdata$nonship.flag == 99 & sdata$id %in% substr.add, 7, sdata$nonship.flag)
if ( nrow(pig) == 0 ) next
cat("plat id",pp,"# substr match =",nrow(pig),"\n")
cat("IDs =",paste(names(table(pig$id)),collapse=" / "),"\n")
cat("DCKs =",paste(names(table(pig$dck)),collapse=" / "),"\n")
nonship<-rbind(nonship,pig) # add to data frame of non-ship data
}
#readline(prompt="Press [enter] to continue")
if(nrow(nonship) == 0 ) next
# manual exclusions
man.ex<-c("ISW1","3FNX","C7M","C7C")
dck.excl<-c(667)
nonship<-subset(nonship,!(id %in% man.ex))
nonship<-subset(nonship,!(grepl("^[0-9]{4,5}$",nonship$id) & dck == 927) )
nonship<-subset(nonship,!(dck %in% dck.excl))
if ( length(uid.want) > 0 ) {
toadd<-subset(sdata,uid %in% uid.want)
cat('adding',nrow(toadd),'reports with generic ID',"\n")
if ( nrow(toadd) > 0 ) {
print(table(toadd$id,toadd$dck,useNA='ifany'))
}
nonship<-rbind(nonship,toadd)
}
# data from dck 732
toadd<-subset(sdata,dck==732 & grepl ("^[4-6][0-9]{4}$",sdata$id))
if(nrow(toadd)>0 & nrow(nonship)>0) {
rbind(nonship,toadd)
} else if ( nrow(toadd)>0 ) {
nonship<-toadd
}
toadd<-subset(sdata,dck==732 & grepl("^[0-9]{5}$",sdata$id) & sdata$yr==1950) # seem to be tao
if(nrow(toadd)>0 & nrow(nonship)>0) {
rbind(nonship,toadd)
} else if ( nrow(toadd)>0 ) {
nonship<-toadd
}
toadd<-subset(sdata,dck==992 & substr(sdata$id,1,1) == "6" & grepl("^[0-9]+$",sdata$id) & sdata$yr>=2010 & nchar(sdata$id)>=6 ) # look like moored buoys
if(nrow(toadd)>0 & nrow(nonship)>0) {
rbind(nonship,toadd)
} else if ( nrow(toadd)>0 ) {
nonship<-toadd
}
nonship<-unique(nonship)
# add in substring matches to PLAT, RIGG, BUOY, BOUY
a<-subset(sdata,grepl("BOUY",sdata$id)|grepl("BUOY",sdata$id)|grepl("RIGG",sdata$id)|grepl("PLAT",sdata$id))
if(nrow(a) > 0 ) {
a$nonship.flag<-7
cat("matches",paste(names(table(a$id)),collapse=" / "))
nonship<-rbind(nonship,a)
}
print(addmargins(table(nonship$id,nonship$dck)))
print(addmargins(table(nonship$nonship.flag,nonship$dck)))
cat("final non-ship subset",nrow(nonship),"reports","\n")
if (!dir.exists("MFILES_NOTSHIP")) dir.create("MFILES_NOTSHIP")
YRMO <- split(nonship, data.frame(nonship$yr, nonship$mo), drop=TRUE)
filenames <- paste("MFILES_NOTSHIP/",names(YRMO), ".Rda",sep="")
filenames <- gsub("\\.","_",filenames)
filenames <- gsub("_R",".R",filenames)
jj<-mapply(saveRDS, YRMO, file = filenames )
} # end of loop over years
#source("does_it_move_notship.R")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
source("~/Rscripts/get_mismatch.R")
source("~/Rscripts/read_rdsfiles.R")
#source("add_match_id.R")
#source("assess_match.R")
#require(imma)
options(warn=1)
basedir<-"/gws/nopw/j04/glosat/development/data/interim/"
cmo<-c("01","02","03","04","05","06","07","08","09","10","11","12")
to.print<-FALSE
if ( length(args) == 0 ) {args<-args2}
type.want<-args[3]
pp.dup<-NULL
pp.dup.mo<-NULL
for ( year in args[1]:args[2] ) {
for ( imo in 1:12 ) {
cat("-----------------------------------------------","\n")
cat(year,imo,type.want,"\n")
cat("-----------------------------------------------","\n")
fn.want<-paste0(basedir,"MFILES_",type.want,"/",year,"_",imo,".Rda")
if (!file.exists(fn.want)) { next }
mdata<-readRDS(paste0(basedir,"MFILES_SHIP/",year,"_",imo,".Rda"))
mdata2<-readRDS(fn.want)
mdata<-rbind(mdata,mdata2)
rm(mdata2)
#-----------------------------------------------------------
# full duplicates, similar contents and ID
#-----------------------------------------------------------
#-----------------------------------------------------------
# default tolerances for candidate matches
# if not passed, others set to zero
# tol.id=2,tol.pos=1,tol.sst=1,tol.slp=1,tol.at=1,
# tol.dpt=1,tol.w=1,tol.d=1,tol.ww=0,tol.w1=0,tol.vv=0,tol.n=0
#-----------------------------------------------------------
# to allow mismatch in variables used as key, need to
# define a new variable, e.g. rounded
#-----------------------------------------------------------
# new "key" variables
# use yr/mo/dy from new.date variable
mdata$k.yr<-mdata$yr
mdata$k.mo<-mdata$mo
mdata$k.dy<-mdata$dy
mdata$k.jday<-mdata$jday
mdata$k.hr<-round(mdata$hr)
mdata$rlat<-round(mdata$lat)
mdata$rlon<-round(mdata$lon)
mdata$rsst<-round(mdata$sst)
mdata$alat<-abs(mdata$lat)
mdata$alon<-abs(mdata$lon)
#key.list<-c("k.yr","k.mo","k.dy","k.hr","rlat","rlon")
not.dck<-c(667)
mdata<-subset(mdata,!(dck %in% not.dck) )
if ( nrow(mdata) > 2000000 ) {
ndays<-1
} else if ( nrow(mdata) > 200000 ) {
ndays<-5
} else if ( nrow(mdata) > 50000 ) {
ndays<-10
} else {
ndays<-400
}
date1<-paste0(year,"-",imo,"-01")
if ( imo < 12 ) {
date2<-paste0(year,"-",imo+1,"-01")
} else {
date2<-paste0(year,"-",1,"-01")
}
dy.in.mo<-as.numeric(difftime(date2, date1 ))
for ( iday in seq(1,max(1,dy.in.mo-ndays+1),ndays) ) {
start.day<-iday
end.day<-iday+ndays-1
if ( end.day >= dy.in.mo-2 & ndays != 1 ) end.day <- 31
#cat(iday,start.day,end.day,"\n")
sub<-subset(mdata,dy>=start.day & dy<=end.day)
if ( ndays < 400 ) cat("running for days",min(sub$dy),max(sub$dy),"\n")
# these key values used to index table, linking reports that match
key.list<-c("k.yr","k.mo","k.dy","k.hr","rlat","rlon")
#key.list<-c("yr","mo","dy","rlat","rlon")
# these variables are used to refine matching, values from both reports kept
#close.list<-c("yr","mo","dy","hr","id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon")
close.list<-c("id","sst","slp","at","w","d","vv","ww","dpt","n","lat","lon","jday")
cat('about to get matches',"\n")
pp.dup.mo<-get_mis(df.in=sub,key.list=key.list,close.list=close.list,
tol.pos=0.51,tol.id=9,
tol.w=2,tol.d=13,
tol.sst=1.01,tol.at=1.01,tol.dpt=1.01,tol.slp=1.01,
tol.ww=99,tol.w1=99,tol.vv=99)
cat('got matches, full date/time & pos',nrow(pp.dup.mo),"\n")
if(is.null(pp.dup.mo) ) {next}
if(nrow(pp.dup.mo) == 0 ) {next}
pp.dup.mo<-subset(pp.dup.mo,data.type<i.data.type)
cat('got matches, full date/time & pos, cross-type',nrow(pp.dup.mo),"\n")
if(nrow(pp.dup.mo) == 0 ) {next}
print(nrow(pp.dup))
print(nrow(pp.dup.mo))
if ( (is.null(pp.dup)) & (nrow(pp.dup.mo) > 0 ) ) {
pp.dup<-pp.dup.mo
} else { # got pp.up, append data if any
if ( nrow(pp.dup.mo)>0 ) pp.dup<-rbind.all.columns(pp.dup,pp.dup.mo)
}
} # end loop over days
# remove irrelevant row labels
rownames(pp.dup)<-NULL
if ( is.null(pp.dup) ) {next}
print(table(pp.dup$dck,pp.dup$i.dck))
} # end loop over months
if(is.null(pp.dup) ) next
if(nrow(pp.dup) == 0 ) next
pp.dup<-subset(pp.dup,data.type<i.data.type)
pp.dup<-pp.dup[order(pp.dup$id),]
if (!dir.exists(paste0("CROSS_", type.want))) dir.create(paste0("CROSS_", type.want))
fn.out<-paste0("CROSS_",type.want,"/crossplatdups_",year,".Rda")
saveRDS(pp.dup,fn.out)
pp.dup<-NULL
} # end loop over years
#!/bin/R
args = commandArgs(trailingOnly=TRUE)
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
source("~/Rscripts/add_date2.R")
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
options(warn=1)
#require(stringdist)
cmo<-c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12")
if ( length(args) < 2 ) {args<-args2}
for ( year in args[1]:args[2] ) {
for ( imo in 1:12 ) {
#for ( imo in 1:1 ) {
cat("-----------------------------------------------", "\n")
cat(year, imo, "\n")
cat("-----------------------------------------------", "\n")
fn<-paste0("/gws/nopw/j04/glosat/development/data/interim/ICOADSR3_TOTAL_Rda/icoadsR3total_", year, "_", cmo[imo],".Rda")
if ( !file.exists(fn) ) next
mdata<-readRDS(fn)
if ( is.null(mdata) ) {next}
if ( nrow(mdata) == 0 ) {next}
mdata$id[mdata$id==NA]<-""
mdata$id[is.na(mdata$id)]<-""
mdata$ptx<-0
# 0 = pt
# 1 = dck
# 2 = missing id by dck
# 3 = by ID
# 4 = corrected
# 6 = irf reject
# 7 = daymiss reject
# 8 = manual reject
#-----------------------------------------------------------
# assign data types
#-----------------------------------------------------------
# source exclusions
mdata$data.type<-"unknown"
# assign missing PT
ship.dck<-c(128, 150, 151, 152, 155, 156, 192, 201, 246, 255, 701, 721, 875, 897, 899)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% ship.dck, 0, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% ship.dck, 2, mdata$ptx)
# dck 192 has pt = 6 in 1912 & in 1930s
mdata$pt<-ifelse(mdata$dck == 192 & mdata$pt == 6, 5, mdata$pt)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(793, 794, 993, 994), 6, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(793, 794, 993, 994), 2, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(714), 7, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(714), 2, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(797), 13, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(797), 2, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("PLAT", mdata$id), 15, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("SHIP", mdata$id), 3, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("SHIP", mdata$id), 5, mdata$pt)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896, 883) & grepl("^[0-9]", mdata$id), 6, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896, 883) & grepl("^[0-9]", mdata$id), 3, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("RIG", mdata$id), 15, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("RIG", mdata$id), 3, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("4Y", mdata$id), 2, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("4Y", mdata$id), 3, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("C7", mdata$id), 2, mdata$pt)
mdata$ptx<-ifelse(is.na(mdata$pt) & mdata$dck %in% c(896) & grepl("C7", mdata$id), 3, mdata$ptx)
mdata$pt<-ifelse(is.na(mdata$pt), 99, mdata$pt)
mdata$data.type<-ifelse(mdata$pt <= 5, "ship", mdata$data.type)
mdata$data.type<-ifelse(mdata$pt == 8, "moored", mdata$data.type) # include ice buoys as moored
mdata$data.type<-ifelse(mdata$pt == 9, "ship", mdata$data.type) # include ice stations as ships
mdata$data.type<-ifelse(mdata$pt == 6, "moored", mdata$data.type)
mdata$data.type<-ifelse(mdata$pt == 7, "drift", mdata$data.type)
mdata$data.type<-ifelse(mdata$pt == 15, "plat", mdata$data.type)
mdata$data.type<-ifelse(mdata$pt %in% c(13, 14, 16), "coast", mdata$data.type)
mdata$data.type<-ifelse(mdata$pt %in% c(17, 18, 19, 20, 21), "prof", mdata$data.type)
mdata$data.type<-ifelse(mdata$dck %in% c(795, 995), "coast", mdata$data.type)
mdata$ptx<-ifelse(mdata$dck %in% c(795, 995), 1, mdata$ptx)
mdata$data.type<-ifelse(mdata$dck %in% c(735, 740, 780, 782), "research", mdata$data.type)
mdata$ptx<-ifelse(mdata$dck %in% c(735, 740, 780, 782), 1, mdata$ptx)
#mdata$data.type<-ifelse(mdata$dck %in% c(793, 794, 993, 994), "moored", mdata$data.type)
mdata$data.type<-ifelse(mdata$id %in% c("PLAT","RIGG"), "plat", mdata$data.type)
mdata$ptx<-ifelse(mdata$id %in% c("PLAT","RIGG"), 3, mdata$ptx)
mdata$data.type<-ifelse(mdata$id %in% c("BUOY","BOUY"), "moored", mdata$data.type)
mdata$ptx<-ifelse(mdata$id %in% c("BUOY","BOUY"), 3, mdata$ptx)
mdata$data.type<-ifelse(is.na(mdata$id) & mdata$dck == 700, "drift", mdata$data.type)
mdata$ptx<-ifelse(is.na(mdata$id) & mdata$dck == 700, 2, mdata$ptx)
mdata$data.type<-ifelse(nchar(mdata$id) == 5 & is.numeric(mdata$id) & mdata$dck==700 & mdata$sid == 147 & mdata$pt == 5, "moored", mdata$data.type)
mdata$data.type<-ifelse(nchar(mdata$id) == 5 & is.numeric(mdata$id) & mdata$dck==892 & mdata$sid == 29 & mdata$pt == 5, "moored", mdata$data.type)
# IRF reject
mdata$data.type<-ifelse(mdata$irf==2 & mdata$dck !=732, "reject", mdata$data.type)
mdata$ptx<-ifelse(mdata$irf==2 & mdata$dck !=732, 6, mdata$ptx)
# want date-time present
mdata$data.type<-ifelse(is.na(mdata$dy),"reject", mdata$data.type)
# & valid
mdata<-add_date2(mdata)
mdata$data.type<-ifelse(is.na(mdata$date),"reject", mdata$data.type)
mdata$ptx<-ifelse(is.na(mdata$dy),7, mdata$ptx)
mdata$ptx<-ifelse(is.na(mdata$date),7, mdata$ptx)
mdata$date<-NULL
mdata$data.type<-ifelse(is.na(mdata$hr) & !(mdata$dck %in% c(246, 701, 721, 780)),"reject", mdata$data.type)
mdata$ptx<-ifelse(is.na(mdata$hr) & !(mdata$dck %in% c(246, 701, 721, 780)),7, mdata$ptx)
# exclude IDs containg TEST, but not CONTEST
mdata$data.type<-ifelse(grepl("TEST",mdata$id) & mdata$id != "CONTEST","reject", mdata$data.type)
mdata$ptx<-ifelse(grepl("TEST",mdata$id) & mdata$id != "CONTEST",8, mdata$ptx)
# sort out drifters in dck 892
mdata$data.type<-ifelse(mdata$dck==892 & mdata$yr == 1981 & grepl("^[1-3][0-9]{3}$",mdata$id), "drift", mdata$data.type)
mdata$ptx<-ifelse(mdata$dck==892 & mdata$yr == 1981 & grepl("^[1-3][0-9]{3}$",mdata$id), 4, mdata$ptx)
mdata$data.type<-ifelse(mdata$dck==892 & mdata$yr == 1982 & grepl("^[4-7][0-9]{4}$",mdata$id), "drift", mdata$data.type)
mdata$ptx<-ifelse(mdata$dck==892 & mdata$yr == 1982 & grepl("^[4-7][0-9]{4}$",mdata$id), 4, mdata$ptx)
mdata$data.type<-ifelse(mdata$dck==892 & mdata$yr == 1983 & grepl("^[4-7][0-9]{4}$",mdata$id), "drift", mdata$data.type)
mdata$ptx<-ifelse(mdata$dck==892 & mdata$yr == 1983 & grepl("^[4-7][0-9]{4}$",mdata$id), 4, mdata$ptx)
# these seem to be tao data in the wrong year
mdata$data.type<-ifelse(mdata$dck==732 & mdata$yr == 1950 & !is.na(mdata$id) & mdata$id != "", "reject", mdata$data.type)
mdata$ptx<-ifelse(mdata$dck==732 & mdata$yr == 1950 & !is.na(mdata$id), 8, mdata$ptx)
# submarines in dck 245
#if ( year >= 1939 & year <= 1945 ) {
if ( year >= 1936 & year <= 1955 ) {
# from https://github.com/oldweather/RN-WW2/blob/master/analyses/submarines/sub_names
subnames<-trimws(readLines("/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/ww2_submarine_names.txt"))
#inames<-names(table(mdata$id[which(substr(mdata$dck,1,1)==2)]))
inames<-names(table(mdata$id[which(mdata$dck==245)]))
for ( nn in subnames ) {
stdist<-stringdist::stringdist(nn,gsub("_"," ",inames))
cdiff<-abs(nchar(nn) - nchar(inames))
candidates<-inames[which(stdist==cdiff)]
cdiff<-abs(nchar(nn) - nchar(candidates))
if ( length(cdiff) == 0 ) {
#cat("no match",nn,"\n")
next
}
candidates<-candidates[which(cdiff==min(cdiff))]
if ( length(candidates) == 1 ) {
#print(nn)
#cat("match to submarine:",nn,candidates,"\n")
mdata$data.type<-ifelse(substr(mdata$dck,1,1)==2 & mdata$id == candidates, "submarine", mdata$data.type)
mdata$data.type<-ifelse(substr(mdata$dck,1,1)==2 & mdata$id == gsub(" ","_",candidates), "submarine", mdata$data.type)
} else if (length(candidates) == 1 ) {
#cat("no match",nn,"\n")
} else {
cat("choice",nn,paste(candidates,collapse="/"),"\n")
stopping("got choice of submarine ids")
}
}
mdata$ptx<-ifelse(mdata$data.type=="submarine",3,mdata$ptx)
} # end of WW2 year selection
# some data for ship SEAMEN is reflected in latitude and duplicated with SEAMAN in dck 701
mdata$data.type <- ifelse(mdata$yr == 1852 & mdata$id == "SEAMEN" & mdata$dck == 701 & mdata$lon < 260, "reject", mdata$data.type)
mdata$ptx <- ifelse(mdata$yr == 1852 & mdata$id == "SEAMEN" & mdata$dck == 701 & mdata$lon < 260, 8, mdata$ptx)
# remove mispositioned data from dck 151
mdata$data.type <- ifelse(mdata$dck == 151 & mdata$lon >= 260 & mdata$lon <= 270 & mdata$yr >= 1890 & mdata$yr <= 1919, "reject", mdata$data.type )
#print(table(mdata$data.type))
if ( max(mdata$pt) == 99 ) {
cat("PT problem")
print(table(mdata$pt,mdata$dck))
print(table(mdata$pt,mdata$sid))
stop()
}
sp <- split(mdata, mdata$data.type)
for ( ty in names(sp) ) {
op <- sp[[ty]]
if ( nrow(op) == 0 ) {next}
fn.out <- paste0("MFILES_", toupper(ty), "/", year, "_", imo, ".Rda")
if (!dir.exists(paste0("MFILES_", toupper(ty)))) dir.create(paste0("MFILES_", toupper(ty)))
cat(fn.out, "# obs =", nrow(op), "\n")
saveRDS(op, fn.out)
}
} # end loop over months
} # end loop over years
cat("finished split_by_type.R",args[1],args[2],"\n")
add_ID_class <- function(mdata) {
source("~/Rscripts/get_id_class.R")
# code written by Elizabeth Kent, National Oceanography Centre
# for C3S_311a_Lot2
# and ID classifications
app2merge<-NULL
cat("dcks: ")
for ( mydck in names(table(mdata$dck)) ) {
#print(table(mdata$dck))
cat(mydck," ")
sub<-subset(mdata,dck==mydck)
#print("bison")
sub$id.class<-get_id_class(mydck,sub$id)
#print("camel")
if ( is.null(app2merge) ) {
app2merge<-sub[,c("uid","id.class")]
} else {
app2merge<-rbind(app2merge,sub[,c("uid","id.class")])
}
#print(dim(app2merge))
}
cat("\n")
#print(tail(sort(table(mdata$uid))))
if("id.class" %in% names(mdata)) mdata$id.class<-NULL
mdata<-merge(mdata,app2merge,by="uid",all.x=T)
# some SHIP[CC] and SHIP[C] may be OK
mdata$id.class[which(grepl("SHIP",mdata$id))] <- "invalid"
mdata$id.class[which(mdata$id.class=="SHIP")] <- "SHIP"
mdata$id.class[which(mdata$id.class=="MASKSTID")] <- "MASKSTID"
mdata$id.class[which(mdata$id=="")] <- "invalid"
# remove itu for ESURFMAR IDs
coded.id<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{3}[AA|UK|FR|DE|EU|HK]{2}[0-9]{2}$"
mdata$id.class<-ifelse(mdata$id.class=="ESURFMAR" & !grepl(coded.id,mdata$id),"invalid",mdata$id.class)
mdata$id.class<-ifelse(grepl(coded.id,mdata$id),"ESURFMAR",mdata$id.class)
if(!("itu_co" %in% names(mdata) ) ) mdata$itu_co<-"invalid"
mdata$itu_co<-ifelse(mdata$id.class=="ESURFMAR","invalid",mdata$itu_co)
# add number of occurrences of each country
tmp<-data.frame(table(mdata$itu_co))
names(tmp)<-c("itu_co","itu.num")
mdata<-merge(mdata,tmp,all.x=T)
mdata$itu.ok<-ifelse(mdata$itu.num>=20 & mdata$itu_co != "invalid", TRUE,FALSE)
mdata$itu.num<-NULL
return(mdata)
}
#--------------------------------------------------------------------------------
Sys.setenv(TZ='GMT')
#--------------------------------------------------------------------------------
is.leapyear=function(year){
#http://en.wikipedia.org/wiki/Leap_year
return(((year %% 4 == 0) & (year %% 100 != 0)) | (year %% 400 == 0))
}
add_date2 <- function(df){
# adds date variable to file based on yr, mo, dy, hr, invalid set to missing
# also generates date variable with missing hour set to local noon
# adds date.flag
# 0 = valid date & time
# 1 = invalid date or time (time not missing)
# 2 = valid date, hr missing, 12 local added
#------------------------------------------------------------------------------------------------------
# add date variable
#------------------------------------------------------------------------------------------------------
#print('about to add date')
df$date.flag <- 0
# need to catch and remove any data with invalid dates (e.g. 31 June)
df$dy[which(df$mo %in% c(4,6,9,11) & df$dy==31)] <- NA
df$dy[which(df$mo==2 & df$dy>29)] <- NA
df$dy[!is.leapyear(df$yr) & df$mo==2 & df$dy==29] <- NA
df$date.flag<-ifelse(is.na(df$dy), 1, df$date.flag)
#------------------------------------------------------------------------------------------------------
# put missing hour to midday local
#------------------------------------------------------------------------------------------------------
time_string <- ifelse(df$date.flag!=1,sprintf("%4d-%02d-%02d 12:00", df$yr, df$mo, df$dy ),NA)
my_date_time <- as.POSIXct(time_string) + (df$lon / 15)*3600
local_noon <- round(as.numeric(format.POSIXct( my_date_time ,"%H")) + as.numeric(format.POSIXct( my_date_time ,"%M"))/60,0)
local_noon <- ifelse(local_noon > 23, local_noon-24, local_noon)
df$date.flag<-ifelse(is.na(df$hr) & !is.na(df$dy), 2, df$date.flag)
tmphr <- ifelse(is.na(df$hr) & !is.na(df$dy), local_noon, df$hr)
tmp<- strftime(as.POSIXct(tmphr * 60 * 60, origin = "0001-01-01", tz = "GMT"), format = "%H:%M:%S")
#------------------------------------------------------------------------------------------------------
# add date
#------------------------------------------------------------------------------------------------------
dd<-paste0(df$yr,"-",df$mo,"-",df$dy," ",tmp)
dd[grepl("NA",dd)] <- NA
df$date <- as.POSIXct(dd)
#df$date <- ifelse ( df$date.flag %in% c(0,2), as.POSIXct(paste0(df$yr,"-",df$mo,"-",df$dy," ",tmp)), NA)
#df$date <- ifelse ( df$date.flag %in% c(0,2), as.POSIXct(dd), NA)
return(df)
}
add_dck_priority <- function (df) {
df$dck.pri<-9
# best data
dck.pri1 <- c(143,144,145,146,201,202,203,204,205,206,207,209,210,211,213,214,216,218,226,227,246,247,248,249,667,701,702,704,705,706,707,710,711,715,721,730,731,734,735,736,750,781,761,762,780,876,877,878,879,880,881,882,883,891,927)
df$dck.pri <- ifelse ( df$dck %in% dck.pri1, 1, df$dck.pri)
dck.pri2 <- c(110,116,118,119,128,143,184,185,187,188,189,192,193,194,195,196,197,245,666,703,709,714,720,733,740,874,875,897,898,899,900,902,926)
df$dck.pri <- ifelse ( df$dck %in% dck.pri2, 2, df$dck.pri)
dck.pri3 <- c(117,186,233,234,254,792,793,294,700,708,782,792,793,794,795,795,797,892,893,896,901,928,992,993,994,995,996,997,999)
df$dck.pri <- ifelse ( df$dck %in% dck.pri3, 3, df$dck.pri)
dck.pri4 <- c(150,151,152,155,156,221,223,224,229,230,239,849,850,889)
df$dck.pri <- ifelse ( df$dck %in% dck.pri4, 4, df$dck.pri)
dck.pri5 <- c(281,255,555,749,888)
df$dck.pri <- ifelse ( df$dck %in% dck.pri5, 5, df$dck.pri)
dck.pri6 <- c(215,732,999)
df$dck.pri <- ifelse ( df$dck %in% dck.pri6, 6, df$dck.pri)
# change priority of 733 to be lower than 186 (match of polar stations)
df$dck.pri <- ifelse ( df$dck == 733, 4, df$dck.pri)
if ( max(df$dck.pri) > 6 ) {
stop("dck priority not set")
}
return(df)
}
add_shipnames <- function(df) {
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
source("~/Rscripts/fix_usmm_705to707cors_func.R")
anc.root<-"/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/"
idt<-df$id
idt<-gsub("[A-Z]","C",idt)
idt<-gsub("[a-z]","c",idt)
idt<-gsub("[0-9]","N",idt)
name_dcks <- c(702,704,246,730,701,731,721,710,247,711,734,249,248,736,245,249,761,750,146,897,705,706,707)
idt <- ifelse ( df$dck %in% name_dcks, "name", idt )
if ( !("shipname" %in% names(df) ) ) df$shipname<-""
df$shipname<-ifelse(df$dck %in% name_dcks, df$id, NA)
df$shipname.flag<-99
# 0 is orignal ship name
# 1 is ship name from external info
# 2 is manually edited ship name
# 3 is split ship name
# 4 is ship name from sup
# 5 is ship name from sup, probable editing
# 6 is ship name from Pub47
min_yr <- min(df$yr)
max_yr <- max(df$yr)
# dck 702
df$shipname <- ifelse ( df$dck == 702 & df$shipname == "PH ENIX", "PHOENIX", df$shipname)
df$shipname <- ifelse ( df$dck == 702 & df$shipname == "PH NIX", "PHOENIX", df$shipname)
df$shipname <- ifelse ( df$dck == 702 & df$shipname == "ELLID", "ELLIDA", df$shipname)
df$shipname.flag <- ifelse ( df$dck == 702 & df$shipname != df$id, 2, df$shipname.flag)
# dck 711
df$shipname <- ifelse ( df$dck == 711 & df$shipname == "Ringaroon", "Ringaroom", df$shipname)
df$shipname <- ifelse ( df$dck == 711 & df$shipname == "Rotokino", "Rotohino", df$shipname)
df$shipname <- ifelse ( df$dck == 711 & df$shipname == "Lavinni", "Laviuni", df$shipname)
df$shipname.flag <- ifelse ( df$dck == 711 & df$shipname != df$id, 2, df$shipname.flag)
# dck 704, US MMJ, check these are still needed as this should be fixed separately
dck_start <- 1878
dck_end <- 1894
#if ( min_yr >= dck_start & max_yr <= dck_end ) {
if ( max_yr >= dck_start & min_yr <= dck_end ) {
df$shipname <- ifelse ( df$dck == 704 & (is.na(df$shipname) | df$shipname == "") , "Unk_dck704", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Aboukir" , "Aboukir_B", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Aboukir B" , "Aboukir_B", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Achille F" , "Achille.F", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Ben .F.Pa" , "Ben F Pa", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Benj.F.Pa" , "Ben F Pa", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Banafides" , "Bonafides", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "EarlDerbg" , "Earl Derb", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Hcrcules" , "Hercules", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Hencules" , "Hercules", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "John_D-BR" , "John_D-Br", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Leopold_v" , "Leopold_V", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Martha Da" , "Martha.Da", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Mozart_of" , "S-S-Mozar", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Patterdal" , "Pamerdale", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Ethiopia" , "S-S-Ethopia", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "S-S-Ethio" , "S-S-Ethopia", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "William_H" , "Wm-H-Smit", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "William.J" , "William J", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "John L.Be" , "John T.Be", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Papay" , "Panay", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Penbscot" , "Penobscot", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "Raphasl" , "Raphael", df$shipname )
df$shipname <- ifelse ( df$dck == 704 & df$shipname == "W. G.Irwi" , "Wm.G.Irwi", df$shipname )
df$shipname.flag <- ifelse ( df$dck == 704 & df$shipname != df$id, 2, df$shipname.flag)
}
# dck 730, CLIWOC, convert logbook ID to ship names using info from CLIWOC website
dck_start <- 1663
dck_end <- 1860
if ( max_yr >= dck_start & min_yr <= dck_end ) {
cliwoc_links<-readLines(paste0(anc.root,"cliwoc_shipLogbookid2_1.txt"))
cliwoc_links<-iconv(cliwoc_links, to = "ASCII//TRANSLIT")
cliwoc_logs<-substr(cliwoc_links,start=1,stop=4)
cliwoc_names<-substr(cliwoc_links,start=5,stop=34)
cliwoc_dup<-substr(cliwoc_links,start=65,stop=65)
cliwoc_names<-trimws(cliwoc_names)
cliwoc_names<-gsub(" ","XXXX",cliwoc_names)
cliwoc_names<-gsub("[[:punct:]]","-",cliwoc_names)
cliwoc_name<-iconv(cliwoc_names, to = "ASCII//TRANSLIT")
cliwoc_names<-gsub("XXXX","_",cliwoc_names)
cliwoc<-cbind(as.numeric(cliwoc_logs),cliwoc_names,cliwoc_dup)
colnames(cliwoc)<-c("logs","names","cli_dups")
df<-merge(df,cliwoc,by.x="id",by.y="logs",all.x=TRUE)
df$shipname<-ifelse(df$dck==730,df$names,df$shipname)
#df<-subset(df,cli_dups==0 | dck!=730)
df$names<-NULL
df$cli_dups<-NULL
df$shipname.flag <- ifelse ( df$dck == 730 & df$shipname != df$id, 1, df$shipname.flag)
}
# dck 701
# US Maury, names from list on ICOADS website: http://icoads.noaa.gov/software/transpec/maury/mauri_out
dck_start <- 1663
dck_end <- 1863
#if ( min_yr >= dck_start & max_yr <= dck_end ) {
if ( max_yr >= dck_start & min_yr <= dck_end ) {
usmau_links<-readLines(paste0(anc.root,"usmaury_names.txt"))
usmau_links<-iconv(usmau_links, to = "ASCII//TRANSLIT")
usmau<-trimws(cbind(substr(usmau_links,7,14),substr(usmau_links,34,66)))
colnames(usmau)<-c("short_names","names")
# these names have 2 entries, choose one
usmau <- usmau[which(usmau[,2] != "D. FERNANDO"),]
usmau <- usmau[which(usmau[,2] != "CORAL OF NEW BEDFORD S"),]
usmau <- usmau[which(usmau[,2] != "GENERAL JONES"),]
usmau <- usmau[which(usmau[,2] != "MINERVA SMYTH"),]
usmau <- usmau[which(usmau[,2] != "SAMUEL ROBERTSON"),]
usmau <- usmau[which(usmau[,2] != "THOMAS B. WALES"),]
df<-merge(df,usmau,by.x="id",by.y="short_names",all.x=TRUE)
df$shipname<-ifelse(df$dck==701 & !is.na(df$names),df$names,df$shipname)
df$names<-NULL
df$shipname<-ifelse(df$dck==701 & df$shipname=="" & df$yr == 1850,"UNKNOWN-D701-1",df$shipname)
# corrections as suggested by Wilkinson/Wheeler - see webpage above
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "ABOUKIN" , "ABOUKIR", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "ACKBAR" , "AKBAR", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "HEROKEE" , "CHEROKEE", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "MALUBAR" , "MALABAR", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "MUTIN" , "MUTINE", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "NORTHUMBERIAND" , "NORTHUMBERLAND", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "PHEBE" , "PHOEBE", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "TENOBIA" , "ZENOBIA", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "MARGARETEVANS" , "MARGARET EVANS", df$shipname )
df$shipname.flag <- ifelse ( df$dck == 701 & df$shipname != df$id, 1, df$shipname.flag)
# and this seemed wrong
df$shipname <- ifelse ( df$dck == 701 & df$shipname == "HENRY CALY" , "HENRY CLAY", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & df$id == "ZOE" , "ZOE", df$shipname )
# SEAMAN/SEAMEN are the same ship, but some of the data for SEAMEN is reflected in 1853
df$shipname <- ifelse ( df$dck == 701 & df$yr < 1855 & df$shipname == "SEAMEN", "SEAMAN", df$shipname)
df$shipname.flag <- ifelse ( df$dck == 701 & df$shipname != df$id & df$shipname.flag==99, 2, df$shipname.flag)
}
# dck 701, US Maury with missing id
dck_start <- 1663
dck_end <- 1863
#if ( min_yr >= dck_start & max_yr <= dck_end ) {
if ( max_yr >= dck_start & min_yr <= dck_end ) {
df$shipname <- ifelse ( df$dck == 701 & df$id == "E-Z-", "E-Z-", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & (is.na(df$id) | df$id == "") & df$yr == 1850, "Unknown_701_1", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & (is.na(df$id) | df$id == "") & df$yr == 1851 & df$mo <= 2, "Unknown_701_1", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & (is.na(df$id) | df$id == "") & df$yr == 1851 & df$mo == 4, "Unknown_701_2", df$shipname )
df$shipname <- ifelse ( df$dck == 701 & (is.na(df$id) | df$id == "") & df$yr == 1851 & df$mo == 12, "Unknown_701_3", df$shipname )
df$shipname.flag <- ifelse ( df$dck == 701 & (is.na(df$id) | df$id == ""), 2, df$shipname.flag)
}
# dck 721 German Maury, make corrections based on extended length ids from US Maury, assign missing
dck_start <- 1851
dck_end <- 1868
#cat(min_yr,dck_start,max_yr,dck_end,'\n')
#if ( min_yr >= dck_start & max_yr <= dck_end ) {
if ( max_yr >= dck_start & min_yr <= dck_end ) {
#cat('here','\n')
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "WILD RANG" , "WILD RANGER", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "HIUGUENOT" , "HUGUENOT", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "HIPPOGRIF" , "HIPPOGRIFFE", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "NOR WESTE" , "NOR WESTER", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "ROMANCE O" , "ROMANCE OF THE SEA", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "GREAT REP" , "GREAT REPUBLIC", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "DREADNOUG" , "DREADNOUGHT", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "BLACK HAW" , "BLACK HAWK", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "ILLUSTRIO" , "ILLUSTRIOUS", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "RINGLEADE" , "RINGLEADER", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "SEA SERPE" , "SEA SERPENT", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "BALD EAGL" , "BALD EAGLE", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "GREY EAGL" , "GREY EAGLE", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "HENRY CLA" , "HENRY CLAY", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "F. DEMING" , "F.DEMING", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "CALLIA", "GALLIA", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "CASPAR", "GASPAR", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "MILO", "HILO", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "HOLSBURGH", "HORSBURGH", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "HORSBURG", "HORSBURGH", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "INDIANMA", "INDIAMAN", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "INDIANMAN", "INDIAMAN", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "LEPANTO", "LEVANTO", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "HIGLANDER", "HIGHLANDER", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "MARGARETEVANS", "MARGARET EVANS", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "M. HOWES", "M.HOWES", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "PORTSMOUT", "PORTSMOUTH", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "ROMANO_D", "ROMANO-D", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "ROMANO D", "ROMANO-D", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & df$shipname == "WILLIAMET", "WILLIAM T", df$shipname)
df$shipname <- ifelse ( df$dck == 721 & (is.na(df$id) | df$id == "") & df$yr < 1856, "Unknown 721 1", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & (is.na(df$id) | df$id == "") & df$yr == 1856 & df$mo <= 3, "Unknown 721 1", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & (is.na(df$id) | df$id == "") & df$yr == 1856 & df$mo >= 8, "Unknown 721 2", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & (is.na(df$id) | df$id == "") & df$yr == 1857, "Unknown 721 3", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & (is.na(df$id) | df$id == "") & df$yr == 1858, "Unknown 721 4", df$shipname )
df$shipname <- ifelse ( df$dck == 721 & (is.na(df$id) | df$id == "") & df$yr >= 1863, "Unknown 721 5", df$shipname )
df$shipname.flag <- ifelse ( df$dck == 701 & df$shipname != df$id, 1, df$shipname.flag)
}
# separate some common ID's by deck
idtmp<- ifelse( is.na(df$shipname) | df$id == "","xx",df$shipname)
df$shipname <- ifelse ( df$dck %in% c(701,721) & idtmp == "AUSTRALIA", paste0(idtmp,"_d",df$dck),df$shipname)
df$shipname <- ifelse ( df$dck %in% c(701,721) & idtmp == "JAMESTOWN", paste0(idtmp,"_d",df$dck),df$shipname)
#df$shipname <- ifelse ( df$dck %in% c(701,721) & idtmp == "SWORDFISH", paste0(idtmp,"_d",df$dck),df$shipname)
df$shipname <- ifelse ( df$dck %in% c(701,721) & (idtmp == "ANN MARIA" | idtmp == "ANN MARI" | idtmp == "ANN MARIA"), paste0(idtmp,"_d",df$dck),df$shipname)
df$shipname <- ifelse ( df$dck %in% c(701,721) & idtmp == "ASHBURTON", paste0(idtmp,"_d",df$dck),df$shipname)
df$shipname <- ifelse ( df$dck %in% c(701,721,730) & idtmp == "EUROPA", paste0(idtmp,"_d",df$dck),df$shipname)
df$shipname.flag <- ifelse ( df$dck %in% c(701,721,730) & df$shipname != df$id & substr(df$shipname,nchar(df$shipname)-4,nchar(df$shipname)-3)=="_d", 3, df$shipname.flag)
# process 705-707
if (min_yr >= 1909 & max_yr <= 1946 ) {
#cat('bison',min_yr,max_yr,"\n")
#print(head(table(df$shipname)))
if ( file.exists(paste0("TEST_NAMES3/",min(df$yr),"_",min(df$mo),'.Rda')) ) {
#cat('camel',min_yr,max_yr,"\n")
tmp<-readRDS(paste0("TEST_NAMES3/",min(df$yr),"_",min(df$mo),'.Rda'))
tmp$shipname.tmp<-tmp$shipname
tmp<-unique(tmp[,c("id","shipname.tmp")])
df<-merge(df,tmp[,c("id","shipname.tmp")],all.x=T)
#print(head(table(df$shipname.tmp)))
df$shipname<-ifelse(!is.na(df$shipname.tmp),df$shipname.tmp,df$shipname)
df$shipname.flag<-ifelse(!is.na(df$shipname.tmp),5,df$shipname.flag)
df$shipname.tmp<-NULL
rm(tmp)
#print(head(table(df$shipname)))
}
}
#cat('about to do pub 47',"\n")
#print(names(df))
if ( min(df$yr) >= 1956 ) { # Pub 47
if ( min(df$mo)<10 ) {
fn47<-paste0(anc.root,"PUB47byMONTH/",min(df$yr),"_0",min(df$mo),'.Rda')
} else {
fn47<-paste0(anc.root,"PUB47byMONTH/",min(df$yr),"_",min(df$mo),'.Rda')
}
if ( df$yr[1] != 2019 ) {
pub47<-readRDS(fn47)
pub47$pname<-pub47$name
} else {
#pub47<-read_trackmonth("/noc/mpoc/surface/eck/PUB47_COP/PUB47byMONTH",syr=2018)
pub47<-readRDS(paste0(anc.root,"PUB47byMONTH/2018_12.Rda"))
pub47$pname<-pub47$name
}
pub47<-unique(pub47[,c("callsign","pname","recruiting_country")])
df<-merge(df,pub47,by.x="homog.id",by.y="callsign",all.x=T)
#print(names(df))
df$shipname<-ifelse(!is.na(df$pname),df$pname,df$shipname)
#print(names(df))
df$shipname.flag<-ifelse(!is.na(df$pname),6,df$shipname.flag)
df$pname<-NULL
}
df$shipname<-toupper(df$shipname)
df$shipname<-gsub("[[:punct:]]","-",df$shipname)
df$shipname<-gsub(" ","_",df$shipname)
df$shipname<-gsub("__","_",df$shipname)
df$shipname<-gsub("--","-",df$shipname)
df$shipname<-gsub("-_-","-",df$shipname)
df$shipname<-gsub("_-_","-",df$shipname)
df$shipname<-gsub("_-","-",df$shipname)
df$shipname<-gsub("-_","-",df$shipname)
df$shipname<-gsub("-","_",df$shipname)
df$shipname<-gsub("__","_",df$shipname)
df$shipname<-gsub("__","_",df$shipname)
df$shipname<-gsub("_$","",df$shipname)
return(df)
}
###################################################################################################
## code for checking matches
###################################################################################################
# code written by Elizabeth Kent, National Oceanography Centre
# code written for the NERC HOSTACE project and adapted for
# C3S_311a_Lot2
# this useful for checking diffs
xx<-function(X) round(apply(X[,v.want[which(v.want %in% names(X))]],2,var,na.rm=T),2)
# find maximum difference across variables of interest
var.diff<-function(X) {
val<-99
diff.lim<-0.01
#v.want<-c("lat","lon","sst","slp","at","dpt","wbt","rh","w","d","ww","n","vv","wh","nh","w1")
v.want<-c("sst","slp","at","dpt","wbt","rh","w","d","ww","n","vv","wh","nh","w1")
# select delayed mode data if a mixture of GTS and DM
if ( length(table(X$is.dm)) > 1 ) X<-X[X$is.dm,]
if (nrow(X) == 1 ) {
#print("only 1 delayed mode record")
val <- 0
return(val)
}
# where there is delayed mode data and multiple IMMT versions, pick most recent
if ( length(table(X$immv,useNA='ifany')) > 1 ) {
max.immv<-max(X$immv,na.rm=T)
X<-X[X$immv==max.immv & !is.na(X$immv),]
}
if (nrow(X) == 1 ) {
#print("only 1 recent IMMT version")
val <- 0
return(val)
}
#cat(nrow(X[X$is.dm,]),nrow(X[X$is.gts,]),nrow(X),"\n")
# get rid of 700 if we have other GTS decks
if ( "700" %in% names(table(X$dck)) & length(names(table(X$dck))) > 1 ) {
X<-subset(X,dck!=700)
}
if (nrow(X) == 1 ) {
#print("only 1 after 700 removed")
val <- 0
return(val)
}
# get rid of 792 if we have other GTS decks
if ( "792" %in% names(table(X$dck)) & length(names(table(X$dck))) > 1 ) {
X<-subset(X,dck!=792)
}
if (nrow(X) == 1 ) {
#print("only 1 after 792 removed")
val <- 0
return(val)
}
# get rid of 117 if we have 116 or 128
if ( "117" %in% names(table(X$dck)) & ("128" %in% names(table(X$dck))|"116" %in% names(table(X$dck))) & length(names(table(X$dck))) > 1 ) {
X<-subset(X,dck!=117)
}
if (nrow(X) == 1 ) {
#print("only 1 after 117 removed")
val <- 0
return(val)
}
# get select only sid 142 for dck 117
if ( 117 %in% names(table(X$dck)) & max(X$dck) == min(X$dck) & 142 %in% names(table(X$sid)) & length(names(table(X$sid))) > 1 ) {
X<-subset(X,sid==142)
}
if (nrow(X) == 1 ) {
#print("only 1 after sid 142 for dck 117 selected")
val <- 0
return(val)
}
# get rid of 733 if we have match to 186 (polar stations)
if ( "733" %in% names(table(X$dck)) & "186" %in% names(table(X$dck)) & length(names(table(X$dck))) > 1 ) {
X<-subset(X,dck!=733)
}
if (nrow(X) == 1 ) {
#print("only 1 after 733 removed")
val <- 0
return(val)
}
tmp<-X[,v.want[which(v.want %in% names(X))]]
# check to see if there are no common variables
if ( suppressWarnings(!is.finite(max(colSums(tmp),na.rm=T)))) {
val <- -99
return(val)
}
val<-suppressWarnings(max(apply(tmp,2,mad,na.rm=T),na.rm=T))
if ( val > diff.lim ) {
# mismatches of vv and ww in dck 194 & 201
is.deck201<-ifelse(201 %in% X$dck,TRUE,FALSE)
#X$vv<-ifelse(X$dck==194 & is.deck201, X$vv+1, X$vv)
X$vv<-ifelse(X$dck==194 & is.deck201, NA, X$vv)
X$ww<-ifelse(X$dck==194 & is.deck201 & X$ww==53, 50, X$ww)
X$ww<-ifelse(X$dck==194 & is.deck201 & X$ww==81, 52, X$ww)
X$ww<-ifelse(X$dck==194 & is.deck201 & X$ww==80, 52, X$ww)
X$ww<-ifelse(X$dck==194 & is.deck201 & X$ww==90, 89, X$ww)
X$ww<-ifelse(X$dck==194 & is.deck201 & X$ww==63, 60, X$ww)
X$ww<-ifelse(X$dck==194 & is.deck201, NA, X$ww)
X$w<-ifelse(X$dck==194 & is.deck201 & X$w==9.3, 9.8, X$w)
is.deck128<-ifelse(128 %in% X$dck,TRUE,FALSE)
is.deck254<-ifelse(254 %in% X$dck,TRUE,FALSE)
X$w<-ifelse(is.deck128&is.deck254,9.3,9.8)
tmp<-X[,v.want[which(v.want %in% names(X))]]
val<-suppressWarnings(max(apply(tmp,2,mad,na.rm=T),na.rm=T))
if ( val < diff.lim ) return(val)
# check - one of the dcks has loads of w=1
#X$w<-ifelse(X$dck==721 & X$w==1,NA,X$w)
X$w<-ifelse(X$dck==720 & X$w==1,NA,X$w)
is.deck720<-ifelse(720 %in% X$dck,TRUE,FALSE)
# dck 720 slp rounded down
X$slp<-ifelse(is.deck720,floor(X$slp),X$slp)
#dck.hipos<-c(116,186)
#dck.lopos<-c(117,733)
#X$lat<-ifelse(X$dck%in%dck.hipos & X$dck%in%dck.lopos,round(X$lat),X$lat)
#X$lon<-ifelse(X$dck%in%dck.hipos & X$dck%in%dck.lopos,round(X$lon),X$lon)
#dck.roundpos<-c(192,720)
#X$lat<-ifelse(X$dck%in%dck.roundpos, round(X$lat),X$lat)
#X$lon<-ifelse(X$dck%in%dck.roundpos, round(X$lon),X$lon)
# dck 116 direction is to 10 deg, 117 in 16 points (integer 22.5 deg), plus some other values
dvals<-seq(0,365,10)
X$d<-ifelse(X$dck == 116|X$dck == 117,dvals[findInterval(X$d,dvals)],X$d)
X$ww<-ifelse((X$dck == 116|X$dck == 117) & X$ww %in% c(50,60),X$ww+1,X$ww)
# dcks where only wbt to be checked
dck4wb<-c(116,117,227,194,720)
# dck 116/117 wbt agree but other humidity variables don't
if ( "dpt" %in% names(X) )X$dpt<-ifelse(X$dck%in%dck4wb,NA,X$dpt)
if ( "rh" %in% names(X) )X$rh<-ifelse(X$dck%in%dck4wb,NA,X$rh)
#X$rh<-ifelse((X$dck == 116|X$dck == 117),NA,X$rh)
# dck 116 has fractional lat/lon
X$lat<-ifelse(X$dck==116,round(X$lat,0),X$lat)
X$lon<-ifelse(X$dck==116,round(X$lon,0),X$lon)
# mismatches of visibility in 116/117
#X$vv<-ifelse(X$sid == 140, X$vv-1, X$vv)
# differences in sid 140
if ("nh" %in% names(X) ) X$nh<-ifelse(X$sid == 140, NA, X$nh)
if ("wh" %in% names(X) ) X$wh<-ifelse(X$sid == 140, NA, X$wh)
if ("vv" %in% names(X) ) X$vv<-ifelse(X$sid == 140, NA, X$vv)
if ("nh" %in% names(X) ) X$nh<-ifelse(X$dck == 116, NA, X$nh)
if ("wh" %in% names(X) ) X$wh<-ifelse(X$dck == 116, NA, X$wh)
if ("vv" %in% names(X) ) X$vv<-ifelse(X$dck == 116, NA, X$vv)
for ( myvar in names(X)[grepl("prec$",names(X))] ) {
mv<-gsub(".prec","",myvar)
if ( sum(!is.na(X[,mv])) <= 1 ) {next}
if ( range(X[,mv],na.rm=T)[2]-range(X[,mv],na.rm=T)[1] < max(X[,myvar],na.rm=T) ) {
X[,mv]<-max(X[,mv],na.rm=T)
}
}
tmp<-X[,v.want[which(v.want %in% names(X))]]
val.full<-apply(tmp,2,mad,na.rm=T)
n.diff<-sum(val.full>0,na.rm=T)
n.agree<-sum(val.full==0,na.rm=T)
val<-suppressWarnings(max(apply(tmp,2,mad,na.rm=T),na.rm=T))
ndck<-names(table(X$dck))
nids<-names(table(X$id))
# disable day mismatch for certain decks
d.nody<-c(192,194,197,201,702,704,720,721,762)
if ( sum(ndck %in% d.nody) >= 1 & min(X$dy) != max(X$dy) ) {
val <- 1
return(val)
}
# disable hr mismatch for certain decks
d.nohr<-c(117)
if ( sum(ndck %in% d.nohr) >= 1 & min(X$hr) != max(X$hr) ) {
val <- 1
return(val)
}
if (n.diff <= 1 & n.agree >= 4 & sum(X$id.class!="invalid") > 0 ) {
val <- 0
return(val)
} else if ( n.diff <= 0 & n.agree >= 4 ) {
val <- 0
return(val)
} else if ( n.diff <= 1 & n.agree >= 3 & 192 %in% ndck & 254 %in% ndck ) {
val <- 0
return(val)
} else if ( n.diff <= 1 & n.agree >= 3 & 192 %in% ndck & 193 %in% ndck ) {
val <- 0
return(val)
} else if ( n.diff > 0 & sum(nchar(X$id)) == 0 ) {
# don't allow differing reports with all missing ID to match
val <- 1
return(val)
} else if ( n.diff > 1 & ("" %in% nids | "invalid" %in% X$id.class)) {
# don't allow differing reports with any missing ID to match
val <- 1
return(val)
}
if ( val < diff.lim ) return(val)
}
return(val)
}
###################################################################################################
## end code for checking matches
###################################################################################################
find_gap_func <- function(data,fill.data,ship="test") {
suppressMessages(require(stringdist))
source("~/Rscripts/get_speed.R")
source("~/Rscripts/get_gap_pos.R")
options(warnings=1)
output<-c()
data <- data[order(data$date),]
idck<-paste(names(table(data$dck)),collapse="/")
if ( "group" %in% names(data) ) {
igroup<-paste(names(table(data$group)),collapse="/")
remove<-paste0("_g",igroup)
}
#print(paste(ship,length(data$lon)))
data$date <- as.POSIXct(data$date)
ttmp <- diff(data$date)
units(ttmp)<-"hours"
ttmp <- as.numeric(ttmp)
data$dt <- c(NA,ttmp)
data$dlon <- c(NA,diff(data$lon))
data$dlat <- c(NA,diff(data$lat))
interval <- as.numeric(names(sort(table(data$dt),decreasing=T)[1])) # hours
if ( interval < 0.25 | interval > 25 ) {
if ( length(data$lon) > 10 ) {
cat("\n",'suspect interval for',ship,interval,'hours, no. obs=',nrow(data),"\n")
}
{ return() }
} else {
#cat('OK interval for',ship,interval,'hours',"\n")
}
for.gaps<-data[,c("date","lon","lat")]
interval<-interval*60*60
miss.app.int <- get_gap_pos(interval,for.gaps)
miss.app.start <- get_gap_pos_start(interval,for.gaps)
miss.app.end <- get_gap_pos_end(interval,for.gaps)
miss.app<-na.omit(rbind(miss.app.int,miss.app.start))
miss.app$id <- rep(ship,times=length(miss.app$lon))
if ( nrow(miss.app) == 0 ) {
#cat("no gaps, returning","\n")
return()
}
#cat(nrow(miss.app),"\n")
for (imiss in 1:nrow(miss.app) ) {
sub<-subset(fill.data,date==miss.app$date[imiss] & abs(miss.app$lat[imiss] - lat) < 1.1 & abs(miss.app$lon[imiss] - lon) < 1.1 )
if ( dim(sub)[1] == 0 ) {next}
if ( dim(sub)[1] > 0) {
if ( dim(sub)[1] > 1 ) {
dist<-gcd.slc(miss.app$lon[imiss],miss.app$lat[imiss],sub$lon,sub$lat)
#print(dist)
dist<-ifelse(is.na(dist),0,dist)
sub <- sub[which(dist<112),]
dist<-gcd.slc(miss.app$lon[imiss],miss.app$lat[imiss],sub$lon,sub$lat)
dist<-ifelse(is.na(dist),0,dist)
if ( dim(sub)[1] == 0 ) {next}
#print(sub[,c("id","id.ok")])
#print(miss.app[imiss,])
id1 <- ifelse(!is.na(sub$new.id),sub$new.id,"")
#id1 <- as.character(data.frame(base::strsplit(id1,"g"))[1,])
#id1 <- gsub("_$","",id1)
#id1[which(is.na(id1))] <- ""
if ( "group" %in% names(sub) ) id1<-gsub(remove,"",id1)
#print(id1)
id2 <- ifelse(!is.na(miss.app$id[imiss]),miss.app$id[imiss],"")
#id2 <- as.character(data.frame(base::strsplit(id2,"g"))[1,])
#id2 <- gsub("_$","",id2)
#id2[which(is.na(id2))] <- ""
if ( "group" %in% names(sub) ) id1<-gsub(remove,"",id2)
#print(id2)
#sdi <- stringdist(sub$new.id,miss.app$id[imiss])
sdi <- stringdist(id1,id2)
blanks1 <- nchar(id1)-nchar(gsub(" ","",id1))
blanks2 <- nchar(id2)-nchar(gsub(" ","",id2))
sdi <- sdi - abs(blanks1-blanks2)
#cat(id1,id2,sdi,blanks1,blanks2,"\n")
#print(sdi)
sdi <- ifelse(sdi > abs(nchar(id1)-nchar(id2)), abs(nchar(id1)-nchar(id2)), sdi)
#print(sdi)
sdi[which(is.na(sdi))]<-abs(nchar(id1)-nchar(id2))
best.id <- which.min(sdi)
best.pos <- which.min(dist)
#print("carrot")
#print(sdi)
#print(best.pos)
#print(best.id)
#cat(paste(best.id,collapse="/"),paste(best.pos,collapse="/"),"\n")
if ( is.na(best.id) | is.na(best.pos) ) {
# don't do anything
} else if ( best.id == best.pos ) {
sub <- sub[best.id,]
} else if (max(sdi) == min(sdi)) {
sub <- sub[best.pos,]
} else if (min(sdi) > 1) {
next
} else if (best.id != best.pos & min(sdi) <= 1) {
sub <- sub[best.id,]
} else {
print("help")
print(miss.app[imiss,])
print(sub[,c("date","lon","lat","new.id")])
next # can't decide
}
}
id1 <- ifelse(!is.na(sub$new.id),sub$new.id,"")
#id1 <- as.character(data.frame(base::strsplit(id1,"g"))[1,])
#id1 <- gsub("_$","",id1)
#id1[which(is.na(id1))] <- ""
if ( "group" %in% names(sub) ) id1<-gsub(remove,"",id1)
#print(id1)
id2 <- ifelse(!is.na(miss.app$id[imiss]),miss.app$id[imiss],"")
#id2 <- as.character(data.frame(base::strsplit(id2,"g"))[1,])
#id2 <- gsub("_$","",id2)
#id2[which(is.na(id2))] <- ""
if ( "group" %in% names(sub) ) id2<-gsub(remove,"",id2)
#print(id2)
sdi <- stringdist(id1,id2)
blanks1 <- nchar(id1)-nchar(gsub(" ","",id1))
blanks2 <- nchar(id2)-nchar(gsub(" ","",id2))
sdi <- sdi - abs(blanks1-blanks2)
#id2 <- miss.app$id[imiss]
if ( id1 == id2 | is.na(id1) | is.na(id2) ) {next} # id match across dcks
#sdi <- stringdist(sub$new.id,miss.app$id[imiss])
if ( sdi <= 1 | sdi == abs(nchar(id1)-nchar(id2))) {
dist<-gcd.slc(miss.app$lon[imiss],miss.app$lat[imiss],sub$lon,sub$lat)
dist<-ifelse(is.na(dist),0,dist)
#cat("comp ids",id1,id2,sdi,dist,"\n")
if ( dist < 112 | sdi == 1 ) {
#cat("match",sub$new.id,miss.app$id[imiss],dist,sdi,"\n")
if ( "group" %in% names(sub) ) {
#print("antelope")
output<-rbind(output,data.frame(uid=sub$uid,id=sub$new.id,rep.id=miss.app$id[imiss],
dck=sub$dck,rep.dck=idck,group=sub$group,rep.group=igroup,sdi=sdi,dist=round(dist)))
} else {
#print("bison")
output<-rbind(output,data.frame(uid=sub$uid,id=sub$new.id,rep.id=miss.app$id[imiss],
dck=sub$dck,rep.dck=idck,sdi=sdi,dist=round(dist)))
}
#print(output[nrow(output),])
}
}
}
}
return(output)
}
This diff is collapsed.
flag_id_dups <- function (sub) {
source("~/Rscripts/liz_merge.R")
print.time<-TRUE
#print.comm<-FALSE
if ( nrow(sub) < 5 ) { return }
sub<-sub[base::order(sub$date),]
if ( min(diff(sub$date)) > 0 ) { return }
tr<-imma::track_check(uid=sub$uid,yr=sub$yr,mo=sub$mo,dy=sub$dy,hr=sub$hr,lon=sub$lon2,
lat=sub$lat,id=sub$new.id,pt=sub$pt,dck=sub$dck,vsi=sub$vsi,dsi=sub$dsi,
max_direction_change = 60, max_speed_change = 10, max_absolute_speed = 40,
max_midpoint_discrepancy = 150, add_vars = TRUE)
tr$speed[1]<-tr$speed[2]
tr$course[1]<-tr$course[2]
sub<-liz_merge(sub,data.frame(uid=tr$uid,trk=tr$trk),by.var="uid")
my.data<-data.table::data.table(sub[,c("uid","yr","mo","dy","hr","lat","lon","dck","sid","id","new.id","sst","at","slp","dck.pri","trk","group","c1m","dflag","numel","numecv")],key=c("yr","mo","dy","hr"))
ddd<-my.data[my.data,allow.cartesian=TRUE]
ddd<-subset(ddd,uid!=i.uid) # remove self-matches
ddd$tmp<-ifelse(ddd$uid>ddd$i.uid,paste0(ddd$uid,ddd$i.uid),paste0(ddd$i.uid,ddd$uid))
ddd<-subset(ddd,!duplicated(ddd$tmp)) # and reverse matches
ddd$tmp<-NULL
# choose a preferred report from every pair
ddd$rej.uid<-ifelse(ddd$dck.pri>ddd$i.dck.pri,ddd$uid,ddd$i.uid)
ddd$rej.uid<-ifelse(ddd$dck == 720 & ddd$sid == 160,ddd$i.uid,ddd$rej.uid)
ddd$rej.uid<-ifelse(ddd$i.dck == 720 & ddd$i.sid == 160,ddd$uid,ddd$rej.uid)
# and if the same dck priority choose one with most parameters
ddd$rej.uid<-ifelse(ddd$numecv>ddd$i.numecv,ddd$i.uid,ddd$rej.uid)
ddd$rej.uid<-ifelse(ddd$numecv<ddd$i.numecv,ddd$uid,ddd$rej.uid)
# finally if one passes the track check and the other fails ...
ddd$rej.uid<-ifelse(ddd$trk==1&ddd$i.trk==0,ddd$uid,ddd$rej.uid)
ddd$rej.uid<-ifelse(ddd$trk==0&ddd$i.trk==1,ddd$i.uid,ddd$rej.uid)
ddd$want.uid<-ifelse(ddd$uid==ddd$rej.uid,ddd$i.uid,ddd$uid)
ddd.app<-NULL
if(nrow(ddd) > 0 ) {
if ( nrow(ddd) < 0.5*sum(mdata$new.id==ship) ) {
#cat(ship,'num rejected as ID dups',nrow(ddd),'of',sum(mdata$new.id==ship),"\n")
mdata$dflag<-ifelse(mdata$uid %in% ddd$rej.uid, 3, mdata$dflag)
mdata$dflag<-ifelse(mdata$uid %in% ddd$want.uid, 1, mdata$dflag)
ddd.app<-rbind(ddd.app,ddd[,c("new.id","want.uid","rej.uid")])
} else { # lots of dups, check lat/lon diffs
blat<-boxplot(sub$lat~sub$date,plot=F)
blat<-blat$stats[4,]-blat$stats[2,]
blon<-boxplot(sub$lon~sub$date,plot=F)
blon<-blon$stats[4,]-blon$stats[2,]
if ( max(quantile(blat,c(0.05,0.95)),quantile(blon,c(0.05,0.95))) > 0.1 ) {
cat("\n",ship,paste(names(table(sub$id)),collapse="/"),'not rejecting ID dups',nrow(ddd),'of',sum(mdata$new.id==ship),"\n")
cat(ship,quantile(blat,c(0.05,0.95)),quantile(blon,c(0.05,0.95)),"\n")
#print(table(sub$dck,sub$sid))
} else {
ddd.app<-rbind(ddd.app,ddd[,c("new.id","want.uid","rej.uid")])
}
}
}
return(ddd.app)
}
get_gap_pos <- function(interval,gdf) {
# gdf, date, lon, lat
# interval is in seconds
miss.app.int <- data.frame(array(NA,c(length(gdf$lon)*10,3)))
names(miss.app.int) <- c("date","lon","lat")
miss.app.int$date <- as.POSIXct(miss.app.int$date)
gdf$date <- as.POSIXct(gdf$date)
ttmp <- diff(as.numeric(gdf$date))
gdf$dt <- c(NA,ttmp)
gdf$dlon <- c(NA,diff(gdf$lon))
gdf$dlat <- c(NA,diff(gdf$lat))
no.intervals <- round(48*60*60/interval)
irow <- 1
index <- which(gdf$dt > interval*2-1 & gdf$dt < no.intervals*interval)
index2 <- which(gdf$dt > 48*60*60)
if ( length(index) > 0 ) {
#print(paste(gdf$date[index-1],gdf$date[index]))
aa <- gdf$date[index-1]
bb <- gdf$date[index]
aa.lon <- gdf$lon[index-1]
bb.lon <- gdf$lon[index]
aa.lat <- gdf$lat[index-1]
bb.lat <- gdf$lat[index]
l1 <- length(aa)
if ( l1 > 0 ) {
for ( j in 1:l1 ) {
#print(paste(j,interval,aa[j] , bb[j]))
if ( bb[j]-1 <= aa[j] + interval) {next}
miss <- seq(aa[j] + interval, bb[j]-1,interval)
#print("-------")
##print(j)
#print(miss)
#print(paste(aa[j],bb[j]))
#print(paste(aa.lon[j],bb.lon[j]))
#print(paste(aa.lat[j],bb.lat[j]))
lon.miss <- aa.lon[j]+(bb.lon[j] - aa.lon[j])/(as.numeric(bb[j])-as.numeric(aa[j]))*(as.numeric(miss)-as.numeric(aa[j]))
if ( (aa.lon[j] < -160 & bb.lon[j] > 160) | (bb.lon[j] < -160 & bb.lon[j] > 160) |
(aa.lon[j] < 20 & bb.lon[j] > 340) | (bb.lon[j] < 20 & aa.lon[j] > 340) ) {
del <- bb.lon[j] - aa.lon[j]
del <- ifelse(del < (-340), del+360, del)
del <- ifelse(del > (340), del-360, del)
lon.miss <- aa.lon[j]+(del)/(as.numeric(bb[j])-as.numeric(aa[j]))*(as.numeric(miss)-as.numeric(aa[j]))
lon.miss[which(lon.miss >= 360)] <- lon.miss[which(lon.miss >= 360)]-360
lon.miss[which(lon.miss <= -180)] <- lon.miss[which(lon.miss <= -180)]+360
}
lon.miss <- round(lon.miss,2)
#print(lon.miss)
#print((bb.lon[j] - aa.lon[j]))
#print(as.numeric(bb[j]-aa[j]))
#print(as.numeric(miss-aa[j]))
#print("")
lat.miss <- aa.lat[j]+(bb.lat[j] - aa.lat[j])/(as.numeric(bb[j])-as.numeric(aa[j]))*(as.numeric(miss)-as.numeric(aa[j]))
lat.miss <- round(lat.miss,2)
#print(lat.miss)
#id.miss <- rep(ship,times=length(miss))
#xx.int<-data.frame(id=id.miss,date=miss,lon=lon.miss,lat=lat.miss)
xx.int<-data.frame(date=miss,lon=lon.miss,lat=lat.miss)
lenx <- length(xx.int$lon)
ie <- irow+lenx-1
miss.app.int[irow:ie,]<-xx.int[1:lenx,]
irow <- irow + lenx
#if ( j == 3 ) stop()
}
}
}
miss.app.int <- na.omit(miss.app.int)
#print(miss.app.int)
return(miss.app.int)
}
###########################
get_gap_pos_start <- function(interval,gdf) {
miss.app.start <- data.frame(array(NA,c(length(gdf$lon)*10,3)))
names(miss.app.start) <- c("date","lon","lat")
miss.app.start$date <- as.POSIXct(miss.app.start$date)
irow <- 1
no.intervals <- round(24*60*60/interval)
# gdf, date, lon, lat
# interval is in seconds
gdf$date <- as.POSIXct(gdf$date)
ttmp <- diff(as.numeric(gdf$date))
gdf$dt <- c(NA,ttmp)
gdf$dlon <- c(NA,diff(gdf$lon))
gdf$dlat <- c(NA,diff(gdf$lat))
index2 <- which(gdf$dt > 48*60*60)
index2 <- index2[which(index2-2 >= 1)]
if ( length(index2) > 0 ) {
#-2,-1 gives points leading up to gap, 0, 1 gives points after gap
start.date <- gdf$date[index2-1]
start.lon <- gdf$lon[index2-1]
start.lat <- gdf$lat[index2-1]
delta.lon <- gdf$lon[index2-2]-gdf$lon[index2-1]
delta.lat <- gdf$lat[index2-2]-gdf$lat[index2-1]
l1 <- length(start.date)
if ( l1 > 0 ) {
for ( j in 1:l1 ) {
miss <- seq(start.date[j]+interval,start.date[j]+round(no.intervals-0.1)*interval,interval)
lon.miss <- start.lon[j]-delta.lon[j]*seq(1,round(no.intervals-0.1),1)
lon.miss <- round(lon.miss,2)
lat.miss <- start.lat[j]-delta.lat[j]*seq(1,round(no.intervals-0.1),1)
lat.miss <- round(lat.miss,2)
#print(paste(ship,miss,lon.miss,lat.miss))
#id.miss <- rep(ship,times=length(miss))
#xx.start<-data.frame(id=id.miss,date=miss,lon=lon.miss,lat=lat.miss)
xx.start<-data.frame(date=miss,lon=lon.miss,lat=lat.miss)
lenx <- length(xx.start$lon)
ie <- irow+lenx-1
miss.app.start[irow:ie,]<-xx.start[1:lenx,]
irow <- irow + lenx
}
}
}
miss.app.start<-na.omit(miss.app.start)
return(miss.app.start)
}
###########################
get_gap_pos_end <- function(interval,gdf) {
miss.app.end <- data.frame(array(NA,c(length(gdf$lon)*10,3)))
names(miss.app.end) <- c("date","lon","lat")
miss.app.end$date <- as.POSIXct(miss.app.end$date)
no.intervals <- round(24*60*60/interval)
irow <- 1
# gdf, date, lon, lat
# interval is in seconds
gdf$date <- as.POSIXct(gdf$date)
ttmp <- diff(as.numeric(gdf$date))
gdf$dt <- c(NA,ttmp)
gdf$dlon <- c(NA,diff(gdf$lon))
gdf$dlat <- c(NA,diff(gdf$lat))
index2 <- which(gdf$dt > 48*60*60)
index2 <- index2[which(index2 <= length(gdf$lon))]
if ( length(index2) > 0 ) {
#-2,-1 gives points leading up to gap, 0, 1 gives points after gap
end.date <- gdf$date[index2]
end.lon <- gdf$lon[index2]
end.lat <- gdf$lat[index2]
delta.lon <- gdf$lon[index2+1]-gdf$lon[index2]
delta.lat <- gdf$lat[index2+1]-gdf$lat[index2]
delta.time <- as.numeric(gdf$date[index2+1])-as.numeric(gdf$date[index2])
delta.lon <- delta.lon/delta.time*interval
delta.lat <- delta.lat/delta.time*interval
l1 <- length(end.date)
if ( l1 > 0 ) {
for ( j in 1:l1 ) {
miss <- seq(end.date[j]-interval,end.date[j]-round(no.intervals-0.1)*interval,interval*(-1))
#print(end.date[j])
#print(miss)
lon.miss <- end.lon[j]-delta.lon[j]*seq(1,round(no.intervals-0.1),1)
lon.miss <- round(lon.miss,2)
lat.miss <- end.lat[j]-delta.lat[j]*seq(1,round(no.intervals-0.1),1)
lat.miss <- round(lat.miss,2)
#print(paste(ship,miss,lon.miss,lat.miss))
#id.miss <- rep(ship,times=length(miss))
#xx.end<-data.frame(id=id.miss,date=miss,lon=lon.miss,lat=lat.miss)
xx.end<-data.frame(date=miss,lon=lon.miss,lat=lat.miss)
lenx <- length(xx.end$lon)
ie <- irow+lenx-1
miss.app.end[irow:ie,]<-xx.end[1:lenx,]
irow <- irow + lenx
}
}
}
miss.app.end<-na.omit(miss.app.end)
return(miss.app.end)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment