Added disjoint tracking scripts and data

e122b788 · Richard Cornes · f7e99fda · e122b788 · e122b788 · e122b788
Commit e122b788 authored 3 years ago by Richard Cornes
12 changed files
--- a/data/OCEAN_BASIN/data.nc
+++ b/data/OCEAN_BASIN/data.nc
--- a/rscripts/add_diurnal_fails.R
+++ b/rscripts/add_diurnal_fails.R
+#!/usr/bin/env Rscript
+# code written by Elizabeth Kent, National Oceanography Centre
+# code written for the NERC GloSAT project 
+
+# Description:
+## IMPORTANT: all paths to all data are defined in config.yml
+## input_data:
+# 1. arguments (e.g. 1990)
+# 2. output_data/HOOVER_RUN_ZERO
+# 3. output_data/DISJ/stat
+
+Sys.setenv(TZ='GMT')
+options(warnings=1)
+exit <- function() { invokeRestart("abort") }
+my_args = commandArgs(trailingOnly=TRUE)
+if(length(my_args)== 0) {my_args<-my_args2}
+if(length(my_args)!= 1) {stop("wrong number of input my_args or my_args2 not set")}
+require(icoads.utils)
+
+config<-config::get(file = "config.yml")
+print.comm <- TRUE
+
+# my_args[1] is start year
+
+iyr<-my_args[1]
+
+#########################################################################################
+cat('--------------------------------------------- \n')
+cat('running diurnal checks for',iyr,'\n')
+cat('--------------------------------------------- \n')
+
+  dir.out <- paste0(config$output_dir,"DIURNAL_FAIL_IDS/")
+  if (!dir.exists(dir.out) ) dir.create(dir.out)
+ 
+  dir.in <- paste0(config$output_dir,"FINAL_PROC/")
+  df<-icoads.utils::read_trackmonth(dir.in,syr=iyr)
+
+  if(!is.null(df)) {
+
+   YRMO <- split(df, data.frame(df$yr, df$mo), drop=TRUE)
+   if(length(YRMO) > 0 ) { 
+   names(YRMO) <- gsub("\\.","_", names(YRMO))
+   # empty list for output
+   sp.op<-vector(mode = "list", length = length(YRMO))
+   names(sp.op)<-names(YRMO)
+   for ( imo in 1:length(YRMO) ) {
+   #cat(imo,length(sp.op),"\n")
+
+     data<-YRMO[[imo]]
+     sp<-split(data,data$final.id)
+     sp<-sp[sapply(sp,nrow)>=10]
+     source("~eck/bad_daynightcheck.R")
+     op<-lapply(sp,function(X) bad_daynight_check(X$date,X$yr,X$lon,X$lat,X$dck,X$at))
+     op<-op[sapply(op,function(X) X==1)]
+     if(is.null(op)) op<-""
+     #print(names(unlist(op)))
+     sp.op[[imo]]<-names(unlist(op))
+
+   }
+
+   filenames<-paste0(dir.out,names(sp.op),".Rda")
+   jj<-mapply(saveRDS, sp.op, file = filenames)
+
+   nn<-names(table(unlist(sp.op)))
+   df$baddn<-ifelse(df$final.id %in% nn,1,0)
+   cat('--------------------------------------------- \n')
+   cat("Diurnal check output:",iyr,"1=fail","\n")
+   cat('--------------------------------------------- \n')
+   print(table(df$dck,df$baddn))
+   cat('--------------------------------------------- \n')
+
+   }
+
+  } # switch for no input data (df = NULL)
+
--- a/rscripts/assign_country.R
+++ b/rscripts/assign_country.R
+
+assign_country <- function (data) {
+
+#source("add_ituco.R")
+print.comm <- TRUE
+
+icoads_cc <- c( "Netherlands", "Norway", "US", "UK", "France", "Denmark", "Italy", "India", "HongKong", "NewZealand", "Ireland", "Philippines", "Egypt", "Canada", "Belgium", "SouthAfrica", "Australia", "Japan", "Pakistan", "Argentina", "Sweden", "FRG", "Iceland", "Israel", "Malaysia", "USSR", "Finland", "RepKorea", "NewCaledonia", "Portugal", "Spain", "Thailand", "Yugoslavia", "Poland", "Brazil", "Singapore", "Kenya", "Tanzania", "Uganda", "Mexico", "GDR")
+
+icoads_cc <- data.frame(as.character(seq(0,40)),icoads_cc)
+names(icoads_cc) <- c("cc.num","country")
+icoads_cc$cc.num[which(nchar(icoads_cc$cc.num)==1)] <- paste0("0",icoads_cc$cc.num[which(nchar(icoads_cc$cc.num)==1)])
+d705t7_cc <- c( "AS", "OS","AU", "BX", "CH", "CI", "CN", "CU", "CZ", "DD","DL",
+            "DN", "DR", "FI", "FR", "GR", "HK", "HO", "IY", "JP",
+            "MX", "NK", "NL", "NO", "PH", "PL", "PM", "PO", "RA","RS",
+            "SN", "SP", "SW", "TH", "TU", "UK", "US", "YG",
+            "BR", "BZ", "NU", "BE", "NZ")
+d705t7_co <- c( "Austria", "Austria", "Australia", "Belgium", "Chile", "China", "Canada", "Cuba", "Czechoslovakia", "Germany", "Germany", "Denmark", "DominicanRepublic", "Finland", "France", "Greece", "HongKong", "Honduras", "Italy", "Japan", "Mexico", "Nicaragua", "Netherlands", "Norway", "Philippines", "Poland", "Panama", "Portugal", "USSR", "USSR", "Sweden", "Spain", "Switzerland", "Thailand", "Turkey", "UK", "US", "Yugoslavia","UK","Brazil", "NU", "Bermuda", "NewZealand")
+d705t7_cc<-data.frame(d705t7_cc,d705t7_co)
+
+year<-data$yr[1]
+
+ if ( year >= 1956 ) {
+  itudata <- get_ituco(data)
+  data <- merge(data,itudata,all.x=T)
+ } else {
+  data$ituco<-NA
+ }
+  tmp<-data
+  iyear<-year
+
+  if ( print.comm ) cat("got ituco \n")
+
+#stop()
+
+    tmp$best_co <- NA
+    if(!("recruiting_country" %in% names(tmp))) tmp$recruiting_country<-NA
+    #if(!("country" %in% names(tmp))) tmp$country<-NA
+    if(!("ituco" %in% names(tmp))) tmp$ituco<-NA
+    tmp$best_co <- ifelse(!is.na(tmp$c1),tmp$c1,tmp$recruiting_country)
+    tmp$best_co <- ifelse(is.na(tmp$best_co),tmp$recruiting_country,tmp$best_co)
+    coded.id<-"^[AMO|BAR|BAT|EUC|IDD|MIN|SCA|TBW|MPD]{3}[AA|UK|FR|DE|EU|HK]{2}[0-9]{2}$"
+    tmp$id.ok <- ifelse(grepl(coded.id,tmp$track.id),"coded",tmp$id.ok)
+    tmp$best_co <- ifelse(is.na(tmp$best_co) & tmp$id.ok=="coded",substr(tmp$track.id,4,5),tmp$best_co)
+    if ( print.comm ) cat("about to merge 1 \n")
+    print(names(tmp))
+    print(names(icoads_cc))
+    tmp<-merge(tmp,icoads_cc,by.x=c("c1"),by.y=c("cc.num"),all.x=T)
+    if ( print.comm ) cat("merged 1 \n")
+    tmp$best_co<-ifelse(!is.na(tmp$country),tmp$country,tmp$best_co)
+    tmp$country<-NULL
+    tmp$best_co[which(tmp$best_co=="99")] <- NA
+    tmp$best_co<-ifelse(is.na(tmp$best_co) & !is.na(tmp$ituco),tmp$ituco,tmp$best_co)
+    #tmp$best_co<-ifelse(tmp$dck== 780 | tmp$dck==782, tmp$wod_country, tmp$best_co)
+    tmp$best_co<-ifelse(tmp$dck %in% c(249,204,205,211,216,229,239,245) & is.na(tmp$best_co), "UK", tmp$best_co)
+    tmp$best_co<-ifelse(tmp$dck %in% c(110,116,195,281,667,703,708,710,874) & is.na(tmp$best_co), "US", tmp$best_co)
+    tmp$best_co<-ifelse((tmp$dck %in% c(186) ) & is.na(tmp$best_co), "USSR", tmp$best_co)
+    tmp$best_co<-ifelse((tmp$dck %in% c(731,733,735) ) & is.na(tmp$best_co), "Russia", tmp$best_co)
+    tmp$best_co<-ifelse((tmp$dck %in% c(188,702) ) & is.na(tmp$best_co), "Norway", tmp$best_co)
+    tmp$best_co<-ifelse((tmp$dck %in% c(750,900) ) & is.na(tmp$best_co), "Australia", tmp$best_co)
+    tmp$best_co<-ifelse((tmp$dck %in% c(899) ) & is.na(tmp$best_co), "SouthAfrica", tmp$best_co)
+    tmp$best_co<-ifelse((tmp$dck %in% c(118,119,761,762) ) & is.na(tmp$best_co), "Japan", tmp$best_co)
+    tmp$best_co[which(tmp$best_co == "Unknown")] <- NA 
+
+    tmp$best_co[which(tmp$best_co=="DE")] <- "Germany"
+    tmp$best_co[which(tmp$best_co=="SE")] <- "Sweden"
+    tmp$best_co[which(tmp$best_co=="RU")] <- "Russia"
+    tmp$best_co[which(tmp$best_co=="CA")] <- "Canada"
+    tmp$best_co[which(tmp$best_co=="DK")] <- "Denmark"
+    tmp$best_co[which(tmp$best_co=="GB")] <- "UK"
+    tmp$best_co[which(tmp$best_co=="IE")] <- "Ireland"
+    tmp$best_co[which(tmp$best_co=="IN")] <- "India"
+    tmp$best_co[which(tmp$best_co=="ES")] <- "Spain"
+    tmp$best_co[which(tmp$best_co=="IT")] <- "Italy"
+    tmp$best_co[which(tmp$best_co=="LT")] <- "Lithuania"
+    tmp$best_co[which(tmp$best_co=="PT")] <- "Portugal"
+    tmp$best_co[which(tmp$best_co=="KR")] <- "RepKorea"
+    tmp$best_co[which(tmp$best_co=="NI")] <- "Nicaragua"
+    tmp$best_co[which(tmp$best_co=="ZA")] <- "SouthAfrica"
+    tmp$best_co[which(tmp$best_co=="YU")] <- "Yugoslavia"
+    tmp$best_co[which(tmp$best_co=="CD")] <- "DRCongo"
+    tmp$best_co[which(tmp$best_co=="AR")] <- "Argentina"
+    tmp$best_co[which(tmp$best_co=="IL")] <- "Israel"
+    tmp$best_co[which(tmp$best_co=="IS")] <- "Iceland"
+    tmp$best_co[which(tmp$best_co=="VE")] <- "Venezuela"
+    tmp$best_co[which(tmp$best_co=="MC")] <- "Monoco"
+    tmp$best_co[which(tmp$best_co=="EC")] <- "Ecuador"
+    tmp$best_co[which(tmp$best_co=="MG")] <- "Madagascar"
+    tmp$best_co[which(tmp$best_co=="RO")] <- "Romania"
+    tmp$best_co[which(tmp$best_co=="PE")] <- "Peru"
+    tmp$best_co[which(tmp$best_co=="CR")] <- "CostaRica"
+    tmp$best_co[which(tmp$best_co=="PA")] <- "Panama"
+    tmp$best_co[which(tmp$best_co=="AT")] <- "Austria"
+    tmp$best_co[which(tmp$best_co=="BN")] <- "Brunei"
+    tmp$best_co[which(tmp$best_co=="TR")] <- "Turkey"
+    tmp$best_co[which(tmp$best_co=="UA")] <- "Ukraine"
+    tmp$best_co[which(tmp$best_co=="GH")] <- "Ghana"
+    tmp$best_co[which(tmp$best_co=="TW")] <- "Taiwan"
+    tmp$best_co[which(tmp$best_co=="ID")] <- "Indonesia"
+    tmp$best_co[which(tmp$best_co=="NC")] <- "NewCaledonia"
+    tmp$best_co[which(tmp$best_co=="CY")] <- "Cyprus"
+    tmp$best_co[which(tmp$best_co=="LV")] <- "Latvia"
+    tmp$best_co[which(tmp$best_co=="EE")] <- "Estonia"
+    tmp$best_co[which(tmp$best_co=="SG")] <- "Singapore"
+    tmp$best_co[which(tmp$best_co=="LB")] <- "Lebanon"
+    tmp$best_co[which(tmp$best_co=="LR")] <- "Liberia"
+    tmp$best_co[which(tmp$best_co=="AG")] <- "AntiguaBarbuda"
+    tmp$best_co[which(tmp$best_co=="BS")] <- "Bahamas"
+    tmp$best_co[which(tmp$best_co=="MY")] <- "Malaysia"
+    tmp$best_co[which(tmp$best_co=="HR")] <- "Croatia"
+
+    tmp$best_co[which(tmp$best_co=="Argentine Republic")] <- "Argentina"
+    tmp$best_co[which(tmp$best_co=="Russian Federation")] <- "Russia"
+
+    tmp$best_co<-trimws(tmp$best_co)
+    tmp$best_co<-gsub(" ","",tmp$best_co)
+    data <- tmp
+
+    data$best_co[which(data$best_co == "NZ")] <- "NewZealand"
+    data$best_co[which(data$best_co == "GR")] <- "Greece"
+    data$best_co[which(data$best_co == "AU")] <- "Australia"
+    data$best_co[which(data$best_co == "NL")] <- "Netherlands"
+    data$best_co[which(data$best_co == "HK")] <- "HongKong"
+    data$best_co[which(data$best_co == "JP")] <- "Japan"
+    data$best_co[which(data$best_co == "NO")] <- "Norway"
+    data$best_co[which(data$best_co == "CL")] <- "Chile"
+    data$best_co[which(data$best_co == "FR")] <- "France"
+    data$best_co[which(data$best_co == "BR")] <- "Brazil"
+    data$best_co[which(data$best_co == "PL")] <- "Poland"
+    data$best_co[which(data$best_co == "BE")] <- "Belgium"
+    data$best_co[which(data$best_co == "CN")] <- "China"
+    data$best_co[which(data$best_co == "FI")] <- "Finland"
+    data$best_co[which(data$best_co == "PH")] <- "Philippines"
+    data$best_co[which(data$best_co == "CH")] <- "Switzerland"
+    data$best_co[which(data$best_co == "CU")] <- "Cuba"
+    data$best_co[which(data$best_co == "TZ")] <- "Tanzania"
+    data$best_co[which(data$best_co == "GDR")] <- "EastGermany"
+    data$best_co[which(data$best_co == "FRG")] <- "WestGermany"
+    data$best_co[which(data$best_co == "TH")] <- "Thailand"
+    data$best_co[which(data$best_co == "LK")] <- "SriLanka"
+    data$best_co[which(data$best_co == "RussianFederation")] <- "Russia"
+    data$best_co[which(data$best_co == "CS")] <- NA
+    data$best_co[which(data$best_co == "SA")] <- "SaudiArabia"
+    data$best_co[which(data$best_co == "KE")] <- "Kenya"
+    data$best_co[which(data$best_co == "DD")] <- "EastGermany"
+    data$best_co[which(data$best_co == "PM")] <- "StPandM"
+    data$best_co[which(data$best_co == "PK")] <- "Pakistan"
+    data$best_co[which(data$best_co == "BD")] <- "Bangladesh"
+    data$best_co[which(data$best_co == "BG")] <- "Bulgaria"
+    data$best_co[which(data$best_co == "XX")] <- NA
+    data$best_co[which(data$best_co == "OT")] <- NA
+    data$best_co[which(data$best_co == "NU")] <- NA
+    data$best_co[which(data$best_co == "EA")] <- NA
+    data$best_co[which(data$best_co == "ZY")] <- NA
+    data$best_co[which(data$best_co == "ZZ")] <- NA
+    data$best_co[which(data$best_co == "SovietUnion")] <- "USSR"
+    data$best_co[which(data$best_co == "Russia" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Ukraine" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Latvia" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Belarus" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Estonia" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Kazakhstan" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Moldova" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Uzbekistan" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "Lithuania" & iyear < 1991  & iyear > 1922)] <- "USSR"
+    data$best_co[which(data$best_co == "USSR" & iyear < 1922)] <- "Russia"
+    data$best_co[which(data$best_co == "Ukraine" & iyear < 1922)] <- "Russia"
+    data$best_co[which(data$best_co == "WestGermany" & iyear < 1945 )] <- "Germany"
+    data$best_co[which(data$best_co == "EastGermany" & iyear < 1945 )] <- "Germany"
+    data$best_co[which(data$best_co == "RepKorea" & iyear < 1945 )] <- "Korea"
+    data$best_co[which(data$best_co == "DPRK" & iyear < 1945 )] <- "Korea"
+    data$best_co[which(data$best_co == "WestGermany" & iyear > 1990 )] <- "Germany"
+    data$best_co[which(data$best_co == "EastGermany" & iyear > 1990 )] <- "Germany"
+    data$best_co[which(data$best_co == "Namibia" & data$dck == 780 )] <- NA
+    data$best_co[which(data$best_co == "Namibia" & data$dck == 782 )] <- NA
+
+    data$best_co[which(data$best_co == "" )] <- NA
+
+   # }
+
+if ( FALSE ) {
+    #print(table(data$best_co,data$dck,useNA='ifany'))
+    cat("-----------------------------------------------","\n")
+    #print(addmargins(table(data$dck,data$best_co,useNA='ifany')))
+    if ( sum(!is.na(data$best_co)) > 0 ) {
+    #print(addmargins(table(data$best_co)))
+    #print(table(!is.na(data$best_co)))
+    cat(iyear,"percent with cc: ",round(sum(!is.na(data$best_co))/dim(data)[1],2)*100,'\n')
+    cat("-----------------------------------------------","\n")
+    cat("best_co","\n")
+    cat("-----------------------------------------------","\n")
+    print(table(data$best_co,useNA='ifany'))
+    cat("-----------------------------------------------","\n")
+    cat("c1","\n")
+    cat("-----------------------------------------------","\n")
+    print(table(data$c1,useNA='ifany'))
+    cat("-----------------------------------------------","\n")
+    cat("recruiting_country","\n")
+    cat("-----------------------------------------------","\n")
+    print(table(data$recruiting_country,useNA='ifany'))
+    cat("-----------------------------------------------","\n")
+    cat("dck","\n")
+    cat("-----------------------------------------------","\n")
+    print(table(data$dck,useNA='ifany'))
+    }
+    cat("-----------------------------------------------","\n")
+    cat("-----------------------------------------------","\n")
+
+}
+
+    data$cflag <- NA
+    data$cflag <- ifelse ( !is.na(data$c1) & !(data$c1 %in% c("XX","OT","NU","EA","ZY","ZZ","99")),1,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & !is.na(data$recruiting_country),2,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck == 780 ,3,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck == 782 ,3,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck == 705 ,3,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck == 706 ,3,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck == 707 ,3,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(249,204,205,211,216,229,239,245) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(110,116,195,667,703,708,710,874) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(186) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(731,733,735) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(188,702) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(750,900) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(899) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & data$dck %in% c(118,119,761,762) ,4,data$cflag)
+    data$cflag <- ifelse ( is.na(data$cflag) & !is.na(data$ituco),9,data$cflag)
+
+    # remove country designation if # rep < 30 per month & only from callsign
+    few.co <- names(table(data$best_co))[which(table(data$best_co) < 30)]
+    few <- subset(data,best_co %in% few.co)
+    data$best_co <- ifelse(data$best_co %in% few.co & data$cflag == 9, NA, data$best_co)
+    # remove country designation if # rep < 10 per month 
+    few.co <- names(table(data$best_co))[which(table(data$best_co) < 10)]
+    few <- subset(data,best_co %in% few.co)
+    data$best_co <- ifelse(data$best_co %in% few.co, NA, data$best_co)
+    data$best_co <- ifelse(data$best_co == "GF", "FrenchGuiana", data$best_co)
+    data$best_co <- ifelse(is.na(data$best_co) & data$id.ok=="callsign" & substr(data$track.id,1,2) == "9V",  "Malta", data$best_co)
+
+    data$cflag <- ifelse ( is.na(data$best_co),NA,data$cflag)
+    
+
+    #print(table(data$track.id[data$id.ok=="callsign" & is.na(data$best_co)]))
+
+    #fn.out <- paste0("/noc/mpoc/surface_data/TRACKING/FinalTrack/COUNTRY/",iyear,".",imonth,".Rda")
+    #for.out <- data[,c("yr","mo","dck","sid","uid","c1","c1m","recruiting_country","id","track.id","best_co","cflag")]
+    #for.out <- unique(for.out)
+    #saveRDS(for.out,fn.out)
+
+    return(data[,c("uid","best_co","cflag")])
+}
+
--- a/rscripts/check_squares.R
+++ b/rscripts/check_squares.R
+
+check_squares <- function(dfg, dfb, nmin=1, do.plots=FALSE) {
+   vnames<-c("sst","at","slp","dpt") 
+   v2<- c("f","f2","nm")
+   nn<-c(paste0(rep(vnames,each=length(v2)),rep(v2,times=length(vnames))))
+   op<-data.frame(t(rep(0,times=length(nn))))
+   names(op)<-nn
+   maskb<-dfb$dck==732 & is.na(dfb$os)
+   if(sum(maskb) == 0 ) return(op)
+   if(sum(!maskb) == 0 ) return(op)
+   maskb2<-dfb$dck==732 & !is.na(dfb$os)
+   maskg<-dfg$dck==732
+   if(do.plots)par(mfrow=c(2,2))
+
+   for ( vname in vnames ) {
+    op[paste0(vname,"nm")]<-min(sum(!is.na(dfb[maskb,vname])),sum(!is.na(dfb[!maskb,vname])))
+    # doesn't work if both distributions have zero range
+    if ( sum(!is.na(dfb[maskb,vname])) > 0 & sum(!is.na(dfb[!maskb,vname])) > 0 ) {
+      rr1<-range(dfb[maskb,vname],na.rm=T)[2]-range(dfb[maskb,vname],na.rm=T)[1]
+      rr2<-range(dfb[!maskb,vname],na.rm=T)[2]-range(dfb[!maskb,vname],na.rm=T)[1]
+    } else {
+      rr1<-0
+      rr2<-0
+    }
+    nn1<-sum(!is.na(dfb[maskb,vname]))
+    nn2<-sum(!is.na(dfb[!maskb,vname]))
+    if ( rr1 > 0 | rr2 > 0 ) {
+     if ( min(nn1,nn2) > 1 ) {
+      test.t<-t.test(dfb[maskb,vname],dfb[!maskb,vname])
+      fail.t<-if ( test.t$conf.int[2] < 0 | test.t$conf.int[1] > 0 ) 1 else 0
+     } else {
+      fail.t <- (-9)
+     }
+      test.wil<-wilcox.test(dfb[maskb,vname],dfb[!maskb,vname],exact=F,conf.int=T)
+      fail.wil<- if ( test.wil$conf.int[2] < 0 | test.wil$conf.int[1] > 0 ) 1 else 0
+      if ( do.plots )  {
+       if ( min(nn1,nn2) > 1  ) {
+         boxplot(dfb[vname]~maskb,varwidth=T,main=paste(vname,fail.t,fail.wil,sum(!is.na(dfb$sst[maskb]))))
+       } else {
+        plot.new()
+       }
+      }
+    } else {
+      fail.t <- (-9)
+      fail.wil <- (-9)
+      if ( do.plots ) plot.new()
+    }
+    op[paste0(vname,"f")]<-fail.t
+    op[paste0(vname,"f2")]<-fail.wil
+   }
+
+  return(op)
+}
--- a/rscripts/doqc.R
+++ b/rscripts/doqc.R
+
+   y5<-seq(-90,90,5)
+   y5.2<-seq(-87.5,87.5,5)
+   x5<-seq(0,360,5)
+   x5.2<-seq(2.5,358,5)
+
+#mboxes<-readRDS("squares_mormet.Rda")
+#mboxes$source<-"M"
+iboxes<-readRDS("squares_icoads.Rda")
+iboxes$source<-"I"
+sure<-subset(iboxes,dck!="m732")
+sure$lon.grad<-ifelse(sure$lon.grad %in% c(-1,1),sure$lon.grad,NA)
+sure$lat.grad<-ifelse(sure$lat.grad %in% c(-1,1),sure$lat.grad,NA)
+amb<-subset(iboxes,dck=="m732")
+source("check_squares.R")
+
+if ( TRUE ) {
+   #df.full<-icoads.utils::read_trackmonth("/noc/mpoc/surface_data/D732pairs/NEW3",syr=1950,eyr=2020)
+   ff<-list.files("/noc/mpoc/surface_data/D732pairs/NEW3",full.names=T)
+   sp<-lapply(ff,readRDS)
+   sp<-sp[sapply(sp,nrow)>0]
+   sp<-sp[sapply(sp,ncol)==55]
+   df.full<-do.call(rbind,sp)
+
+   y5<-seq(-90,90,5)
+   y5.2<-seq(-87.5,87.5,5)
+   x5<-seq(0,360,5)
+   x5.2<-seq(2.5,358,5)
+
+   #df.full$ppos<-paste(df.full$lat5,df.full$i.lat5,df.full$lon5,df.full$i.lon5,sep="_")
+   pp<-subset(df.full,lat5!=i.lat5 | lon5!=i.lon5)
+}
+
+if ( FALSE ) {
+  # this to check whether we identify any new boxes using
+  # the original mormet data - seems not
+  mor.amb<-mboxes
+
+  for ( ib in 1:nrow(sure) ) {
+    a<-which((mboxes$lat5==sure$lat5[ib] & mboxes$lon5==sure$lon5[ib]) | (mboxes$lat5==sure$i.lat5[ib] & mboxes$lon5==sure$i.lon5[ib]))
+    b<-which((mboxes$i.lat5==sure$lat5[ib] & mboxes$i.lon5==sure$lon5[ib]) | (mboxes$i.lat5==sure$i.lat5[ib] & mboxes$i.lon5==sure$i.lon5[ib]))
+    irow<-a[which(b%in%a)]
+    cat(ib,irow,"\n")
+    mor.amb<-mor.amb[-irow,]
+  }
+}
+
+for ( iyear in min(iboxes$syr):max(iboxes$eyr) ) {
+#for ( iyear in 1974 ) {
+#for ( iyear in 1967:1974 ) {
+#for ( iyear in 1962:max(iboxes$eyr) ) {
+#for ( iyear in 1975:max(iboxes$eyr)) {
+ cat(iyear," ")
+ for ( imo in 1:12) {
+ cat(imo," ")
+ cmo<-if(imo<10)paste0("0",imo) else imo
+ #cat(imo,"\n")
+ #for ( imo in 1) {
+ fn<-paste0("MormetWithUid/",iyear,"_",cmo,".Rda")
+ df<-readRDS(fn)
+ df$lat5<-y5.2[findInterval(df$lat,y5,all.inside=T)]
+ df$lon5<-x5.2[findInterval(df$lon,x5,all.inside=T)]
+ df$dlat5<-df$lat-df$lat5
+ df$dlon5<-df$lon-df$lon5
+ df$uid<-ifelse(is.na(df$uid),df$mor.uid,df$uid)
+
+   fn2<-paste0("/noc/mpoc/surface_data/ICOADS_R3_PROC/INPUT_Rda/r3.0.0/ICOADS_R3.0.0T_",iyear,"-",cmo,".Rda")
+   ico<-readRDS(fn2)
+   ico<-subset(ico,!(dck %in% c(731,732)))
+
+   df<-icoads.utils::rbind_all_columns(df,ico)
+
+num<-sum(df$dck==732 & is.na(df$os))
+fill<-rep(-9,times=num)
+if ( length(df$uid[df$dck==732 & is.na(df$os)]) == 0 ) next
+flags<-data.frame(uid=df$uid[df$dck==732 & is.na(df$os)],badBox=fill,susBox=fill,badDate=fill,match=fill,regExl=fill,diff.sst=fill,diff.at=fill,diff.slp=fill,diff.dpt=fill,reject=fill,new.lat=NA,new.lon=NA)
+sub<-subset(df,dck==732 & is.na(os))
+
+nmin<-5
+# for these we know which the correct box is 
+for ( ib in 1:nrow(sure) ) {
+  flags$badBox<-ifelse(sub$lat5==sure$i.lat5[ib] & sub$lon5==sure$i.lon5[ib],1,flags$badBox)
+  flags$badDate<-ifelse(sub$lat5==sure$i.lat5[ib] & sub$lon5==sure$i.lon5[ib] & sub$yr >= sure$syr[ib] & sub$yr <= sure$eyr[ib],1,flags$badBox)
+  test.bad<-subset(df,lat5==sure$i.lat5[ib]&lon5==sure$i.lon5[ib])
+  mm<-test.bad$dck==732 & is.na(test.bad$os)
+  if ( max(sum(mm),sum(!mm))  < nmin ) next
+  test.bad$new.lat<-sure$lat5[ib]+test.bad$dlat*sure$lat.grad[ib]
+  test.bad$new.lon<-sure$lon5[ib]+test.bad$dlon*sure$lon.grad[ib]
+  test.move<-subset(df,lat5==sure$lat5[ib]&lon5==sure$lon5[ib])
+  #cat(ib,"sure",sure$i.lat5[ib],sure$i.lon5[ib],"\n")
+  #print(table(test.bad$dck,test.bad$os,useNA='ifany'))
+  #if(nrow(test.move) > 20 & nrow(test.bad) > 20 ) stop()
+  dum<-check_squares(test.move,test.bad,nmin=nmin,do.plots=FALSE)
+  #if ( max(dum) > 0 ) print(dum)
+  if(dum$sstf%in% c(0,1)) flags$diff.sst <- ifelse(flags$uid %in% test.bad$uid, dum$sstf, flags$diff.sst)
+  if(dum$atf%in% c(0,1)) flags$diff.at <- ifelse(flags$uid %in% test.bad$uid, dum$atf, flags$diff.at)
+  if(dum$slpf%in% c(0,1)) flags$diff.slp <- ifelse(flags$uid %in% test.bad$uid, dum$slpf, flags$diff.slp)
+  if(dum$dptf%in% c(0,1)) flags$diff.dpt <- ifelse(flags$uid %in% test.bad$uid, dum$dptf, flags$diff.dpt)
+  #if(max(dum)==0)next
+  #flags<-merge(flags,test.bad[,c("uid","new.lat","new.lon")],by="uid",all.x=T)
+  for ( iuid in test.bad$uid[test.bad$dck==732&is.na(test.bad$os)] ) {
+    mmf<-which(flags$uid==iuid)
+    mmb<-which(test.bad$uid==iuid)
+    flags$new.lat[mmf]<-test.bad$new.lat[mmb]
+    flags$new.lon[mmf]<-test.bad$new.lon[mmb]
+  }
+}
+
+# problem region in Indian Ocean
+  flags$regExl<-ifelse(sub$lon >= 70 & sub$lon <= 80 & sub$yr >= 1975 & sub$yr <= 1981, 1, flags$regExl)
+
+# flag if match in pp file
+  flags$match<-ifelse(flags$uid %in% pp$i.uid,1,flags$match)
+  flags$match<-ifelse(flags$uid %in% pp$uid,1,flags$match)
+  #flags$reject<-ifelse(flags$badBox==1|flags$susBox==1|flags$badDate==1|flags$match==1|flags$regExl==1|flags$susDate==1,1,0)
+  #flags$reject<-ifelse(flags$badDate==1|flags$match==1|flags$regExl==1|flags$susDate==1,1,0)
+  flags$reject<-ifelse(flags$match==1|flags$regExl==1,1,0)
+  flags$reject<-ifelse(flags$diff.sst==1|flags$diff.at==1|flags$diff.slp==1|flags$diff.dpt==1,1,flags$reject)
+  flags$reject<-ifelse((flags$diff.sst==-9|flags$diff.at==-9|flags$diff.slp==-9|flags$diff.dpt==-9)&(flags$badBox==1|flags$susBox==1|flags$badDate==1),1,flags$reject)
+  saveRDS(flags,paste0("testD732flags/",iyear,"_",imo,".Rda"))
+} # end loop over month
+cat("\n")
+} # end loop over year
--- a/rscripts/gen_732pairs.R
+++ b/rscripts/gen_732pairs.R
+
+   y5<-seq(-90,90,5)
+   y5.2<-seq(-87.5,87.5,5)
+   x5<-seq(0,360,5)
+   x5.2<-seq(2.5,358,5)
+
+#for ( iyear in 1888:1995 ) {
+#for ( iyear in 1888:1995 ) {
+for ( iyear in 1940:1995 ) {
+cat(iyear,": ")
+ for ( imo in 1:12 ) {
+  cat(imo," ")
+
+   #fn<-paste0("/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/HOOVER_RUNZERO/",iyear,".",imo,".Rda")
+   #if(!file.exists(fn))next
+   #pig<-readRDS(fn)
+   cmo<-if(imo<10)paste0("0",imo) else imo
+   fn.mor<-paste0("/noc/users/eck/TESTING/D732/MormetWithUid/",iyear,"_",cmo,".Rda")
+   if ( !file.exists(fn.mor)) next
+   mor<-readRDS(fn.mor)
+   mor$dck<-paste0("m",mor$dck)
+   fn2<-paste0("/noc/mpoc/surface_data/ICOADS_R3_PROC/INPUT_Rda/r3.0.0/ICOADS_R3.0.0T_",iyear,"-",cmo,".Rda")
+   pig<-readRDS(fn2)
+   #if(!("os" %in% names(tmp)))tmp$os<-NA
+   #if(!("op" %in% names(tmp)))tmp$op<-NA
+   #pig<-merge(pig,tmp[,c("uid","os","op")])
+   pig<-subset(pig,!(dck %in% c(731,732)))
+   pig$dck<-paste0("d",pig$dck)
+   pig2<-icoads.utils::rbind_all_columns(pig,mor)
+   pig2<-icoads.utils::add_date2(pig2)
+
+#function (df.in, key.list, close.list, tol.id = 2, tol.pos = 1, 
+#    tol.yr = 0, tol.mo = 0, tol.dy = 0, tol.hr = 0, tol.sst = 1, 
+#    tol.slp = 1, tol.at = 1, tol.dpt = 1, tol.w = 1, tol.d = 1, 
+#    tol.ww = 0, tol.w1 = 0, tol.vv = 0, tol.n = 0) 
+
+   #pig2<-pig[,c("date","uid","yr","mo","dy","hr","lat","lon","dck","orig.id","track.id","sst","slp","at","w","os","op")]
+   pig2<-pig2[,c("date","uid","yr","mo","dy","hr","lat","lon","dck","id","sst","slp","at","w","d","os","op","tape","mor.uid")]
+
+   #v.want<-c("date","uid","yr","mo","dy","hr","lat","lat5","lon","lon5","dck","id","sst","slp","at","w","lon.pos5","lat.pos5","os","op")
+
+   pig2$lat5<-y5.2[findInterval(pig2$lat,y5,all.inside=T)]
+   pig2$lon5<-x5.2[findInterval(pig2$lon,x5,all.inside=T)]
+
+   pig2$lat.pos5<-pig2$lat-pig2$lat5
+   pig2$lon.pos5<-pig2$lon-pig2$lon5
+   pig2$alat.pos5<-abs(pig2$lat.pos5)
+   pig2$alon.pos5<-abs(pig2$lon.pos5)
+
+   sp<-split(pig2,format(pig2$date,"%j"))
+
+   #op<-lapply(sp,icoads.utils::get_mis,key.list=c("yr","mo","dy","hr"),close.list=c("sst","slp","at","w"),tol.sst=0.6,tol.at=0.6,tol.slp=0.6,tol.w=3.0)
+   op<-lapply(sp,icoads.utils::get_mis,key.list=c("yr","mo","dy","hr","alat.pos5","alon.pos5"),close.list=c("sst","slp","at","w"),tol.sst=0.6,tol.at=0.6,tol.slp=0.6,tol.w=3.0)
+   fn.out<-paste0("/noc/mpoc/surface_data/D732pairs/NEW3/",iyear,"_",imo,".Rda")
+   op<-op[sapply(op,nrow)>0]
+   if ( length(op) == 0 ) {
+     if (file.exists(fn.out)) file.remove(fn.out)
+     next
+   }
+
+   pp<-do.call(rbind,op)
+   if (nrow(pp) == 0 ) { 
+     if (file.exists(fn.out)) file.remove(fn.out)
+     next
+   }
+
+   pp2<-subset(pp,dck=="m732"|i.dck=="m732")
+   if (nrow(pp2) == 0 ) { 
+     if (file.exists(fn.out)) file.remove(fn.out)
+     next
+   }
+
+   rownames(pp2)<-NULL
+
+   cat("saving",fn.out,"\n")
+   saveRDS(pp2,fn.out)
+   #stop()
+}
+cat("\n")
+}
+
--- a/rscripts/get_basin.R
+++ b/rscripts/get_basin.R
+
+require(ncdf4)
+
+#nc<-nc_open("/gws/nopw/j04/glosat/development/data/results/NMAT/ECK/version_2/OCEAN_BASIN/data.nc")
+nc<-nc_open("/noc/mpoc/surface_data/ICOADS_R3_PROC/OCEAN_BASIN/data.nc")
+Xlon<-ncvar_get(nc,"X")
+Ylat<-ncvar_get(nc,"Y")
+Z<-ncvar_get(nc,"Z")
+basin<-ncvar_get(nc,"basin")
+bb<-ncatt_get(nc,"basin","CLIST")$value
+nc_close(nc)
+
+bb<-gsub("\\[","",bb)
+bb<-gsub("\\]","",bb)
+bb<-gsub("\\) \\(",",",bb)
+bb<-gsub(" ","",bb)
+bb<-gsub("\\(","",bb)
+bb<-gsub("\\)","",bb)
+bb<-read.table(text=bb,sep=",",header=F)
+names(bb)<-bb[1,]
+bb<-names(bb)
+
+tmp<-basin[,,1]
+basin<-basin[,,1]
+basin[1:180,]<-tmp[181:360,]
+basin[181:360,]<-tmp[1:180,]
+basin_name<-basin
+basin_name<-ifelse(!is.na(basin),bb[basin],NA)
+
--- a/rscripts/get_gridded_vals.R
+++ b/rscripts/get_gridded_vals.R
+#!/usr/bin/env Rscript
+my_args = commandArgs(trailingOnly=TRUE)
+
+#code_dir="/gws/nopw/j04/glosat/NMAT/ECK/HEIGHT_ENS/"
+code_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/"
+#data_dir="/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/FINAL_PROC"
+data_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/FINAL_PROC"
+#pub47_dir="/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/HOSTACE_PROC_ANC_INFO/PUB47byMONTH"
+#out_dir="./ENS_HTS/"
+out_dir="."
+out_grid_dir="/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/GRID_HTS/"
+
+
+#source("/gws/nopw/j04/glosat/NMAT/ECK/version_2/get_basin.R")
+source(paste0(code_dir,"get_basin.R"))
+
+#--------------------------------------------------------------------------------
+Sys.setenv(TZ='GMT')
+#--------------------------------------------------------------------------------
+
+options(warn=1)
+if ( length(my_args) < 2 ) {my_args<-my_args2}
+
+for ( year in my_args[1]:my_args[2] ) {
+
+cat("--------------------------------------------------","\n")
+  cat("Generating default grids for",year,"\n")
+cat("--------------------------------------------------","\n")
+
+  flist<-list.files(data_dir,pattern=as.character(year),full.names=T)
+  if ( length(flist) == 0 ) {
+   data_dir2<-gsub("FINAL_PROC","HOOVER_RUNZERO",data_dir)
+   flist<-list.files(data_dir2,pattern=as.character(year),full.names=T)
+   data$final.id<-data$track.id
+  }
+  sp<-lapply(flist,readRDS)
+  data<-do.call(rbind,sp)
+
+  #data<-subset(data,mo==12)
+   fmiss<-round(sum(is.na(data$final.id))/nrow(data)*100,1)
+   cat("n.good=",nrow(data),"miss id=",fmiss,"%\t","\n")
+
+   data$lon180<-ifelse(data$lon>=180,data$lon-360,data$lon)
+   data$ix10<-floor((data$lon180+180)*0.1)+1
+   data$iy10<-floor((data$lat+90)*0.1)+1
+   data$ix5<-floor((data$lon180+180)*0.2)+1
+   data$iy5<-floor((data$lat+90)*0.2)+1
+   data$ix2<-floor((data$lon180+180)*0.5)+1
+   data$iy2<-floor((data$lat+90)*0.5)+1
+   data$ix1<-floor((data$lon180+180)*1.0)+1
+   data$iy1<-floor((data$lat+90)*1.0)+1
+
+   ind<-array(NA,dim=c(nrow(data),2))
+   ind[,1]<-ifelse(data$ix1<=360,data$ix1,360)
+   ind[,2]<-ifelse(data$iy1<=180,data$iy1,180)
+   data$basin<-basin_name[ind]
+   data$basin<-ifelse(is.na(data$basin),"coastal",data$basin)
+
+   basin5<-array(NA,c(72,36))
+   basin5n<-array(NA,c(72,36))
+   for ( ix in seq(1,359,5) ) {
+    for ( iy in seq(1,179,5) ) {
+      #sub<-basin_name[ix:(ix+4),iy:(iy+4)]
+      sub<-basin[ix:(ix+4),iy:(iy+4)]
+      subn<-basin_name[ix:(ix+4),iy:(iy+4)]
+      if(sum(!is.na(sub)) == 0 ) next
+      if(length(table(sub))>1) {
+       tt<-sort(table(sub))
+       if(tt[1]/sum(!is.na(sub))<0.25 & length(tt) == 2 ) {
+        #print(tt)
+        basin5[ceiling(ix/5),ceiling(iy/5)]<-as.numeric(names(tt)[2])
+        basin5n[ceiling(ix/5),ceiling(iy/5)]<-names(table(subn))[2]
+       } else {
+        #print(table(sub))
+        next
+       }
+      } else {
+        basin5[ceiling(ix/5),ceiling(iy/5)]<-as.numeric(names(table(sub)))
+        basin5n[ceiling(ix/5),ceiling(iy/5)]<-names(table(subn))
+      }
+    }
+   }
+
+   cat("got basins","\n")
+
+   #hts.by.id <- read.table(paste0("ENS_HTS/htbyid_",year,".psv.gz"),header=F,sep="|")
+   if ( file.exists(paste0("/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/ENS_HTS/htbyid_",year,".psv")) ) {
+     hts.by.id <- read.table(paste0("/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/ENS_HTS/htbyid_",year,".psv"),header=F,sep="|",comment.char="",quote=FALSE)
+     
+     cat("got ID hts","\n")
+   } else {
+     cat('no height file, skipping',year,"\n")
+     next
+   }
+   #hts.by.uid <- read.table(paste0("ENS_HTS/htbyuid_",year,".psv.gz"),header=F,sep="|")
+   hts.by.uid <- read.table(paste0("/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/output_data/ENS_HTS/htbyuid_",year,".psv"),header=F,sep="|")
+   cat("got UID hts","\n")
+
+   h.id<-data.frame(final.id=hts.by.id$V1,ht=apply(hts.by.id[,2:ncol(hts.by.id)],c(1),mean),ht.sd=apply(hts.by.id[,2:ncol(hts.by.id)],c(1),sd))
+   h.uid<-data.frame(uid=unique(hts.by.uid$V1),ht=NA,ht.sd=NA)
+   sp<-split(hts.by.uid,hts.by.uid$V1)
+   h.uid$ht<-sapply(sp,function(X) mean(X$V4))
+   h.uid$ht.sd<-sapply(sp,function(X) sd(X$V4))
+
+   data<-merge(data,h.id,all.x=T)
+   data.noid<-subset(data,is.na(ht))
+   data.noid$ht<-NULL
+   data.noid$ht.sd<-NULL
+   data.id<-subset(data,!is.na(ht))
+   data.noid<-merge(data.noid,h.uid,all.x=T)
+   data<-rbind(data.id,data.noid)
+
+   # hts by 5 deg box
+   data$tmp<-paste0(data$ix5,"_",data$iy5)
+   sp<-split(data,data$tmp)
+   ht<-data.frame(ht=sapply(sp,function(X) round(mean(X$ht,na.rm=T),2)))
+   ht$tmp<-rownames(ht)
+   #ht.sd<-data.frame(sd=sapply(sp,function(X) round(sd(X$at.ht,na.rm=T),2)))
+   ht.sd<-data.frame(sd=sapply(sp,function(X) round((quantile(X$ht,5/6,na.rm=T)-quantile(X$ht,1/6,na.rm=T))*0.5,2)))
+   ht.num<-data.frame(num=sapply(sp,function(X) sum(!is.na(X$ht))))
+   stri<-".ix5.iy5"
+   ht.app<-data.frame(tmp=ht$tmp,ht=ht$ht,sd=ht.sd$sd,num=ht.num$num)
+   ht.app<-subset(ht.app,!is.na(ht.app$ht))
+   ht.app<-subset(ht.app,ht.app$num>50)
+   names(ht.app)[2:4]<-paste0(names(ht.app)[2:4],stri)
+   ht.5x5<-ht.app
+
+
+   tmp<-strsplit(ht.5x5$tmp,"_")
+   want<-array(NA,c(nrow(ht.5x5),2))
+   want[,1]<-sapply(tmp,function(X) as.numeric(X[1]))
+   want[,2]<-sapply(tmp,function(X) as.numeric(X[2]))
+
+   # start off with 5x5 values
+   op.ht<-array(NA,c(72,36))
+   op.ht[want]<-ht.5x5$ht.ix5.iy5
+   op.sd<-array(NA,c(72,36))
+   op.sd[want]<-ht.5x5$sd.ix5.iy5
+   op.num<-array(NA,c(72,36))
+   op.num[want]<-ht.5x5$num.ix5.iy5
+
+   #op.ht<-ifelse(is.na(op.ht),bas10.ht,op.ht)
+   #op.ht<-ifelse(is.na(op.ht),bas.ht,op.ht)
+   #op.ht<-ifelse(is.na(op.ht),lat10.ht,op.ht)
+   #op.sd<-ifelse(is.na(op.sd),bas10.sd,op.sd)
+   #op.sd<-ifelse(is.na(op.sd),bas.sd,op.sd)
+   #op.sd<-ifelse(is.na(op.sd),lat10.sd,op.sd)
+#
+   # infill southern ocean
+   for ( i in 35:1 ) {
+    if(sum(!is.na(op.ht[,i]))==0 ) {
+     #cat(i,sum(!is.na(op.ht[,i])),"\n")
+     op.ht[,i]<-op.ht[,i+1]
+     op.sd[,i]<-min(op.sd[,i+1],2)
+    }
+   }
+
+   saveRDS(op.ht,paste0(out_grid_dir,"deg5_ht_",year,".Rda"))
+   saveRDS(op.sd,paste0(out_grid_dir,"deg5_sd_",year,".Rda"))
+   saveRDS(op.num,paste0(out_grid_dir,"deg5_sd_",year,".Rda"))
+
+} # end of loop over years
--- a/rscripts/get_height_ens.R
+++ b/rscripts/get_height_ens.R
--- a/rscripts/run_disj_track2.R
+++ b/rscripts/run_disj_track2.R
+#!/usr/bin/env Rscript
+# code written by Elizabeth Kent, National Oceanography Centre
+# code written for the NERC GloSAT project 
+
+# Description:
+
+## IMPORTANT: all paths to all data are defined in config.yml
+## input_data:
+# 1. arguments (e.g. c(1990, 1990, "pre", "none"))
+# 2. output_data/HOOVER_RUNZERO/
+
+# my_args[1] is start year
+# my_args[2] is end year
+# my_args[3] is "pre" or "post" tracking
+# my_args[4] dcks to select, either "none" or e.g. "123,124"
+
+## Outputs the data to a home dir:
+# output_data/DISJTR
+# and duplicate text files to
+# output_data/PAIR_DUPFILES
+
+my_args = commandArgs(trailingOnly=TRUE)
+
+source("~eck/keeping_going2.R")
+
+if(length(my_args)<2)my_args<-my_args2
+
+n.args<-length(my_args)
+if(n.args< 2) {my_args<-my_args2}
+n.args<-length(my_args)
+
+if(n.args < 2 | n.args> 4) {stop("wrong number of input my_args or my_args2 not set")}
+if(n.args == 2 & my_args[1] >= 1982 ) my_args[3] <- "pre"
+if(n.args == 2 ) my_args[3] <- "post"
+if(n.args < 4) my_args[4] <- "none"
+
+config<-config::get(file = "config.yml")
+exit <- function() { invokeRestart("abort") } 
+source("/noc/users/eck/Rscripts/read_rdsfiles.R")
+
+if (!dir.exists(paste0(config$output_dir,"DISJTR_NEW"))) dir.create(paste0(config$output_dir,"DISJTR_NEW"))
+
+print.comm <- FALSE
+writeOutText = FALSE
+read.in<-TRUE
+keep.going<-TRUE
+save.data<-TRUE
+gen.plots<-FALSE
+
+tot.rep<-12
+f.add<-1
+
+  vv<-c("date","lon","lat","dck","sid","d","w","at","sst","slp","n")
+  vv2<-c(vv,"uid","orig.id","track.id","testid","id.ok")
+
+  min.len1 <- 12
+  min.len2<-ceiling(0.5*min.len1)
+  min.len3<-ceiling(0.3*min.len1)
+  min.len.use<-min.len1
+  stop.loop<-FALSE
+
+  syr<-as.numeric(my_args[1])
+  eyr<-as.numeric(my_args[2])
+  cat('---------------------------------------------------------',"\n")
+  cat("running grouping for",my_args[1],"to",my_args[2],my_args[3],"tracking \n")
+  cat('---------------------------------------------------------',"\n")
+  data<-icoads.utils::read_trackmonth(paste0(config$output_dir,"HOOVER_RUNZERO/"),syr=syr,eyr=eyr)
+  if ( is.null(data) ) {
+   cat('no data for',my_args[1],my_args[2],'skipping \n')
+   if ( my_args[1] != my_args[2] ) exit()
+  }
+  if ( nrow(data) == 0 ) {
+   cat('no data for',my_args[1],my_args[2],'skipping \n')
+   if ( my_args[1] != my_args[2] ) exit()
+  }
+  # needed if "pre"
+  if(!("orig.id") %in% names(data)) data$orig.id<-data$id
+  # fix some dates ...
+  for ( iv in names(data) ) {
+    if ( "POSIXlt" %in% class(data[,iv])) data[,iv]<-as.POSIXct(data[,iv])
+    #cat(iv,class(data[,iv]),"\n")
+  }
+  if ( my_args[4] != "none" ) {
+    dck2sel<-read.table(text=my_args[4],sep=",")
+    data<-subset(data,dck %in% dck2sel)
+    cat("using data dcks", my_args[4], "\n")
+  } else {
+    cat("using data from all dcks \n")
+  }
+  if(!("id.ok" %in% names(data) )) data$id.ok<-data$id.type
+  if(!("track.id" %in% names(data) )) data$track.id<-data$new.id
+  data$lon360<-data$lon
+  # disjoint groups needs lon +/- 180
+  data$lon<-ifelse(data$lon>=180,data$lon-360,data$lon)
+  data<-icoads.utils::calc_local(data)
+  if ( my_args[3] == "pre" ) {
+    data2<-subset(data,!(id.ok %in% c("callsign","name","coded","valid","assigned","fill","track","trackfill")))
+  } else {
+    data2<-subset(data,!(id.ok %in% c("callsign","name","coded","valid","assigned","fill")))
+  }
+
+  if ( max(data$yr) > 2000 ) {
+    data2<-subset(data,hr%%1 == 0)
+    data2<-subset(data2,format(data2$date,"%M")=="00")
+    data2<-subset(data2,format(data2$date,"%S")=="00")
+  } else {
+    data2<-data
+  }
+  n.tot<-nrow(data2)
+  #if(my_args[1] == "2020") data2<-subset(data2,mo==2 & dy < 15)
+  #data2<-subset(data2,mo==2)
+  cat('---------------------------------------------------------',"\n")
+  cat(my_args[1],"sampling type","\n")
+  print(addmargins(table(data2$dck,data2$samp.flag)))
+  cat('---------------------------------------------------------',"\n")
+  togroup<-subset(data2,!(id.ok %in% c("callsign","name","coded","valid","assigned","fill")))
+  if ( max(data2$yr) < 1900 ) togroup<-subset(togroup,!(dck %in% c(701,721,730) & id.ok=="few"))
+  if ( nrow(togroup) <= 4 ) {
+   cat('no data to track for',my_args[1],my_args[2],'skipping \n')
+   if ( my_args[1] != my_args[2] ) exit()
+   #next
+  }
+  if ( (syr < 1970 & length(my_args) == 2) | my_args[3] == "post"  ) {
+    togroup<-subset(togroup,!(id.ok %in% c("track","trackfill")))
+    cat("removing data already tracked","\n")
+    if ( nrow(togroup) == 0 ) {
+      cat("no untracked data \n")
+      exit()
+    }
+    print(table(togroup$dck,togroup$id.ok))
+  }
+  stat<-icoads.utils::find_stations(togroup)
+  if ( !is.null(stat) ) { 
+    cat("got",nrow(stat),"reports in",length(unique(stat$stat_loc)),"station positions \n ")
+  } else {
+    cat("no stations \n")
+  }
+  if(!is.null(stat)) togroup<-subset(togroup,!(uid %in% stat$uid))
+  #togroup<-subset(togroup,mo<=3)
+  cat("grouping",nrow(togroup),"reports \n")
+  togroup.old<-togroup
+
+  # subset to whole hour reports
+  togroup<-subset(togroup,as.numeric(format(togroup$date,"%H"))%%1==0)
+
+  # and remove hours with too much data (some days have huge # obs)
+  tt<-table(togroup$date)
+  lim<-quantile(tt,c(0.99))*2
+  tt<-tt[tt<=lim]
+  if ( length(tt) > 0 ) togroup<-subset(togroup,date %in% as.POSIXct(names(tt)))
+
+  togroup.early<-subset(togroup,yr<1855)
+  togroup.gmt<-subset(togroup,samp.flag!="L" & yr >= 1855)
+  togroup.loc<-subset(togroup,samp.flag=="L" & yr >= 1855)
+  # subset local data to 4 hours
+  togroup.loc<-subset(togroup.loc,as.numeric(format(togroup.loc$local,"%H"))%%4==0)
+  # now get midday d192 reports
+  togroup.mid<-subset(togroup.loc,dck %in% c(192) & as.numeric(format(togroup.loc$local,"%H"))==12)
+  togroup.loc<-subset(togroup.loc,!(uid %in% togroup.mid$uid))
+  if ( nrow(togroup.loc) > 0 ) togroup.loc<-togroup.loc[order(togroup.loc$loc),]
+  #stop()
+
+  poss.fill<-togroup
+
+  if(exists("tracked.check"))rm(tracked.check)
+  len.old<-0
+
+  num.grouped<-0
+
+  ###############################################
+  ## togroup is the dataframe of reports to group
+  ###############################################
+
+  togroup.keep<-togroup
+
+  samp.type <- c("gmt","loc","mid","early")
+  #samp.type <- c("gmt","loc")
+
+  grouped.data<-data.frame(uid="",testid="")
+
+  months<-0
+  if ( syr >1950 & syr <= 1989 ) months <- seq(1,12)
+  #months<-2
+
+  for ( imo in months ) {
+
+  if ( imo != 0 ) {
+    cat('==================',"running for month",imo,"\n")
+    togroup<-subset(togroup.keep,mo==imo)
+    togroup.early<-subset(togroup,yr<1855)
+    togroup.gmt<-subset(togroup,samp.flag!="L" & yr >= 1855)
+    togroup.loc<-subset(togroup,samp.flag=="L" & yr >= 1855)
+    # subset local data to 4 hours
+    togroup.loc<-subset(togroup.loc,as.numeric(format(togroup.loc$local,"%H"))%%4==0)
+    # now get midday d192 reports
+    togroup.mid<-subset(togroup.loc,dck %in% c(192) & as.numeric(format(togroup.loc$local,"%H"))==12)
+    togroup.loc<-subset(togroup.loc,!(uid %in% togroup.mid$uid))
+    if ( nrow(togroup.loc) > 0 ) togroup.loc<-togroup.loc[order(togroup.loc$loc),]
+
+    poss.fill<-togroup
+  } else {
+    cat('------------------',"running for full year","\n")
+  }
+
+  for ( isamp in 1:length(samp.type) )  {
+
+    sampt<-samp.type[isamp]
+
+    if ( sampt == "gmt" & nrow(togroup.gmt) == 0 ) next
+    if ( sampt == "gmt" ) cat("got GMT reports",nrow(togroup.gmt),"\n")
+    if ( sampt == "loc" & nrow(togroup.loc) == 0 ) next
+    if ( sampt == "loc" ) cat("got LOCAL reports",nrow(togroup.loc),"\n")
+    if ( sampt == "mid" & nrow(togroup.mid) == 0 ) next
+    if ( sampt == "mid" ) cat("got Midday reports",nrow(togroup.mid),"\n")
+    if(syr >= 1990 & sampt == "loc" ) { if ( nrow(togroup.loc) < 30) next }
+    if(syr >= 1990 & sampt == "mid" ) { if ( nrow(togroup.mid) < 30) next }
+    if ( sampt == "early" & nrow(togroup.early) != 0 ) cat("got early reports",nrow(togroup.early),"\n")
+    if ( sampt == "early" & nrow(togroup.early) == 0 ) next
+
+    for ( irep in 1:tot.rep ) {
+      if(irep == 1) {
+        min.len.use<-min.len1
+        keep.going<-TRUE
+      } else if (f.add<0.02 ) {  # only added 2% of missing data
+        if ( min.len.use == min.len1 ) {
+          min.len.use <- min.len2
+        } else if ( min.len.use == min.len2 ) {
+          min.len.use <- min.len3
+        } else if ( min.len.use == min.len3 ) {
+          keep.going<-FALSE
+        }
+      } else if ( irep == tot.rep -1 & keep.going == TRUE ) {
+        min.len.use <- min(min.len.use,min.len2)
+      } else if ( irep == tot.rep  & keep.going == TRUE ) {
+        min.len.use <- min(min.len.use,min.len3)
+      }
+      
+    #if(!keep.going) next
+    #for ( irep in 1:1 ) {
+    if(!keep.going) {
+      cat("iteration",irep,"no new info","\n")
+      next
+    }
+    kg.op<-NULL
+
+    cat('---------------------------------------------------------',"\n")
+    if ( imo != 0 ) {
+      cat(my_args[1],"month",imo,'Sampling type',sampt,"iteration",irep,'minimum reports',min.len.use,"\n")
+    } else {
+      cat(my_args[1],'Sampling type',sampt,"iteration",irep,'minimum reports',min.len.use,"\n")
+    }
+    if ( irep > 1 ) cat("number already grouped =",nrow(grouped.data),"\n")
+    cat('---------------------------------------------------------',"\n")
+  
+    if ( sampt == "gmt" ) {
+      gmt.loop<-1
+      # track 6 hourly and fill in remainder
+      test<-subset(togroup.gmt,!(uid %in% grouped.data$uid))
+      if(nrow(test) < min.len.use ) {
+        #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+        kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+        min.len.use<-kg.op[[1]]
+        #gmt.loop<-kg.op[[2]]
+        #keep.going<-kg.op[[3]]
+        #keep.going<-kg.op[[2]]
+        next
+      }
+      prfix<-paste0("gmt",irep,"_")
+      use.int<-6
+      mg<-6
+      op<-icoads.utils::loop_disj_func(test,"gmt",mg,use.int,min.len.use)
+    } else if ( sampt == "mid" ) {
+      # dck 192 has daily reports at midday
+      test<-subset(togroup.mid, !(uid %in% grouped.data$uid))
+      if(nrow(test) < min.len.use ) {
+        #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+        kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+        min.len.use<-kg.op[[1]]
+        #gmt.loop<-kg.op[[2]]
+        #keep.going<-kg.op[[3]]
+        #keep.going<-kg.op[[2]]
+        next
+      }
+      prfix<-paste0("d192_",irep,"_")
+      use.int<-24
+      mg<-3
+      op<-icoads.utils::loop_disj_func(test,"loc",mg,use.int,min.len.use)
+     } else if ( sampt == "early" ) {
+      # pre 1855 reports
+      test<-togroup.early
+      prfix<-paste0(sampt,irep,"_")
+      use.int<-1
+      mg<-50
+      min.len.use<-min.len3
+      op<-icoads.utils::loop_disj_func(test,"loc",mg,use.int,min.len.use)
+      keep.going<-FALSE
+     } else if ( sampt == "loc" ) {
+      # local time reports
+      test<-subset(togroup.loc, !(uid %in% grouped.data$uid))
+      if(nrow(test) < min.len.use ) {
+        #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+        kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+        min.len.use<-kg.op[[1]]
+        #gmt.loop<-kg.op[[2]]
+        #keep.going<-kg.op[[3]]
+        #keep.going<-kg.op[[2]]
+        next
+      }
+      prfix<-paste0("loc",irep,"_")
+      if(irep == 1) min.len.use<-min.len1
+      use.int<-4
+      mg<-9
+      op<-icoads.utils::loop_disj_func(test,"loc",mg,use.int,min.len.use)
+     }
+
+    if ( !is.null(op) ) {
+     if ( class(op) == "character" ) {
+      if ( op == "noData" | op == "datesNotOK" ) {
+         #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going) 
+         kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+         min.len.use<-kg.op[[1]]
+         #gmt.loop<-kg.op[[2]]
+         #keep.going<-kg.op[[3]]
+         #keep.going<-kg.op[[2]]
+
+         next
+      }
+     }
+
+    #print(op)
+     if ( nrow(op) > 0 ) {
+       cat("about to evaluate tracks ...","\n")
+       #e.op<-evaluate_tracks(test, op, poss.fill, sampt, prfix, min.len.use)
+       e.op<-icoads.utils::evaluate_tracks(test, op, poss.fill, sampt, prfix, 2)
+       if ( !is.null(e.op) ) {
+         subset(e.op,nobs>=min.len.use)
+         #print(table(e.op$nobs))
+         e.op$nobs<-NULL
+       }
+       #print(table(e.op$nobs))
+       e.op$nobs<-NULL
+
+       cat("done \n")
+       if (!is.null(e.op) ) {
+          grouped.data<-rbind(grouped.data,e.op)
+          #source("plot_tr_diag.R")
+          cat('# candidates',nrow(op),'# added',nrow(e.op),"# total",nrow(grouped.data),'\n')
+          # record fraction added to see whether to continue
+          f.add<-nrow(e.op)/nrow(subset(test,!(uid %in% e.op$uid)))
+          cat("fraction added",round(f.add,3),"\n")
+          if(nrow(e.op) < min.len.use) {
+            #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+            kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+            min.len.use<-kg.op[[1]]
+            #gmt.loop<-kg.op[[2]]
+            #keep.going<-kg.op[[3]]
+            #keep.going<-kg.op[[2]]
+          }
+       } else {
+          cat('no data to add this time around \n')
+          #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+          kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+          min.len.use<-kg.op[[1]]
+          #gmt.loop<-kg.op[[2]]
+          #keep.going<-kg.op[[3]]
+          #keep.going<-kg.op[[2]]
+       }
+     } else {
+        #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+        kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+        min.len.use<-kg.op[[1]]
+        #gmt.loop<-kg.op[[2]]
+        #keep.going<-kg.op[[3]]
+        #keep.going<-kg.op[[2]]
+     }
+    } else {
+      #kg.op<-icoads.utils::keeping_going(min.len.use,min.len1,min.len2,min.len3,gmt.loop,sampt,keep.going)
+      kg.op<-keeping_going(min.len.use,min.len1,min.len2,min.len3,keep.going)
+      if(is.null(kg.op)) {
+        cat('here \n')
+        #keep.going<-FALSE
+        next
+      }
+      min.len.use<-kg.op[[1]]
+      #gmt.loop<-kg.op[[2]]
+      #keep.going<-kg.op[[3]]
+      #keep.going<-kg.op[[2]]
+    }
+
+  
+    } # end loop over irep
+  
+    cat('---------------------------------------------------------',"\n")
+
+    num.grouped<-len.old
+  } # repeat over type (gmt, local, mid)
+} # end loop over months
+
+  cat("--------------------------------------------------","\n")
+  cat("ids before spline","\n")
+  tracked.check<-merge(togroup.keep,grouped.data)
+  tracked.check.old<-tracked.check
+  #print(table(grouped.data$testid))
+  print(table(table(grouped.data$testid)))
+  cat("--------------------------------------------------","\n")
+  #fn.out<-paste0("tr_outprespline_",my_args[1],"_",my_args[2],"_",my_args[3],".Rda")
+  #saveRDS(tracked.check[,vv2],fn.out)
+
+  #stop()
+  source("/noc/users/eck/nocgit/icoads-r-hostace/rscripts/spline_qc.R")
+  cat("--------------------------------------------------","\n")
+  cat("ids after spline","\n")
+  print(table(grouped.data$testid))
+  #print(table(table(tracked.check$testid)))
+  cat("--------------------------------------------------","\n")
+
+  if ( gen.plots) source("plot_overall.R")
+
+  if ( save.data ) {
+    fn.out<-paste0(config$output_dir,"DISJTR_NEW/tr_out_",my_args[1],"_",my_args[2],"_",my_args[3],".Rda")
+    saveRDS(tracked.check[,vv2],fn.out)
+    if ( !is.null(stat) ) {
+     if ( nrow(stat) > 0 ) {
+      fn.out<-paste0(config$output_dir,"DISJTR_NEW/stat_out_",my_args[1],"_",my_args[2],"_",my_args[3],".Rda")
+      saveRDS(stat,fn.out)
+     }
+    }
+  }
+
+  not.tracked<-subset(togroup.old,!(uid %in% tracked.check$uid))
+  if(!is.null(stat)) not.tracked<-subset(not.tracked,!(uid %in% stat$uid))
+
+  n.tot<-nrow(data)
+  n.ok<-sum(data2$id.ok %in% c("callsign","name","coded","valid","assigned","fill"))
+  n.oldtrack<-sum(data2$id.ok %in% c("track","trackfill"))
+  n.group<-nrow(tracked.check)
+  n.stat<-if(!is.null(stat)) nrow(stat) else 0
+  n.newtrack<-n.stat+n.group
+  cat("-------------------------------------------------------","\n")
+  cat("total reports ",nrow(data),'\n')
+  cat("OK reports ",n.ok,'\n')
+  cat("oldtr reports ",n.oldtrack,'\n')
+  cat('grouped reports ',n.group,"\n")
+  cat('not tracked old ',n.tot-n.ok-n.oldtrack,"\n")
+  if(my_args[3] == "pre" ) {
+  cat('not using old tracking',"\n")
+  cat('not tracked ',n.tot-n.ok-n.group-n.stat,"\n")
+  cat('% tracked ',round((n.ok+n.group+n.stat)/n.tot*100,2),"\n")
+  } else {
+  cat('using old tracking',"\n")
+  cat('not tracked ',n.tot-n.ok-n.oldtrack-n.group-n.stat,"\n")
+  cat('% tracked ',round((n.ok+n.oldtrack+n.group+n.stat)/n.tot*100,2),"\n")
+  }
+#  cat("-------------------------------------------------------","\n")
--- a/rscripts/run_joins2.R
+++ b/rscripts/run_joins2.R
+#!/usr/bin/env Rscript
+# code written by Elizabeth Kent, National Oceanography Centre
+# code written for the NERC GloSAT project 
+
+# Description:
+## IMPORTANT: all paths to all data are defined in config.yml
+## input_data:
+# 1. arguments (e.g. 1990)
+# 2. output_data/HOOVER_RUN_ZERO
+# 3. output_data/DISJ/stat
+
+Sys.setenv(TZ='GMT')
+options(warnings=1)
+exit <- function() { invokeRestart("abort") }
+my_args = commandArgs(trailingOnly=TRUE)
+if(length(my_args)== 0) {my_args<-my_args2}
+if(length(my_args)!= 1) {stop("wrong number of input my_args or my_args2 not set")}
+require(icoads.utils)
+
+config<-config::get(file = "config.yml")
+print.comm <- TRUE
+dck.nodup<-c(701)
+p47vars<-c("ship_callsign","ship_name","vessel_length","vessel_breadth","vessel_freeboard","vessel_draught","recruiting_country","visual_observing_height","thermometer1_height_above_summer_load_line","anemometer1_height_above_summer_load_line","barometer1_height_above_summer_load_line","sea_thermometer1_height_above_summer_load_line","vessel_type2","p47.uid")
+
+  #source("/noc/users/eck/nocgit/icoads.utils/R/check_track_func.R")
+  #source("/noc/users/eck/nocgit/icoads.utils/R/read_rdsfiles.R")
+  #source("/noc/users/eck/nocgit/icoads.utils/R/get_breaks_func.R")
+  #source("/noc/users/eck/nocgit/icoads.utils/R/identify_dups.R")
+
+# my_args[1] is start year
+
+iyr<-my_args[1]
+
+#########################################################################################
+cat('--------------------------------------------- \n')
+cat('running final joining prog for',iyr,'\n')
+cat('--------------------------------------------- \n')
+
+  dir.out <- paste0(config$output_dir,"FINAL_PROC/")
+  if (!dir.exists(dir.out) ) dir.create(dir.out)
+ 
+  dir.in <- paste0(config$output_dir,"HOOVER_RUNZERO/")
+  df<-icoads.utils::read_trackmonth(dir.in,syr=iyr)
+
+  pub47dir <- config$pub_47_by_month_path
+  #pub_47_by_month_path: "/noc/mpoc/surface_data/ICOADS_R3_PROC/HOSTACE_PROC_ANC_INFO/PUB47byMONTH"
+
+  if(!is.null(df)) {
+
+  disj.in <- paste0(config$output_dir,"DISJTR_NEW/")
+  if ( iyr < 1982 ) {
+    fn.op<-paste0(disj.in,"tr_out_",iyr,"_",iyr,"_post.Rda")
+  } else {
+    fn.op<-paste0(disj.in,"tr_out_",iyr,"_",iyr,"_pre.Rda")
+  }
+  cat(fn.op,"\n")
+  if(file.exists(fn.op)) {
+    op<-readRDS(fn.op)
+    cat("got group output",nrow(op),"\n")
+  } else {
+    if ( iyr >= 1855 ) {
+      cat("no disj data, stopping")
+      stop()
+    } else {
+      op<-NULL
+    }
+    cat("no disj track data","\n")
+  }
+  if ( !is.null(op ) ) {if(nrow(op) == 0) op<-NULL}
+  if ( !is.null(op) ) {
+    # problem with dups, need to fix
+    tt<-as.data.frame(table(op$testid))
+    names(tt)[1]<-"testid"
+  
+    op<-merge(op,tt)
+  
+    dup<-op$uid[duplicated(op$uid)]
+    sub<-subset(op,uid %in% dup)
+    op<-subset(op,!(uid %in% dup))
+  
+    sp<-split(sub,sub$uid)
+    ff<-function(X) {
+      X<-X[order(-X$Freq),]
+      X<-X[1,]
+    }
+    sp<-lapply(sp,ff)
+    sub<-do.call(rbind,sp)
+  
+    op<-rbind(op,sub)
+    # end dup fix
+  }
+
+  st<-NULL
+  if ( iyr < 1982 ) {
+    fn.st<-paste0(disj.in,"stat_out_",iyr,"_",iyr,"_post.Rda")
+  } else {
+    fn.st<-paste0(disj.in,"stat_out_",iyr,"_",iyr,"_pre.Rda")
+  }
+  if(file.exists(fn.st) ) {
+    st<-readRDS(fn.st)
+    cat("got station data","\n")
+  } else {
+    cat("no station data","\n")
+    st<-NULL
+  }
+
+  #cat('read in grouped and station data \n')
+
+#########################################################################################
+
+if ( !is.null(op) ) {
+  tojoin<-data.frame(ship1=NA,ship2=NA)
+  tojoin<-na.omit(tojoin)
+
+  #pdf(paste0('PDF_JOIN/check_joins_',iyr,'.pdf'),height=9,width=9)
+  
+    #if(!is.null(st)) { 
+    #  tmp<-subset(df,uid %in% op$uid | uid %in% st$uid)
+    #} else {
+    #  tmp<-subset(df,uid %in% op$uid )
+    #}
+    tmp<-op
+    tab.all<-table(tmp$testid,as.numeric(format(tmp$date,"%j")))
+    jjs<-unique(as.numeric(format(tmp$date,"%j")))
+    if(length(jjs)<366) {
+     cols<-setdiff(seq(1,366),jjs)
+     extra<-array(0,dim=c(nrow(tab.all),length(cols)))
+     colnames(extra)<-cols
+     tab.all<-cbind(tab.all,extra)
+     tab.all<-tab.all[,order(as.numeric(colnames(tab.all)))]
+    }
+  
+  for ( ship in rownames(tab.all) ) {
+     #a<-sp.gr[[igr]]
+     a<-subset(tmp,testid==ship)
+     if(nrow(a) < 6 ) next
+     a<-a[order(a$date),]
+     j1<-unique(as.numeric(format(a$date,"%j")))
+     mmin<-max(min(j1-2),1)
+     mmax<-min(max(j1+2),ncol(tab.all))
+     pp<-tab.all[,mmin:mmax]
+     pp<-pp[rowSums(pp)>0,]
+     cat(ship," ")
+     cat('no months',length(unique(a$mo)),'no days',length(unique(format(a$date,"%j")))," ")
+     cat('no candidate tracks',nrow(pp),"\n")
+     if(length(unique(a$mo)) >= 6 ) {
+       cat('more than 6 months, skipping','\n')
+       next # long track, takes ages
+     }
+     if(length(unique(format(a$date,"%j"))) >= 100 ) {
+       cat('more than 100 days, skipping','\n')
+       next # long track, takes ages
+     }
+   #for ( name.track in rownames(pp) ) {
+   #  b<-subset(sub.tr,newid==name.track)
+   for ( name.add in rownames(pp) ) {
+     b<-subset(tmp,testid==name.add)
+     if(name.add==ship)next
+     if(nrow(b) < 6 ) next
+     b<-b[order(b$date),]
+     dd<-c(a$date,b$date)
+     max.gap<-max(abs(diff(dd)))
+     units(max.gap)<-"days"
+     if(max.gap>2) next
+     toch<-rbind(a,b)
+     toch<-toch[order(toch$date),]
+     ch<-icoads.utils::check_track_func(toch)
+     if(is.null(ch$t.diff))next
+     if(max(ch$distance)>300) next
+     if(length(ch$uid.speed) > 0.2*nrow(toch) ) next
+     if(max(ch$t.diff) > 24*3)next
+     seg<-icoads.utils::get_breaks_func(toch)
+     if(is.null(seg))next
+     if(max(seg$seg)>0)next
+  
+     cat('---------------------------------------------------------',"\n")
+     print(table(toch$dck,toch$testid))
+     cat('---------------------------------------------------------',"\n")
+  
+     if ( FALSE ) {
+       par(mfrow=c(2,2))
+       nameg<-a$testid[1]
+       namet<-b$testid[1]
+       plot(toch$lon,toch$lat,col=as.factor(toch$testid),main=paste(nameg,namet))
+       maps::map(add=T,col="orange",interior=F)
+       plot(toch$lon,toch$lat,col=as.factor(toch$testid),xlim=c(-180,180),ylim=c(-90,90),main=iyr)
+       maps::map(add=T,col="orange",interior=F)
+       plot(toch$date,toch$lat,col=as.factor(toch$testid),main=paste(names(table(toch$dck)),collapse="/ "))
+       plot(toch$date,toch$lon,col=as.factor(toch$testid),main=paste(nrow(a),nrow(b),nrow(toch)))
+       tojoin<-rbind(tojoin,data.frame(ship1=nameg,ship2=namet))
+       #readline(prompt="Press [enter] to continue")
+       #stop()
+     }
+    }
+  } 
+     # combine ids
+     if ( nrow(tojoin) > 0 ) {
+       for ( tj in 1:nrow(tojoin) ) {
+        tmp$testid<-ifelse(tmp$testid == tojoin$ship1[tj] | tmp$testid == tojoin$ship2[tj],paste0(tojoin$ship1[tj],"_",tojoin$ship1[tj]),tmp$testid)
+       }
+     }
+     tt<-table(tmp$testid)
+     tt<-as.data.frame(tt)
+     names(tt)[1]<-"testid"
+     tmp<-merge(tmp,tt)
+     tmp<-subset(tmp,Freq>=1)
+  
+  } # end of is.null(op)
+
+  df$lon<-ifelse(df$lon>180,df$lon-360,df$lon)
+  cat("got data",nrow(df),"\n")
+  if ( !is.null(op) ) {
+    df<-merge(df,tmp[,c("uid","testid")],all.x=T)
+    cat('merged grouped data',"\n")
+  } else {
+    df$testid<-NA
+  }
+
+  if ( !is.null(st) ) {
+    df<-merge(df,st[,c("uid","stat_loc")],all.x=T)
+    cat("merged station data","\n")
+  } else {
+    df$stat_loc<-NA
+  }
+  df$id.ok<-ifelse(!is.na(df$testid),"assigned",df$id.ok)
+  df$final.id<-ifelse(!is.na(df$testid),df$testid,df$track.id)
+  df$id.ok<-ifelse(!is.na(df$stat_loc),"assigned",df$id.ok)
+  df$final.id<-ifelse(is.na(df$final.id),df$stat_loc,df$final.id)
+  df$id.ok<-ifelse(is.na(df$final.id),"missing",df$id.ok)
+
+  sp.good<-split(df,df$final.id)
+  nn<-sapply(sp.good,nrow)
+  sp.good<-sp.good[nn>4]
+  df.bad<-subset(df,!(final.id %in% names(sp.good)) | is.na(df$final.id))
+
+  list.dup<-lapply(sp.good,icoads.utils::identify_dups)
+  dup.uids<-as.vector(unlist(list.dup))
+  sp.good<-lapply(sp.good,function(X) X<-subset(X,!(uid %in% dup.uids)))
+  
+  list.ch<-lapply(sp.good,icoads.utils::check_track_func)
+  spd.uids<-lapply(list.ch,function(X) X<-X$uid.speed)
+  spd.uids<-as.vector(unlist(spd.uids))
+  list.seg<-lapply(sp.good,icoads.utils::get_breaks_func)
+
+  f.good<-function(X) {
+   X<-data.frame(uid=X$uid,t.diff=X$t.diff,ship.speed=X$speed,ship.course=X$course,speed.flag=ifelse(X$uid %in% X$uid.speed,1,0),max.speed=rep(X$max.speed,times=X$num),Freq=rep(X$num,times=X$num))
+  }
+  f.bad<-function(X) {
+   nna<-rep(NA,times=X$num)
+   X<-data.frame(uid=X$uid,t.diff=nna,ship.speed=nna,ship.course=nna,speed.flag=nna,max.speed=nna,Freq=rep(X$num,times=X$num))
+  }
+
+  good.bad<-sapply(list.ch,function(X) !is.null(X$max.speed))
+  list.ch.good<-list.ch[good.bad]
+  list.ch.bad<-list.ch[!good.bad]
+  list.ch.df.good<-lapply(list.ch.good, f.good)
+  list.ch.df.bad<-lapply(list.ch.bad, f.bad)
+  df.app<-do.call(rbind,c(list.ch.df.good,list.ch.df.bad))
+  df.seg<-do.call(rbind,list.seg)
+  #df$Freq.old<-df$Freq
+  #df$Freq<-NULL
+  df<-merge(df,df.app,all.x=T)
+  df<-merge(df,df.seg,all.x=T)
+  df$proc.flag<-""
+  df$proc.flag<-ifelse(df$uid %in% dup.uids,"dup",df$proc.flag)
+  df$proc.flag<-ifelse(df$uid %in% spd.uids,"spd",df$proc.flag)
+
+  if ( FALSE ) {
+   for ( ib in 1:length(list.seg) ) {
+     ship<-names(list.seg)[ib]
+     if(max(list.seg[[ib]]$seg) <= -1 ) {
+     } else {
+       sub<-subset(df,final.id==ship)
+       par(mfrow=c(2,2))
+       plot(sub$date,sub$lat,col=as.factor(sub$seg),main=ship)
+       plot(sub$date,sub$lon,col=as.factor(sub$seg))
+       plot(sub$lon,sub$lat,col=as.factor(sub$seg))
+       maps::map(add=T,col="orange",interior=F)
+     }
+   }
+  }
+
+   YRMO <- split(df, data.frame(df$yr, df$mo), drop=TRUE)
+   if(length(YRMO) > 0 ) { 
+   names(YRMO) <- gsub("\\.","_", names(YRMO))
+   # OK to index in this way as there are only missing months pre-Pub47 and at the end
+   for ( imo in 1:length(YRMO) ) {
+     fn47<-paste0(pub47dir,"/pub47_",names(YRMO)[imo],".Rda")
+     if ( file.exists(fn47) ) {
+      tmp<-YRMO[[imo]]
+      p47<-icoads.utils::get_pub47(files=FALSE,iyr,imo)
+      #p47<-subset(p47,preferred==1)  # where duplicate entries for a month
+      #p47$p47.uid<-p47$id
+      #p47$id<-NULL
+      to.drop<-names(tmp)[names(tmp) %in% names(p47)]
+      tmp[,to.drop]<-NULL
+      # these things need to be fixed, should have be removed earlier
+      tmp<-subset(tmp,dflag<3)
+      to.drop<-c("odv","odz","opcv","opcz","rrr","tr","qc26","qc21","c1m","eoh","sim","hop","osv","osz","osiv","osiz","onv","onz","ophv","ophz","ocv","ocz","oav","oaz","oov","ooz","opv","opz","lov","hob")
+      tmp[,to.drop]<-NULL
+      YRMO[[imo]]<-merge(tmp,p47[,p47vars],by.x="final.id",by.y="ship_callsign",all.x=T)
+     } else {
+      YRMO[[imo]][,p47vars]<-NA
+      YRMO[[imo]]$ship_callsign<-NULL
+     }
+   }
+   filenames<-paste0(dir.out,names(YRMO),".Rda")
+   jj<-mapply(saveRDS, YRMO, file = filenames)
+   }
+
+  } # switch for no input data
+
+   #saveRDS(tmp[,c("uid","newid")],paste0(disj.in,"djt_joins_",iyr,".Rda"))
+
+#dev.off()
--- a/scr/run_disj.csh
+++ b/scr/run_disj.csh
+#!/bin/csh
+
+# Immediately exit a script when it encounters an error
+#set -e
+
+# start year = $1
+# end year = $2
+
+#conda activate ~/miniconda3/envs/r_env
+source /noc/users/metman/SETUPS/setup_icproc
+
+#basedir="/gws/nopw/j04/glosat/development/data/interim/HOSTACE_PROC/TEST_CONFIG/"
+set basedir="/noc/mpoc/surface_data/ICOADS_R3_PROC/RUN_ZERO/"
+set codedir="/noc/users/eck/nocgit/icoads-r-hostace/"
+
+if ( ! -d ${basedir}OPFILES  ) then
+    #echo Exists
+#else
+    mkdir ${basedir}OPFILES
+    mkdir ${basedir}output_data
+endif
+
+#Adjust your log_output dir as needed
+set log_output_dir="${basedir}OPFILES"
+
+touch ${log_output_dir}/dop.$1.$2
+/bin/rm ${log_output_dir}/dop.$1.$2
+touch ${log_output_dir}/dop.$1.$2
+echo "logfile" ${log_output_dir}/dop.$1.$2
+
+echo 'running disjoint tracking' >> ${log_output_dir}/dop.$1.$2
+echo 'start year' $1 >> ${log_output_dir}/dop.$1.$2
+echo 'end year' $2 >> ${log_output_dir}/dop.$1.$2
+echo 'start_at_disj' >> ${log_output_dir}/dop.$1.$2
+echo "---------------" >> ${log_output_dir}/dop.$1.$2
+
+set y = $1
+ while ( $y <= $2 )
+
+   echo "XXX getting disjoint" $y $y >> ${log_output_dir}/dop.$1.$2
+   if ( $y <= 1981 ) then
+     #Rscript $codedir/rscripts/run_disj_track.R $y $y "post" "none" >> ${log_output_dir}/dop.$1.$2
+     Rscript /noc/users/eck/run_disj_track2.R $y $y "post" "none" >> ${log_output_dir}/dop.$1.$2
+   else
+     #Rscript $codedir/rscripts/run_disj_track.R $y $y "pre" "none" >> ${log_output_dir}/dop.$1.$2
+     Rscript /noc/users/eck/run_disj_track2.R $y $y "pre" "none" >> ${log_output_dir}/dop.$1.$2
+   endif
+
+   echo 'XXX running joining' >> ${log_output_dir}/dop.$1.$2
+
+     #Rscript $codedir/rscripts/run_joins.R $y >> ${log_output_dir}/dop.$1.$2
+     # this just looks in DISJTR_NEW rather than DISJTR
+     Rscript /noc/users/eck/run_joins2.R $y >> ${log_output_dir}/dop.$1.$2
+
+   echo 'XXX getting height ensemble' >> ${log_output_dir}/dop.$1.$2
+     Rscript get_height_ens.R $y $y >> ${log_output_dir}/dop.$1.$2
+   echo 'XXX generating gridded heights' >> ${log_output_dir}/dop.$1.$2
+     Rscript get_gridded_vals.R $y $y >> ${log_output_dir}/dop.$1.$2
+   echo 'XXX getting ships with wrong diurnal cycle' >> ${log_output_dir}/dop.$1.$2
+     Rscript /noc/users/eck/add_diurnal_fails.R $y >> ${log_output_dir}/dop.$1.$2
+
+ @ y = $y + 1
+ end
+
+  echo "XXX finished disjoint, joins, heights and diurnal flags" $1 $2 >> ${log_output_dir}/dop.$1.$2
+
+