Commit 25291389 authored by Beatriz Recinos's avatar Beatriz Recinos
Browse files

changes to time series avg

parent 6842de0c
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -5,6 +5,8 @@ import time
import sys
from configobj import ConfigObj
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings('ignore')
......@@ -13,12 +15,16 @@ sys.path.append(config['repo_path'])
from sst_tools import utils_sst as utils
from sst_tools import workflow_sst as workflow
from sst_tools.workflow_sst import non_zero_data
from sst_tools.workflow_sst import sum_buoy_density
# Time
start = time.time()
process = psutil.Process(os.getpid())
res = 2
# Input specify resolution in space and time
res = 1
alias = '1M'
if res == 1:
path_coarse = config['satellite_sst_1x1']
......@@ -36,6 +42,18 @@ else:
sst_coarse = utils.open_data_and_crop(path_coarse, lat_s=-90, lat_n=-25)
sst_at_buoy = utils.open_data_and_crop(path_buoy, lat_s=-90, lat_n=-25)
# Adding sst standard error to the sst_at_buoy data set
sst_at_buoy['analysed_sst_se'] = sst_at_buoy.analysed_sst_std / np.sqrt(sst_at_buoy.buoy_count)
# Count how many grid-cells we have with data on the sst_at_buoy data set
grid_boxes_with_data = xr.apply_ufunc(non_zero_data,
sst_at_buoy.analysed_sst,
input_core_dims=[["lat", "lon"]],
dask='allowed',
vectorize=True
)
grid_boxes_with_data.name = 'No_of_grid_cells_with_data'
m, s = divmod(time.time() - start, 60)
h, m = divmod(m, 60)
print("Done reading everything: %02d:%02d:%02d" % (h, m, s))
......@@ -45,29 +63,47 @@ print(process.memory_info().rss)
# time resolution depending on the alias given
# Monthly means: each time stamp correspond to the last day of the month
sst_coarse_m = sst_coarse.resample(time='1M').mean()
sst_at_buoy_m = sst_at_buoy.resample(time='1M').mean()
sst_at_buoy_m_buoy_density = sst_at_buoy.buoy_count.resample(time='1M').sum()
sst_coarse_re = sst_coarse.resample(time=alias).mean()
# For sst_at_buoy we need to calculate weighted time moving averages
# We weight each variable by the number of buoy points (buoy_density) on each grid cell
sst_at_buoy_sst = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst,
sst_at_buoy.buoy_count,
time_stamp=alias,
var_name='analysed_sst')
sst_at_buoy_std = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst_std,
sst_at_buoy.buoy_count,
time_stamp=alias,
var_name='analysed_sst_std')
sst_at_buoy_median = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst_median,
sst_at_buoy.buoy_count,
time_stamp=alias,
var_name='analysed_sst_median')
# Yearly means: each time stamp correspond to the last month of the year
sst_coarse_y = sst_coarse.resample(time='1Y').mean()
sst_at_buoy_y = sst_at_buoy.resample(time='1Y').mean()
sst_at_buoy_y_buoy_density = sst_at_buoy.buoy_count.resample(time='1Y').sum()
sst_at_buoy_se = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst_se,
sst_at_buoy.buoy_count,
time_stamp=alias,
var_name='analysed_sst_se')
# Seasonal means: every 4 months year ends in JAN.
sst_coarse_s = sst_coarse.resample(time='QS-JAN').mean()
sst_at_buoy_s = sst_at_buoy.resample(time='QS-JAN').mean()
sst_at_buoy_s_buoy_density = sst_at_buoy.buoy_count.resample(time='QS-JAN').sum()
# Last variable makes no sense to weight it by itself!
sst_at_buoy_buoy_count_mean = sst_at_buoy.buoy_count.resample(time=alias).mean()
# We also calculate the total sum of buoy points for each month
sst_at_buoy_buoy_count_sum = sst_at_buoy.buoy_count.resample(time=alias).sum()
# The number of gridboxes with data for every month.
grid_boxes_with_data_re = grid_boxes_with_data.resample(time=alias).sum()
m, s = divmod(time.time() - start, 60)
h, m = divmod(m, 60)
print("Done resampling: %02d:%02d:%02d" % (h, m, s))
print(process.memory_info().rss)
# Calculate cell by cell differences of analysed_sst and time series statistics
diff_m = workflow.calculate_time_series_diff(sst_coarse_m, sst_at_buoy_m)
diff_y = workflow.calculate_time_series_diff(sst_coarse_y, sst_at_buoy_y)
diff_s = workflow.calculate_time_series_diff(sst_coarse_s, sst_at_buoy_s)
diff_mean = workflow.calculate_time_series_diff(sst_coarse_re, sst_at_buoy_sst, grid_boxes_with_data_re)
m, s = divmod(time.time() - start, 60)
h, m = divmod(m, 60)
......@@ -75,9 +111,12 @@ print("Done calculating time series diff: %02d:%02d:%02d" % (h, m, s))
print(process.memory_info().rss)
# Calculate S.O means of bin statistics
stats_m = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_m)
stats_y = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_y)
stats_s = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_s)
bin_std_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_std, var_name='analysed_sst_std')
bin_median_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_median, var_name='analysed_sst_median')
bin_se_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_se, var_name='analysed_sst_se')
bin_buoy_count_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_buoy_count_mean, var_name='buoy_count')
stats_mean = pd.concat([bin_std_mean, bin_median_mean, bin_se_mean, bin_buoy_count_mean], axis=1)
m, s = divmod(time.time() - start, 60)
h, m = divmod(m, 60)
......@@ -85,22 +124,15 @@ print("Done stats: %02d:%02d:%02d" % (h, m, s))
print(process.memory_info().rss)
# Estimate total buoy count
count_m = workflow.calculate_total_buoy_density(sst_at_buoy_m_buoy_density)
count_y = workflow.calculate_total_buoy_density(sst_at_buoy_y_buoy_density)
count_s = workflow.calculate_total_buoy_density(sst_at_buoy_s_buoy_density)
count_sum = workflow.calculate_total_buoy_density(sst_at_buoy_buoy_count_sum)
m, s = divmod(time.time() - start, 60)
h, m = divmod(m, 60)
print("Done count: %02d:%02d:%02d" % (h, m, s))
print(process.memory_info().rss)
result_m = pd.concat([diff_m, stats_m, count_m], axis=1)
result_y = pd.concat([diff_y, stats_y, count_y], axis=1)
result_s = pd.concat([diff_s, stats_s, count_s], axis=1)
result_m.to_csv(os.path.join(config['time_series'], 'monthly_' + str(res)+'x'+str(res) + '.csv'))
result_y.to_csv(os.path.join(config['time_series'], 'yearly_' + str(res)+'x'+str(res) + '.csv'))
result_s.to_csv(os.path.join(config['time_series'], 'seasonal_' + str(res)+'x'+str(res) + '.csv'))
result = pd.concat([diff_mean, stats_mean, count_sum], axis=1)
result.to_csv(os.path.join(config['time_series'], alias + '_' + str(res)+'x'+str(res) + '.csv'))
# Log
m, s = divmod(time.time() - start, 60)
......
......@@ -292,25 +292,31 @@ def grid_buoy_data(x, y, z, resolution, data):
return buoy
def calculate_time_series_diff(sst_coarse, sst_at_buoy):
def calculate_time_series_diff(sst_coarse, sst_at_buoy, grid_boxes_with_data):
"""
Calculates spatial means of SST differences between two re-gridded
data sets of ESA-cci-sst at a given resolution.
Parameters
----------
sst_coarse: coarse cci-sst data (1 or 2 degree)
sst_coarse: coarse cci-sst data (1 or 2 degree) resampled to a time stamp
sst_at_buoy: interpolated cci-sst data to buoy coordinates
and re-gridded into 1 or 2 degree grid cells
and re-gridded (into 1 or 2 degree) resampled to a time stamp
grid_boxes_with_data: Number of grid boxes with data resampled to a time stamp
Returns
--------
dt: data frame with time series differences of sst between the two data sets
and some general statistics of the differences in sst
dt: data frame with time series of mean sst differences between the two data sets (buoy - coarse)
across the southern ocean and some general statistics.
dt variables:
- mean of sst diff (weighted by the grid cell area and unweighted)
- std of the sst diff
- median value of sst diff
- standard error of the sst diff: std / sqrt(Total No. of grid boxes with data for each time stamp)
"""
# Calculate sst differences between data sets cell by cell
diff_ts = sst_at_buoy.analysed_sst - sst_coarse.analysed_sst
diff_ts = sst_at_buoy - sst_coarse.analysed_sst
# Calculate a spatial average of the differences for the entire data set
# also weighted by latitude grid cell area
......@@ -332,61 +338,45 @@ def calculate_time_series_diff(sst_coarse, sst_at_buoy):
dt = diff_ts.mean(('lon', 'lat')).load()
dt = dt.to_dataframe()
se = diff_ts.std(dim=('lon', 'lat')).load() / np.sqrt(grid_boxes_with_data.load())
se.name = 'analysed_sst_diff_se'
se = se.to_dataframe()
dt['weighted_analysed_sst_diff_mean'] = dt_w['weighted_analysed_sst_diff_mean'].copy()
dt['analysed_sst_diff_std'] = std['analysed_sst_diff_std'].copy()
dt['analysed_sst_diff_median'] = median['analysed_sst_diff_median'].copy()
dt['analysed_sst_diff_se'] = se['analysed_sst_diff_se'].copy()
return dt
# TODO: this function is very ugly!! needs re-thinking
def calculate_spatial_avg_of_stats(sst_at_buoy):
def calculate_spatial_avg_of_stats(sst_at_buoy_var, var_name):
"""
Calculates spatial means of bins statistics these are statistics from the interpolated data set:
std and median of sst inside a grid cell and buoy count averages.
Parameters
----------
sst_at_buoy: interpolated cci-sst data to buoy coordinates and re-gridded
into 1 or 2 degree grid cells, this contains bin statistics (std, median of sst values inside
each grid cell and buoy count)
sst_at_buoy_var: sst_at_buoy bin statistics resampled in a time stamp
(std, median and se of binned sst values inside each grid cell)
var_name: variable name
Returns
--------
out: data frame with time series of mean values for each bin statistics variable (std, median, buoy_count)
out: data frame with time series of mean values for each bin statistics variable (std, median, se, buoy_count)
"""
# weighted by latitude grid cell area
weights = np.cos(np.deg2rad(sst_at_buoy.lat))
weights = np.cos(np.deg2rad(sst_at_buoy_var.lat))
weights.name = 'weights'
std_w = sst_at_buoy.analysed_sst_std.weighted(weights).mean(('lon', 'lat')).load()
std_w.name = 'weighted_mean_analysed_sst_std'
out = std_w.to_dataframe()
median_w = sst_at_buoy.analysed_sst_median.weighted(weights).mean(('lon', 'lat')).load()
median_w.name = 'weighted_mean_analysed_sst_median'
median_w = median_w.to_dataframe()
out['weighted_mean_analysed_sst_median'] = median_w['weighted_mean_analysed_sst_median'].copy()
count_w = sst_at_buoy.buoy_count.weighted(weights).mean(('lon', 'lat')).load()
count_w.name = 'weighted_mean_buoy_count'
count_w = count_w.to_dataframe()
out['weighted_mean_buoy_count'] = count_w['weighted_mean_buoy_count'].copy()
std = sst_at_buoy.analysed_sst_std.mean(('lon', 'lat')).load()
std.name = 'mean_analysed_sst_std'
std = std.to_dataframe()
out['mean_analysed_sst_std'] = std['mean_analysed_sst_std'].copy()
median = sst_at_buoy.analysed_sst_median.mean(('lon', 'lat')).load()
median.name = 'mean_analysed_sst_median'
median = median.to_dataframe()
out['mean_analysed_sst_median'] = median['mean_analysed_sst_median'].copy()
var_w = sst_at_buoy_var.weighted(weights).mean(('lon', 'lat')).load()
var_w.name = var_name + '_weighted'
out = var_w.to_dataframe()
count = sst_at_buoy.buoy_count.mean(('lon', 'lat')).load()
count.name = 'mean_buoy_count'
count = count.to_dataframe()
out['mean_buoy_count'] = count['mean_buoy_count'].copy()
var_m = sst_at_buoy_var.mean(('lon', 'lat')).load()
var_m.name = var_name + '_unweighted'
var_m = var_m.to_dataframe()
out[var_name + '_unweighted'] = var_m[var_name + '_unweighted'].copy()
return out
......@@ -418,3 +408,51 @@ def calculate_total_buoy_density(buoy_count):
out['sum_buoy_count'] = count['sum_buoy_count'].copy()
return out
def non_zero_data(array):
co = np.count_nonzero(~np.isnan(array))
return co
def sum_buoy_density(array):
rho = array.sum()
return rho
def calculate_time_moving_weighted_average(var_to_avg, weights, time_stamp, var_name):
"""
Calculates time moving weighted average for an xarray variable
Parameters
__________
var_to_avg: xarray.Variable to average
weights: xarray.Variable for weights
time_stamp: alias to resample the data: e.g. 1M for monthly average
var_name: named variable after calculating the weighted avg
Returns
_______
weighted_avg: xarray.Variable averaged
"""
# Calculates the sum of all weights on each time stamp
weights_sum = weights.resample(time=time_stamp).sum()
# Multiply its variable for its own weight
multp = var_to_avg * weights
# Sum the multiply values on each time stamp
multp_sum = multp.resample(time=time_stamp).sum()
# Weighted avg
weighted_avg = multp_sum / weights_sum
weighted_avg.name = var_name
return weighted_avg
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment