changes to time series avg

25291389 · Beatriz Recinos · 6842de0c · 25291389 · 25291389 · 25291389
Commit 25291389 authored 4 years ago by Beatriz Recinos
Showing with 868 additions and 68 deletions

notebooks/computing_time_series.ipynb notebooks/computing_time_series.ipynb +730 -0

scripts/sst_differences.py scripts/sst_differences.py +60 -28

sst_tools/workflow_sst.py sst_tools/workflow_sst.py +78 -40

No files found.
--- a/notebooks/computing_time_series.ipynb
+++ b/notebooks/computing_time_series.ipynb
--- a/scripts/sst_differences.py
+++ b/scripts/sst_differences.py
@@ -5,6 +5,8 @@ import time
 import sys
 from configobj import ConfigObj
 import pandas as pd
+import numpy as np
+import xarray as xr
 import warnings
 warnings.filterwarnings('ignore')

@@ -13,12 +15,16 @@ sys.path.append(config['repo_path'])

 from sst_tools import utils_sst as utils
 from sst_tools import workflow_sst as workflow
+from sst_tools.workflow_sst import non_zero_data
+from sst_tools.workflow_sst import sum_buoy_density

 # Time
 start = time.time()
 process = psutil.Process(os.getpid())

-res = 2
+# Input specify resolution in space and time
+res = 1
+alias = '1M'

 if res == 1:
    path_coarse = config['satellite_sst_1x1']
@@ -36,6 +42,18 @@ else:
 sst_coarse = utils.open_data_and_crop(path_coarse, lat_s=-90, lat_n=-25)
 sst_at_buoy = utils.open_data_and_crop(path_buoy, lat_s=-90, lat_n=-25)

+# Adding sst standard error to the sst_at_buoy data set
+sst_at_buoy['analysed_sst_se'] = sst_at_buoy.analysed_sst_std / np.sqrt(sst_at_buoy.buoy_count)
+
+# Count how many grid-cells we have with data on the sst_at_buoy data set
+grid_boxes_with_data = xr.apply_ufunc(non_zero_data,
+                                      sst_at_buoy.analysed_sst,
+                                      input_core_dims=[["lat", "lon"]],
+                                      dask='allowed',
+                                      vectorize=True
+                                      )
+grid_boxes_with_data.name = 'No_of_grid_cells_with_data'
+
 m, s = divmod(time.time() - start, 60)
 h, m = divmod(m, 60)
 print("Done reading everything: %02d:%02d:%02d" % (h, m, s))
@@ -45,29 +63,47 @@ print(process.memory_info().rss)
 # time resolution depending on the alias given

 # Monthly means: each time stamp correspond to the last day of the month
-sst_coarse_m = sst_coarse.resample(time='1M').mean()
-sst_at_buoy_m = sst_at_buoy.resample(time='1M').mean()
-sst_at_buoy_m_buoy_density = sst_at_buoy.buoy_count.resample(time='1M').sum()
+sst_coarse_re = sst_coarse.resample(time=alias).mean()
+
+# For sst_at_buoy we need to calculate weighted time moving averages
+# We weight each variable by the number of buoy points (buoy_density) on each grid cell
+sst_at_buoy_sst = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst,
+                                                                    sst_at_buoy.buoy_count,
+                                                                    time_stamp=alias,
+                                                                    var_name='analysed_sst')
+
+sst_at_buoy_std = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst_std,
+                                                                    sst_at_buoy.buoy_count,
+                                                                    time_stamp=alias,
+                                                                    var_name='analysed_sst_std')
+
+sst_at_buoy_median = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst_median,
+                                                                       sst_at_buoy.buoy_count,
+                                                                       time_stamp=alias,
+                                                                       var_name='analysed_sst_median')

-# Yearly means: each time stamp correspond to the last month of the year
-sst_coarse_y = sst_coarse.resample(time='1Y').mean()
-sst_at_buoy_y = sst_at_buoy.resample(time='1Y').mean()
-sst_at_buoy_y_buoy_density = sst_at_buoy.buoy_count.resample(time='1Y').sum()
+sst_at_buoy_se = workflow.calculate_time_moving_weighted_average(sst_at_buoy.analysed_sst_se,
+                                                                   sst_at_buoy.buoy_count,
+                                                                   time_stamp=alias,
+                                                                   var_name='analysed_sst_se')

-# Seasonal means: every 4 months year ends in JAN.
-sst_coarse_s = sst_coarse.resample(time='QS-JAN').mean()
-sst_at_buoy_s = sst_at_buoy.resample(time='QS-JAN').mean()
-sst_at_buoy_s_buoy_density = sst_at_buoy.buoy_count.resample(time='QS-JAN').sum()
+# Last variable makes no sense to weight it by itself!
+sst_at_buoy_buoy_count_mean = sst_at_buoy.buoy_count.resample(time=alias).mean()
+
+# We also calculate the total sum of buoy points for each month
+sst_at_buoy_buoy_count_sum = sst_at_buoy.buoy_count.resample(time=alias).sum()
+
+# The number of gridboxes with data for every month.
+grid_boxes_with_data_re = grid_boxes_with_data.resample(time=alias).sum()

 m, s = divmod(time.time() - start, 60)
 h, m = divmod(m, 60)
 print("Done resampling: %02d:%02d:%02d" % (h, m, s))
 print(process.memory_info().rss)

+
 # Calculate cell by cell differences of analysed_sst and time series statistics
-diff_m = workflow.calculate_time_series_diff(sst_coarse_m, sst_at_buoy_m)
-diff_y = workflow.calculate_time_series_diff(sst_coarse_y, sst_at_buoy_y)
-diff_s = workflow.calculate_time_series_diff(sst_coarse_s, sst_at_buoy_s)
+diff_mean = workflow.calculate_time_series_diff(sst_coarse_re, sst_at_buoy_sst, grid_boxes_with_data_re)

 m, s = divmod(time.time() - start, 60)
 h, m = divmod(m, 60)
@@ -75,9 +111,12 @@ print("Done calculating time series diff: %02d:%02d:%02d" % (h, m, s))
 print(process.memory_info().rss)

 # Calculate S.O means of bin statistics
-stats_m = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_m)
-stats_y = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_y)
-stats_s = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_s)
+bin_std_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_std, var_name='analysed_sst_std')
+bin_median_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_median, var_name='analysed_sst_median')
+bin_se_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_se, var_name='analysed_sst_se')
+bin_buoy_count_mean = workflow.calculate_spatial_avg_of_stats(sst_at_buoy_buoy_count_mean, var_name='buoy_count')
+
+stats_mean = pd.concat([bin_std_mean, bin_median_mean, bin_se_mean, bin_buoy_count_mean], axis=1)

 m, s = divmod(time.time() - start, 60)
 h, m = divmod(m, 60)
@@ -85,22 +124,15 @@ print("Done stats: %02d:%02d:%02d" % (h, m, s))
 print(process.memory_info().rss)

 # Estimate total buoy count
-count_m = workflow.calculate_total_buoy_density(sst_at_buoy_m_buoy_density)
-count_y = workflow.calculate_total_buoy_density(sst_at_buoy_y_buoy_density)
-count_s = workflow.calculate_total_buoy_density(sst_at_buoy_s_buoy_density)
+count_sum = workflow.calculate_total_buoy_density(sst_at_buoy_buoy_count_sum)

 m, s = divmod(time.time() - start, 60)
 h, m = divmod(m, 60)
 print("Done count: %02d:%02d:%02d" % (h, m, s))
 print(process.memory_info().rss)

-result_m = pd.concat([diff_m, stats_m, count_m], axis=1)
-result_y = pd.concat([diff_y, stats_y, count_y], axis=1)
-result_s = pd.concat([diff_s, stats_s, count_s], axis=1)
-
-result_m.to_csv(os.path.join(config['time_series'], 'monthly_' + str(res)+'x'+str(res) + '.csv'))
-result_y.to_csv(os.path.join(config['time_series'], 'yearly_' + str(res)+'x'+str(res) + '.csv'))
-result_s.to_csv(os.path.join(config['time_series'], 'seasonal_' + str(res)+'x'+str(res) + '.csv'))
+result = pd.concat([diff_mean, stats_mean, count_sum], axis=1)
+result.to_csv(os.path.join(config['time_series'], alias + '_' + str(res)+'x'+str(res) + '.csv'))

 # Log
 m, s = divmod(time.time() - start, 60)

--- a/sst_tools/workflow_sst.py
+++ b/sst_tools/workflow_sst.py
@@ -292,25 +292,31 @@ def grid_buoy_data(x, y, z, resolution, data):
    return buoy


-def calculate_time_series_diff(sst_coarse, sst_at_buoy):
+def calculate_time_series_diff(sst_coarse, sst_at_buoy, grid_boxes_with_data):
    """
    Calculates spatial means of SST differences between two re-gridded
    data sets of ESA-cci-sst at a given resolution.

        Parameters
        ----------
-        sst_coarse: coarse cci-sst data (1 or 2 degree)
+        sst_coarse: coarse cci-sst data (1 or 2 degree) resampled to a time stamp
        sst_at_buoy: interpolated cci-sst data to buoy coordinates
-        and re-gridded into 1 or 2 degree grid cells
+        and re-gridded (into 1 or 2 degree) resampled to a time stamp
+        grid_boxes_with_data: Number of grid boxes with data resampled to a time stamp

        Returns
        --------
-        dt: data frame with time series differences of sst between the two data sets
-        and some general statistics of the differences in sst
+        dt: data frame with time series of mean sst differences between the two data sets (buoy - coarse)
+        across the southern ocean and some general statistics.
+        dt variables:
+        - mean of sst diff (weighted by the grid cell area and unweighted)
+        - std of the sst diff
+        - median value of sst diff
+        - standard error of the sst diff: std / sqrt(Total No. of grid boxes with data for each time stamp)
    """

    # Calculate sst differences between data sets cell by cell
-    diff_ts = sst_at_buoy.analysed_sst - sst_coarse.analysed_sst
+    diff_ts = sst_at_buoy - sst_coarse.analysed_sst

    # Calculate a spatial average of the differences for the entire data set
    # also weighted by latitude grid cell area
@@ -332,61 +338,45 @@ def calculate_time_series_diff(sst_coarse, sst_at_buoy):
    dt = diff_ts.mean(('lon', 'lat')).load()
    dt = dt.to_dataframe()

+    se = diff_ts.std(dim=('lon', 'lat')).load() / np.sqrt(grid_boxes_with_data.load())
+    se.name = 'analysed_sst_diff_se'
+    se = se.to_dataframe()
+
    dt['weighted_analysed_sst_diff_mean'] = dt_w['weighted_analysed_sst_diff_mean'].copy()
    dt['analysed_sst_diff_std'] = std['analysed_sst_diff_std'].copy()
    dt['analysed_sst_diff_median'] = median['analysed_sst_diff_median'].copy()
+    dt['analysed_sst_diff_se'] = se['analysed_sst_diff_se'].copy()

    return dt


-# TODO: this function is very ugly!! needs re-thinking
-def calculate_spatial_avg_of_stats(sst_at_buoy):
+def calculate_spatial_avg_of_stats(sst_at_buoy_var, var_name):
    """
    Calculates spatial means of bins statistics these are statistics from the interpolated data set:
    std and median of sst inside a grid cell and buoy count averages.

        Parameters
        ----------
-        sst_at_buoy: interpolated cci-sst data to buoy coordinates and re-gridded
-        into 1 or 2 degree grid cells, this contains bin statistics (std, median of sst values inside
-        each grid cell and buoy count)
+        sst_at_buoy_var: sst_at_buoy bin statistics resampled in a time stamp
+        (std, median and se of binned sst values inside each grid cell)
+        var_name: variable name

        Returns
        --------
-        out: data frame with time series of mean values for each bin statistics variable (std, median, buoy_count)
+        out: data frame with time series of mean values for each bin statistics variable (std, median, se, buoy_count)
    """
    # weighted by latitude grid cell area
-    weights = np.cos(np.deg2rad(sst_at_buoy.lat))
+    weights = np.cos(np.deg2rad(sst_at_buoy_var.lat))
    weights.name = 'weights'

-    std_w = sst_at_buoy.analysed_sst_std.weighted(weights).mean(('lon', 'lat')).load()
-    std_w.name = 'weighted_mean_analysed_sst_std'
-    out = std_w.to_dataframe()
-
-    median_w = sst_at_buoy.analysed_sst_median.weighted(weights).mean(('lon', 'lat')).load()
-    median_w.name = 'weighted_mean_analysed_sst_median'
-    median_w = median_w.to_dataframe()
-    out['weighted_mean_analysed_sst_median'] = median_w['weighted_mean_analysed_sst_median'].copy()
-
-    count_w = sst_at_buoy.buoy_count.weighted(weights).mean(('lon', 'lat')).load()
-    count_w.name = 'weighted_mean_buoy_count'
-    count_w = count_w.to_dataframe()
-    out['weighted_mean_buoy_count'] = count_w['weighted_mean_buoy_count'].copy()
-
-    std = sst_at_buoy.analysed_sst_std.mean(('lon', 'lat')).load()
-    std.name = 'mean_analysed_sst_std'
-    std = std.to_dataframe()
-    out['mean_analysed_sst_std'] = std['mean_analysed_sst_std'].copy()
-
-    median = sst_at_buoy.analysed_sst_median.mean(('lon', 'lat')).load()
-    median.name = 'mean_analysed_sst_median'
-    median = median.to_dataframe()
-    out['mean_analysed_sst_median'] = median['mean_analysed_sst_median'].copy()
+    var_w = sst_at_buoy_var.weighted(weights).mean(('lon', 'lat')).load()
+    var_w.name = var_name + '_weighted'
+    out = var_w.to_dataframe()

-    count = sst_at_buoy.buoy_count.mean(('lon', 'lat')).load()
-    count.name = 'mean_buoy_count'
-    count = count.to_dataframe()
-    out['mean_buoy_count'] = count['mean_buoy_count'].copy()
+    var_m = sst_at_buoy_var.mean(('lon', 'lat')).load()
+    var_m.name = var_name + '_unweighted'
+    var_m = var_m.to_dataframe()
+    out[var_name + '_unweighted'] = var_m[var_name + '_unweighted'].copy()

    return out

@@ -418,3 +408,51 @@ def calculate_total_buoy_density(buoy_count):
    out['sum_buoy_count'] = count['sum_buoy_count'].copy()

    return out
+
+
+def non_zero_data(array):
+    co = np.count_nonzero(~np.isnan(array))
+    return co
+
+
+def sum_buoy_density(array):
+    rho = array.sum()
+    return rho
+
+
+def calculate_time_moving_weighted_average(var_to_avg, weights, time_stamp, var_name):
+    """
+    Calculates time moving weighted average for an xarray variable
+
+        Parameters
+        __________
+        var_to_avg: xarray.Variable to average
+        weights: xarray.Variable for weights
+        time_stamp: alias to resample the data: e.g. 1M for monthly average
+        var_name: named variable after calculating the weighted avg
+
+        Returns
+        _______
+        weighted_avg: xarray.Variable averaged
+    """
+    # Calculates the sum of all weights on each time stamp
+    weights_sum = weights.resample(time=time_stamp).sum()
+
+    # Multiply its variable for its own weight
+    multp = var_to_avg * weights
+    # Sum the multiply values on each time stamp
+    multp_sum = multp.resample(time=time_stamp).sum()
+
+    # Weighted avg
+    weighted_avg = multp_sum / weights_sum
+    weighted_avg.name = var_name
+
+    return weighted_avg
+
+
+
+
+
+
+
+