Project

General

Profile

Statistics
| Branch: | Tag: | Revision:

pycama / src / pycama / AlongTrackPlot.py @ 834:d989df597b80

History | View | Annotate | Download (22.1 KB)

1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3

    
4
# Copyright 2016-2017 Maarten Sneep, KNMI
5
#
6
# Redistribution and use in source and binary forms, with or without
7
# modification, are permitted provided that the following conditions are met:
8
#
9
# 1. Redistributions of source code must retain the above copyright notice,
10
#    this list of conditions and the following disclaimer.
11
#
12
# 2. Redistributions in binary form must reproduce the above copyright notice,
13
#    this list of conditions and the following disclaimer in the documentation
14
#    and/or other materials provided with the distribution.
15
#
16
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
20
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26

    
27
## @file AlongTrackPlot.py
28
#
29
# This file defines a subclass of the pycama.AnalysisAndPlot.AnalysisAndPlot class.
30
# This subclass handles row-dependent statistics.
31
# @author Maarten Sneep
32

    
33
import math
34
import logging
35
import datetime
36
import warnings
37
warnings.filterwarnings("ignore", category=FutureWarning)
38

    
39
import numpy as np
40
import netCDF4
41
import h5py
42

    
43
from .AnalysisAndPlot import AnalysisAndPlot
44

    
45
from .utilities import *
46

    
47
## This class handles row-dependent statistics.
48
#
49
# This is a subclass of the pycama.AnalysisAndPlot.AnalysisAndPlot class.
50
class AlongTrackPlot(AnalysisAndPlot):
51
    ## Add a variable to the list 'to be processed'
52
    #
53
    # This creates entries in the pycama.AnalysisAndPlot.AnalysisAndPlot.variables instance variable (a dictionary).
54
    def add_variable(self, var):
55
        n_xtrack = len(var.data)
56
        aggregate = {}
57
        for key in ('mean', 'min', 'max', 'standard_deviation',
58
                    'q01', 'q05', 'q10', 'q25', 'q75', 'q90', 'q95', 'q99',
59
                    'inter_quartile_range', 'median', 'mode', 'count'):
60
            aggregate[key] = np.ma.zeros((n_xtrack,), dtype=np.float32)
61
            aggregate[key].mask = np.zeros((n_xtrack,), dtype=np.bool_)
62
        self.variables[var.name] = aggregate
63

    
64
        super(self.__class__, self).add_variable(var)
65

    
66
    ## Class specific setup
67
    #
68
    #  This class does not require specific setup.
69
    def setup(self, **kwargs):
70
        pass
71

    
72
    ## Extract the required information from the input data.
73
    #
74
    #  This method loops over all variables and rows, and extracts mean, minimum, maxmum, standard deviation,
75
    #  median, mode, and percentiles at 1, 5, 10, 25, 75, 90, 95', and 99 percent.
76
    #  From these values the interquartile range is extracted.
77
    #
78
    #  Rows without data are reported after the loop.
79
    def process(self):
80
        no_data = {}
81
        n_variables = len([v for v in self.input_variables.variables.values() if v.show and (not v.noscanline)])
82
        i_variable = 0
83
        for v in self.input_variables.variables.values():
84
            if (not v.show) or v.noscanline:
85
                if v.name not in self.variables:
86
                    self.variables[v.name] = {}
87
                self.variables[v.name]['noscanline'] = True
88
                continue
89

    
90
            i_variable += 1
91
            self.progress(100*i_variable/n_variables)
92

    
93
            if v.log_range:
94
                b = np.logspace(math.log10(v.data_range[0]), math.log10(v.data_range[1]), num=v.number_bins+1, endpoint=True, base=10.0, dtype=np.float32)
95
            else:
96
                b = np.linspace(v.data_range[0], v.data_range[1], num=v.number_bins+1, endpoint=True, dtype=np.float32)
97

    
98
            for i in range(len(v.data)):
99
                if len(v.data[i]) == 0:
100
                    self.variables[v.name]["count"][i] = 0
101
                    for key in ('mean', 'min', 'max', 'standard_deviation',
102
                                'q01', 'q05', 'q10', 'q25', 'q75', 'q90', 'q95', 'q99',
103
                                'inter_quartile_range', 'median', 'mode'):
104
                        self.variables[v.name][key][i] = fill_value(np.float32)
105
                        self.variables[v.name][key].mask[i] = True
106
                        if v.name not in no_data:
107
                            no_data[v.name] = []
108
                        no_data[v.name].append(i)
109
                    continue # next row
110

    
111
                self.variables[v.name]['noscanline'] = False
112
                if v.data[i].dtype in (np.float32, np.float64):
113
                    cidx = np.logical_and(np.logical_not(v.data[i].mask), np.isfinite(v.data[i]))
114
                else:
115
                    cidx = np.logical_not(v.data[i].mask)
116

    
117
                if not np.any(cidx):
118
                    if v.name not in no_data:
119
                        no_data[v.name] = []
120
                    no_data[v.name].append(i)
121
                    self.variables[v.name]["count"][i] = 0
122
                    for key in ('mean', 'min', 'max', 'standard_deviation',
123
                                'q01', 'q05', 'q10', 'q25', 'q75', 'q90', 'q95', 'q99',
124
                                'inter_quartile_range', 'median', 'mode'):
125
                        self.variables[v.name][key][i] = fill_value(np.float32)
126
                        self.variables[v.name][key].mask[i] = True
127
                    continue # next row
128

    
129
                self.variables[v.name]["count"][i] = np.sum(cidx)
130
                self.variables[v.name]['mean'][i] = np.mean(v.data[i][cidx])
131
                self.variables[v.name]['min'][i] = np.min(v.data[i][cidx])
132
                self.variables[v.name]['max'][i] = np.max(v.data[i][cidx])
133
                self.variables[v.name]['standard_deviation'][i] = np.std(v.data[i][cidx], ddof=1)
134
                q_level = [1, 5, 10, 25, 50, 75, 90, 95, 99]
135
                q_val = np.percentile(v.data[i][cidx], q_level)
136
                for q, qv in zip(q_level, q_val):
137
                    if q != 50:
138
                        self.variables[v.name]['q{0:02d}'.format(q)][i] = float(qv)
139
                self.variables[v.name]['inter_quartile_range'][i] = q_val[q_level.index(75)] - q_val[q_level.index(25)]
140
                self.variables[v.name]['median'][i] = q_val[q_level.index(50)]
141
                h, x_edges = np.histogram(v.data[i][cidx], bins=b)
142
                self.variables[v.name]['mode'][i] = self.mode(h, (x_edges[0:-1] + x_edges[1:])/2)
143

    
144
        # handle delayed warnings
145
        for key, rows in no_data.items():
146
            unique_rows = sorted(list(set(rows)))
147
            if len(unique_rows) == 1:
148
                self.logger.info("%s: no valid data in variable '%s' for row %d.", self.__class__.__name__, key, unique_rows[0])
149
            else:
150
                self.logger.info("%s: no valid data in variable '%s' for rows %s.", self.__class__.__name__, key, row_list_simplification(unique_rows))
151

    
152
    ## Write processed data to output netcdf file.
153
    #
154
    #  @param fname File to write to
155
    #  @param mode  Writing mode, defaults to append.
156
    #
157
    #  Write data (including extraction specific dimensions) to the group with
158
    #  the name given in the `storage_group_name` property ("`alongtrackplot_data`" for this class).
159
    #
160
    #  For each variable in the configuration a series of netCDF variables is
161
    #  created inside the `alongtrackplot_data` group.
162
    #  The variables have the same dimensions: `[time, ground_pixel]`. These dimensions are already created in the root group.
163
    #  The names of the netCDF variables are formed by concatenation: the configured variable name with an underscore and one of the keys: `mean`,
164
    #  `min`, `max`, `standard_deviation`, `q01`, `q05`, `q10`, `q25`, `q75`, `q90`, `q95`, `q99`, `inter_quartile_range`, `median`,
165
    #  and `mode`.
166
    def dump(self, fname, mode='a'):
167
        compress={'compression':'gzip', 'compression_opts':3, 'shuffle':True, 'fletcher32':True}
168
        with h5py.File(fname, mode=mode) as ref:
169
            time_step = self.time_index_in_output # set in __init__.
170
            ground_pixel = ref['ground_pixel'].shape[0]
171
            try:
172
                grp = ref.create_group(self.storage_group_name)
173
                grp.attrs['comment'] = 'Along track statistics'
174
            except:
175
                grp = ref[self.storage_group_name]
176

    
177
            meanings = {'mean':'mean of the observations',
178
                        'min':'minimum value',
179
                        'max':'maximum value',
180
                        'standard_deviation':'sample standard deviation',
181
                        'q01':"1% of all values are smaller than this value",
182
                        'q05':"5% of all values are smaller than this value",
183
                        'q10':"10% of all values are smaller than this value",
184
                        'q25':"25% of all values are smaller than this value",
185
                        'q75':"75% of all values are smaller than this value",
186
                        'q90':"90% of all values are smaller than this value",
187
                        'q95':"95% of all values are smaller than this value",
188
                        'q99':"99% of all values are smaller than this value",
189
                        'inter_quartile_range':"q75 - q25",
190
                        'median':"median value",
191
                        'mode': "Mode (most common) value",
192
                        'count': "Number of data points"}
193

    
194
            for key in self.variable_names:
195
                if (not 'noscanline' in self.variables[key]) or self.variables[key]['noscanline']:
196
                    self.logger.debug("Skipping '{0}' while writing.".format(key))
197
                    continue
198
                for value in meanings.keys():
199
                    variable_name = "{0}_{1}".format(key, value)
200
                    try:
201
                        var = grp[variable_name]
202
                        var.resize(self.time_index_in_output+1, axis=0)
203
                    except:
204
                        if self.time_index_in_output > 0:
205
                            continue
206
                        var = grp.create_dataset(variable_name,
207
                                                 (1,ground_pixel),
208
                                                 dtype=np.float32,
209
                                                 maxshape=(None, ground_pixel),
210
                                                 **compress)
211
                        var.dims.create_scale(ref['time'], 'time')
212
                        var.dims[0].attach_scale(ref['time'])
213
                        var.dims.create_scale(ref['ground_pixel'], 'ground_pixel')
214
                        var.dims[1].attach_scale(ref['ground_pixel'])
215

    
216
                        unit = self.variables_meta[key]['units']
217
                        var.attrs['units'] = unit if unit is not None else "1"
218
                        var.attrs['long_name'] = "{0}, {1}.".format(self.variables_meta[key]['title'], meanings[value])
219
                        var.attrs['log_range'] = str(self.variables_meta[key]['log_range'])
220

    
221
                    try:
222
                        var[time_step, :] = self.variables[key][value]
223
                    except ValueError as err:
224
                        raise CAMAException("While writing {0}[{1}] in {2} an exception occurred: {3}".format(key, value, self.storage_group_name, str(err)))
225

    
226
    ## Read processed data from the input file, for specified time index.
227
    #
228
    #  @param fname      The pycama data file (netCDF4).
229
    #  @param time_index The time-index to read.
230
    #
231
    #  This method reads the date from file to restore itself for plotting.
232
    #  The exected format is as written by the pycama.AlongTrackPlot.AlongTrackPlot.dump() method.
233
    def ingest(self, fname, time_step, exclude=None):
234
        self.time_index_in_output = time_step
235
        with h5py.File(fname, mode='r') as ref:
236
            try:
237
                grp = ref[self.storage_group_name]
238
            except:
239
                self.logger.error("along track data not in '%s'.", os.path.basename(fname))
240
                return False
241
            self.variables = {k.decode('utf-8'): {} for k in ref['variable_names'][:]}
242

    
243
            for key in self.variable_names:
244
                if exclude is not None and key in exclude:
245
                    continue
246
                for value in ('mean', 'min', 'max', 'standard_deviation',
247
                              'q01', 'q05', 'q10', 'q25', 'q75', 'q90', 'q95', 'q99',
248
                              'inter_quartile_range', 'median', 'mode', 'count'):
249
                    variable_name = "{0}_{1}".format(key, value)
250
                    if variable_name not in grp.keys():
251
                        self.logger.debug("Variable '%s' in group '%s' not found, skipping", variable_name, self.storage_group_name)
252
                        continue
253
                    self.logger.debug("Reading variable '%s' in group '%s'", variable_name, self.storage_group_name)
254

    
255
                    if key not in self.variables_meta:
256
                        self.variables_meta[key] = {}
257
                    if key not in self.variables:
258
                        self.variables[key] = {}
259
                    self.variables[key]['noscanline'] = False
260
                    var = grp[variable_name]
261

    
262
                    try:
263
                        self.variables_meta[key]['units'] = var.attrs['units']
264
                        if isinstance(self.variables_meta[key]['units'], bytes):
265
                            self.variables_meta[key]['units'] = str(self.variables_meta[key]['units'], 'utf-8')
266
                    except (AttributeError, KeyError):
267
                        self.variables_meta[key]['units'] = "1"
268

    
269
                    try:
270
                        self.variables_meta[key]['title'] = ", ".join(var.attrs['long_name'].split(', ')[0:-1])
271
                        if isinstance(self.variables_meta[key]['title'], bytes):
272
                            self.variables_meta[key]['title'] = str(self.variables_meta[key]['title'], 'utf-8')
273
                    except (AttributeError, KeyError):
274
                        self.variables_meta[key]['title'] = "{0} ({1})".format(key.replace('_', ' '), value)
275

    
276
                    try:
277
                        self.variables_meta[key]['log_range'] = (var.attrs['log_range'].lower() == b'true'
278
                                                                 if isinstance(var.attrs['log_range'], bytes) else
279
                                                                 var.attrs['log_range'].lower() == 'true')
280
                    except (AttributeError, KeyError):
281
                        self.variables_meta[key]['log_range'] = False
282

    
283
                    self.variables[key][value] = np.ma.asarray(var[time_step, :])
284
                    self.variables[key][value].mask = (self.variables[key][value] == fill_value(var.dtype))
285
        return True
286

    
287
    ## Merge data into a combined dataset.
288
    #
289
    #  @param other The object to be added to self,
290
    #               also an instance of the pycama.AlongTrackPlot.AlongTrackPlot class.
291
    #
292
    # While the mean, standard deviation, minimum and maximum can be aggregated exactly,
293
    # the median and percentiles can't (you need the full dataset for those). As a surrogate
294
    # we use the mean of the values.
295
    def __iadd__(self, other):
296
        for key in self.variables.keys():
297
            if 'count' not in self.variables[key]:
298
                continue
299
            self.logger.debug("{0}.__iadd__(): processing '{1}'".format(self.__class__.__name__, key))
300
            count = self.variables[key]["count"]
301
            ocount = other.variables[key]["count"]
302

    
303
            for v in [k for k in self.variables[key].keys() if k != "count"]:
304
                var = self.variables[key][v]
305
                ovar = other.variables[key][v]
306
                # 'mean', 'min', 'max', 'standard_deviation',
307
                # 'q01', 'q05', 'q10', 'q25', 'q75', 'q90', 'q95', 'q99',
308
                # 'inter_quartile_range', 'median', 'mode', 'count'
309
                if v in ["mean", "median", "mode",
310
                         'q01', 'q05', 'q10', 'q25', 'q75', 'q90', 'q95', 'q99',
311
                         'inter_quartile_range']:
312
                    idx = (count+ocount) > 0
313
                    self.variables[key][v] = (var[idx]*count[idx] + ovar[idx]*ocount[idx])/(count[idx]+ocount[idx])
314
                elif v in ["standard_deviation"]:
315
                    # assume covariance is 0 (different datasets, same inderlying distribution)
316
                    idx = (count+ocount) > 0
317
                    self.variables[key][v] = np.sqrt((var[idx]**2*count[idx] + ovar[idx]**2*ocount[idx])/(count[idx]+ocount[idx]))
318
                elif v in ['min']:
319
                    self.variables[key][v] = np.nanmin(np.asarray([var, ovar]), axis=0)
320
                elif v in ["max"]:
321
                    self.variables[key][v] = np.nanmax(np.asarray([var, ovar]), axis=0)
322

    
323
            self.variables[key]["count"] = count + ocount
324

    
325

    
326

    
327
    ## plot the specified variable.
328
    #
329
    #  @param varname The name of the variable to plot.
330
    #  @param figure  The matplotlib.figure.Figure instance to plot to.
331
    #  @param ax      The matplotlib.axes.Axes object to use. Default is None, in that case `ax = matplotlib.pyplot.gca()` shall be used.
332
    #  @param add_minmax Keyword parameter, when true the minimum and maximum for each row are plotted as well.
333
    #  @param add_mean   Keyword parameter, when true the mean value for each row is plotted as well.
334
    #  @param add_stddev Keyword parameter, when true the standard deviation traces
335
    #         (\f$\bar x\pm\sigma\f$) are plotted as well.
336
    #         Requires add_mean to be present and true as well.
337
    #  @param add_mode   Keyword parameter, when true the mode value for each row.
338
    #  @param add_legend Keyword parameter, when true a legend is added to the plot.
339
    #
340
    #  Other assumptions are as described in the super class.
341
    def plot(self, varname, figure, ax=None, **kwargs):
342
        import matplotlib
343
        matplotlib.use('svg')
344
        matplotlib.rc('font', family='DejaVu Sans')
345
        import matplotlib.pyplot as plt
346
        import matplotlib.cm as cm
347
        from matplotlib.colors import Normalize, LogNorm
348
        import matplotlib.ticker as ticker
349

    
350
        if varname not in self.variables:
351
            self.logger.debug("Variable %s missing for along track statistics", varname)
352
            return False
353

    
354
        self.logger.info("Plotting along track statistics of %s", varname)
355
        if ax is not None:
356
            plt.clf()
357

    
358
        ax = figure.add_axes([0.14, 0.1, 0.84, 0.88])
359

    
360
        add_minmax = ('add_minmax' in kwargs and kwargs['add_minmax'])
361
        add_mean = ('add_mean' in kwargs and kwargs['add_mean'])
362
        add_stddev = ('add_stddev' in kwargs and kwargs['add_stddev'])
363
        add_mode = ('add_mode' in kwargs and kwargs['add_mode'])
364

    
365
        x = np.arange(self.variables[varname]['median'].shape[0], dtype=np.float32)
366
        try:
367
            finite_x = x[np.logical_not(self.variables[varname]['median'].mask)]
368
        except AttributeError:
369
            finite_x = x
370

    
371
        try:
372
            ax.set_xlim([finite_x[0], finite_x[-1]], auto=False)
373
        except IndexError:
374
            return False
375

    
376
        for q1, q2,alpha in zip(('q01', 'q05', 'q10', 'q25'), ('q99', 'q95', 'q90', 'q75'), (0.2, 0.3, 0.4, 0.5)):
377
            ax.fill_between(x,
378
                            self.variables[varname][q1],
379
                            self.variables[varname][q2],
380
                            facecolor='b', alpha=alpha)
381
            labelstr = "{0}-{1}%".format(int(q1[1:]), int(q2[1:]))
382
            plt.plot(x, self.variables[varname][q1], label=labelstr, axes=ax, linewidth=2*alpha, color='#{0:x}{0:x}ff'.format(int(255*(1-1.2*alpha))), linestyle=(0, (5, 1)))
383
            plt.plot(x, self.variables[varname][q2], axes=ax, linewidth=2*alpha, color='#0000{0:x}'.format(int(255*(1-1.2*alpha))), linestyle=(0, (5, 1)))
384

    
385
        ncolumns_legend = 5
386

    
387
        plt.plot(x, self.variables[varname]['median'], label='Median', axes=ax, linewidth=1.5, color='k', linestyle='solid')
388

    
389
        if add_mode:
390
            plt.plot(x, self.variables[varname]['mode'], label='Mode', axes=ax, linewidth=1.0, color='b', linestyle=(0, (3, 1, 1, 1)))
391
            ncolumns_legend += 1
392

    
393
        if add_mean:
394
            plt.plot(x, self.variables[varname]['mean'], label='Mean', axes=ax, linewidth=1.5, color='r', linestyle='solid')
395
            ncolumns_legend += 1
396

    
397
        if add_mean and add_stddev:
398
            ax.fill_between(x,
399
                            self.variables[varname]['mean'] - self.variables[varname]['standard_deviation'],
400
                            self.variables[varname]['mean'] + self.variables[varname]['standard_deviation'],
401
                            facecolor='r', alpha=0.4)
402

    
403
            plt.plot(x, self.variables[varname]['mean'] - self.variables[varname]['standard_deviation'], label=r'$\pm\sigma$', axes=ax, linewidth=0.5, color='r', linestyle=(0, (1, 1)))
404
            plt.plot(x, self.variables[varname]['mean'] + self.variables[varname]['standard_deviation'], axes=ax, linewidth=0.5, color='r', linestyle=(0, (1, 1)))
405
            ncolumns_legend += 1
406

    
407

    
408
        if add_minmax:
409
            ncolumns_legend += 2
410
            plt.plot(x, self.variables[varname]['min'], label='Minimum', axes=ax, linewidth=1.0, color='g', linestyle='solid')
411
            plt.plot(x, self.variables[varname]['max'], label='Maximum', axes=ax, linewidth=1.0, color='g', linestyle='solid')
412

    
413
        if ('add_legend' in kwargs and kwargs['add_legend']):
414
            box = ax.get_position()
415
            ax.set_position([box.x0, box.y0, box.width, box.height * 0.97])
416
            ax.legend(ncol=ncolumns_legend, fontsize='x-small', bbox_to_anchor=(0.0, 1.0, 1.0, 0.03),
417
                      loc=3, mode="expand", borderaxespad=0.)
418

    
419
        if self.variables_meta[varname]['log_range']:
420
            ax.set_yscale('log')
421

    
422
        ax.set_xlabel("Binned row index")
423
        if self.variables_meta[varname]['units'] is not None and self.variables_meta[varname]['units'] != "1":
424
            ax.set_ylabel("{0} [{1}]".format(self.variables_meta[varname]['title'], self.variables_meta[varname]['units']))
425
        else:
426
            ax.set_ylabel(self.variables_meta[varname]['title'])
427
        return True