Project

General

Profile

Statistics
| Branch: | Tag: | Revision:

pycama / src / pycama / HistogramPlot.py @ 831:663001078b4f

History | View | Annotate | Download (45.1 KB)

1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3

    
4
# Copyright 2016-2017 Maarten Sneep, KNMI
5
#
6
# Redistribution and use in source and binary forms, with or without
7
# modification, are permitted provided that the following conditions are met:
8
#
9
# 1. Redistributions of source code must retain the above copyright notice,
10
#    this list of conditions and the following disclaimer.
11
#
12
# 2. Redistributions in binary form must reproduce the above copyright notice,
13
#    this list of conditions and the following disclaimer in the documentation
14
#    and/or other materials provided with the distribution.
15
#
16
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
20
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26

    
27
## @file HistogramPlot.py
28
#
29
# This file defines a subclass of the pycama.AnalysisAndPlot.AnalysisAndPlot class.
30
# This subclass extracts the histograms from the input data.
31
#
32
# @author Maarten Sneep
33

    
34
import math
35
import logging
36
from argparse import Namespace
37

    
38
import numpy as np
39
import netCDF4
40

    
41
from .AnalysisAndPlot import AnalysisAndPlot
42

    
43
from .utilities import *
44

    
45
def interquartile(x):
46
    v = np.nanpercentile(x, [25, 75])
47
    return v[1] - v[0]
48

    
49
def lower_quartile(x):
50
    v = np.nanpercentile(x, [25])
51
    return v[0]
52

    
53
def upper_quartile(x):
54
    v = np.nanpercentile(x, [75])
55
    return v[0]
56

    
57
## Make histograms (and gather other statistics).
58
#
59
#  Also includes netCDF write & read routines. Plotting should be possible from
60
#  a restored object.
61
#
62
class HistogramPlot(AnalysisAndPlot):
63
    ## Define a few variables to collect the number of bins in the histograms
64
    #
65
    #  There is one default (via the `number_bins` keyword argument, 100 if that value isn't given)
66
    #  but each variable can define its own number of histogram bins.
67
    #  To generate the required dimensions in the output, all histogram sizes are collected in a set.
68
    def setup(self, **kwargs):
69
        if 'number_bins' in kwargs:
70
            self.number_bins = kwargs['number_bins']
71
        else:
72
            self.number_bins = 100
73

    
74
        self.number_bins_collection = set()
75
        self.partial_keys = ('mean', 'min', 'max', 'standard_deviation', 'inter_quartile_range', 'median', 'q25', 'q75')
76
        self.operands = (np.nanmean, np.nanmin, np.nanmax, lambda x: np.nanstd(x, ddof=1), interquartile, np.nanmedian, lower_quartile, upper_quartile)
77

    
78
    ## Add a variable to the list 'to be processed'
79
    #
80
    #  @param var A pycama.Variable.Variable instance.
81
    #
82
    #  This method reserves memory and created the bins for the histograms.
83
    def add_variable(self, var):
84
        if var.data_range is None:
85
            var.data_range = [np.min(var.aggregate_data), np.max(var.aggregate_data)]
86
        if var.number_bins is None:
87
            var.number_bins = self.number_bins
88
        self.number_bins_collection.add(var.number_bins)
89

    
90
        if var.log_range:
91
            b = np.logspace(math.log10(var.data_range[0]), math.log10(var.data_range[1]), num=var.number_bins+1, endpoint=True, base=10.0, dtype=np.float32)
92
        else:
93
            b = np.linspace(var.data_range[0], var.data_range[1], num=var.number_bins+1, endpoint=True, dtype=np.float32)
94

    
95
        self.variables[var.name+'.histogram'] = np.zeros((var.number_bins,), dtype=np.int32)
96
        self.variables[var.name+'.histogram_eq'] = np.zeros((var.number_bins,), dtype=np.float32)
97
        self.variables[var.name+'.bins'] = b
98

    
99
        super(self.__class__, self).add_variable(var)
100

    
101
    ## Get a list of variable names.
102
    @property
103
    def variable_names(self):
104
        return [k.replace('.bins', '') for k in list(self.variables.keys()) if k.endswith('.bins')]
105

    
106
    ## Extract the required information from the input data.
107
    #
108
    #  This method filters out the variables that should not be collected, and generates histograms for the rest.
109
    #  In addition the global statistics are collected as well, i.e. `mean`, `min`, `max`, `standard_deviation`,
110
    #  `q01`, `q05`, `q10`, `q16`, `q25`, `q50`, `q75`, `q84`, `q90`, `q95`, `q99`, `inter_quartile_range`, `median`,
111
    #  and `mode` (these are the same statistics collected by the pycama.AlongTrackPlot.AlongTrackPlot class,
112
    #  but there they are collected per row, not for the whole dataset). The additional q16 and q84 (15.8655% and 84.1345%)
113
    #  correspond to \f$\pm1\,\sigma\f$ in a normally distributed variable.
114
    #
115
    #  The histograms are collected in two variants (using hte same bins):
116
    #  one simply counts the number of observations in each bin, the other
117
    #  uses \f$\cos(\delta_{\mbox{geo}})\f$ as weights. The latter should
118
    #  correct for the overrepresentation of the polar regions in our
119
    #  sun-synchronous polar orbit.
120
    #
121
    #  Part of the results are stored in the self.variables dictionary, another
122
    #  part is stored in the self.variables_meta dictionary.
123
    def process(self):
124
        try:
125
            surface_flags = np.asarray(self.input_variables.variables['surface_classification'].aggregate_data, dtype=np.uint8)
126
        except KeyError:
127
            surface_flags = None
128

    
129
        lat = self.input_variables.variables['latitude'].aggregate_data
130
        w = np.cos((np.pi/180.0)*lat)
131
        n_variables = len(self.input_variables.variables)
132
        i = 0
133
        var_count = 0
134

    
135
        for v in list(self.input_variables.variables.values()):
136
            if not v.show or v.noscanline or v.aggregate_data is None or not hasattr(v.aggregate_data, "mask") or len(v.aggregate_data) == 0:
137
                continue
138
            var_count += 1
139

    
140
        for v in list(self.input_variables.variables.values()):
141
            if not v.show or v.noscanline or v.aggregate_data is None or not hasattr(v.aggregate_data, "mask") or len(v.aggregate_data) == 0:
142
                continue
143
            if i > 0:
144
                self.progress(100*i/var_count)
145
            i += 1
146
            cidx = np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask))
147
            if not np.any(cidx):
148
                self.logger.warning("%s: No valid data in '%s'", self.__class__.__name__, v.name)
149
                for k in ('mean', 'min', 'max', 'standard_deviation',
150
                          'q01', 'q05', 'q10', 'q16', 'q25', 'q50', 'q75', 'q84', 'q90', 'q95', 'q99',
151
                          'inter_quartile_range', 'median', 'mode'):
152
                    self.variables_meta[v.name][k] = np.nan
153
                self.variables_meta[v.name]['count'] = v.aggregate_data.shape[0]
154
                continue
155

    
156
            self.variables[v.name+'.histogram'][:] = np.histogram(v.aggregate_data[cidx], bins=self.variables[v.name+'.bins'])[0]
157
            self.variables[v.name+'.histogram_eq'][:] = np.histogram(v.aggregate_data[cidx], bins=self.variables[v.name+'.bins'],
158
                                                                     weights=w[cidx], density=True)[0]
159
            self.variables_meta[v.name]['mean'] = np.nanmean(v.aggregate_data[cidx])
160
            self.variables_meta[v.name]['min'] = np.nanmin(v.aggregate_data[cidx])
161
            self.variables_meta[v.name]['max'] = np.nanmax(v.aggregate_data[cidx])
162
            self.variables_meta[v.name]['standard_deviation'] = np.nanstd(v.aggregate_data[cidx], ddof=1)
163
            q_level = [1, 5, 10, 15.86552539315, 25, 50, 75, 84.13447460685, 90, 95, 99]
164
            q_val = np.nanpercentile(v.aggregate_data[cidx], q_level)
165
            for q, qv in zip(q_level, q_val):
166
                self.variables_meta[v.name]['q{0:02d}'.format(int(math.floor(q+0.5)))] = float(qv)
167
            self.variables_meta[v.name]['inter_quartile_range'] = self.variables_meta[v.name]['q75'] - self.variables_meta[v.name]['q25']
168
            self.variables_meta[v.name]['median'] = self.variables_meta[v.name]['q50']
169
            self.variables_meta[v.name]['count'] = v.aggregate_data.shape[0]
170
            self.variables_meta[v.name]['mode'] = self.mode((self.variables[v.name+'.bins'][0:-1] + self.variables[v.name+'.bins'][1:])/2, self.variables[v.name+'.histogram'])
171

    
172
            scidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), lat < 0)
173
            ncidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), lat > 0)
174
            if surface_flags is not None:
175
                land_cidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), (surface_flags & 3) == 0)
176
                sea_cidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), (surface_flags & 3) == 1)
177
            else:
178
                land_cidx = None
179
                sea_cidx = None
180

    
181
            for selection, suffix in zip((scidx, ncidx, land_cidx, sea_cidx), ('south', 'north', 'land', 'sea')):
182
                if selection is None:
183
                    continue
184
                if not np.any(selection):
185
                    if not (self.product == 'CH4___' and suffix == 'sea'):
186
                        self.logger.warning("%s: No valid data for '%s' in '%s'", self.__class__.__name__, suffix, v.name)
187
                    for k in self.partial_keys:
188
                        self.variables_meta[v.name][k+'_'+suffix] = np.nan
189
                    self.variables_meta[v.name]['count_'+suffix] = np.sum(selection)
190
                else:
191
                    for k, operand in zip(self.partial_keys, self.operands):
192
                        self.variables_meta[v.name][k+'_'+suffix] = operand(v.aggregate_data[selection])
193
                    self.variables_meta[v.name]['count_'+suffix] = np.sum(selection)
194

    
195

    
196
            # Automated range warnings: not useful
197

    
198
            # range_delta = self.variables_meta[v.name]['data_range'][1] - self.variables_meta[v.name]['data_range'][0]
199
            # range_offset = self.variables_meta[v.name]['data_range'][0]
200
            # range_warning = ((self.variables_meta[v.name]['min'] - range_offset + 0.25*range_delta < 0.0)
201
            #                  or (self.variables_meta[v.name]['max'] - range_offset + 0.75*range_delta > range_delta)
202
            #                  or (self.variables_meta[v.name]['q10'] - range_offset - 0.25*range_delta > 0.0)
203
            #                  or (self.variables_meta[v.name]['q90'] - range_offset + 1.75*range_delta > range_delta))
204
            # if range_warning:
205
            #     self.logger.info("Range for variable '{0}' does not match observed data range.".format(v.name))
206
            #     self.logger.info("Configured range [{0[0]}, {0[1]}]; observed range [{1[0]}, {1[1]}] (min, max); [{2[0]}. {2[1]}] (q10, q90)".format(self.variables_meta[v.name]['data_range'],
207
            #                      [self.variables_meta[v.name]['min'], self.variables_meta[v.name]['max']], [self.variables_meta[v.name]['q10'], self.variables_meta[v.name]['q90']]))
208

    
209
    ## Merge data into a combined dataset.
210
    #
211
    #  @param other The object to be added to self,
212
    #               also an instance of the pycama.HistogramPlot.HistogramPlot class.
213
    #
214
    # While the mean, standard deviation, minimum and maximum can be aggregated exactly,
215
    # the median and percentiles can't (you need the full dataset for those). As a surrogate
216
    # we use the mean of the values.
217
    def __iadd__(self, other):
218
        for name in [n.split('.')[0] for n in self.variables.keys() if n.endswith('.histogram')]:
219
            self.variables[name+'.histogram'] += other.variables[name+'.histogram']
220
            count = self.variables_meta[name]['count']
221
            ocount = other.variables_meta[name]['count']
222
            if ocount > 0:
223
                self.variables[name+'.histogram_eq'] = (self.variables[name+'.histogram_eq']*count + other.variables[name+'.histogram_eq']*ocount) / (count+ocount)
224

    
225
                mean_combine = lambda a, b: (a*count + b*ocount) / (count+ocount)
226
                for meaning in ["mean", "median", "mode", "q01", "q05", "q16", "q25", "q75", "q84", "q90", "q95", "q99", "inter_quartile_range"]:
227
                    self.variables_meta[name][meaning] = mean_combine(self.variables_meta[name][meaning], other.variables_meta[name][meaning])
228
                self.variables_meta[name]['min'] = min(self.variables_meta[name]['min'], other.variables_meta[name]['min'])
229
                self.variables_meta[name]['max'] = max(self.variables_meta[name]['max'], other.variables_meta[name]['max'])
230
                self.variables_meta[name]['standard_deviation'] = math.sqrt(mean_combine(self.variables_meta[name]['standard_deviation']**2,
231
                                                                                         other.variables_meta[name]['standard_deviation']**2))
232
            self.variables_meta[name]['count'] += other.variables_meta[name]['count']
233

    
234
            try:
235
                for suffix in ('south', 'north', 'land', 'sea'):
236
                    count = self.variables_meta[name]['count_'+suffix]
237
                    ocount = other.variables_meta[name]['count_'+suffix]
238
                    if ocount > 0:
239
                        mean_combine = lambda a, b: (a*count + b*ocount) / (count+ocount)
240
                        for k in [s for s in self.partial_keys if s not in ('min', 'max', 'standard_deviation')]:
241
                            self.variables_meta[name][k+'_'+suffix] = mean_combine(self.variables_meta[name][k+'_'+suffix], other.variables_meta[name][k+'_'+suffix])
242
                        self.variables_meta[name]['min_'+suffix] = min(self.variables_meta[name]['min_'+suffix], other.variables_meta[name]['min_'+suffix])
243
                        self.variables_meta[name]['max_'+suffix] = max(self.variables_meta[name]['max_'+suffix], other.variables_meta[name]['max_'+suffix])
244
                        self.variables_meta[name]['standard_deviation_'+suffix] = math.sqrt(mean_combine(self.variables_meta[name]['standard_deviation_'+suffix]**2, other.variables_meta[name]['standard_deviation_'+suffix]**2))
245
                        self.variables_meta[name]['count_'+suffix] += other.variables_meta[name]['count_'+suffix]
246
            except KeyError:
247
                self.logger.warning("%s.__iadd__: Partial statistics not available.", self.__class__.__name__)
248

    
249

    
250

    
251
    ## Read processed data from the input file, for specified time index.
252
    #
253
    #  @param fname NetCDF file with input data.
254
    #  @param time_index Time slice to read.
255
    def ingest(self, fname, time_index):
256
        self.time_index_in_output = time_index
257
        with h5py.File(fname, 'r') as ref:
258
            try:
259
                grp = ref[self.storage_group_name]
260
            except:
261
                self.logger.error("histogram data not in '%s'.", os.path.basename(fname))
262
                raise
263
                return False
264

    
265
            vlist = [k.decode("ASCII") for k in ref['variable_names'][:]]
266
            self.input_variables = Namespace(variables={})
267

    
268
            for k in vlist:
269
                if k+'_histogram' not in list(grp.keys()):
270
                    continue
271
                var = grp[k+'_histogram']
272

    
273
                self.variables_meta[k] = {}
274
                self.variables_meta[k]['noscanline'] = False
275
                try:
276
                    self.variables[k+'.histogram'] = var[time_index, :]
277
                except ValueError:
278
                    self.logger.error("Error in key '{0}'".format(k))
279
                    raise
280
                self.number_bins_collection.add(self.variables[k+'.histogram'].shape[0])
281
                self.input_variables.variables[k] = Namespace(noscanline=False, number_bins=var.shape[1])
282
                try:
283
                    self.variables_meta[k]['title'] = var.attrs['long_name']
284
                except KeyError:
285
                    self.logger.error("No long_name on '{0}'".format(k))
286
                    raise
287
                if isinstance(self.variables_meta[k]['title'], bytes):
288
                    self.variables_meta[k]['title'] = str(self.variables_meta[k]['title'], 'utf-8')
289

    
290
                self.variables_meta[k]['data_range'] = var.attrs['range']
291
                self.variables_meta[k]['color_scale'] = var.attrs['color_scale']
292
                if isinstance(self.variables_meta[k]['color_scale'], bytes):
293
                    self.variables_meta[k]['color_scale'] = str(self.variables_meta[k]['color_scale'], 'utf-8')
294
                self.variables_meta[k]['log_range'] = (var.attrs['log_range'].lower() == b'true'
295
                                                       if isinstance(var.attrs['log_range'], bytes) else
296
                                                       var.attrs['log_range'].lower() == 'true')
297

    
298
                var = grp[k+'_histogram_bins']
299
                self.variables_meta[k]['units'] = var.attrs['units']
300
                if isinstance(self.variables_meta[k]['units'], bytes):
301
                    self.variables_meta[k]['units'] = str(self.variables_meta[k]['units'], 'utf-8')
302

    
303
                var = grp[k+'_histogram_eq']
304
                self.variables[k+'.histogram_eq'] = var[time_index, :]
305

    
306
                var = grp[k+'_bounds']
307
                self.variables[k+'.bins'] = np.concatenate((var[:, 0], [var[-1, 1]]))
308

    
309
                for atkey in ('mean', 'min', 'max', 'standard_deviation',
310
                              'q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99',
311
                              'inter_quartile_range', 'median', 'count', 'mode'):
312
                    var = grp[k+'_'+atkey]
313
                    self.variables_meta[k][atkey] = var[time_index]
314

    
315
                partial_keys = list(self.partial_keys)
316
                partial_keys.append('count')
317

    
318
                for suffix in ('south', 'north', 'land', 'sea'):
319
                    for kk in partial_keys:
320
                        try:
321
                            var = grp[k+'_'+kk+'_'+suffix]
322
                            self.variables_meta[k][kk+'_'+suffix] = var[time_index]
323
                        except:
324
                            self.logger.debug("{0}.ingest(): variable '{1}' not available".format(self.__class__.__name__, k+'_'+kk+'_'+suffix))
325
        return True
326

    
327
    ## Write processed data to output netcdf file.
328
    #
329
    #  @param fname File to write to
330
    #  @param mode  Writing mode, defaults to append.
331
    #
332
    #  Write data (including extraction specific dimensions) to the group with
333
    #  the name given in the `storage_group_name` property ("`histogramplot_data`" for this class).
334
    def dump(self, fname, mode='a'):
335
        compress={'compression':'gzip', 'compression_opts':3, 'shuffle':True, 'fletcher32':True}
336
        with h5py.File(fname, 'a') as ref:
337
            time_step = self.time_index_in_output
338
            try:
339
                grp = ref.create_group(self.storage_group_name)
340
                grp.attrs['comment'] = 'Histograms of data'
341
            except:
342
                grp = ref[self.storage_group_name]
343

    
344
            ##  For each unique number of bins a dimension is created.
345
            number_bins_collection = sorted(list(self.number_bins_collection))
346

    
347
            for number_bins in number_bins_collection:
348
                try:
349
                    dim_name = 'histogram_bins_{0}'.format(number_bins)
350
                    var = grp.create_dataset(dim_name, (number_bins,), dtype=np.int32, **compress)
351
                    var[:] = np.arange(number_bins, dtype=np.int32)
352
                    var.attrs['long_name'] = "histogram indices for length {0}".format(number_bins)
353
                except:
354
                    self.logger.debug("{0}.dump(): skipping dimension '{1}'".format(self.__class__.__name__, dim_name))
355
                    var = grp[dim_name]
356
                    var[:] = np.arange(number_bins, dtype=np.int32)
357

    
358
            ## Loop over the variables and store the histogram for each
359
            #
360
            #  Create dataset, attach dimensions (as scales), set attributes.
361
            #  If variable already exists: expand in time-dimension.
362
            for k in self.variable_names:
363
                if self.input_variables.variables[k].noscanline:
364
                    continue
365

    
366
                histogram_bins_dim = 'histogram_bins_{0}'.format(self.input_variables.variables[k].number_bins)
367
                try:
368
                    self.logger.debug("{0}.dump(): Opening & resizing variable '{1}'".format(self.__class__.__name__, k+'_histogram'))
369
                    var = grp[k+'_histogram']
370
                    var.resize(self.time_index_in_output+1, axis=0)
371
                except KeyError:
372
                    if self.time_index_in_output > 0:
373
                        continue
374
                    self.logger.debug("{0}.dump(): creating variable '{1}'".format(self.__class__.__name__, k+'_histogram'))
375
                    var = grp.create_dataset(k+'_histogram',
376
                                             (1, self.input_variables.variables[k].number_bins),
377
                                             dtype=np.int32,
378
                                             maxshape=(None,self.input_variables.variables[k].number_bins),
379
                                             **compress)
380
                    self.logger.debug("{0}.dump(): attaching dimension 'time' to '{1}'".format(self.__class__.__name__, k+'_histogram'))
381
                    var.dims.create_scale(ref['time'], 'time')
382
                    var.dims[0].attach_scale(ref['time'])
383
                    self.logger.debug("{0}.dump(): attaching dimension '{2}' to '{1}'".format(self.__class__.__name__, k+'_histogram', histogram_bins_dim))
384
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
385
                    var.dims[1].attach_scale(grp[histogram_bins_dim])
386
                    self.logger.debug("{0}.dump(): Adding attributes to variable '{1}'".format(self.__class__.__name__, k+'_histogram'))
387
                    var.attrs['long_name'] = self.variables_meta[k]['title']
388
                    var.attrs['range'] = [float(v) for v in self.variables_meta[k]['data_range']]
389
                    var.attrs['color_scale'] = self.variables_meta[k]['color_scale']
390
                    var.attrs['log_range'] = str(self.variables_meta[k]['log_range'])
391
                    var.attrs['bins'] = k+'_histogram_bins'
392
                
393
                try:
394
                    var[time_step, :] = self.variables[k+'.histogram']
395
                except TypeError as err:
396
                    self.logger.warning(f"Error in variable '{k}'")
397
                    self.logger.warning(str(err))
398
                    raise
399

    
400
                ## Create bins variable for histogram axis, including bounds.
401
                try:
402
                    var = grp.create_dataset(k+'_histogram_bins',
403
                                             (self.input_variables.variables[k].number_bins,),
404
                                             dtype=np.float32,
405
                                             **compress)
406
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
407
                    var.dims[0].attach_scale(grp[histogram_bins_dim])
408
                except:
409
                    var = grp[k+'_histogram_bins']
410
                var.attrs['bounds'] = k+'_bounds'
411
                if 'units' in self.variables_meta[k] and self.variables_meta[k]['units'] is not None:
412
                    var.attrs['units'] = self.variables_meta[k]['units']
413
                else:
414
                    var.attrs['units'] = "None"
415
                var[:] = (self.variables[k+'.bins'][0:-1] + self.variables[k+'.bins'][1:])/2.
416

    
417
                try:
418
                    var = grp.create_dataset(k+'_bounds',
419
                                             (self.input_variables.variables[k].number_bins, 2),
420
                                             dtype=np.float32,
421
                                             **compress)
422
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
423
                    var.dims[0].attach_scale(grp[histogram_bins_dim])
424
                    var.dims.create_scale(ref['nv'], 'nv')
425
                    var.dims[1].attach_scale(ref['nv'])
426
                except:
427
                    var = grp[k+'_bounds']
428
                var[:, 0] = self.variables[k+'.bins'][0:-1]
429
                var[:, 1] = self.variables[k+'.bins'][1:]
430

    
431
                ## Add the weighted histogram.
432
                #
433
                #  Create dataset, attach dimensions (as scales), set attributes.
434
                #  If variable already exists: expand in time-dimension.
435
                try:
436
                    var = grp[k+'_histogram_eq']
437
                    var.resize(self.time_index_in_output+1, axis=0)
438
                except:
439
                    if self.time_index_in_output > 0:
440
                        continue
441
                    var = grp.create_dataset(k+'_histogram_eq', (1, self.input_variables.variables[k].number_bins),
442
                                             dtype=np.float32,
443
                                             maxshape=(None,self.input_variables.variables[k].number_bins),
444
                                             **compress)
445
                    var.dims.create_scale(ref['time'], 'time')
446
                    var.dims[0].attach_scale(ref['time'])
447
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
448
                    var.dims[1].attach_scale(grp[histogram_bins_dim])
449
                    var.attrs['range'] = self.variables_meta[k]['data_range']
450
                    var.attrs['color_scale'] = self.variables_meta[k]['color_scale']
451
                    var.attrs['log_range'] = str(self.variables_meta[k]['log_range'])
452
                    var.attrs['bins'] = k+'_histogram_bins'
453
                    var.attrs['long_name'] = "normed histogram of {0}, with cos(latitude) as weight".format(k)
454
                var[time_step, :] = self.variables[k+'.histogram_eq']
455

    
456
                ## For each of the statistical results, create a variable with dimensions `time`.
457
                #
458
                #  The long name is given in this dictionary.
459
                meanings = {'mean':'mean of the observations',
460
                            'min':'minimum value',
461
                            'max':'maximum value',
462
                            'standard_deviation':'sample standard deviation',
463
                            'q01':"1% of all values are smaller than this value",
464
                            'q05':"5% of all values are smaller than this value",
465
                            'q10':"10% of all values are smaller than this value",
466
                            'q16':"15.9% of all values are smaller than this value (1 standard deviation)",
467
                            'q25':"25% of all values are smaller than this value",
468
                            'q75':"75% of all values are smaller than this value",
469
                            'q84':"84.1% of all values are smaller than this value (1 standard deviation)",
470
                            'q90':"90% of all values are smaller than this value",
471
                            'q95':"95% of all values are smaller than this value",
472
                            'q99':"99% of all values are smaller than this value",
473
                            'inter_quartile_range':"q75 - q25",
474
                            'median':"median value",
475
                            'count':'number of datapoints',
476
                            'mode': "Mode (most common) value"}
477
                for atkey in meanings.keys():
478
                    try:
479
                        var = grp[k+'_'+atkey]
480
                        var.resize(self.time_index_in_output+1, axis=0)
481
                    except:
482
                        if self.time_index_in_output > 0:
483
                            continue
484
                        var = grp.create_dataset(k+'_'+atkey, (1,),
485
                            dtype=(np.float64 if (atkey != 'count') else np.int32),
486
                            maxshape=(None,), **compress)
487
                        var.dims.create_scale(ref['time'], 'time')
488
                        var.dims[0].attach_scale(ref['time'])
489
                        var.attrs['long_name'] = meanings[atkey]
490
                    try:
491
                        if atkey not in self.variables_meta[k] or not np.isfinite(self.variables_meta[k][atkey]):
492
                            var[time_step] = fill_value(np.float64 if (atkey != 'count') else np.int32)
493
                        else:
494
                            var[time_step] = self.variables_meta[k][atkey]
495
                    except:
496
                        self.logger.info("variables: %s", ", ".join(list(self.variables_meta.keys())))
497
                        self.logger.info("keys: %s", ", ".join(list(self.variables_meta[k].keys())))
498
                        self.logger.info("value: %f", self.variables_meta[k][atkey])
499
                        raise CAMAException("Error in HistogramPlot.dump() for variable %s and key %s" % (k, atkey))
500

    
501
                partial_keys = list(self.partial_keys)
502
                partial_keys.append('count')
503

    
504
                for suffix, label in zip(('south', 'north', 'land', 'sea'),
505
                                         ('southern hemisphere', 'northern hemisphere',
506
                                          'pixels over land', 'pixels over sea')):
507
                    for kk in partial_keys:
508
                        try:
509
                            var = grp[k+'_'+kk+'_'+suffix]
510
                            var.resize(self.time_index_in_output+1, axis=0)
511
                        except:
512
                            if self.time_index_in_output > 0:
513
                                continue
514
                            var = grp.create_dataset(k+'_'+kk+'_'+suffix, (1,),
515
                                dtype=(np.float64 if kk != 'count' else np.int32),
516
                                maxshape=(None,), **compress)
517
                            var.dims.create_scale(ref['time'], 'time')
518
                            var.dims[0].attach_scale(ref['time'])
519
                            var.attrs['long_name'] = meanings[kk] + ', for ' + label
520

    
521
                        if kk+'_'+suffix not in self.variables_meta[k] or not np.isfinite(self.variables_meta[k][kk+'_'+suffix]):
522
                            var[time_step] = fill_value(np.float64 if kk != 'count' else np.int32)
523
                        else:
524
                            var[time_step] = self.variables_meta[k][kk+'_'+suffix]
525

    
526
    ## Make a plot of a specified variable
527
    #
528
    #  @param varname The name of the variable to plot.
529
    #  @param figure  The matplotlib.figure.Figure instance to plot to.
530
    #  @param ax      The matplotlib.axes.Axes object to use. Default is None, in that case `ax = matplotlib.pyplot.gca()` shall be used.
531
    #  @param weighted Keyword parameter, boolean. When `True` the weighted histogram gets plotted.
532
    #  @param ylog     Keyword parameter, boolean. When `True` a logarithmic y-axis will be used.
533
    #  @param legend   Keyword parameter, boolean. When `True` a legend is added to the plot.
534
    #  @param clip     Keyword parameter, boolean. When `True` the horizontal axis is clipped to the range of the histogram.
535
    #                  When set to `False` the x-range is determined by the mean and standard deviation or the median and inter quartile range.
536
    #
537
    #  @return `False` if no plot could be created (a logarithmic y-axis with a maximum of 0), `True` otherwise.
538
    def plot(self, varname, figure, ax=None, **kwargs):
539
        import matplotlib
540
        matplotlib.use('svg')
541
        matplotlib.rc('font', family='DejaVu Sans')
542
        import matplotlib.pyplot as plt
543
        import matplotlib.cm as cm
544
        from matplotlib.colors import Normalize, LogNorm
545

    
546
        if varname not in self.variables_meta:
547
            self.logger.debug("Histogram data for '%s' not found.", varname)
548
            return False
549
        self.logger.info("Plotting histogram of %s", varname)
550
        if ax is None:
551
            ax = plt.gca()
552

    
553
        left = self.variables[varname+'.bins'][0:-1]
554
        box = ax.get_position()
555

    
556
        if 'add_xlabel' in kwargs and not kwargs['add_xlabel']:
557
            add_xlabel = False
558
        else:
559
            add_xlabel = True
560

    
561
        weighted = ('weighted' in kwargs and kwargs['weighted'])
562
        log = ('ylog' in kwargs and kwargs['ylog'])
563
        show_legend = ('legend' in kwargs and kwargs['legend'])
564
        clip_x = ('clip' in kwargs and kwargs['clip'])
565

    
566
        meta = self.variables_meta[varname]
567

    
568
        log_x_range = self.variables_meta[varname]['log_range']
569
        # log_x_range = False # disabled to fix the range issue
570

    
571
        if weighted:
572
            height = self.variables[varname+'.histogram_eq']
573
        else:
574
            height = self.variables[varname+'.histogram']
575

    
576
        if log and height.max() <= 0:
577
            return False
578

    
579
        width = self.variables[varname+'.bins'][1:] - self.variables[varname+'.bins'][0:-1]
580

    
581
        if self.variables[varname+'.bins'][0] == self.variables[varname+'.bins'][-1]:
582
            return False
583

    
584
        plt.bar(left, height, width=width, color='b' if weighted else 'g', edgecolor='k', linewidth=0.5, align='edge', log=log)
585

    
586
        if 'time' in kwargs and not weighted:
587
            if isinstance(kwargs['time'], (list, tuple)):
588
                tt = kwargs['time']
589
            else:
590
                tt = [kwargs['time']]
591
            timestr = "".join([t.strftime("%Y-%m-%d") for t in tt])
592
            ax.set_title(timestr)
593

    
594
        if log_x_range:
595
            try:
596
                ax.set_xscale('log', basex=10, subsx=[2, 3, 4, 5, 6, 7, 8, 9], nonposx='mask')
597
            except ValueError:
598
                self.logger.debug("Tried to use a logarithmic histogram axis, but failed for %s.", varname)
599
                ax.set_xscale('linear')
600
        else:
601
            ax.set_xscale('linear')
602

    
603
        if clip_x:
604
            plt.xlim(meta['data_range'])
605

    
606
        if add_xlabel:
607
            if self.variables_meta[varname]['units'] == "1":
608
                ax.set_xlabel(self.variables_meta[varname]['title'])
609
            else:
610
                ax.set_xlabel("{0} [{1}]".format(self.variables_meta[varname]['title'], self.variables_meta[varname]['units']))
611
        ax.set_ylabel("Number of observations" if not weighted else "Observation density")
612

    
613
        ymin, ymax = ax.get_ylim()
614
        if log:
615
            decades = math.log10(ymax/ymin)
616
            y = ymin*10**(decades/5)
617
            ymid = [ymin*10**(decades/6), ymin*10**(decades/6)]
618
            ylow = [ymin*10**(decades/9), ymin*10**(decades/9)]
619
        else:
620
            y = ymin+(ymax-ymin)/25.
621
            ymid = [ymin+2*(y-ymin), ymin+2*(y-ymin)]
622
            ylow = [ymin+1*(y-ymin), ymin+1*(y-ymin)]
623

    
624
        if not weighted:
625
            tbl = []
626
            tbl.append(r"mean: ${0}$".format(self.number_pretty_printer_pair(meta['mean'], meta['standard_deviation'])))
627
            tbl.append("minimum: {min}, maximum: {max}".format(min=self.number_pretty_printer(meta['min']), max=self.number_pretty_printer(meta['max'])))
628
            fmt = r"percentiles: 1\%: {q[0]}, 5\%: {q[1]}, 10\%: {q[2]}, 15.9\%: {q[3]}, 25\%: {q[4]}, 75\%: {q[5]}, 84.1\%: {q[6]}, 90\%: {q[7]}, 95\%: {q[8]},  99\%: {q[9]}"
629
            tbl.append(fmt.format(q=[self.number_pretty_printer(meta[q]) for q in ('q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99')]))
630
            tbl.append("median: {median}, interquartile range: {iqr}".format(median=self.number_pretty_printer(meta['median']), iqr=self.number_pretty_printer(meta['inter_quartile_range'])))
631

    
632
            plt.text(0.05, 0.95, tbl[0], alpha=0.0, axes=ax, size='xx-small', verticalalignment='top', horizontalalignment='left')
633

    
634
        for atkey, c, yy in zip(('mean', 'median'), ('mv', 'r^'), (ylow[0], ymid[0])):
635
            v = self.variables_meta[varname][atkey]
636
            plt.plot([v], [yy], c, label=atkey.capitalize())
637

    
638
        for atkey, c, lim, yy in zip(('standard_deviation', 'inter_quartile_range'), ('md:', 'rd--'), (('mean',), ('q25', 'q75')), (ylow, ymid)):
639
            v = self.variables_meta[varname][atkey]
640
            if len(lim) == 1:
641
                x = [self.variables_meta[varname][lim[0]] - v, self.variables_meta[varname][lim[0]] + v]
642
            else:
643
                x = [self.variables_meta[varname][lim[0]], self.variables_meta[varname][lim[1]]]
644
            plt.plot(x, yy, c, label=atkey.replace('_', ' ').capitalize())
645

    
646
        if show_legend:
647
            ax.set_position([box.x0, box.y0, box.width, box.height * 0.97], 'both')
648
            ax.legend(ncol=4, fontsize='x-small', bbox_to_anchor=(0.0, 1.0, 1.0, 0.03),
649
                      loc=3, mode="expand", borderaxespad=0.)
650

    
651
        return True
652

    
653
    ## Generate a LaTeX table with statistics for inclusion in the report.
654
    #  @param kwargs No keywords are actually used.
655
    #  @return String with table.
656
    #
657
    #  The table uses a LaTeX boxes called `\parameterbox` and `\percentilebox`
658
    #  (these boxes are already created in the latex template for the report),
659
    #  and determines (within LaTeX) which orientation of the table gives the best result.
660
    #
661
    def report(self, **kwargs):
662
        rval = []
663

    
664
        caption_text = "Parameterlist and basic statistics for the analysis"
665
        label = 'table:parameters'
666

    
667
        rval.append(r'\savebox{\parameterbox}{')
668
        rval.append(r'\begin{tabular}{l|ccccccc}')
669
        rval.append(r"Variable & mean\,$\pm\sigma$ & Count & Mode & IQR & Median & Minimum & Maximum \\")
670
        fmt = "{name} [{unit}] & ${mean}$ & ${count}$ & ${mode}$ & ${iqr}$ & ${median}$ & ${min}$ & ${max}$ \\\\"
671
        for name in self.variable_names:
672
            meta = self.variables_meta[name]
673
            rval.append(fmt.format(name=name.replace('_', ' '),
674
                                   unit=meta['units'],
675
                                   mean=self.number_pretty_printer_pair(meta['mean'], meta['standard_deviation']),
676
                                   mode=self.number_pretty_printer(meta['mode']),
677
                                   iqr=self.number_pretty_printer(meta['inter_quartile_range']),
678
                                   median=self.number_pretty_printer(meta['median']),
679
                                   min=self.number_pretty_printer(meta['min']),
680
                                   max=self.number_pretty_printer(meta['max']),
681
                                   count=int(meta['count'])))
682

    
683
        rval.append(r'\end{tabular}}')
684
        tabletemplate = r"""
685

686
\setlength{\naturaltablewidth}{\widthof{\usebox{\parameterbox}}}
687
\setlength{\naturaltableheight}{\heightof{\usebox{\parameterbox}}}
688
\addtolength{\naturaltableheight}{\depthof{\usebox{\parameterbox}}}
689
\setlength{\naturaltextratio}{1pt*\ratio{0.92\textheight}{\textwidth}}
690
\setlength{\naturaltableratio}{1pt*\ratio{\naturaltableheight}{\naturaltablewidth}}
691
\setlength{\realtextheight}{\textheight}
692
\setlength{\realtextwidth}{\textwidth}
693
%%
694
\ifthenelse{\lengthtest{0.92\textheight>\naturaltableheight}%%
695
            \AND \lengthtest{\textwidth>\naturaltablewidth}}%%
696
{%%(then)
697
  \begin{table}[htbp]%%
698
  \caption{%s}\label{%s}%%
699
  \centering\usebox{\parameterbox}%%
700
  \end{table}%%
701
}%%
702
{%%
703
  \begin{landscape}%%
704
  \begin{table}%%
705
    \caption{%s}\label{%s}%%
706
    \ifthenelse{\lengthtest{0.92\realtextheight>\naturaltableheight}%%
707
                \AND \lengthtest{\realtextwidth>\naturaltablewidth}}%%
708
      {\centering\usebox{\parameterbox}}%%
709
      {%%
710
        \ifthenelse{\lengthtest{\naturaltableratio<\naturaltextratio}}%%
711
          {\resizebox*{\realtextheight}{!}{\usebox{\parameterbox}}}%%
712
          {\resizebox*{!}{0.92\realtextwidth}{\usebox{\parameterbox}}}%%
713
      }
714
  \end{table}%%
715
  \end{landscape}%%
716
}
717

718
"""
719
        rval.append(tabletemplate % (caption_text, label, caption_text, label))
720

    
721
        caption_text = "Percentile ranges"
722
        label = 'table:percentiles'
723

    
724
        rval.append(r'\savebox{\percentilebox}{')
725
        rval.append(r'\begin{tabular}{l|ccccccccccc}')
726
        rval.append(r"Variable & 1\,\% & 5\,\% & 10\,\% & 15.9\,\% & 25\,\% & 75\,\% & 84.1\,\% & 90\,\% & 95\,\% & 99\,\% \\")
727
        fmt = "{name} [{unit}]  & ${q[0]}$ & ${q[1]}$ & ${q[2]}$ & ${q[3]}$ & ${q[4]}$ & ${q[5]}$ & ${q[6]}$ & ${q[7]}$ & ${q[8]}$ & ${q[9]}$ \\\\"
728
        for name in self.variable_names:
729
            meta = self.variables_meta[name]
730
            rval.append(fmt.format(unit=meta['units'],
731
                                   name=name.replace('_', ' '),
732
                                   q=[self.number_pretty_printer(meta[q]) for q in ('q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99')]))
733
        rval.append(r'''\end{tabular}}''')
734

    
735
        rval.append(tabletemplate.replace('parameterbox', 'percentilebox') % (caption_text, label, caption_text, label))
736

    
737
        partial_keys = list(self.partial_keys)
738
        partial_keys.append('count')
739

    
740
        if partial_keys[0]+'_south' not in self.variables_meta[self.variable_names[0]]:
741
            return "\n".join(rval)
742

    
743
        for suffix, label in zip(('north', 'south', 'sea', 'land'), ('in the northern hemisphere', 'in the southern hemisphere', 'over water', 'over land')):
744
            caption_text = "Parameterlist and basic statistics for the analysis for observations {0}".format(label)
745
            label = 'table:parameters_{0}'.format(suffix)
746

    
747
            rval.append(r'\savebox{{\parameterbox{0}}}{{'.format(suffix))
748
            rval.append(r'\begin{tabular}{l|cccccccc}')
749
            rval.append(r"Variable & mean\,$\pm\sigma$ & Count & IQR & Median & Minimum & Maximum & 25\,\% percentile & 75\,\% percentile\\")
750
            fmt = "{name} [{unit}] & ${mean}$ & ${count}$ & ${iqr}$ & ${median}$ & ${min}$ & ${max}$ & ${q25}$ & ${q75}$ \\\\"
751
            for name in self.variable_names:
752
                meta = self.variables_meta[name]
753
                rval.append(fmt.format(name=name.replace('_', ' '),
754
                                       unit=meta['units'],
755
                                       mean=self.number_pretty_printer_pair(meta['mean_'+suffix], meta['standard_deviation_'+suffix]),
756
                                       iqr=self.number_pretty_printer(meta['inter_quartile_range_'+suffix]),
757
                                       median=self.number_pretty_printer(meta['median_'+suffix]),
758
                                       min=self.number_pretty_printer(meta['min_'+suffix]),
759
                                       max=self.number_pretty_printer(meta['max_'+suffix]),
760
                                       count=int(meta['count_'+suffix]),
761
                                       q25=self.number_pretty_printer(meta['q25_'+suffix]),
762
                                       q75=self.number_pretty_printer(meta['q75_'+suffix])))
763

    
764
            rval.append(r'\end{tabular}}')
765
            rval.append(tabletemplate.replace('parameterbox', 'parameterbox'+suffix) % (caption_text, label, caption_text, label))
766

    
767
        return "\n".join(rval)
768

    
769
    ## Add a table to the report using matplotlib alone. The preferred output uses LaTeX.
770
    #
771
    #  @param ax The axis to use, defaults to `ax = matplotlib.pyplot.gca()`
772
    #  @param figure The figure instance to use, defaults to `figure = matplotlib.pyplot.gcf()`.
773
    #  @param variant Which of the tables should be printed. keyword argument with values
774
    #  `base` or `percentile`. Indicates which table is written.
775
    def table(self, ax=None, figure=None, **kwargs):
776
        import matplotlib.pyplot as plt
777
        import matplotlib.cm as cm
778
        from matplotlib.colors import Normalize, LogNorm
779

    
780
        if 'variant' in kwargs:
781
            variant = kwargs['variant']
782
        else:
783
            variant = 'base'
784

    
785
        self.logger.info("Adding table for %s parameters", variant)
786

    
787
        if ax is None:
788
            ax = plt.gca()
789

    
790
        if figure is None:
791
            figure = plt.gcf()
792

    
793

    
794
        if variant == 'base':
795
            colLabels = ['Name', 'Mean', 'StdDev', 'Count', 'Mode', 'IQR', 'Median', 'Min', 'Max']
796
            contents = []
797
            for name in self.variable_names:
798
                if name not in self.variables_meta:
799
                    continue
800
                meta = self.variables_meta[name]
801
                line = [name]
802
                for q in ('mean', 'standard_deviation', 'count', 'mode', 'inter_quartile_range', 'median', 'min', 'max'):
803
                    try:
804
                        if q not in ['count']:
805
                            val = "{0:.4g}".format(meta[q])
806
                        else:
807
                            val = "{0:d}".format(int(meta[q]))
808
                    except TypeError as err:
809
                        val = '--'
810
                    line.append(val)
811
                contents.append(line)
812
        else:
813
            colLabels = ['Name', '1%', '5%', '10%', '15.9%', '25%', '75%', '84.1%', '90%', '95%', '99%']
814
            contents = []
815
            for name in self.variable_names:
816
                if name not in self.variables_meta:
817
                    continue
818
                meta = self.variables_meta[name]
819
                line = [name]
820
                for q in ('q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99'):
821
                    try:
822
                        line.append("{0:.4g}".format(meta[q]))
823
                    except TypeError:
824
                        line.append("--")
825
                contents.append(line)
826

    
827
        nrows, ncols = len(self.variable_names)+1, len(colLabels)
828
        hcell, wcell = 0.3, 1.
829
        hpad, wpad = 0, 0
830
        figure.set_size_inches(ncols*wcell+wpad, nrows*hcell+hpad)
831

    
832
        ax.axis('off')
833
        #do the table
834
        the_table = ax.table(cellText=contents, colLabels=colLabels, loc='center')
835
        return [the_table]