Project

General

Profile

Statistics
| Branch: | Tag: | Revision:

pycama / src / pycama / HistogramPlot.py @ 834:d989df597b80

History | View | Annotate | Download (45.3 KB)

1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3

    
4
# Copyright 2016-2017 Maarten Sneep, KNMI
5
#
6
# Redistribution and use in source and binary forms, with or without
7
# modification, are permitted provided that the following conditions are met:
8
#
9
# 1. Redistributions of source code must retain the above copyright notice,
10
#    this list of conditions and the following disclaimer.
11
#
12
# 2. Redistributions in binary form must reproduce the above copyright notice,
13
#    this list of conditions and the following disclaimer in the documentation
14
#    and/or other materials provided with the distribution.
15
#
16
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
20
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26

    
27
## @file HistogramPlot.py
28
#
29
# This file defines a subclass of the pycama.AnalysisAndPlot.AnalysisAndPlot class.
30
# This subclass extracts the histograms from the input data.
31
#
32
# @author Maarten Sneep
33

    
34
import math
35
import logging
36
from argparse import Namespace
37

    
38
import numpy as np
39
import netCDF4
40

    
41
from .AnalysisAndPlot import AnalysisAndPlot
42

    
43
from .utilities import *
44

    
45
def interquartile(x):
46
    v = np.nanpercentile(x, [25, 75])
47
    return v[1] - v[0]
48

    
49
def lower_quartile(x):
50
    v = np.nanpercentile(x, [25])
51
    return v[0]
52

    
53
def upper_quartile(x):
54
    v = np.nanpercentile(x, [75])
55
    return v[0]
56

    
57
## Make histograms (and gather other statistics).
58
#
59
#  Also includes netCDF write & read routines. Plotting should be possible from
60
#  a restored object.
61
#
62
class HistogramPlot(AnalysisAndPlot):
63
    ## Define a few variables to collect the number of bins in the histograms
64
    #
65
    #  There is one default (via the `number_bins` keyword argument, 100 if that value isn't given)
66
    #  but each variable can define its own number of histogram bins.
67
    #  To generate the required dimensions in the output, all histogram sizes are collected in a set.
68
    def setup(self, **kwargs):
69
        if 'number_bins' in kwargs:
70
            self.number_bins = kwargs['number_bins']
71
        else:
72
            self.number_bins = 100
73

    
74
        self.number_bins_collection = set()
75
        self.partial_keys = ('mean', 'min', 'max', 'standard_deviation', 'inter_quartile_range', 'median', 'q25', 'q75')
76
        self.operands = (np.nanmean, np.nanmin, np.nanmax, lambda x: np.nanstd(x, ddof=1), interquartile, np.nanmedian, lower_quartile, upper_quartile)
77

    
78
    ## Add a variable to the list 'to be processed'
79
    #
80
    #  @param var A pycama.Variable.Variable instance.
81
    #
82
    #  This method reserves memory and created the bins for the histograms.
83
    def add_variable(self, var):
84
        if var.data_range is None:
85
            var.data_range = [np.min(var.aggregate_data), np.max(var.aggregate_data)]
86
        if var.number_bins is None:
87
            var.number_bins = self.number_bins
88
        self.number_bins_collection.add(var.number_bins)
89

    
90
        if var.log_range:
91
            b = np.logspace(math.log10(var.data_range[0]), math.log10(var.data_range[1]), num=var.number_bins+1, endpoint=True, base=10.0, dtype=np.float32)
92
        else:
93
            b = np.linspace(var.data_range[0], var.data_range[1], num=var.number_bins+1, endpoint=True, dtype=np.float32)
94

    
95
        self.variables[var.name+'.histogram'] = np.zeros((var.number_bins,), dtype=np.int32)
96
        self.variables[var.name+'.histogram_eq'] = np.zeros((var.number_bins,), dtype=np.float32)
97
        self.variables[var.name+'.bins'] = b
98

    
99
        super(self.__class__, self).add_variable(var)
100

    
101
    ## Get a list of variable names.
102
    @property
103
    def variable_names(self):
104
        return [k.replace('.bins', '') for k in list(self.variables.keys()) if k.endswith('.bins')]
105

    
106
    ## Extract the required information from the input data.
107
    #
108
    #  This method filters out the variables that should not be collected, and generates histograms for the rest.
109
    #  In addition the global statistics are collected as well, i.e. `mean`, `min`, `max`, `standard_deviation`,
110
    #  `q01`, `q05`, `q10`, `q16`, `q25`, `q50`, `q75`, `q84`, `q90`, `q95`, `q99`, `inter_quartile_range`, `median`,
111
    #  and `mode` (these are the same statistics collected by the pycama.AlongTrackPlot.AlongTrackPlot class,
112
    #  but there they are collected per row, not for the whole dataset). The additional q16 and q84 (15.8655% and 84.1345%)
113
    #  correspond to \f$\pm1\,\sigma\f$ in a normally distributed variable.
114
    #
115
    #  The histograms are collected in two variants (using hte same bins):
116
    #  one simply counts the number of observations in each bin, the other
117
    #  uses \f$\cos(\delta_{\mbox{geo}})\f$ as weights. The latter should
118
    #  correct for the overrepresentation of the polar regions in our
119
    #  sun-synchronous polar orbit.
120
    #
121
    #  Part of the results are stored in the self.variables dictionary, another
122
    #  part is stored in the self.variables_meta dictionary.
123
    def process(self):
124
        try:
125
            surface_flags = np.asarray(self.input_variables.variables['surface_classification'].aggregate_data, dtype=np.uint8)
126
        except KeyError:
127
            surface_flags = None
128

    
129
        lat = self.input_variables.variables['latitude'].aggregate_data
130
        w = np.cos((np.pi/180.0)*lat)
131
        n_variables = len(self.input_variables.variables)
132
        i = 0
133
        var_count = 0
134

    
135
        for v in list(self.input_variables.variables.values()):
136
            if not v.show or v.noscanline or v.aggregate_data is None or not hasattr(v.aggregate_data, "mask") or len(v.aggregate_data) == 0:
137
                continue
138
            var_count += 1
139

    
140
        for v in list(self.input_variables.variables.values()):
141
            if not v.show or v.noscanline or v.aggregate_data is None or not hasattr(v.aggregate_data, "mask") or len(v.aggregate_data) == 0:
142
                continue
143
            if i > 0:
144
                self.progress(100*i/var_count)
145
            i += 1
146
            cidx = np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask))
147
            if not np.any(cidx):
148
                self.logger.warning("%s: No valid data in '%s'", self.__class__.__name__, v.name)
149
                for k in ('mean', 'min', 'max', 'standard_deviation',
150
                          'q01', 'q05', 'q10', 'q16', 'q25', 'q50', 'q75', 'q84', 'q90', 'q95', 'q99',
151
                          'inter_quartile_range', 'median', 'mode'):
152
                    self.variables_meta[v.name][k] = np.nan
153
                self.variables_meta[v.name]['count'] = v.aggregate_data.shape[0]
154
                continue
155

    
156
            self.variables[v.name+'.histogram'][:] = np.histogram(v.aggregate_data[cidx], bins=self.variables[v.name+'.bins'])[0]
157
            self.variables[v.name+'.histogram_eq'][:] = np.histogram(v.aggregate_data[cidx], bins=self.variables[v.name+'.bins'],
158
                                                                     weights=w[cidx], density=True)[0]
159
            self.variables_meta[v.name]['mean'] = np.nanmean(v.aggregate_data[cidx])
160
            self.variables_meta[v.name]['min'] = np.nanmin(v.aggregate_data[cidx])
161
            self.variables_meta[v.name]['max'] = np.nanmax(v.aggregate_data[cidx])
162
            self.variables_meta[v.name]['standard_deviation'] = np.nanstd(v.aggregate_data[cidx], ddof=1)
163
            q_level = [1, 5, 10, 15.86552539315, 25, 50, 75, 84.13447460685, 90, 95, 99]
164
            q_val = np.nanpercentile(v.aggregate_data[cidx], q_level)
165
            for q, qv in zip(q_level, q_val):
166
                self.variables_meta[v.name]['q{0:02d}'.format(int(math.floor(q+0.5)))] = float(qv)
167
            self.variables_meta[v.name]['inter_quartile_range'] = self.variables_meta[v.name]['q75'] - self.variables_meta[v.name]['q25']
168
            self.variables_meta[v.name]['median'] = self.variables_meta[v.name]['q50']
169
            self.variables_meta[v.name]['count'] = v.aggregate_data.shape[0]
170
            self.variables_meta[v.name]['mode'] = self.mode((self.variables[v.name+'.bins'][0:-1] + self.variables[v.name+'.bins'][1:])/2, self.variables[v.name+'.histogram'])
171

    
172
            scidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), lat < 0)
173
            ncidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), lat > 0)
174
            if surface_flags is not None:
175
                land_cidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), (surface_flags & 3) == 0)
176
                sea_cidx = np.logical_and(np.logical_and(np.isfinite(v.aggregate_data), np.logical_not(v.aggregate_data.mask)), (surface_flags & 3) == 1)
177
            else:
178
                land_cidx = None
179
                sea_cidx = None
180

    
181
            for selection, suffix in zip((scidx, ncidx, land_cidx, sea_cidx), ('south', 'north', 'land', 'sea')):
182
                if selection is None:
183
                    continue
184
                if not np.any(selection):
185
                    if not (self.product == 'CH4___' and suffix == 'sea'):
186
                        self.logger.warning("%s: No valid data for '%s' in '%s'", self.__class__.__name__, suffix, v.name)
187
                    for k in self.partial_keys:
188
                        self.variables_meta[v.name][k+'_'+suffix] = np.nan
189
                    self.variables_meta[v.name]['count_'+suffix] = np.sum(selection)
190
                else:
191
                    for k, operand in zip(self.partial_keys, self.operands):
192
                        self.variables_meta[v.name][k+'_'+suffix] = operand(v.aggregate_data[selection])
193
                    self.variables_meta[v.name]['count_'+suffix] = np.sum(selection)
194

    
195

    
196
            # Automated range warnings: not useful
197

    
198
            # range_delta = self.variables_meta[v.name]['data_range'][1] - self.variables_meta[v.name]['data_range'][0]
199
            # range_offset = self.variables_meta[v.name]['data_range'][0]
200
            # range_warning = ((self.variables_meta[v.name]['min'] - range_offset + 0.25*range_delta < 0.0)
201
            #                  or (self.variables_meta[v.name]['max'] - range_offset + 0.75*range_delta > range_delta)
202
            #                  or (self.variables_meta[v.name]['q10'] - range_offset - 0.25*range_delta > 0.0)
203
            #                  or (self.variables_meta[v.name]['q90'] - range_offset + 1.75*range_delta > range_delta))
204
            # if range_warning:
205
            #     self.logger.info("Range for variable '{0}' does not match observed data range.".format(v.name))
206
            #     self.logger.info("Configured range [{0[0]}, {0[1]}]; observed range [{1[0]}, {1[1]}] (min, max); [{2[0]}. {2[1]}] (q10, q90)".format(self.variables_meta[v.name]['data_range'],
207
            #                      [self.variables_meta[v.name]['min'], self.variables_meta[v.name]['max']], [self.variables_meta[v.name]['q10'], self.variables_meta[v.name]['q90']]))
208

    
209
    ## Merge data into a combined dataset.
210
    #
211
    #  @param other The object to be added to self,
212
    #               also an instance of the pycama.HistogramPlot.HistogramPlot class.
213
    #
214
    # While the mean, standard deviation, minimum and maximum can be aggregated exactly,
215
    # the median and percentiles can't (you need the full dataset for those). As a surrogate
216
    # we use the mean of the values.
217
    def __iadd__(self, other):
218
        for name in [n.split('.')[0] for n in self.variables.keys() if n.endswith('.histogram')]:
219
            self.variables[name+'.histogram'] += other.variables[name+'.histogram']
220
            count = self.variables_meta[name]['count']
221
            ocount = other.variables_meta[name]['count']
222
            if ocount > 0:
223
                self.variables[name+'.histogram_eq'] = (self.variables[name+'.histogram_eq']*count + other.variables[name+'.histogram_eq']*ocount) / (count+ocount)
224

    
225
                mean_combine = lambda a, b: (a*count + b*ocount) / (count+ocount)
226
                for meaning in ["mean", "median", "mode", "q01", "q05", "q16", "q25", "q75", "q84", "q90", "q95", "q99", "inter_quartile_range"]:
227
                    self.variables_meta[name][meaning] = mean_combine(self.variables_meta[name][meaning], other.variables_meta[name][meaning])
228
                self.variables_meta[name]['min'] = min(self.variables_meta[name]['min'], other.variables_meta[name]['min'])
229
                self.variables_meta[name]['max'] = max(self.variables_meta[name]['max'], other.variables_meta[name]['max'])
230
                self.variables_meta[name]['standard_deviation'] = math.sqrt(mean_combine(self.variables_meta[name]['standard_deviation']**2,
231
                                                                                         other.variables_meta[name]['standard_deviation']**2))
232
            self.variables_meta[name]['count'] += other.variables_meta[name]['count']
233

    
234
            try:
235
                for suffix in ('south', 'north', 'land', 'sea'):
236
                    count = self.variables_meta[name]['count_'+suffix]
237
                    ocount = other.variables_meta[name]['count_'+suffix]
238
                    if ocount > 0:
239
                        mean_combine = lambda a, b: (a*count + b*ocount) / (count+ocount)
240
                        for k in [s for s in self.partial_keys if s not in ('min', 'max', 'standard_deviation')]:
241
                            self.variables_meta[name][k+'_'+suffix] = mean_combine(self.variables_meta[name][k+'_'+suffix], other.variables_meta[name][k+'_'+suffix])
242
                        self.variables_meta[name]['min_'+suffix] = min(self.variables_meta[name]['min_'+suffix], other.variables_meta[name]['min_'+suffix])
243
                        self.variables_meta[name]['max_'+suffix] = max(self.variables_meta[name]['max_'+suffix], other.variables_meta[name]['max_'+suffix])
244
                        self.variables_meta[name]['standard_deviation_'+suffix] = math.sqrt(mean_combine(self.variables_meta[name]['standard_deviation_'+suffix]**2, other.variables_meta[name]['standard_deviation_'+suffix]**2))
245
                        self.variables_meta[name]['count_'+suffix] += other.variables_meta[name]['count_'+suffix]
246
            except KeyError:
247
                self.logger.warning("%s.__iadd__: Partial statistics not available.", self.__class__.__name__)
248

    
249

    
250

    
251
    ## Read processed data from the input file, for specified time index.
252
    #
253
    #  @param fname NetCDF file with input data.
254
    #  @param time_index Time slice to read.
255
    def ingest(self, fname, time_index, exclude=None):
256
        self.time_index_in_output = time_index
257
        with h5py.File(fname, 'r') as ref:
258
            try:
259
                grp = ref[self.storage_group_name]
260
            except:
261
                self.logger.error("histogram data not in '%s'.", os.path.basename(fname))
262
                raise
263
                return False
264

    
265
            vlist = [k.decode("ASCII") for k in ref['variable_names'][:]]
266
            if exclude is not None:
267
                vlist = [v for v in vlist if v not in exclude]
268
                
269
            self.input_variables = Namespace(variables={})
270

    
271
            for k in vlist:
272
                if k+'_histogram' not in list(grp.keys()):
273
                    continue
274
                var = grp[k+'_histogram']
275

    
276
                self.variables_meta[k] = {}
277
                self.variables_meta[k]['noscanline'] = False
278
                try:
279
                    self.variables[k+'.histogram'] = var[time_index, :]
280
                except ValueError:
281
                    self.logger.error("Error in key '{0}'".format(k))
282
                    raise
283
                self.number_bins_collection.add(self.variables[k+'.histogram'].shape[0])
284
                self.input_variables.variables[k] = Namespace(noscanline=False, number_bins=var.shape[1])
285
                try:
286
                    self.variables_meta[k]['title'] = var.attrs['long_name']
287
                except KeyError:
288
                    self.logger.error("No long_name on '{0}'".format(k))
289
                    raise
290
                if isinstance(self.variables_meta[k]['title'], bytes):
291
                    self.variables_meta[k]['title'] = str(self.variables_meta[k]['title'], 'utf-8')
292

    
293
                self.variables_meta[k]['data_range'] = var.attrs['range']
294
                self.variables_meta[k]['color_scale'] = var.attrs['color_scale']
295
                if isinstance(self.variables_meta[k]['color_scale'], bytes):
296
                    self.variables_meta[k]['color_scale'] = str(self.variables_meta[k]['color_scale'], 'utf-8')
297
                self.variables_meta[k]['log_range'] = (var.attrs['log_range'].lower() == b'true'
298
                                                       if isinstance(var.attrs['log_range'], bytes) else
299
                                                       var.attrs['log_range'].lower() == 'true')
300

    
301
                var = grp[k+'_histogram_bins']
302
                self.variables_meta[k]['units'] = var.attrs['units']
303
                if isinstance(self.variables_meta[k]['units'], bytes):
304
                    self.variables_meta[k]['units'] = str(self.variables_meta[k]['units'], 'utf-8')
305

    
306
                var = grp[k+'_histogram_eq']
307
                self.variables[k+'.histogram_eq'] = var[time_index, :]
308

    
309
                var = grp[k+'_bounds']
310
                self.variables[k+'.bins'] = np.concatenate((var[:, 0], [var[-1, 1]]))
311

    
312
                for atkey in ('mean', 'min', 'max', 'standard_deviation',
313
                              'q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99',
314
                              'inter_quartile_range', 'median', 'count', 'mode'):
315
                    var = grp[k+'_'+atkey]
316
                    self.variables_meta[k][atkey] = var[time_index]
317

    
318
                partial_keys = list(self.partial_keys)
319
                partial_keys.append('count')
320

    
321
                for suffix in ('south', 'north', 'land', 'sea'):
322
                    for kk in partial_keys:
323
                        try:
324
                            var = grp[k+'_'+kk+'_'+suffix]
325
                            self.variables_meta[k][kk+'_'+suffix] = var[time_index]
326
                        except:
327
                            self.logger.debug("{0}.ingest(): variable '{1}' not available".format(self.__class__.__name__, k+'_'+kk+'_'+suffix))
328
        return True
329

    
330
    ## Write processed data to output netcdf file.
331
    #
332
    #  @param fname File to write to
333
    #  @param mode  Writing mode, defaults to append.
334
    #
335
    #  Write data (including extraction specific dimensions) to the group with
336
    #  the name given in the `storage_group_name` property ("`histogramplot_data`" for this class).
337
    def dump(self, fname, mode='a'):
338
        compress={'compression':'gzip', 'compression_opts':3, 'shuffle':True, 'fletcher32':True}
339
        with h5py.File(fname, 'a') as ref:
340
            time_step = self.time_index_in_output
341
            try:
342
                grp = ref.create_group(self.storage_group_name)
343
                grp.attrs['comment'] = 'Histograms of data'
344
            except:
345
                grp = ref[self.storage_group_name]
346

    
347
            ##  For each unique number of bins a dimension is created.
348
            number_bins_collection = sorted(list(self.number_bins_collection))
349

    
350
            for number_bins in number_bins_collection:
351
                try:
352
                    dim_name = 'histogram_bins_{0}'.format(number_bins)
353
                    var = grp.create_dataset(dim_name, (number_bins,), dtype=np.int32, **compress)
354
                    var[:] = np.arange(number_bins, dtype=np.int32)
355
                    var.attrs['long_name'] = "histogram indices for length {0}".format(number_bins)
356
                except:
357
                    self.logger.debug("{0}.dump(): skipping dimension '{1}'".format(self.__class__.__name__, dim_name))
358
                    var = grp[dim_name]
359
                    var[:] = np.arange(number_bins, dtype=np.int32)
360

    
361
            ## Loop over the variables and store the histogram for each
362
            #
363
            #  Create dataset, attach dimensions (as scales), set attributes.
364
            #  If variable already exists: expand in time-dimension.
365
            for k in self.variable_names:
366
                if self.input_variables.variables[k].noscanline:
367
                    continue
368

    
369
                histogram_bins_dim = 'histogram_bins_{0}'.format(self.input_variables.variables[k].number_bins)
370
                try:
371
                    self.logger.debug("{0}.dump(): Opening & resizing variable '{1}'".format(self.__class__.__name__, k+'_histogram'))
372
                    var = grp[k+'_histogram']
373
                    var.resize(self.time_index_in_output+1, axis=0)
374
                except KeyError:
375
                    if self.time_index_in_output > 0:
376
                        continue
377
                    self.logger.debug("{0}.dump(): creating variable '{1}'".format(self.__class__.__name__, k+'_histogram'))
378
                    var = grp.create_dataset(k+'_histogram',
379
                                             (1, self.input_variables.variables[k].number_bins),
380
                                             dtype=np.int32,
381
                                             maxshape=(None,self.input_variables.variables[k].number_bins),
382
                                             **compress)
383
                    self.logger.debug("{0}.dump(): attaching dimension 'time' to '{1}'".format(self.__class__.__name__, k+'_histogram'))
384
                    var.dims.create_scale(ref['time'], 'time')
385
                    var.dims[0].attach_scale(ref['time'])
386
                    self.logger.debug("{0}.dump(): attaching dimension '{2}' to '{1}'".format(self.__class__.__name__, k+'_histogram', histogram_bins_dim))
387
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
388
                    var.dims[1].attach_scale(grp[histogram_bins_dim])
389
                    self.logger.debug("{0}.dump(): Adding attributes to variable '{1}'".format(self.__class__.__name__, k+'_histogram'))
390
                    var.attrs['long_name'] = self.variables_meta[k]['title']
391
                    var.attrs['range'] = [float(v) for v in self.variables_meta[k]['data_range']]
392
                    var.attrs['color_scale'] = self.variables_meta[k]['color_scale']
393
                    var.attrs['log_range'] = str(self.variables_meta[k]['log_range'])
394
                    var.attrs['bins'] = k+'_histogram_bins'
395
                
396
                try:
397
                    var[time_step, :] = self.variables[k+'.histogram']
398
                except TypeError as err:
399
                    self.logger.warning(f"Error in variable '{k}'")
400
                    self.logger.warning(str(err))
401
                    raise
402

    
403
                ## Create bins variable for histogram axis, including bounds.
404
                try:
405
                    var = grp.create_dataset(k+'_histogram_bins',
406
                                             (self.input_variables.variables[k].number_bins,),
407
                                             dtype=np.float32,
408
                                             **compress)
409
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
410
                    var.dims[0].attach_scale(grp[histogram_bins_dim])
411
                except:
412
                    var = grp[k+'_histogram_bins']
413
                var.attrs['bounds'] = k+'_bounds'
414
                if 'units' in self.variables_meta[k] and self.variables_meta[k]['units'] is not None:
415
                    var.attrs['units'] = self.variables_meta[k]['units']
416
                else:
417
                    var.attrs['units'] = "None"
418
                var[:] = (self.variables[k+'.bins'][0:-1] + self.variables[k+'.bins'][1:])/2.
419

    
420
                try:
421
                    var = grp.create_dataset(k+'_bounds',
422
                                             (self.input_variables.variables[k].number_bins, 2),
423
                                             dtype=np.float32,
424
                                             **compress)
425
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
426
                    var.dims[0].attach_scale(grp[histogram_bins_dim])
427
                    var.dims.create_scale(ref['nv'], 'nv')
428
                    var.dims[1].attach_scale(ref['nv'])
429
                except:
430
                    var = grp[k+'_bounds']
431
                var[:, 0] = self.variables[k+'.bins'][0:-1]
432
                var[:, 1] = self.variables[k+'.bins'][1:]
433

    
434
                ## Add the weighted histogram.
435
                #
436
                #  Create dataset, attach dimensions (as scales), set attributes.
437
                #  If variable already exists: expand in time-dimension.
438
                try:
439
                    var = grp[k+'_histogram_eq']
440
                    var.resize(self.time_index_in_output+1, axis=0)
441
                except:
442
                    if self.time_index_in_output > 0:
443
                        continue
444
                    var = grp.create_dataset(k+'_histogram_eq', (1, self.input_variables.variables[k].number_bins),
445
                                             dtype=np.float32,
446
                                             maxshape=(None,self.input_variables.variables[k].number_bins),
447
                                             **compress)
448
                    var.dims.create_scale(ref['time'], 'time')
449
                    var.dims[0].attach_scale(ref['time'])
450
                    var.dims.create_scale(grp[histogram_bins_dim], histogram_bins_dim)
451
                    var.dims[1].attach_scale(grp[histogram_bins_dim])
452
                    var.attrs['range'] = self.variables_meta[k]['data_range']
453
                    var.attrs['color_scale'] = self.variables_meta[k]['color_scale']
454
                    var.attrs['log_range'] = str(self.variables_meta[k]['log_range'])
455
                    var.attrs['bins'] = k+'_histogram_bins'
456
                    var.attrs['long_name'] = "normed histogram of {0}, with cos(latitude) as weight".format(k)
457
                var[time_step, :] = self.variables[k+'.histogram_eq']
458

    
459
                ## For each of the statistical results, create a variable with dimensions `time`.
460
                #
461
                #  The long name is given in this dictionary.
462
                meanings = {'mean':'mean of the observations',
463
                            'min':'minimum value',
464
                            'max':'maximum value',
465
                            'standard_deviation':'sample standard deviation',
466
                            'q01':"1% of all values are smaller than this value",
467
                            'q05':"5% of all values are smaller than this value",
468
                            'q10':"10% of all values are smaller than this value",
469
                            'q16':"15.9% of all values are smaller than this value (1 standard deviation)",
470
                            'q25':"25% of all values are smaller than this value",
471
                            'q75':"75% of all values are smaller than this value",
472
                            'q84':"84.1% of all values are smaller than this value (1 standard deviation)",
473
                            'q90':"90% of all values are smaller than this value",
474
                            'q95':"95% of all values are smaller than this value",
475
                            'q99':"99% of all values are smaller than this value",
476
                            'inter_quartile_range':"q75 - q25",
477
                            'median':"median value",
478
                            'count':'number of datapoints',
479
                            'mode': "Mode (most common) value"}
480
                for atkey in meanings.keys():
481
                    try:
482
                        var = grp[k+'_'+atkey]
483
                        var.resize(self.time_index_in_output+1, axis=0)
484
                    except:
485
                        if self.time_index_in_output > 0:
486
                            continue
487
                        var = grp.create_dataset(k+'_'+atkey, (1,),
488
                            dtype=(np.float64 if (atkey != 'count') else np.int32),
489
                            maxshape=(None,), **compress)
490
                        var.dims.create_scale(ref['time'], 'time')
491
                        var.dims[0].attach_scale(ref['time'])
492
                        var.attrs['long_name'] = meanings[atkey]
493
                    try:
494
                        if atkey not in self.variables_meta[k] or not np.isfinite(self.variables_meta[k][atkey]):
495
                            var[time_step] = fill_value(np.float64 if (atkey != 'count') else np.int32)
496
                        else:
497
                            var[time_step] = self.variables_meta[k][atkey]
498
                    except:
499
                        self.logger.info("variables: %s", ", ".join(list(self.variables_meta.keys())))
500
                        self.logger.info("keys: %s", ", ".join(list(self.variables_meta[k].keys())))
501
                        self.logger.info("value: %f", self.variables_meta[k][atkey])
502
                        raise CAMAException("Error in HistogramPlot.dump() for variable %s and key %s" % (k, atkey))
503

    
504
                partial_keys = list(self.partial_keys)
505
                partial_keys.append('count')
506

    
507
                for suffix, label in zip(('south', 'north', 'land', 'sea'),
508
                                         ('southern hemisphere', 'northern hemisphere',
509
                                          'pixels over land', 'pixels over sea')):
510
                    for kk in partial_keys:
511
                        try:
512
                            var = grp[k+'_'+kk+'_'+suffix]
513
                            var.resize(self.time_index_in_output+1, axis=0)
514
                        except:
515
                            if self.time_index_in_output > 0:
516
                                continue
517
                            var = grp.create_dataset(k+'_'+kk+'_'+suffix, (1,),
518
                                dtype=(np.float64 if kk != 'count' else np.int32),
519
                                maxshape=(None,), **compress)
520
                            var.dims.create_scale(ref['time'], 'time')
521
                            var.dims[0].attach_scale(ref['time'])
522
                            var.attrs['long_name'] = meanings[kk] + ', for ' + label
523

    
524
                        if kk+'_'+suffix not in self.variables_meta[k] or not np.isfinite(self.variables_meta[k][kk+'_'+suffix]):
525
                            var[time_step] = fill_value(np.float64 if kk != 'count' else np.int32)
526
                        else:
527
                            var[time_step] = self.variables_meta[k][kk+'_'+suffix]
528

    
529
    ## Make a plot of a specified variable
530
    #
531
    #  @param varname The name of the variable to plot.
532
    #  @param figure  The matplotlib.figure.Figure instance to plot to.
533
    #  @param ax      The matplotlib.axes.Axes object to use. Default is None, in that case `ax = matplotlib.pyplot.gca()` shall be used.
534
    #  @param weighted Keyword parameter, boolean. When `True` the weighted histogram gets plotted.
535
    #  @param ylog     Keyword parameter, boolean. When `True` a logarithmic y-axis will be used.
536
    #  @param legend   Keyword parameter, boolean. When `True` a legend is added to the plot.
537
    #  @param clip     Keyword parameter, boolean. When `True` the horizontal axis is clipped to the range of the histogram.
538
    #                  When set to `False` the x-range is determined by the mean and standard deviation or the median and inter quartile range.
539
    #
540
    #  @return `False` if no plot could be created (a logarithmic y-axis with a maximum of 0), `True` otherwise.
541
    def plot(self, varname, figure, ax=None, **kwargs):
542
        import matplotlib
543
        matplotlib.use('svg')
544
        matplotlib.rc('font', family='DejaVu Sans')
545
        import matplotlib.pyplot as plt
546
        import matplotlib.cm as cm
547
        from matplotlib.colors import Normalize, LogNorm
548

    
549
        if varname not in self.variables_meta:
550
            self.logger.debug("Histogram data for '%s' not found.", varname)
551
            return False
552
        self.logger.info("Plotting histogram of %s", varname)
553
        if ax is None:
554
            ax = plt.gca()
555

    
556
        left = self.variables[varname+'.bins'][0:-1]
557
        box = ax.get_position()
558

    
559
        if 'add_xlabel' in kwargs and not kwargs['add_xlabel']:
560
            add_xlabel = False
561
        else:
562
            add_xlabel = True
563

    
564
        weighted = ('weighted' in kwargs and kwargs['weighted'])
565
        log = ('ylog' in kwargs and kwargs['ylog'])
566
        show_legend = ('legend' in kwargs and kwargs['legend'])
567
        clip_x = ('clip' in kwargs and kwargs['clip'])
568

    
569
        meta = self.variables_meta[varname]
570

    
571
        log_x_range = self.variables_meta[varname]['log_range']
572
        # log_x_range = False # disabled to fix the range issue
573

    
574
        if weighted:
575
            height = self.variables[varname+'.histogram_eq']
576
        else:
577
            height = self.variables[varname+'.histogram']
578

    
579
        if log and height.max() <= 0:
580
            return False
581

    
582
        width = self.variables[varname+'.bins'][1:] - self.variables[varname+'.bins'][0:-1]
583

    
584
        if self.variables[varname+'.bins'][0] == self.variables[varname+'.bins'][-1]:
585
            return False
586

    
587
        plt.bar(left, height, width=width, color='b' if weighted else 'g', edgecolor='k', linewidth=0.5, align='edge', log=log)
588

    
589
        if 'time' in kwargs and not weighted:
590
            if isinstance(kwargs['time'], (list, tuple)):
591
                tt = kwargs['time']
592
            else:
593
                tt = [kwargs['time']]
594
            timestr = "".join([t.strftime("%Y-%m-%d") for t in tt])
595
            ax.set_title(timestr)
596

    
597
        if log_x_range:
598
            try:
599
                ax.set_xscale('log', basex=10, subsx=[2, 3, 4, 5, 6, 7, 8, 9], nonposx='mask')
600
            except ValueError:
601
                self.logger.debug("Tried to use a logarithmic histogram axis, but failed for %s.", varname)
602
                ax.set_xscale('linear')
603
        else:
604
            ax.set_xscale('linear')
605

    
606
        if clip_x:
607
            plt.xlim(meta['data_range'])
608

    
609
        if add_xlabel:
610
            if self.variables_meta[varname]['units'] == "1":
611
                ax.set_xlabel(self.variables_meta[varname]['title'])
612
            else:
613
                ax.set_xlabel("{0} [{1}]".format(self.variables_meta[varname]['title'], self.variables_meta[varname]['units']))
614
        ax.set_ylabel("Number of observations" if not weighted else "Observation density")
615

    
616
        ymin, ymax = ax.get_ylim()
617
        if log:
618
            decades = math.log10(ymax/ymin)
619
            y = ymin*10**(decades/5)
620
            ymid = [ymin*10**(decades/6), ymin*10**(decades/6)]
621
            ylow = [ymin*10**(decades/9), ymin*10**(decades/9)]
622
        else:
623
            y = ymin+(ymax-ymin)/25.
624
            ymid = [ymin+2*(y-ymin), ymin+2*(y-ymin)]
625
            ylow = [ymin+1*(y-ymin), ymin+1*(y-ymin)]
626

    
627
        if not weighted:
628
            tbl = []
629
            tbl.append(r"mean: ${0}$".format(self.number_pretty_printer_pair(meta['mean'], meta['standard_deviation'])))
630
            tbl.append("minimum: {min}, maximum: {max}".format(min=self.number_pretty_printer(meta['min']), max=self.number_pretty_printer(meta['max'])))
631
            fmt = r"percentiles: 1\%: {q[0]}, 5\%: {q[1]}, 10\%: {q[2]}, 15.9\%: {q[3]}, 25\%: {q[4]}, 75\%: {q[5]}, 84.1\%: {q[6]}, 90\%: {q[7]}, 95\%: {q[8]},  99\%: {q[9]}"
632
            tbl.append(fmt.format(q=[self.number_pretty_printer(meta[q]) for q in ('q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99')]))
633
            tbl.append("median: {median}, interquartile range: {iqr}".format(median=self.number_pretty_printer(meta['median']), iqr=self.number_pretty_printer(meta['inter_quartile_range'])))
634

    
635
            plt.text(0.05, 0.95, tbl[0], alpha=0.0, axes=ax, size='xx-small', verticalalignment='top', horizontalalignment='left')
636

    
637
        for atkey, c, yy in zip(('mean', 'median'), ('mv', 'r^'), (ylow[0], ymid[0])):
638
            v = self.variables_meta[varname][atkey]
639
            plt.plot([v], [yy], c, label=atkey.capitalize())
640

    
641
        for atkey, c, lim, yy in zip(('standard_deviation', 'inter_quartile_range'), ('md:', 'rd--'), (('mean',), ('q25', 'q75')), (ylow, ymid)):
642
            v = self.variables_meta[varname][atkey]
643
            if len(lim) == 1:
644
                x = [self.variables_meta[varname][lim[0]] - v, self.variables_meta[varname][lim[0]] + v]
645
            else:
646
                x = [self.variables_meta[varname][lim[0]], self.variables_meta[varname][lim[1]]]
647
            plt.plot(x, yy, c, label=atkey.replace('_', ' ').capitalize())
648

    
649
        if show_legend:
650
            ax.set_position([box.x0, box.y0, box.width, box.height * 0.97], 'both')
651
            ax.legend(ncol=4, fontsize='x-small', bbox_to_anchor=(0.0, 1.0, 1.0, 0.03),
652
                      loc=3, mode="expand", borderaxespad=0.)
653

    
654
        return True
655

    
656
    ## Generate a LaTeX table with statistics for inclusion in the report.
657
    #  @param kwargs No keywords are actually used.
658
    #  @return String with table.
659
    #
660
    #  The table uses a LaTeX boxes called `\parameterbox` and `\percentilebox`
661
    #  (these boxes are already created in the latex template for the report),
662
    #  and determines (within LaTeX) which orientation of the table gives the best result.
663
    #
664
    def report(self, **kwargs):
665
        rval = []
666

    
667
        caption_text = "Parameterlist and basic statistics for the analysis"
668
        label = 'table:parameters'
669

    
670
        rval.append(r'\savebox{\parameterbox}{')
671
        rval.append(r'\begin{tabular}{l|ccccccc}')
672
        rval.append(r"Variable & mean\,$\pm\sigma$ & Count & Mode & IQR & Median & Minimum & Maximum \\")
673
        fmt = "{name} [{unit}] & ${mean}$ & ${count}$ & ${mode}$ & ${iqr}$ & ${median}$ & ${min}$ & ${max}$ \\\\"
674
        for name in self.variable_names:
675
            meta = self.variables_meta[name]
676
            rval.append(fmt.format(name=name.replace('_', ' '),
677
                                   unit=meta['units'],
678
                                   mean=self.number_pretty_printer_pair(meta['mean'], meta['standard_deviation']),
679
                                   mode=self.number_pretty_printer(meta['mode']),
680
                                   iqr=self.number_pretty_printer(meta['inter_quartile_range']),
681
                                   median=self.number_pretty_printer(meta['median']),
682
                                   min=self.number_pretty_printer(meta['min']),
683
                                   max=self.number_pretty_printer(meta['max']),
684
                                   count=int(meta['count'])))
685

    
686
        rval.append(r'\end{tabular}}')
687
        tabletemplate = r"""
688

689
\setlength{\naturaltablewidth}{\widthof{\usebox{\parameterbox}}}
690
\setlength{\naturaltableheight}{\heightof{\usebox{\parameterbox}}}
691
\addtolength{\naturaltableheight}{\depthof{\usebox{\parameterbox}}}
692
\setlength{\naturaltextratio}{1pt*\ratio{0.92\textheight}{\textwidth}}
693
\setlength{\naturaltableratio}{1pt*\ratio{\naturaltableheight}{\naturaltablewidth}}
694
\setlength{\realtextheight}{\textheight}
695
\setlength{\realtextwidth}{\textwidth}
696
%%
697
\ifthenelse{\lengthtest{0.92\textheight>\naturaltableheight}%%
698
            \AND \lengthtest{\textwidth>\naturaltablewidth}}%%
699
{%%(then)
700
  \begin{table}[htbp]%%
701
  \caption{%s}\label{%s}%%
702
  \centering\usebox{\parameterbox}%%
703
  \end{table}%%
704
}%%
705
{%%
706
  \begin{landscape}%%
707
  \begin{table}%%
708
    \caption{%s}\label{%s}%%
709
    \ifthenelse{\lengthtest{0.92\realtextheight>\naturaltableheight}%%
710
                \AND \lengthtest{\realtextwidth>\naturaltablewidth}}%%
711
      {\centering\usebox{\parameterbox}}%%
712
      {%%
713
        \ifthenelse{\lengthtest{\naturaltableratio<\naturaltextratio}}%%
714
          {\resizebox*{\realtextheight}{!}{\usebox{\parameterbox}}}%%
715
          {\resizebox*{!}{0.92\realtextwidth}{\usebox{\parameterbox}}}%%
716
      }
717
  \end{table}%%
718
  \end{landscape}%%
719
}
720

721
"""
722
        rval.append(tabletemplate % (caption_text, label, caption_text, label))
723

    
724
        caption_text = "Percentile ranges"
725
        label = 'table:percentiles'
726

    
727
        rval.append(r'\savebox{\percentilebox}{')
728
        rval.append(r'\begin{tabular}{l|ccccccccccc}')
729
        rval.append(r"Variable & 1\,\% & 5\,\% & 10\,\% & 15.9\,\% & 25\,\% & 75\,\% & 84.1\,\% & 90\,\% & 95\,\% & 99\,\% \\")
730
        fmt = "{name} [{unit}]  & ${q[0]}$ & ${q[1]}$ & ${q[2]}$ & ${q[3]}$ & ${q[4]}$ & ${q[5]}$ & ${q[6]}$ & ${q[7]}$ & ${q[8]}$ & ${q[9]}$ \\\\"
731
        for name in self.variable_names:
732
            meta = self.variables_meta[name]
733
            rval.append(fmt.format(unit=meta['units'],
734
                                   name=name.replace('_', ' '),
735
                                   q=[self.number_pretty_printer(meta[q]) for q in ('q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99')]))
736
        rval.append(r'''\end{tabular}}''')
737

    
738
        rval.append(tabletemplate.replace('parameterbox', 'percentilebox') % (caption_text, label, caption_text, label))
739

    
740
        partial_keys = list(self.partial_keys)
741
        partial_keys.append('count')
742

    
743
        if partial_keys[0]+'_south' not in self.variables_meta[self.variable_names[0]]:
744
            return "\n".join(rval)
745

    
746
        for suffix, label in zip(('north', 'south', 'sea', 'land'), ('in the northern hemisphere', 'in the southern hemisphere', 'over water', 'over land')):
747
            caption_text = "Parameterlist and basic statistics for the analysis for observations {0}".format(label)
748
            label = 'table:parameters_{0}'.format(suffix)
749

    
750
            rval.append(r'\savebox{{\parameterbox{0}}}{{'.format(suffix))
751
            rval.append(r'\begin{tabular}{l|cccccccc}')
752
            rval.append(r"Variable & mean\,$\pm\sigma$ & Count & IQR & Median & Minimum & Maximum & 25\,\% percentile & 75\,\% percentile\\")
753
            fmt = "{name} [{unit}] & ${mean}$ & ${count}$ & ${iqr}$ & ${median}$ & ${min}$ & ${max}$ & ${q25}$ & ${q75}$ \\\\"
754
            for name in self.variable_names:
755
                meta = self.variables_meta[name]
756
                rval.append(fmt.format(name=name.replace('_', ' '),
757
                                       unit=meta['units'],
758
                                       mean=self.number_pretty_printer_pair(meta['mean_'+suffix], meta['standard_deviation_'+suffix]),
759
                                       iqr=self.number_pretty_printer(meta['inter_quartile_range_'+suffix]),
760
                                       median=self.number_pretty_printer(meta['median_'+suffix]),
761
                                       min=self.number_pretty_printer(meta['min_'+suffix]),
762
                                       max=self.number_pretty_printer(meta['max_'+suffix]),
763
                                       count=int(meta['count_'+suffix]),
764
                                       q25=self.number_pretty_printer(meta['q25_'+suffix]),
765
                                       q75=self.number_pretty_printer(meta['q75_'+suffix])))
766

    
767
            rval.append(r'\end{tabular}}')
768
            rval.append(tabletemplate.replace('parameterbox', 'parameterbox'+suffix) % (caption_text, label, caption_text, label))
769

    
770
        return "\n".join(rval)
771

    
772
    ## Add a table to the report using matplotlib alone. The preferred output uses LaTeX.
773
    #
774
    #  @param ax The axis to use, defaults to `ax = matplotlib.pyplot.gca()`
775
    #  @param figure The figure instance to use, defaults to `figure = matplotlib.pyplot.gcf()`.
776
    #  @param variant Which of the tables should be printed. keyword argument with values
777
    #  `base` or `percentile`. Indicates which table is written.
778
    def table(self, ax=None, figure=None, **kwargs):
779
        import matplotlib.pyplot as plt
780
        import matplotlib.cm as cm
781
        from matplotlib.colors import Normalize, LogNorm
782

    
783
        if 'variant' in kwargs:
784
            variant = kwargs['variant']
785
        else:
786
            variant = 'base'
787

    
788
        self.logger.info("Adding table for %s parameters", variant)
789

    
790
        if ax is None:
791
            ax = plt.gca()
792

    
793
        if figure is None:
794
            figure = plt.gcf()
795

    
796

    
797
        if variant == 'base':
798
            colLabels = ['Name', 'Mean', 'StdDev', 'Count', 'Mode', 'IQR', 'Median', 'Min', 'Max']
799
            contents = []
800
            for name in self.variable_names:
801
                if name not in self.variables_meta:
802
                    continue
803
                meta = self.variables_meta[name]
804
                line = [name]
805
                for q in ('mean', 'standard_deviation', 'count', 'mode', 'inter_quartile_range', 'median', 'min', 'max'):
806
                    try:
807
                        if q not in ['count']:
808
                            val = "{0:.4g}".format(meta[q])
809
                        else:
810
                            val = "{0:d}".format(int(meta[q]))
811
                    except TypeError as err:
812
                        val = '--'
813
                    line.append(val)
814
                contents.append(line)
815
        else:
816
            colLabels = ['Name', '1%', '5%', '10%', '15.9%', '25%', '75%', '84.1%', '90%', '95%', '99%']
817
            contents = []
818
            for name in self.variable_names:
819
                if name not in self.variables_meta:
820
                    continue
821
                meta = self.variables_meta[name]
822
                line = [name]
823
                for q in ('q01', 'q05', 'q10', 'q16', 'q25', 'q75', 'q84', 'q90', 'q95', 'q99'):
824
                    try:
825
                        line.append("{0:.4g}".format(meta[q]))
826
                    except TypeError:
827
                        line.append("--")
828
                contents.append(line)
829

    
830
        nrows, ncols = len(self.variable_names)+1, len(colLabels)
831
        hcell, wcell = 0.3, 1.
832
        hpad, wpad = 0, 0
833
        figure.set_size_inches(ncols*wcell+wpad, nrows*hcell+hpad)
834

    
835
        ax.axis('off')
836
        #do the table
837
        the_table = ax.table(cellText=contents, colLabels=colLabels, loc='center')
838
        return [the_table]