Project

General

Profile

Statistics
| Branch: | Tag: | Revision:

pycama / src / pycama / utilities.py @ 750:ccb55f4d7a99

History | View | Annotate | Download (21.3 KB)

1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3

    
4
# Copyright 2016-2017 Maarten Sneep, KNMI
5
#
6
# Redistribution and use in source and binary forms, with or without
7
# modification, are permitted provided that the following conditions are met:
8
#
9
# 1. Redistributions of source code must retain the above copyright notice,
10
#    this list of conditions and the following disclaimer.
11
#
12
# 2. Redistributions in binary form must reproduce the above copyright notice,
13
#    this list of conditions and the following disclaimer in the documentation
14
#    and/or other materials provided with the distribution.
15
#
16
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
20
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26

    
27
## \file utilities.py
28
#  Support utilities for PyCAMA, including the logging system.
29
# @author Maarten Sneep
30

    
31
import sys
32
import os
33
import datetime, time
34
import warnings
35
import logging
36
import logging.config
37
import socket
38
import collections
39
import getpass
40
import warnings
41
warnings.filterwarnings("ignore", category=FutureWarning)
42

    
43
import h5py
44
import netCDF4
45
import numpy as np
46

    
47

    
48
## Global counter to keep track of the number of warnings in a run.
49
warning_counter = 0
50

    
51
## Global counter to keep track of the number of errors in a run.
52
error_counter = 0
53

    
54
## Global counter to determing when the number of warnings is so large that this is actually an error.
55
warning_counter_limit = None
56

    
57
## Extra logging level for progress messages.
58
#
59
#  Injected at the same level as expected by the thin layer interface.
60
PROGRESS_LEVEL_NUM = 35
61

    
62
## Create a new logging level with the name "PROGRESS"
63
#
64
# This code is executed at load time.
65
# \code
66
# logging.addLevelName(PROGRESS_LEVEL_NUM, "PROGRESS")
67
# \endcode
68
#
69
# The progress_method() is the actual logger, similar to the \c warning and \c debug methods of the logger.
70
# The function is injected as the method \c progress into the <tt>logging.Logger</tt> class.
71
#
72
# \code
73
# logging.Logger.progress = progress_method
74
# \endcode
75
#
76
# we're not calling the function here, just inserting the method into the class object.
77
# Gotta love python for its monkey-patching.
78
logging.addLevelName(PROGRESS_LEVEL_NUM, "PROGRESS")
79

    
80
def progress_method(self, message, *args, **kws):
81
    # Yes, logger takes its '*args' as 'args'.
82
    if self.isEnabledFor(PROGRESS_LEVEL_NUM):
83
        self._log(PROGRESS_LEVEL_NUM, message, args, **kws)
84

    
85
logging.Logger.progress = progress_method
86

    
87
## Generate and capture errors specific to PyCAMA
88
class CAMAException(Exception):
89
    ##
90
    # \param message The reason of the error.
91
    # \param return_value The exit value of PyCAMA, defaults to 255 (hard error).
92
    #
93
    # Raising a CAMAException is considered a fatal error.
94
    def __init__(self, message, return_value=255):
95
        self.message = message
96
        self.return_value = return_value
97

    
98
## Generate and capture errors while reading the job order file.
99
#
100
# This is handled separately, as the logging system may not have been set up.
101
# The configuration for the logging system is coming from the job order file,
102
# and if there are errors in the job order we don't have a logging system.
103
# In this case the reporting must be handled slightly differently.
104
#
105
# This is always a fatal error.
106
class JobOrderError(Exception):
107
    pass
108

    
109
## Order preserving unique values filter
110
#
111
# Loop over the sequence \c seq and return a list with the unique items
112
# while preserving the order in the original list.
113
#
114
# \param[in] seq A sequence.
115
# \param[in] idfun A callable which returns a marker for an item in the list.
116
# This can be used to transform a string, for instance by making it lower
117
# case to ignore case in this function. The default is to apply no transformation.
118
# \return The order preserved uniq list.
119
#
120
def uniq(seq, idfun=None):
121
    if idfun is None:
122
        def idfun(x): return x
123
    seen = {}
124
    result = []
125
    for item in seq:
126
        marker = idfun(item)
127
        if marker in seen: continue
128
        seen[marker] = 1
129
        result.append(item)
130
    return result
131

    
132
## Set the maximum number of warnings before an error is generated.
133
# \param[in] lim The new maximum number of warnings before an error is generated. Should be an integer.
134
def set_warnings_limit(lim):
135
    global warning_counter_limit
136
    warning_counter_limit = lim
137

    
138
## Is this object a sequence?
139
#
140
# Try a few operations to check that the argument is indeed a sequence.
141
# \param[in] obj The object to test.
142
# \return \c True if the object is enough of a sequence to operate on, \c False otherwise.
143
def is_sequence(obj):
144
    try:
145
        len(obj)
146
        obj[0:0]
147
        return True
148
    except TypeError:
149
        return False
150

    
151
## A formatter class specifically for the production logger.
152
#
153
#  The main aim for this subclass of `logging.Formatter` is to format the timestamp
154
#  accoding to the thin layer interface, i.e. including microseconds. This is not
155
#  supported out of the box by the `logging` framework of Python 3.
156
#
157
#  The base class is documented
158
#  [on the Python 3 documentation site](https://docs.python.org/3/library/logging.html#formatter-objects).
159
#
160
class MyFormatter(logging.Formatter):
161
    ## Convert the creation date for easier formatting
162
    #
163
    #  @param[in] r Creation timestamp in unix epoch-seconds.
164
    #  @return  `datetime` object in the `utc` timezone.
165
    def converter(self, r):
166
        return datetime.datetime.fromtimestamp(r, tz=datetime.timezone.utc)
167

    
168
    ## Format the time into an ISO 8601 string
169
    #
170
    # Override the [Python standard library `logging.Formatter.formatTime` method to include microseconds
171
    # @param[in] record a `logging.LogRecord` object. Only the `record.created` attribute is used.
172
    # @param[in] datefmt A specific format to use for the date formatting
173
    # @return a formatted string.
174
    def formatTime(self, record, datefmt=None):
175
        ct = self.converter(record.created)
176
        if datefmt:
177
            s = ct.strftime(datefmt)
178
        else:
179
            s = ct.strftime("%Y-%m-%dT%H:%M:%S.%f")
180
        return s
181

    
182
## A filter object for the logging facility
183
#
184
#  [See the python documentation for a full specification](https://docs.python.org/3/library/logging.html#filter-objects).
185
class ErrorLogFilter(object):
186
    ## Initialization
187
    #
188
    #  set up the filter
189
    #  @param[in] outlevel Level for standard output logging
190
    #  @param[in] errlevel Level for standard error logging
191
    def __init__(self, outlevel, errlevel):
192
        self.outlevel = outlevel if outlevel >= logging.DEBUG else logging.DEBUG
193
        self.errlevel = errlevel if errlevel <= logging.ERROR else logging.ERROR
194
    ## Is the specified record to be logged?
195
    #
196
    #  @param[in] record The record to operate on
197
    #  @returns 0 for no, non-zero for yes.
198
    #
199
    #  As all logging records pass through this method, we also count the number of errors and warnings here.
200
    def filter(self, record):
201
        global error_counter
202
        global warning_counter
203
        global warning_counter_limit
204
        if record.levelno == logging.ERROR:
205
            error_counter += 1
206
        elif record.levelno == logging.WARNING:
207
            warning_counter += 1
208

    
209
        return self.outlevel <= record.levelno < self.errlevel
210

    
211
## Check the number of warnings, and raise an error if required
212
#
213
#  Uses the gloabl variables `warning_counter` and `warning_counter_limit` to check and raise an error.
214
def check_warning_counter():
215
    global warning_counter
216
    global warning_counter_limit
217
    if warning_counter_limit is not None and warning_counter_limit > 0 and warning_counter > warning_counter_limit:
218
        raise CAMAException("The maximum number of warnings has been reached.")
219

    
220
## The current number of errors
221
#
222
#  Access function to read the number of errors in the (module) global variable `error_counter`
223
def total_errors():
224
    global error_counter
225
    return error_counter
226

    
227
## The current number of warnings
228
#
229
#  Access function to read the number of warnings in the (module) global variable `warning_counter`
230
def total_warnings():
231
    global warning_counter
232
    return warning_counter
233

    
234
## Compare version numbers of PyCAMA and as presented in the job order file
235
#
236
#  Compare version numbers of the form `Major.minor.patch`, checking numerical values rather than comparing strings.
237
#
238
#  @param[in] version_joborder The number as found in the job order file
239
#  @param[in] level the level to which to compare, using _1.2.3_ for the levels. Defaults to 2.
240
#  @returns one of the following values:
241
#  * 0 if the version numbers are equal to the level specified.
242
#  * 1 if the job order version is _newer_ than the pycama version.
243
#  * -1 if the job order version is _older_ than the pycama version.
244
#  * -2 if the parts of the job order version could not be converted to integers.
245
#  * -3 if the parts of the pycama version could not be converted to integers (this is a packaging error).
246
#
247
#  Uses the `__version__` variable from the main pycama package.
248
def compare_versions(version_joborder, level=2):
249
    from . import __version__
250

    
251
    try:
252
        jo_versions = [int(k) for k in version_joborder.split('.')]
253
    except:
254
        return -2
255

    
256
    try:
257
        my_versions = [int(k) for k in __version__.split('.')]
258
    except:
259
        return -3
260

    
261
    for i in range(min(level, len(my_versions), len(jo_versions))):
262
        if jo_versions[i] < my_versions[i]:
263
            return -1
264
        elif jo_versions[i] > my_versions[i]:
265
            return 1
266

    
267
    return 0
268

    
269
## Prepare logging for PyCAMA
270
#
271
#  The logging system takes care of formatting and routing of all messages generated during processing.
272
#
273
#  There are five levels of log messages:
274
#  1. **Debug**: low level messages intended to figure out what is going wrong.
275
#     Not intended for operational use.
276
#  2. **Info**: Display what PyCAMA is currently doing. This is the lowest level intended for operational use.
277
#  3. **Warning**: Show what is going on that is unexpected. PyCAMA can still continue.
278
#  4. **Progress**: Show how far along we are. This is done action by by action,
279
#     as an overall progress indicator is too configuration dependent.
280
#  5. **Error**: Show if a fatal error has occurred.
281
#
282
#  Setting the level to 'info' will also enable all messages that come later in the list.
283
#
284
#  @param[in] production_logger Boolean to indicate if the logging system should be set
285
#         up according to the thin layer interface (i.e. 'production') or that a simplified
286
#         logging system can be used. Defaults to \c True, using a production logger.
287
#  @param[in] loglevel The level of logging output that is sent to stdout. Valid values are taken from the list above.
288
#  @param[in] errlevel The level of logging output that is sent to stderr.
289
#         This level should be higher (i.e. more towards 'error') than loglevel.
290
#  @return A `logging.Logger` object.
291
#
292
def setup_logging(production_logger=True, loglevel="warning", errlevel="error"):
293
    from . import __version__
294

    
295
    warnings.simplefilter("ignore")
296

    
297
    try:
298
        num_levels = [int(getattr(logging, loglevel.upper(), None)), int(getattr(logging, errlevel.upper(), None))]
299
    except TypeError:
300
        raise ValueError("Invalid log level(s) in configuration: {0}, {1}.\nValid levels are 'DEBUG', 'INFO', 'WARNING', 'PROGRESS' or 'ERROR'".format(loglevel.upper(), errlevel.upper()))
301

    
302
    for i in range(2):
303
        if num_levels[i] > logging.ERROR:
304
            num_levels[i] = logging.ERROR
305

    
306
    if num_levels[0] == num_levels[1]:
307
        if num_levels[0] == logging.ERROR:
308
            num_levels = (logging.WARNING, logging.ERROR)
309
        else:
310
            num_levels = (num_levels[0], logging.ERROR)
311

    
312
    # create logger
313
    logger = logging.getLogger('PyCAMA')
314

    
315
    if len(logger.handlers) == 0:
316
        # create formatter
317
        # 2016-06-14T09:44:15.482164 bhltrdl2 TROPNLL2DP 0.10.1 [0000006984]: [I] Starting TROPNLL2DP version 0.10.1
318
        # 2016-06-14T12:13:52.396889 bhw465.knmi.nl PyCAMA 0.1.0 [0000020929]: [I] Opening file 'G2A_TEST_L2__AER_AI_20100319T112929_20100319T131047_17713_02_001000_20160502T150326.nc'
319
        if production_logger:
320
            fmt='%(asctime)s {machine} %(name)s {version} [%(process)010d]: [%(levelname)-.1s] %(message)s'
321
            formatter = MyFormatter(fmt=fmt.format(version=__version__, machine=socket.gethostname()),
322
                                    datefmt='%Y-%m-%dT%H:%M:%S.%f')
323
        else:
324
            formatter = logging.Formatter(fmt='[%(levelname)8s] %(filename)s [%(lineno)4d]: %(message)s',
325
                                          datefmt='%Y-%m-%dT%H:%M:%S')
326

    
327
        logger.setLevel(min(num_levels))
328
        for output_channel in (sys.stdout, sys.stderr):
329
            # create console handler and set level to
330
            ch = logging.StreamHandler(stream=output_channel)
331

    
332
            if output_channel is sys.stdout:
333
                ch.setLevel(logging.DEBUG)
334
                # add filter to ignore error messages
335
                ch.addFilter(ErrorLogFilter(num_levels[0], num_levels[1]))
336
            else:
337
                ch.setLevel(num_levels[1])
338

    
339
            # add formatter to ch
340
            ch.setFormatter(formatter)
341

    
342
            # add ch to logger
343
            logger.addHandler(ch)
344

    
345
    return logger
346

    
347
## Count the number of time steps that are currently present in a PyCAMA output file.
348
#
349
# @param[in] fname The hdf5/netCDF4 file to check.
350
# @returns the number of steps in the file, or -1 in case of an error.
351
def count_time_steps(fname):
352
    if not h5py.is_hdf5(fname):
353
        return -1
354

    
355
    with h5py.File(fname, 'r') as ref:
356
        try:
357
            return len(ref['time'])
358
        except AttributeError:
359
            return -1
360

    
361
## Extract time information for a given time-index in a PyCAMA output file.
362
#
363
#  @param[in] fname The hdf5/netCDF4 file to extract from.
364
#  @param[in] index The index to check
365
#  @returns A three-element tuple with the time of the interval, the start and the end, all datetime objects.
366
#  @throws CAMAException in case the file is not an netCDF4/HDF5 file or if the index is out of range.
367
def extract_time(fname, index):
368
    if not h5py.is_hdf5(fname):
369
        raise CAMAException("The file '{0}' is not a netCDF4/HDF5 file".format(fname))
370

    
371
    with h5py.File(fname, 'r') as ref:
372
        tvar = ref['time']
373
        tbvar = ref['time_bounds']
374
        try:
375
            t, tstart, tend = netCDF4.num2date([tvar[index], tbvar[index, 0], tbvar[index, 1]], tvar.attrs['units'], 
376
                only_use_cftime_datetimes=False, only_use_python_datetimes=True)
377
        except IndexError:
378
            raise CAMAException("Index {0} is out of range".format(index))
379

    
380
    return t, tstart, tend
381

    
382
## Extract input granule count information for a given time-index in a PyCAMA output file.
383
#
384
#  @param[in] fname The hdf5/netCDF4 file to extract from.
385
#  @param[in] index The index to check
386
#  @returns an integer with the number of input granules.
387
#  @throws CAMAException in case the file is not an netCDF4/HDF5 file or if the index is out of range.
388
def extract_input_count(fname, index):
389
    if not h5py.is_hdf5(fname):
390
        raise CAMAException("The file '{0}' is not a netCDF4/HDF5 file".format(fname))
391

    
392
    with h5py.File(fname, 'r') as ref:
393
        cvar = ref['input_granule_count']
394
        try:
395
            if index is None:
396
                count = np.sum(cvar[:])
397
            else:
398
                count = cvar[index]
399
        except IndexError:
400
            raise CAMAException("Index {0} is out of range".format(index))
401

    
402
    return count
403

    
404
## Extract number of time slices in a PyCAMA output file.
405
#
406
#  @param[in] fname The hdf5/netCDF4 file to extract from.
407
#  @returns an integer with the number of time steps.
408
#  @throws CAMAException in case the file is not an netCDF4/HDF5 file.
409
def extract_time_step_count(fname):
410
    if not h5py.is_hdf5(fname):
411
        raise CAMAException("The file '{0}' is not a netCDF4/HDF5 file".format(fname))
412

    
413
    with h5py.File(fname, 'r') as ref:
414
        cvar = ref['input_granule_count']
415
        try:
416
            count = len(cvar)
417
        except:
418
            raise CAMAException("Unknown error determining number of time steps.")
419

    
420
    return count
421

    
422
## Extract number of granules in a PyCAMA output file.
423
#
424
#  @param[in] fname The hdf5/netCDF4 file to extract from.
425
#  @param[in] slice Time slice to return. When None the total of all slices in the file is returned.
426
#  @returns an integer with the number of time steps.
427
#  @throws CAMAException in case the file is not an netCDF4/HDF5 file.
428
def extract_granule_count(fname, slice=None):
429
    if not h5py.is_hdf5(fname):
430
        raise CAMAException("The file '{0}' is not a netCDF4/HDF5 file".format(fname))
431

    
432
    with h5py.File(fname, 'r') as ref:
433
        cvar = ref['input_granule_count']
434
        try:
435
            if slice is not None:
436
                count = cvar[slice]
437
            else:
438
                count = np.sum(cvar[:])
439
        except:
440
            raise CAMAException("Unknown error determining number of time steps.")
441

    
442
    return count
443

    
444
## Get a list of variables to be plotted
445
#
446
#
447
#  @param[in] fname The hdf5/netCDF4 file to extract from.
448
#  @returns A list of variables in the file, excluding the base variables that are never plotted anyway.
449
#  @throws CAMAException in case the file is not an netCDF4/HDF5 file.
450
def get_variable_list(fname, exclude=None):
451
    basenames = ("latitude", "longitude", 'processing_quality_flags', 'geolocation_flags',
452
                 'surface_classification', "solar_zenith_angle", "viewing_zenith_angle",
453
                 "solar_azimuth_angle", "viewing_azimuth_angle")
454

    
455
    if not h5py.is_hdf5(fname):
456
        raise CAMAException("The file '{0}' is not a netCDF4/HDF5 file".format(fname))
457

    
458
    with h5py.File(fname, 'r') as ref:
459
        try:
460
            var = ref['variable_names']
461
        except KeyError:
462
            raise CAMAException("The variable 'variable_names' wasn't found in '{0}'".format(fname))
463
        vnames = [n.decode('utf-8') for n in var[:] if n.decode('utf-8') not in basenames]
464

    
465
    if exclude is not None:
466
        vnames = [v for v in vnames if v not in exclude]
467
    return vnames
468

    
469
## Get the version of PyCAMA with which a netCDF file was written
470
#
471
# Used to check how to read the input file. Introduced in version 0.3.0 of PyCAMA
472
def read_version_number_from_file(fname):
473
    if not h5py.is_hdf5(fname):
474
        raise CAMAException("The file '{0}' is not a netCDF4/HDF5 file".format(fname))
475

    
476
    logger = logging.getLogger('PyCAMA')
477
    with h5py.File(fname, 'r') as ref:
478
        try:
479
            attr = ref.attrs['PyCAMA_version']
480
        except KeyError:
481
            logger.warning("No PyCAMA version was found in '{0}'".format(os.path.basename(fname)))
482
            attr = "0.0.0"
483
    return attr
484

    
485
## simplify a list of indices for reporting purposes.
486
#
487
# Reduce a list of indices to a sequence of continuous ranges, making long ranges more human readable.
488
# @param r The list to simplify.
489
# @returns A string for printing.
490
#
491
# Example:
492
# \code
493
# r = [0, 1, 2, 3, 4, 5, 7, 8, 9, 11, 13, 14, 16]
494
# print(row_list_simplification(r))
495
# \endcode
496
# prints `0-5, 7-9, 11, 13-14, 16`
497
def row_list_simplification(r):
498
    rows = sorted(r)
499
    groups = []
500
    i = 1
501
    while i < len(rows):
502
        group = [rows[i-1]]
503
        while i < len(rows) and rows[i] <= rows[i-1]+1:
504
            i+=1
505
        group.append(rows[i-1])
506
        groups.append(group)
507
        i+=1
508
    if groups[-1][-1] != rows[-1]:
509
        groups.append([rows[-1], rows[-1]])
510

    
511
    s = []
512
    for group in groups:
513
        if group[0] != group[1]:
514
            s.append("{0[0]:d}-{0[1]:d}".format(group))
515
        else:
516
            s.append("{0[0]:d}".format(group))
517
    return ", ".join(s)
518

    
519
## Clean up a name so it becomes usable as a netCDF group name
520
#
521
# @param name A string with the input name
522
# @returns The clean name for use in the output netCDF4 file.
523
def name_cleanup(name):
524
    result = []
525
    name_parts = [n.strip().replace('.nc', '') for n in name.split(',')]
526
    for name_part in name_parts:
527
        plist = name_part.split('_')
528
        try:
529
            result.append("_".join([plist[2], plist[3], plist[4], plist[5], plist[7], plist[10]]))
530
        except IndexError:
531
            result.append(name_part)
532
    return '_'.join(result)
533

    
534
## Return netCDF4 default fill values for a given numpy dtype.
535
#
536
# @param dtype A numpy dtype compatible type description.
537
# @returns The fill value of the appropriate type
538
# @throws RuntimeError for an unrecognized type.
539
def fill_value(dtype):
540
    if str(np.dtype(dtype)) in ('float32', 'float64', ):
541
        return float.fromhex('0x1.ep+122')
542
    elif str(np.dtype(dtype)) == 'int32':
543
        return -2147483647
544
    elif str(np.dtype(dtype)) == 'uint32':
545
        return 4294967295
546
    elif str(np.dtype(dtype)) == 'int64':
547
        return -9223372036854775806
548
    elif str(np.dtype(dtype)) == 'uint64':
549
        return 18446744073709551614
550
    elif str(np.dtype(dtype)) == 'int16':
551
        return -32767
552
    elif str(np.dtype(dtype)) == 'uint16':
553
        return 65535
554
    elif str(np.dtype(dtype)) == 'int8':
555
        return -127
556
    elif str(np.dtype(dtype)) == 'uint8':
557
        return 255
558
    else:
559
        raise RuntimeError("No fill value for data type {0}".format(str(np.dtype(dtype))))