data_util.py 5.02 KB
Newer Older
1
'''
Alan Mitchell's avatar
Alan Mitchell committed
2 3 4 5
Utilities used in the data analysis used to produce data for charts and reports.
'''

from datetime import datetime
6
import pytz, calendar, time, math
Alan Mitchell's avatar
Alan Mitchell committed
7 8
from dateutil import parser
import numpy as np
9
import pandas as pd
10 11
from django.conf import settings

Alan Mitchell's avatar
Alan Mitchell committed
12 13

# Default timezone used when a datetime value needs to be created
14
default_tz = pytz.timezone(getattr(settings, 'TIME_ZONE', 'US/Alaska'))
Alan Mitchell's avatar
Alan Mitchell committed
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34

def ts_to_datetime(unix_ts=time.time(), tz=default_tz):
    '''
    Converts a UNIX timestamp (seconds) to a Python datetime object in a
    particular timezone.  The timezone info is stripped from the returned
    datetime to make it naive, which works better with the Pandas library
    '''
    return datetime.fromtimestamp(unix_ts, tz).replace(tzinfo=None)

def datestr_to_ts(datestr, tz=default_tz):
    '''
    Converts a date/time string into a Unix timestamp, assuming the date/time is expressed
    in the timezone 'tz'.
    '''
    dt = parser.parse(datestr)
    dt_aware = tz.localize(dt)
    return calendar.timegm(dt_aware.utctimetuple())

def round4(val):
    '''
35
    Rounds a number to a 4 significant digits, unless it is an integer.
Alan Mitchell's avatar
Alan Mitchell committed
36
    '''
Ian Moore's avatar
Ian Moore committed
37 38 39 40 41 42 43
    try:
        if val != int(val):
            return float('%.4g' % val)
        else:
            return val
    except:
        return None
Alan Mitchell's avatar
Alan Mitchell committed
44

45 46 47 48 49
def decimals_needed(vals, sig_figures):
    '''Returns the number of digits past the decimal needed to ensure
    that 'sig_figures' significant figures are displayed for the largest
    value (in absolute value terms) in the array of values 'vals'. 
    '''
50 51
    if len(vals):
        max_val = max(abs(min(vals)), abs(max(vals)))
52 53 54 55
        if max_val != 0:
            return max(0, sig_figures - int(math.log10(max_val)) - 1)
        else:
            return 0
56 57 58
    else:
        # No values in the array, just return 0.
        return 0
59 60 61 62 63 64 65

def formatCurVal(val):
    """
    Helper function for formatting current values to 3 significant digits, but 
    avoiding the use of scientific notation for display.  Also, integers are
    shown at full precision.
    """
66 67 68
    if val is None:
        return ''
    elif val == int(val):
69
        return '{:,}'.format(int(val))
70
    elif abs(val) >= 1000.0:
71
        return '{:,}'.format( int(float('%.4g' % val)))
72
    else:
73
        return '%.4g' % val
74

Alan Mitchell's avatar
Alan Mitchell committed
75 76 77 78 79 80 81 82 83 84
def histogram_from_series(pandas_series):
    '''
    Returns a list of histogram bins ( [bin center point, count] ) for the Pandas
    Time Series 'pandas_series'.  The values of the series (index not involved) are used
    to create the histogram.  The histogram has 30 bins.
    '''

    cts, bins = np.histogram(pandas_series.values, 20)   # 20 bin histogram
    avg_bins = (bins[:-1] + bins[1:]) / 2.0       # calculate midpoint of bins

Ian Moore's avatar
Ian Moore committed
85
    # round these values for better display in charts
Alan Mitchell's avatar
Alan Mitchell committed
86 87 88 89 90 91 92 93 94 95
    avg_bins = [round4(x) for x in avg_bins]

    # Convert count bins into % of total reading count
    reading_ct = float(sum(cts))
    cts = cts.astype('float64') / reading_ct * 100.0
    cts = [round4(x) for x in cts]

    # weirdly, some integer are "not JSON serializable".  Had to 
    # convert counts to float to avoid the error.  Also, round bin average
    # to 4 significant figures
96
    return list(zip(avg_bins, cts))
97

98
def resample_timeseries(pandas_dataframe, averaging_hours, use_rolling_averaging=False, drop_na=True):
99
    '''
100
    Returns a new pandas dataframe that is resampled at the specified "averaging_hours"
101 102 103
    interval.  If the 'averaging_hours' parameter is fractional, the averaging time 
    period is truncated to the lesser minute.
    If 'drop_na' is True, rows with any NaN values are dropped.
104 105
    
    For some reason the pandas resampling sometimes fails if the datetime index is timezone aware...
106 107 108 109 110 111 112
    '''

    interval_lookup = {
        0.5: {'rule':'30min', 'loffset': '15min'}, 
        1: {'rule': '1H', 'loffset': '30min'},
        2: {'rule': '2H', 'loffset': '1H'},
        4: {'rule': '4H', 'loffset': '2H'},
Ian Moore's avatar
Ian Moore committed
113
        8: {'rule': '8H', 'loffset': '4H'},
114
        24: {'rule': '1D', 'loffset': '12H'},
115
        168: {'rule': '1W', 'loffset': '108H'},
116 117 118 119 120
        720: {'rule': '1M', 'loffset': '16D'},
        8760: {'rule': 'AS', 'loffset': '6M'}
        }
    params = interval_lookup.get(averaging_hours, {'rule':str(int(averaging_hours * 60)) + 'min', 'loffset':str(int(averaging_hours * 30)) + 'min'})

121 122 123 124 125 126 127 128 129 130 131 132 133 134
    if not use_rolling_averaging:
        new_df = pandas_dataframe.resample(rule=params['rule'], loffset=params['loffset'],label='left').mean()
    else:
        # resample to consistent interval
        original_interval = pandas_dataframe.index.to_series().diff().quantile(.05)
        new_df = pandas_dataframe.resample(rule=original_interval).median()

        # apply the rolling averaging
        new_df = new_df.rolling(int(pd.Timedelta(hours=averaging_hours) / original_interval),center=True,min_periods=1).mean()

        # downsample the result if there are more than 1000 values
        if len(new_df) > 1000:
            new_df = new_df.resample(rule=(pandas_dataframe.index[-1] - pandas_dataframe.index[0]) / 1000).mean()

135 136
    if drop_na:
        new_df = new_df.dropna()
137 138

    return new_df