hyperparameter-optimization/global_utils/get_data_multi_note.py at master · ak4win/hyperparameter-optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


def read_and_preprocess_data(
    should_smooth=False,
    smoothing_window=100,
    sequence_length=120,
    cut_off_min=5,
    cut_off_max=45,
    should_scale=True,
    data_path="/home/paperspace/development/datasets/data.txt",
    batch_size=32,
    motes_train=[1, 2, 3, 4, 6, 7, 9, 10, 32, 34, 35],
    motes_test=[36],
):
    """
    Load the temperature sensor data of the "Intel Berkeley Research Lab" dataset, clean it and scale it down.
    :parameters:
    cut_off_min(number)   -- threshhold to discard all temperatures below that point
    cut_off_max(number)   -- threshhold to discard all temperatures above that point
    should_scale(boolean) -- switch between min-max-scaling data or not
    data_path(string)     -- path to the file containing all data
    :returns:
    x_train -- numpy array of shape dictated by config and train_range
    x_test  -- numpy array of shape dictated by config and test_range
    config  -- chosen config in case it needs to be reused later on
    """
    # Load, clean and preprocess data
    df = pd.read_csv(
        data_path,
        sep=" ",
        lineterminator="\n",
        names=[
            "date",
            "time",
            "epoch",
            "moteid",
            "temperature",
            "humidity",
            "light",
            "voltage",
        ],
    )
    # Clean nans
    df.dropna(inplace=True)
    # Clean outliers
    df.drop(
        df[(df["temperature"] < cut_off_min) | (df["temperature"] > cut_off_max)].index,
        inplace=True,
    )
    # temperature_std = df["temperature"].std()
    lower_bound = df["temperature"].mean() - 3 * df["temperature"].std()
    upper_bound = df["temperature"].mean() + 3 * df["temperature"].std()
    df.drop(
        df[(df["temperature"] < lower_bound) | (df["temperature"] > upper_bound)].index,
        inplace=True,
    )

    def concat_motes(mote_ids):
        # Concatenate all relevant motes into one dataframe
        tmp_frames = []
        for mote_id in mote_ids:
            tmp_frame = df.loc[df["moteid"] == mote_id]["temperature"]
            tmp_frame = tmp_frame.reset_index(drop=True)
            tmp_frames.append(tmp_frame)
        return pd.concat(tmp_frames, axis=0)

    x_train = concat_motes(motes_train)
    x_test = concat_motes(motes_test)
    if should_smooth:
        x_train = x_train.rolling(window=smoothing_window).mean()[smoothing_window:]
        x_test = x_test.rolling(window=smoothing_window).mean()[smoothing_window:]
    if should_scale:
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train.values.reshape(-1, 1))
        x_test = scaler.fit_transform(x_test.values.reshape(-1, 1))
    ###
    # Prepare the data
    ###

    def reshape_inputs(data, sequence_length):
        assert sequence_length <= data.shape[0]
        remainder = data.shape[0] % sequence_length
        limit = data.shape[0] - remainder
        data = data[:limit, :]
        n_samples = int(data.shape[0] / sequence_length)
        n_dims = data.shape[1]
        reshaped_data = data.reshape(n_samples, sequence_length, n_dims)
        length = reshaped_data.shape[0]
        cutoff = length % batch_size
        new_length = length - cutoff
        return reshaped_data[:new_length]

    x_train = reshape_inputs(x_train, sequence_length)
    x_test = reshape_inputs(x_test, sequence_length)

    return x_train, x_test