From 99e6a2888ba7e9b2e2658094e64635d6a2528850 Mon Sep 17 00:00:00 2001 From: initze Date: Tue, 31 Jan 2023 11:56:11 +0100 Subject: [PATCH] moved main functionality to function --- scripts/download_data.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/scripts/download_data.py b/scripts/download_data.py index e478f89..81b6fa3 100644 --- a/scripts/download_data.py +++ b/scripts/download_data.py @@ -46,58 +46,53 @@ def main(): args = parser.parse_args() - # remove file if exists - if os.path.exists(args.output_file): - os.remove(args.output_file) + download_function(args.output_file, args.start_date, args.end_date, args.datatypes, args.loc_name, args.station_id, args.token) - # Make query string - dtypes_string = '&'.join([f'datatypeid={dt}' for dt in args.datatypes]) +def download_function(output_file, start_date, end_date, datatypes, loc_name, station_id, noaa_api_token): + # remove file if exists + if os.path.exists(output_file): + os.remove(output_file) + # Make query string + dtypes_string = '&'.join([f'datatypeid={dt}' for dt in datatypes]) # convert datestring to dt - dt_start = datetime.strptime(args.start_date, '%Y-%m-%d') - dt_end = datetime.strptime(args.end_date, '%Y-%m-%d') + dt_start = datetime.strptime(start_date, '%Y-%m-%d') + dt_end = datetime.strptime(end_date, '%Y-%m-%d') # calculate number of days n_days = (dt_end - dt_start).days # calculate number of splits to fit into 1000 lines/rows - split_size = np.floor(1000 / len(args.datatypes)) + split_size = np.floor(1000 / len(datatypes)) # calculate splits split_range = np.arange(0, n_days, split_size) - # Data Loading print('Downloading data through NOAA API') datasets_list = Parallel(n_jobs=4)( - delayed(dl_noaa_api)(i, dtypes_string, args.station_id, args.token, args.start_date, args.end_date, split_size) + delayed(dl_noaa_api)(i, dtypes_string, station_id, noaa_api_token, start_date, end_date, split_size) for i in tqdm.tqdm(split_range[:]) ) - # Merge subsets and create DataFrame df = pd.concat(datasets_list) #### Pivot table to correct form df_pivot = df.pivot(index='date', columns='datatype', values='value') #### adapt factor df_pivot.loc[:, :] /= 10 - df_pivot = df_pivot.reset_index(drop=False) df_pivot['DATE'] = df_pivot.apply(lambda x: datetime.fromisoformat(x['date']).strftime('%Y-%m-%d'), axis=1) - - dr = pd.DataFrame(pd.date_range(start=args.start_date, end=args.end_date), columns=['DATE']) + dr = pd.DataFrame(pd.date_range(start=start_date, end=end_date), columns=['DATE']) dr['DATE'] = dr['DATE'].astype(str) df_merged = pd.concat([df_pivot.set_index('DATE'), dr.set_index('DATE')], join='outer', axis=1, sort=True) df_merged['DATE'] = df_merged.index - df_merged['STATION'] = args.station_id - df_merged['NAME'] = args.loc_name - + df_merged['STATION'] = station_id + df_merged['NAME'] = loc_name df_merged['TAVG'] = None df_merged['SNWD'] = None - final_cols = ["STATION", "NAME", "DATE", "PRCP", "SNWD", "TAVG", "TMAX", "TMIN"] - df_final = df_merged[final_cols] df_final = df_final.replace({np.nan: None}) - - print(f'Saving data to {args.output_file}') - df_final.to_csv(args.output_file, index=False, quoting=csv.QUOTE_ALL) + print(f'Saving data to {output_file}') + df_final.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) + return 0 if __name__ == "__main__":