diff --git a/README.md b/README.md index ff2b576..9c31f97 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,38 @@ -# dstcrusher - -A simple approach for meter readings in time series with the time recorded with Daylight Savings Time (DST) and naively compress a week of (cumulative) values into the first day of winter time. For this case, this script takes a dataset with DST applied and removes it be subtracting an hour from records in summer time. +# dstcrusher - Summer Time (DST) Uncrushing Tool for Cumulative Meter Readings +A simple approach for meter readings in time series: +- Are the times recorded with Daylight Savings Time (DST) and naively compress a week of (cumulative) values into the first day of winter time? +- There will be a jump in the cumulative value in the first step of winter time in this case. -It will leave the gap created in the old transition from winter to summer time as this gap is addressed through imputation to fill the missing values with the jump in the cumulative value in the first step of winter time. +For this case, it removes the timezone signature (if any) and subtracts an hour from records in summer time to convert back to standard time. -Useful for some applications that take meter readings from dashboards that incorrectly store timestamps/signatures. \ No newline at end of file +It will leave the gap created in the old transition from winter to summer time as this gap should be addressed through imputation afterwards to fill the missing values using the jump in cumulative values. + +Useful for some applications that take meter readings from dashboards that incorrectly store timestamps/signatures through application of summer time and crushing the last week into the first value of winter time. + +## Functions + +### 1. `create_test_data()` +- Creates a test dataset in UTC with continuous 15-minute intervals. +- Converts the dataset to Europe/Amsterdam timezone. +- Returns a prepared dataframe for testing. + +### 2. `get_exact_dst_transitions(year, timezone)` +- Calculates the exact DST transitions for a given year and timezone. +- Returns the start and end times of DST (summer time). + +### 3. `correct_summer_time_with_timezone_check(df, datetime_col, timezone)` +- Corrects the dataset by: + - Checking if the dataset has timezone-aware data, logging and removing the timezone signature if needed. + - Dynamically calculating the DST transitions for each year and applying a 1-hour subtraction to return times to standard time. + +## Example Usage + +Run the script with the following steps: + +1. Create test data using `create_test_data()`. +2. Correct summer time and remove timezone using `correct_summer_time_with_timezone_check()`. +3. Print the results for the DST transition periods (March and October). + +# disclaimer + +This was completely thought up by Nicolas Dickinson but written with the help of chatgpt 4o. FYI, it strugged greatly with timezone aware and naive datasets and I had to help substantially to get to the right solution and right tests. \ No newline at end of file diff --git a/summer_time_correction.py b/summer_time_correction.py new file mode 100644 index 0000000..d9d4b68 --- /dev/null +++ b/summer_time_correction.py @@ -0,0 +1,95 @@ + +import pandas as pd +from datetime import timedelta, datetime +from zoneinfo import ZoneInfo + +# Step 1: Create the test data in UTC and convert to Europe/Amsterdam timezone +def create_test_data(): + # Create the dataset in UTC with continuous 15-minute intervals + df_test_utc = pd.DataFrame({ + 'datetime': pd.date_range('2023-03-25', '2023-10-30', freq='15min', tz='UTC'), + 'cumulative_col1': range(10000, 10000 + len(pd.date_range('2023-03-25', '2023-10-30', freq='15min', tz='UTC'))), + 'cumulative_col2': range(20000, 20000 + len(pd.date_range('2023-03-25', '2023-10-30', freq='15min', tz='UTC'))) + }) + + # Convert the UTC datetime column to Europe/Amsterdam timezone + df_test_dutch = df_test_utc.copy() + df_test_dutch['datetime'] = df_test_dutch['datetime'].dt.tz_convert('Europe/Amsterdam') + + # Return the prepared dataframe + return df_test_dutch + +# Function to get exact DST transitions for a given year +def get_exact_dst_transitions(year, timezone='Europe/Amsterdam'): + tz = ZoneInfo(timezone) + start_of_year = datetime(year, 1, 1, tzinfo=tz) + end_of_year = datetime(year + 1, 1, 1, tzinfo=tz) + + current_time = start_of_year + transitions = [] + + # Check hour by hour for the exact transition + while current_time < end_of_year: + next_time = current_time + timedelta(hours=1) + if current_time.utcoffset() != next_time.utcoffset(): + transition_type = "Start of DST" if next_time.utcoffset() > current_time.utcoffset() else "End of DST" + transitions.append({ + 'transition': transition_type, + 'exact_time': next_time.astimezone(tz), + 'previous_offset': current_time.utcoffset(), + 'new_offset': next_time.utcoffset() + }) + + current_time = next_time + + return transitions + +# Step 2: Function to correct summer time, log timezone changes, and handle DST dynamically +def correct_summer_time_with_timezone_check(df, datetime_col, timezone='Europe/Amsterdam'): + # Check if the datetime column is timezone-aware + + if isinstance(df[datetime_col].dtype, pd.DatetimeTZDtype): + detected_timezone = df[datetime_col].dt.tz.zone + + # Ensure the timezone matches the one passed to the function + if detected_timezone != timezone: + raise ValueError(f"Detected timezone '{detected_timezone}' does not match the expected timezone '{timezone}'") + + print(f"Timezone '{detected_timezone}' detected. Removing timezone information.") + df[datetime_col] = df[datetime_col].dt.tz_localize(None) + + # Get the unique years in the dataset + years = pd.to_datetime(df[datetime_col]).dt.year.unique() + + # Adjust for each year's DST transition period + for year in years: + transitions = get_exact_dst_transitions(year, timezone) + start_of_dst = transitions[0]['exact_time'].replace(tzinfo=None) + end_of_dst = transitions[1]['exact_time'].replace(tzinfo=None) + + # Subtract 1 hour during the DST period + dst_mask = (df[datetime_col] >= start_of_dst) & (df[datetime_col] < end_of_dst) + df.loc[dst_mask, datetime_col] = df.loc[dst_mask, datetime_col] - timedelta(hours=1) + + return df + +# Example of usage +if __name__ == "__main__": + # Create test data + df_test = create_test_data() + + # Correct summer time and remove timezone information + df_corrected = correct_summer_time_with_timezone_check(df_test, 'datetime') + + # Display the dataset for the DST transition periods + dst_transition_corrected = df_corrected[ + (df_corrected['datetime'] >= '2023-03-26 01:00:00') & + (df_corrected['datetime'] <= '2023-03-26 04:00:00') + ] + print(dst_transition_corrected) + + dst_transition_winter = df_corrected[ + (df_corrected['datetime'] >= '2023-10-29 01:00:00') & + (df_corrected['datetime'] <= '2023-10-29 04:00:00') + ] + print(dst_transition_winter)