Script committed

This commit is contained in:
Nicolas Dickinson 2024-09-12 12:39:39 +02:00
parent e0779a216b
commit f48d0dd756
2 changed files with 131 additions and 5 deletions

View File

@ -1,7 +1,38 @@
# dstcrusher
# dstcrusher - Summer Time (DST) Uncrushing Tool for Cumulative Meter Readings
A simple approach for meter readings in time series:
- Are the times recorded with Daylight Savings Time (DST) and naively compress a week of (cumulative) values into the first day of winter time?
- There will be a jump in the cumulative value in the first step of winter time in this case.
A simple approach for meter readings in time series with the time recorded with Daylight Savings Time (DST) and naively compress a week of (cumulative) values into the first day of winter time. For this case, this script takes a dataset with DST applied and removes it be subtracting an hour from records in summer time.
For this case, it removes the timezone signature (if any) and subtracts an hour from records in summer time to convert back to standard time.
It will leave the gap created in the old transition from winter to summer time as this gap is addressed through imputation to fill the missing values with the jump in the cumulative value in the first step of winter time.
It will leave the gap created in the old transition from winter to summer time as this gap should be addressed through imputation afterwards to fill the missing values using the jump in cumulative values.
Useful for some applications that take meter readings from dashboards that incorrectly store timestamps/signatures.
Useful for some applications that take meter readings from dashboards that incorrectly store timestamps/signatures through application of summer time and crushing the last week into the first value of winter time.
## Functions
### 1. `create_test_data()`
- Creates a test dataset in UTC with continuous 15-minute intervals.
- Converts the dataset to Europe/Amsterdam timezone.
- Returns a prepared dataframe for testing.
### 2. `get_exact_dst_transitions(year, timezone)`
- Calculates the exact DST transitions for a given year and timezone.
- Returns the start and end times of DST (summer time).
### 3. `correct_summer_time_with_timezone_check(df, datetime_col, timezone)`
- Corrects the dataset by:
- Checking if the dataset has timezone-aware data, logging and removing the timezone signature if needed.
- Dynamically calculating the DST transitions for each year and applying a 1-hour subtraction to return times to standard time.
## Example Usage
Run the script with the following steps:
1. Create test data using `create_test_data()`.
2. Correct summer time and remove timezone using `correct_summer_time_with_timezone_check()`.
3. Print the results for the DST transition periods (March and October).
# disclaimer
This was completely thought up by Nicolas Dickinson but written with the help of chatgpt 4o. FYI, it strugged greatly with timezone aware and naive datasets and I had to help substantially to get to the right solution and right tests.

95
summer_time_correction.py Normal file
View File

@ -0,0 +1,95 @@
import pandas as pd
from datetime import timedelta, datetime
from zoneinfo import ZoneInfo
# Step 1: Create the test data in UTC and convert to Europe/Amsterdam timezone
def create_test_data():
# Create the dataset in UTC with continuous 15-minute intervals
df_test_utc = pd.DataFrame({
'datetime': pd.date_range('2023-03-25', '2023-10-30', freq='15min', tz='UTC'),
'cumulative_col1': range(10000, 10000 + len(pd.date_range('2023-03-25', '2023-10-30', freq='15min', tz='UTC'))),
'cumulative_col2': range(20000, 20000 + len(pd.date_range('2023-03-25', '2023-10-30', freq='15min', tz='UTC')))
})
# Convert the UTC datetime column to Europe/Amsterdam timezone
df_test_dutch = df_test_utc.copy()
df_test_dutch['datetime'] = df_test_dutch['datetime'].dt.tz_convert('Europe/Amsterdam')
# Return the prepared dataframe
return df_test_dutch
# Function to get exact DST transitions for a given year
def get_exact_dst_transitions(year, timezone='Europe/Amsterdam'):
tz = ZoneInfo(timezone)
start_of_year = datetime(year, 1, 1, tzinfo=tz)
end_of_year = datetime(year + 1, 1, 1, tzinfo=tz)
current_time = start_of_year
transitions = []
# Check hour by hour for the exact transition
while current_time < end_of_year:
next_time = current_time + timedelta(hours=1)
if current_time.utcoffset() != next_time.utcoffset():
transition_type = "Start of DST" if next_time.utcoffset() > current_time.utcoffset() else "End of DST"
transitions.append({
'transition': transition_type,
'exact_time': next_time.astimezone(tz),
'previous_offset': current_time.utcoffset(),
'new_offset': next_time.utcoffset()
})
current_time = next_time
return transitions
# Step 2: Function to correct summer time, log timezone changes, and handle DST dynamically
def correct_summer_time_with_timezone_check(df, datetime_col, timezone='Europe/Amsterdam'):
# Check if the datetime column is timezone-aware
if isinstance(df[datetime_col].dtype, pd.DatetimeTZDtype):
detected_timezone = df[datetime_col].dt.tz.zone
# Ensure the timezone matches the one passed to the function
if detected_timezone != timezone:
raise ValueError(f"Detected timezone '{detected_timezone}' does not match the expected timezone '{timezone}'")
print(f"Timezone '{detected_timezone}' detected. Removing timezone information.")
df[datetime_col] = df[datetime_col].dt.tz_localize(None)
# Get the unique years in the dataset
years = pd.to_datetime(df[datetime_col]).dt.year.unique()
# Adjust for each year's DST transition period
for year in years:
transitions = get_exact_dst_transitions(year, timezone)
start_of_dst = transitions[0]['exact_time'].replace(tzinfo=None)
end_of_dst = transitions[1]['exact_time'].replace(tzinfo=None)
# Subtract 1 hour during the DST period
dst_mask = (df[datetime_col] >= start_of_dst) & (df[datetime_col] < end_of_dst)
df.loc[dst_mask, datetime_col] = df.loc[dst_mask, datetime_col] - timedelta(hours=1)
return df
# Example of usage
if __name__ == "__main__":
# Create test data
df_test = create_test_data()
# Correct summer time and remove timezone information
df_corrected = correct_summer_time_with_timezone_check(df_test, 'datetime')
# Display the dataset for the DST transition periods
dst_transition_corrected = df_corrected[
(df_corrected['datetime'] >= '2023-03-26 01:00:00') &
(df_corrected['datetime'] <= '2023-03-26 04:00:00')
]
print(dst_transition_corrected)
dst_transition_winter = df_corrected[
(df_corrected['datetime'] >= '2023-10-29 01:00:00') &
(df_corrected['datetime'] <= '2023-10-29 04:00:00')
]
print(dst_transition_winter)