diff --git a/.gitignore b/.gitignore index 5d381cc..f7b964f 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +LOST_July_2024_Rate_Table.xlsx +~*.xlsx diff --git a/README.md b/README.md index 52bb800..e4dad4b 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Later on, Avalara could be used if thresholds set in the us_sales_tax_compliance ## Example -So far the only code that works is the example-filltable.py which scrapes data from the web and populates the tables with data for VAT rates and reverse charge mechanisms but does not yet include U.S. sales tax compliance. Open to suggestions on how to improve this to include U.S. sales tax compliance. +So far the only code that works is the example-filltable.py which scrapes data from the web and populates the tables with data for VAT rates and reverse charge mechanisms. Open to suggestions on how to improve this. ## Setup The SQL script provided in this repository will create the following tables: diff --git a/example-filltable.py b/example-filltable.py index a2784d3..275f014 100644 --- a/example-filltable.py +++ b/example-filltable.py @@ -40,6 +40,17 @@ CREATE TABLE IF NOT EXISTS vat_rules ( vat_action INTEGER DEFAULT NULL, PRIMARY KEY (country_code, is_registered) ); + +CREATE TABLE IF NOT EXISTS us_sales_tax_compliance ( + state_code TEXT PRIMARY KEY, + sales_threshold REAL, + transaction_threshold INTEGER, + sales_tax_rate REAL, + threshold_reached BOOLEAN, + sales_block BOOLEAN, + total_sales REAL, + total_transactions INTEGER +); ''') logging.info("Tables created successfully") @@ -173,6 +184,118 @@ conn.commit() conn.close() logging.info("Database connection closed") +# Let's download the U.S. sales tax compliance data from the web +# From: https://taxfoundation.org/wp-content/uploads/2024/07/LOST_July_2024_Rate_Table.xlsx + +# Define file paths +lost_file_path = "LOST_July_2024_Rate_Table.xlsx" + +# This is data manually gathered from the sources indicated in the file to crosscheck facts +thresholds_file_path = "state_sales_tax_thresholds.xlsx" + +# Check if files exist, download if they don't +if not os.path.exists(lost_file_path): + logging.info("Downloading U.S. sales tax compliance data") + url = "https://taxfoundation.org/wp-content/uploads/2024/07/LOST_July_2024_Rate_Table.xlsx" + response = requests.get(url) + if response.status_code == 200: + with open(lost_file_path, "wb") as file: + file.write(response.content) + logging.info("U.S. sales tax data downloaded successfully") + else: + logging.error(f"Failed to download U.S. sales tax data. Status code: {response.status_code}") + raise Exception("Failed to download U.S. sales tax data") +else: + logging.info("U.S. sales tax data file already exists, using existing file") + +if not os.path.exists(thresholds_file_path): + logging.error("State sales tax thresholds file not found") + raise FileNotFoundError("state_sales_tax_thresholds.xlsx is missing") +else: + logging.info("Using existing state sales tax thresholds file") + +# Read the Excel files +logging.info("Reading U.S. sales tax and threshold data from Excel files") +lost_df = pd.read_excel(lost_file_path, sheet_name="Sheet1") +thresholds_df = pd.read_excel(thresholds_file_path, sheet_name=0) # Explicitly read the first sheet + +def clean_state_name(state_name): + if pd.isna(state_name): + return pd.NA + return str(state_name).split('(')[0].strip() + +# Clean state names in both dataframes +lost_df['State'] = lost_df['State'].apply(clean_state_name) +thresholds_df['State Name'] = thresholds_df['State Name'].apply(clean_state_name) + +# Process and insert data into us_sales_tax_compliance table +logging.info("Processing and inserting U.S. sales tax and threshold data") +conn = sqlite3.connect(db_path) +cursor = conn.cursor() + +for _, threshold_row in thresholds_df.iterrows(): + state_name = threshold_row['State Name'] + state_code = threshold_row['State Code'] + sales_threshold = threshold_row['Threshold'] + transaction_threshold = threshold_row['Transactions'] + + # Skip rows where state_name is NA or null + if pd.isna(state_name): + logging.info(f"Skipping row with NA state name in thresholds data") + continue + + logging.info(f"Processing state: {state_name}") + + # Find corresponding LOST data + lost_row = lost_df[lost_df['State'] == state_name] + if not lost_row.empty: + combined_rate = lost_row['State Tax Rate'].values[0] + logging.info(f"LOST data found - Combined rate: {combined_rate}") + else: + combined_rate = None + logging.warning(f"No LOST data found for {state_name}") + + # Convert 'None' to None for SQLite + sales_threshold = None if pd.isna(sales_threshold) else sales_threshold + transaction_threshold = None if pd.isna(transaction_threshold) else transaction_threshold + combined_rate = None if pd.isna(combined_rate) else combined_rate + + # Remove '$' and ',' from sales_threshold if it's not None + if sales_threshold is not None: + sales_threshold = float(str(sales_threshold).replace('$', '').replace(',', '')) + + logging.info(f"Inserting data for {state_name}: State Code: {state_code}, Sales threshold: {sales_threshold}, Transaction threshold: {transaction_threshold}, Combined rate: {combined_rate}") + cursor.execute(''' + INSERT OR REPLACE INTO us_sales_tax_compliance + (state_code, sales_threshold, transaction_threshold, sales_tax_rate, threshold_reached, sales_block, total_sales, total_transactions) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ''', (state_code, sales_threshold, transaction_threshold, combined_rate, False, False, 0, 0)) + logging.info(f"Data inserted for {state_name}") + +conn.commit() +logging.info("U.S. sales tax compliance data inserted successfully") + +# Check for any states in LOST data that weren't in thresholds data +lost_states = set(lost_df['State'].dropna()) +threshold_states = set(thresholds_df['State Name'].dropna()) +missing_states = lost_states - threshold_states +if missing_states: + logging.warning(f"States in LOST data but not in thresholds data: {missing_states}") + +# Print table +logging.info("Fetching all U.S. sales tax compliance data from database") +cursor.execute(''' + SELECT * FROM us_sales_tax_compliance +''') +us_tax_data = cursor.fetchall() +logging.info(f"Total U.S. sales tax entries: {len(us_tax_data)}") +logging.info(f"First few U.S. sales tax entries:\n{us_tax_data[:5]}") + +conn.close() +logging.info("Database connection closed") + +logging.info(f"Excel files kept at: {lost_file_path} and {thresholds_file_path}") + # Provide the path to the created SQLite database logging.info(f"SQLite database created at: {db_path}") db_path diff --git a/state_sales_tax_thresholds.xlsx b/state_sales_tax_thresholds.xlsx new file mode 100644 index 0000000..31f8040 Binary files /dev/null and b/state_sales_tax_thresholds.xlsx differ diff --git a/ttaxt.db b/ttaxt.db index 28ed1fb..1147131 100644 Binary files a/ttaxt.db and b/ttaxt.db differ