Adding LOST and thresholds and updating database; update git ignore

This commit is contained in:
Nicolas Dickinson 2024-12-02 17:32:59 +01:00
parent 8333b885c5
commit 9d80f57ebc
5 changed files with 126 additions and 1 deletions

2
.gitignore vendored
View File

@ -160,3 +160,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
LOST_July_2024_Rate_Table.xlsx
~*.xlsx

View File

@ -31,7 +31,7 @@ Later on, Avalara could be used if thresholds set in the us_sales_tax_compliance
## Example
So far the only code that works is the example-filltable.py which scrapes data from the web and populates the tables with data for VAT rates and reverse charge mechanisms but does not yet include U.S. sales tax compliance. Open to suggestions on how to improve this to include U.S. sales tax compliance.
So far the only code that works is the example-filltable.py which scrapes data from the web and populates the tables with data for VAT rates and reverse charge mechanisms. Open to suggestions on how to improve this.
## Setup
The SQL script provided in this repository will create the following tables:

View File

@ -40,6 +40,17 @@ CREATE TABLE IF NOT EXISTS vat_rules (
vat_action INTEGER DEFAULT NULL,
PRIMARY KEY (country_code, is_registered)
);
CREATE TABLE IF NOT EXISTS us_sales_tax_compliance (
state_code TEXT PRIMARY KEY,
sales_threshold REAL,
transaction_threshold INTEGER,
sales_tax_rate REAL,
threshold_reached BOOLEAN,
sales_block BOOLEAN,
total_sales REAL,
total_transactions INTEGER
);
''')
logging.info("Tables created successfully")
@ -173,6 +184,118 @@ conn.commit()
conn.close()
logging.info("Database connection closed")
# Let's download the U.S. sales tax compliance data from the web
# From: https://taxfoundation.org/wp-content/uploads/2024/07/LOST_July_2024_Rate_Table.xlsx
# Define file paths
lost_file_path = "LOST_July_2024_Rate_Table.xlsx"
# This is data manually gathered from the sources indicated in the file to crosscheck facts
thresholds_file_path = "state_sales_tax_thresholds.xlsx"
# Check if files exist, download if they don't
if not os.path.exists(lost_file_path):
logging.info("Downloading U.S. sales tax compliance data")
url = "https://taxfoundation.org/wp-content/uploads/2024/07/LOST_July_2024_Rate_Table.xlsx"
response = requests.get(url)
if response.status_code == 200:
with open(lost_file_path, "wb") as file:
file.write(response.content)
logging.info("U.S. sales tax data downloaded successfully")
else:
logging.error(f"Failed to download U.S. sales tax data. Status code: {response.status_code}")
raise Exception("Failed to download U.S. sales tax data")
else:
logging.info("U.S. sales tax data file already exists, using existing file")
if not os.path.exists(thresholds_file_path):
logging.error("State sales tax thresholds file not found")
raise FileNotFoundError("state_sales_tax_thresholds.xlsx is missing")
else:
logging.info("Using existing state sales tax thresholds file")
# Read the Excel files
logging.info("Reading U.S. sales tax and threshold data from Excel files")
lost_df = pd.read_excel(lost_file_path, sheet_name="Sheet1")
thresholds_df = pd.read_excel(thresholds_file_path, sheet_name=0) # Explicitly read the first sheet
def clean_state_name(state_name):
if pd.isna(state_name):
return pd.NA
return str(state_name).split('(')[0].strip()
# Clean state names in both dataframes
lost_df['State'] = lost_df['State'].apply(clean_state_name)
thresholds_df['State Name'] = thresholds_df['State Name'].apply(clean_state_name)
# Process and insert data into us_sales_tax_compliance table
logging.info("Processing and inserting U.S. sales tax and threshold data")
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
for _, threshold_row in thresholds_df.iterrows():
state_name = threshold_row['State Name']
state_code = threshold_row['State Code']
sales_threshold = threshold_row['Threshold']
transaction_threshold = threshold_row['Transactions']
# Skip rows where state_name is NA or null
if pd.isna(state_name):
logging.info(f"Skipping row with NA state name in thresholds data")
continue
logging.info(f"Processing state: {state_name}")
# Find corresponding LOST data
lost_row = lost_df[lost_df['State'] == state_name]
if not lost_row.empty:
combined_rate = lost_row['State Tax Rate'].values[0]
logging.info(f"LOST data found - Combined rate: {combined_rate}")
else:
combined_rate = None
logging.warning(f"No LOST data found for {state_name}")
# Convert 'None' to None for SQLite
sales_threshold = None if pd.isna(sales_threshold) else sales_threshold
transaction_threshold = None if pd.isna(transaction_threshold) else transaction_threshold
combined_rate = None if pd.isna(combined_rate) else combined_rate
# Remove '$' and ',' from sales_threshold if it's not None
if sales_threshold is not None:
sales_threshold = float(str(sales_threshold).replace('$', '').replace(',', ''))
logging.info(f"Inserting data for {state_name}: State Code: {state_code}, Sales threshold: {sales_threshold}, Transaction threshold: {transaction_threshold}, Combined rate: {combined_rate}")
cursor.execute('''
INSERT OR REPLACE INTO us_sales_tax_compliance
(state_code, sales_threshold, transaction_threshold, sales_tax_rate, threshold_reached, sales_block, total_sales, total_transactions)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (state_code, sales_threshold, transaction_threshold, combined_rate, False, False, 0, 0))
logging.info(f"Data inserted for {state_name}")
conn.commit()
logging.info("U.S. sales tax compliance data inserted successfully")
# Check for any states in LOST data that weren't in thresholds data
lost_states = set(lost_df['State'].dropna())
threshold_states = set(thresholds_df['State Name'].dropna())
missing_states = lost_states - threshold_states
if missing_states:
logging.warning(f"States in LOST data but not in thresholds data: {missing_states}")
# Print table
logging.info("Fetching all U.S. sales tax compliance data from database")
cursor.execute('''
SELECT * FROM us_sales_tax_compliance
''')
us_tax_data = cursor.fetchall()
logging.info(f"Total U.S. sales tax entries: {len(us_tax_data)}")
logging.info(f"First few U.S. sales tax entries:\n{us_tax_data[:5]}")
conn.close()
logging.info("Database connection closed")
logging.info(f"Excel files kept at: {lost_file_path} and {thresholds_file_path}")
# Provide the path to the created SQLite database
logging.info(f"SQLite database created at: {db_path}")
db_path

Binary file not shown.

BIN
ttaxt.db

Binary file not shown.