Adding LOST and thresholds and updating database; update git ignore
This commit is contained in:
parent
8333b885c5
commit
9d80f57ebc
2
.gitignore
vendored
2
.gitignore
vendored
@ -160,3 +160,5 @@ cython_debug/
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
LOST_July_2024_Rate_Table.xlsx
|
||||
~*.xlsx
|
||||
|
@ -31,7 +31,7 @@ Later on, Avalara could be used if thresholds set in the us_sales_tax_compliance
|
||||
|
||||
## Example
|
||||
|
||||
So far the only code that works is the example-filltable.py which scrapes data from the web and populates the tables with data for VAT rates and reverse charge mechanisms but does not yet include U.S. sales tax compliance. Open to suggestions on how to improve this to include U.S. sales tax compliance.
|
||||
So far the only code that works is the example-filltable.py which scrapes data from the web and populates the tables with data for VAT rates and reverse charge mechanisms. Open to suggestions on how to improve this.
|
||||
|
||||
## Setup
|
||||
The SQL script provided in this repository will create the following tables:
|
||||
|
@ -40,6 +40,17 @@ CREATE TABLE IF NOT EXISTS vat_rules (
|
||||
vat_action INTEGER DEFAULT NULL,
|
||||
PRIMARY KEY (country_code, is_registered)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS us_sales_tax_compliance (
|
||||
state_code TEXT PRIMARY KEY,
|
||||
sales_threshold REAL,
|
||||
transaction_threshold INTEGER,
|
||||
sales_tax_rate REAL,
|
||||
threshold_reached BOOLEAN,
|
||||
sales_block BOOLEAN,
|
||||
total_sales REAL,
|
||||
total_transactions INTEGER
|
||||
);
|
||||
''')
|
||||
|
||||
logging.info("Tables created successfully")
|
||||
@ -173,6 +184,118 @@ conn.commit()
|
||||
conn.close()
|
||||
logging.info("Database connection closed")
|
||||
|
||||
# Let's download the U.S. sales tax compliance data from the web
|
||||
# From: https://taxfoundation.org/wp-content/uploads/2024/07/LOST_July_2024_Rate_Table.xlsx
|
||||
|
||||
# Define file paths
|
||||
lost_file_path = "LOST_July_2024_Rate_Table.xlsx"
|
||||
|
||||
# This is data manually gathered from the sources indicated in the file to crosscheck facts
|
||||
thresholds_file_path = "state_sales_tax_thresholds.xlsx"
|
||||
|
||||
# Check if files exist, download if they don't
|
||||
if not os.path.exists(lost_file_path):
|
||||
logging.info("Downloading U.S. sales tax compliance data")
|
||||
url = "https://taxfoundation.org/wp-content/uploads/2024/07/LOST_July_2024_Rate_Table.xlsx"
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
with open(lost_file_path, "wb") as file:
|
||||
file.write(response.content)
|
||||
logging.info("U.S. sales tax data downloaded successfully")
|
||||
else:
|
||||
logging.error(f"Failed to download U.S. sales tax data. Status code: {response.status_code}")
|
||||
raise Exception("Failed to download U.S. sales tax data")
|
||||
else:
|
||||
logging.info("U.S. sales tax data file already exists, using existing file")
|
||||
|
||||
if not os.path.exists(thresholds_file_path):
|
||||
logging.error("State sales tax thresholds file not found")
|
||||
raise FileNotFoundError("state_sales_tax_thresholds.xlsx is missing")
|
||||
else:
|
||||
logging.info("Using existing state sales tax thresholds file")
|
||||
|
||||
# Read the Excel files
|
||||
logging.info("Reading U.S. sales tax and threshold data from Excel files")
|
||||
lost_df = pd.read_excel(lost_file_path, sheet_name="Sheet1")
|
||||
thresholds_df = pd.read_excel(thresholds_file_path, sheet_name=0) # Explicitly read the first sheet
|
||||
|
||||
def clean_state_name(state_name):
|
||||
if pd.isna(state_name):
|
||||
return pd.NA
|
||||
return str(state_name).split('(')[0].strip()
|
||||
|
||||
# Clean state names in both dataframes
|
||||
lost_df['State'] = lost_df['State'].apply(clean_state_name)
|
||||
thresholds_df['State Name'] = thresholds_df['State Name'].apply(clean_state_name)
|
||||
|
||||
# Process and insert data into us_sales_tax_compliance table
|
||||
logging.info("Processing and inserting U.S. sales tax and threshold data")
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
for _, threshold_row in thresholds_df.iterrows():
|
||||
state_name = threshold_row['State Name']
|
||||
state_code = threshold_row['State Code']
|
||||
sales_threshold = threshold_row['Threshold']
|
||||
transaction_threshold = threshold_row['Transactions']
|
||||
|
||||
# Skip rows where state_name is NA or null
|
||||
if pd.isna(state_name):
|
||||
logging.info(f"Skipping row with NA state name in thresholds data")
|
||||
continue
|
||||
|
||||
logging.info(f"Processing state: {state_name}")
|
||||
|
||||
# Find corresponding LOST data
|
||||
lost_row = lost_df[lost_df['State'] == state_name]
|
||||
if not lost_row.empty:
|
||||
combined_rate = lost_row['State Tax Rate'].values[0]
|
||||
logging.info(f"LOST data found - Combined rate: {combined_rate}")
|
||||
else:
|
||||
combined_rate = None
|
||||
logging.warning(f"No LOST data found for {state_name}")
|
||||
|
||||
# Convert 'None' to None for SQLite
|
||||
sales_threshold = None if pd.isna(sales_threshold) else sales_threshold
|
||||
transaction_threshold = None if pd.isna(transaction_threshold) else transaction_threshold
|
||||
combined_rate = None if pd.isna(combined_rate) else combined_rate
|
||||
|
||||
# Remove '$' and ',' from sales_threshold if it's not None
|
||||
if sales_threshold is not None:
|
||||
sales_threshold = float(str(sales_threshold).replace('$', '').replace(',', ''))
|
||||
|
||||
logging.info(f"Inserting data for {state_name}: State Code: {state_code}, Sales threshold: {sales_threshold}, Transaction threshold: {transaction_threshold}, Combined rate: {combined_rate}")
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO us_sales_tax_compliance
|
||||
(state_code, sales_threshold, transaction_threshold, sales_tax_rate, threshold_reached, sales_block, total_sales, total_transactions)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''', (state_code, sales_threshold, transaction_threshold, combined_rate, False, False, 0, 0))
|
||||
logging.info(f"Data inserted for {state_name}")
|
||||
|
||||
conn.commit()
|
||||
logging.info("U.S. sales tax compliance data inserted successfully")
|
||||
|
||||
# Check for any states in LOST data that weren't in thresholds data
|
||||
lost_states = set(lost_df['State'].dropna())
|
||||
threshold_states = set(thresholds_df['State Name'].dropna())
|
||||
missing_states = lost_states - threshold_states
|
||||
if missing_states:
|
||||
logging.warning(f"States in LOST data but not in thresholds data: {missing_states}")
|
||||
|
||||
# Print table
|
||||
logging.info("Fetching all U.S. sales tax compliance data from database")
|
||||
cursor.execute('''
|
||||
SELECT * FROM us_sales_tax_compliance
|
||||
''')
|
||||
us_tax_data = cursor.fetchall()
|
||||
logging.info(f"Total U.S. sales tax entries: {len(us_tax_data)}")
|
||||
logging.info(f"First few U.S. sales tax entries:\n{us_tax_data[:5]}")
|
||||
|
||||
conn.close()
|
||||
logging.info("Database connection closed")
|
||||
|
||||
logging.info(f"Excel files kept at: {lost_file_path} and {thresholds_file_path}")
|
||||
|
||||
# Provide the path to the created SQLite database
|
||||
logging.info(f"SQLite database created at: {db_path}")
|
||||
db_path
|
||||
|
BIN
state_sales_tax_thresholds.xlsx
Normal file
BIN
state_sales_tax_thresholds.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user