# Stack Overflow
'''
Goal: pull back all tables in a database where column name 'Audit Report' == 'complete'
Why is this a script: There are over 1000 tables in the database with the same structure
this script will automate checking which reports have a 'complete'
under the column name 'Audit Report'
This script will:
1) pull down all of the tables in a database
2) pull down all of the column names per table
3) create a dataframe
4) filter to all tables that contain a column name
5) run a dynamic sql query based on filtered df
6) print out tablenames
'''
# import modules
import pandas as pd
import pyodbc
import time
# connect to database
conn = pyodbc.connect('Driver={SQL Server};'
'Server=111.11.111.11;' # servername
'Database=databasename;' # database name
'Trusted_Connection=yes;')
# Initialize the list to populate
listoftables = []
columnnames = []
# Step 1
# using the connection string
# appending each table name to 'listoftables'
# this step will give us all of the database tables
cursor = conn.cursor()
for row in cursor.tables():
listoftables.append(row.table_name)
# Step 2
# using all of the tables, we now want all of the column names
for x in listoftables:
tempcolnames = []
# for each table, get all of the column names
for row in cursor.columns(table = x):
tempcolnames.append(row.column_name)
columnnames.append(tempcolnames)
# Step 3
# make a dataframe of the two lists
df = pd.DataFrame(
{'TableName': listoftables,
'col_names' : columnnames
})
# Step 4
# filter to tables that are have 'Audit_Report' as a column name
df['status'] = df.apply(lambda x: 1 if 'Audit_Report' in x['col_names'] else 0, axis =1 )
# auditreports will return a dataframe with all tables that have 'Audit_Report' as a column name
auditreports = df[df['status'] == 1]
# start time to get an idea for how long the query runs
start_time = time.time()
# this is the 'shell' of the save location
merged=pd.DataFrame()
# Step 5
# this is the iterative sql loop we are going to run
for table in auditreports['TableName']:
df = pd.read_sql_query('''
select
*
from
[dbo].[{}]
where
Audit_Report = 'Complete'
'''.format(table, username), conn)
# checking if the query pulled down any data
if len(df) > 0:
df['TableName'] = table
merged=pd.concat([merged,df], sort=False)
# if it does have data we store it in 'merged' if it does not, the 'else' is implied as 'do nothing'
# we dont need to say, else if all was want to do is 'do nothing'
# waiting a tenth of a second to keep from overloading server or computer
time.sleep(.1)
# how long the query took to run
minutes = (time.time() - start_time)/60
print('query took: ' + str(minutes) + ' minutes')
# Step 6
# print all reports that have been audited
merged
Concerns: Are my comments understandable?
Potential #TODO: this is considered a Script, how can I take this script and make it more flexible and create a method or Class that I can throw some kwargs and make it more program-like. What part of the script would be the easiest to turn into a method?