I have a Python script that given some configuration, generates a random CSV file that can be used for testing purposes.
I want to know if it adheres to the best Python and coding practices. It works for cases where I need upwards of 10K+ rows fast enough for my requirements, so I am not too worried about performance although inputs on performance are also appreciated.
Input:
- Schema: as a dict, information about each column name, data type and some other constraints (like fixed length/in a range/ from a given list)
- Number of rows
- Name of the output CSV file
Script:
import random as rnd
import csv
from abc import ABC, abstractmethod
# csv creator, creates a csv files with a given config
roundPrecision = 3
class BoundType(ABC):
def __init__(self, dtype, params):
self.dType = dtype
self.params = params
@abstractmethod
def generate(self):
pass
class FixedLength(BoundType):
# params is length
def generate(self):
length = self.params.get("len", 1)
if self.dType == "int":
return rnd.randint(10 ** (length - 1), 10 ** length - 1)
elif self.dType == "float":
return FixedLength("int", self.params).generate() + round(rnd.random(), roundPrecision)
elif self.dType == "string":
alphabet = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
word = [rnd.choice(alphabet) for _ in range(length)]
return ''.join(word)
else:
return None
class FixedRange(BoundType):
# params is range
def generate(self):
lo, hi = (self.params.get("lohi"))
if self.dType == "int":
return rnd.randint(lo, hi)
elif self.dType == "float":
return round(rnd.uniform(lo, hi), roundPrecision)
else:
return None
class FromPossibleValues(BoundType):
# params is a list
def generate(self):
possibleval = self.params.get("set", set())
return rnd.choice(possibleval)
def createcsv(rows, filename, schema):
with open(f'./output/{filename}.csv', 'w', encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(schema.keys())
for _ in range(rows):
writer.writerow([x.generate() for x in schema.values()])
Test:
from csvGen.csvGenerator import FixedLength, FixedRange, FromPossibleValues, createcsv
schema = {
"col1": FixedLength("int", {"len": 5}),
"col2": FixedLength("float", {"len": 5}),
"col3": FixedLength("string", {"len": 5}),
"col4": FixedRange("int", {"lohi": (10, 15)}),
"col5": FixedRange("float", {"lohi": (5.5, 6.7)}),
"col6": FromPossibleValues("int", {"set": [1, 2, 3, 4, 5]}),
"col7": FromPossibleValues("int", {"set": [1.1, 2.2, 3.3]}),
"col8": FromPossibleValues("int", {"set": ["A", "AB"]})
}
rows = 10
fileName = "eightVals"
createcsv(rows, fileName, schema)
This is what the output looks like for the given test :
col1 | col2 | col3 | col4 | col5 | col6 | col7 | col8 |
---|---|---|---|---|---|---|---|
51685 | 71830.471 | PAXBK | 12 | 6.192 | 1 | 2.2 | AB |
60384 | 42341.991 | RHNUK | 11 | 6.037 | 1 | 1.1 | AB |
73505 | 30997.171 | DVOGT | 10 | 6.69 | 5 | 2.2 | A |
60528 | 85072.731 | FWWXW | 10 | 5.761 | 1 | 2.2 | A |
23048 | 65401.245 | EVPUX | 13 | 6.474 | 4 | 1.1 | AB |
74748 | 66969.774 | PEULP | 15 | 6.546 | 3 | 2.2 | AB |
88763 | 34749.184 | VOAUO | 10 | 6.402 | 4 | 2.2 | AB |
77351 | 44566.163 | JOBQF | 13 | 5.683 | 1 | 2.2 | AB |
50820 | 73002.154 | EACZT | 15 | 5.711 | 1 | 1.1 | AB |
53037 | 89225.572 | YTLBI | 13 | 6.328 | 1 | 2.2 | AB |