import random
import string
import sys
import typing as t
from dataclasses import dataclass
from datetime import date
import pandas as pd
[docs]
@dataclass
class PropertyInt:
"""
Integer property definition with range constraints.
Used to define integer columns in entity schemas, providing
valid value ranges for query generation.
Attributes:
min: Minimum allowed value (inclusive)
max: Maximum allowed value (inclusive)
type: Always "int", used for schema parsing
"""
min: int
max: int
type: str = 'int'
[docs]
@dataclass
class PropertyFloat:
"""
Floating point property definition with range constraints.
Used to define decimal/float columns in entity schemas, providing
valid value ranges for query generation.
Attributes:
min: Minimum allowed value (inclusive)
max: Maximum allowed value (inclusive)
type: Always "float", used for schema parsing
"""
min: float
max: float
type: str = 'float'
[docs]
@dataclass
class PropertyEnum:
"""
Enumeration property definition with allowed values.
Used to define columns with a fixed set of possible values in entity schemas,
such as status fields or categories.
Attributes:
values: List of valid string values for this property
type: Always "enum", used for schema parsing
Example:
status = PropertyEnum(values=["active", "inactive", "pending"])
"""
values: t.List[str]
type: str = 'enum'
[docs]
@dataclass
class PropertyString:
"""
String property definition with character constraints.
Used to define text columns in entity schemas, with optional constraints
on what characters strings can start with. This allows generating realistic
looking text data.
Attributes:
starting_character: List of valid first characters for generated strings
type: Always "string", used for schema parsing
Example:
# Names starting with uppercase letters
name = PropertyString(starting_character=list(string.ascii_uppercase))
"""
starting_character: t.List[str]
type: str = 'string'
[docs]
@dataclass
class PropertyDate:
"""
Date property definition with range constraints.
Used to define date columns in entity schemas, providing valid
date ranges for query generation. Generates dates between the
min and max values inclusive.
Attributes:
min: Earliest allowed date
max: Latest allowed date
type: Always "date", used for schema parsing
Example:
# Dates in the year 2023
created_at = PropertyDate(
min=date(2023, 1, 1),
max=date(2023, 12, 31)
)
"""
min: date
max: date
type: str = 'date'
Property = t.Union[PropertyInt, PropertyFloat, PropertyEnum, PropertyString, PropertyDate]
[docs]
@dataclass(frozen=True)
class Entity:
"""
Represents entity information parsed from schema files.
This class is used to generate well-formed and meaningful queries based on
entity information and a high-level structure describing how queries should
generally look (see `QueryStructure`).
Attributes:
name (str): The name of the entity.
primary_key (str | t.List[str] | None): The primary key(s) of the entity.
properties (t.Dict[str, Property]): A dictionary of property names to their definitions.
foreign_keys (t.Dict[str, t.List[str]]): A dictionary of foreign key relationships.
"""
name: str
primary_key: str | t.List[str] | None
properties: t.Dict[str, Property]
foreign_keys: t.Dict[str, t.List[str]]
[docs]
def __hash__(self) -> int:
"""
Generate a hash based on the entity's name.
Since entity names must be unique within a schema, using the name
as the hash basis ensures proper hash table behavior.
Returns:
int: Hash value for the entity.
"""
return hash(self.name)
[docs]
def __eq__(self, other: object) -> bool:
"""
Compare this entity with another for equality.
Entities are considered equal if they have the same name,
as names must be unique within a schema.
Args:
other: The object to compare with.
Returns:
bool: True if the objects are equal, False otherwise.
"""
if not isinstance(other, Entity):
return NotImplemented
return self.name == other.name
[docs]
@staticmethod
def from_configuration(name: str, config: t.Dict) -> 'Entity':
"""
Create an Entity instance from a configuration dictionary.
Args:
config (t.Dict): A dictionary containing entity configuration.
Returns:
Entity: An instance of the Entity class.
Raises:
ValueError: If an unknown property type is encountered.
"""
properties = {}
for prop_name, data in config.get('properties', {}).items():
prop_type = data['type']
if prop_type == 'int':
properties[prop_name] = PropertyInt(
min=data.get('min', -sys.maxsize), max=data.get('max', sys.maxsize)
)
elif prop_type == 'float':
properties[prop_name] = PropertyFloat(
min=data.get('min', -1e308), max=data.get('max', 1e308)
)
elif prop_type == 'enum':
if 'values' not in data:
raise ValueError(f'Enum property {prop_name} must specify values')
properties[prop_name] = PropertyEnum(values=data['values'])
elif prop_type == 'string':
properties[prop_name] = PropertyString(
starting_character=data.get('starting_character', list(string.ascii_letters))
)
elif prop_type == 'date':
properties[prop_name] = PropertyDate(
min=date.fromisoformat(data.get('min', '1970-01-01')),
max=date.fromisoformat(data.get('max', '2038-01-19')),
)
else:
raise ValueError(f'Unknown property type: {prop_type}')
return Entity(
name=name,
primary_key=config.get('primary_key', None),
properties=properties,
foreign_keys=config.get('foreign_keys', {}),
)
@property
def has_unique_primary_key(self) -> bool:
"""Check if the entity has a single, unique primary key."""
return isinstance(self.primary_key, str)
@property
def data_ranges(self) -> t.Dict[str, t.Tuple[int, int] | t.List[str]]:
"""
Get the data ranges for all properties of the entity.
Returns:
A dictionary mapping property names to their respective ranges or possible values.
"""
ranges = {}
for name, property in self.properties.items():
match property:
case PropertyInt(min, max) | PropertyFloat(min, max):
ranges[name] = (min, max)
case PropertyString(starting_character):
ranges[name] = (starting_character,)
case PropertyEnum(values):
ranges[name] = values
case PropertyDate(min, max):
ranges[name] = (min.isoformat(), max.isoformat())
return ranges
[docs]
def generate_dataframe(self, num_rows=1000) -> pd.DataFrame:
"""
Generate a Pandas dataframe using this entity's information.
Args:
num_rows (int): The number of rows to generate. Default is 1000.
Returns:
pd.DataFrame:
A dataframe populated with randomly generated data based on the entity's properties.
Note:
If the entity has a unique primary key of type int, the number of rows may be limited
to the range of possible values for that key.
"""
rows = []
if self.has_unique_primary_key:
assert isinstance(self.primary_key, str)
primary_key_property = self.properties[self.primary_key]
if isinstance(primary_key_property, PropertyInt):
constraint = primary_key_property.max - primary_key_property.min + 1
num_rows = min(constraint, num_rows)
for i in range(num_rows):
row = {}
for name, property in self.properties.items():
match property:
case PropertyInt(minimum, maximum):
if (
self.has_unique_primary_key
and name == self.primary_key
and num_rows == (maximum - minimum + 1)
):
row[name] = i + minimum
else:
if maximum - minimum > 1e6:
row[name] = random.randint(-1000000, 1000000)
else:
row[name] = random.randint(minimum, maximum)
case PropertyFloat(minimum, maximum):
if maximum - minimum > 1e6:
row[name] = round(random.uniform(-1000000, 1000000), 2)
else:
row[name] = round(random.uniform(minimum, maximum), 2)
case PropertyString(starting_character):
starting_char = random.choice(starting_character)
random_string = ''.join(random.choices(string.ascii_letters, k=9))
row[name] = starting_char + random_string
case PropertyEnum(values):
row[name] = random.choice(values)
case PropertyDate(minimum, maximum):
row[name] = pd.to_datetime(
random.choice(pd.date_range(pd.to_datetime(minimum), pd.to_datetime(maximum)))
).strftime('%Y-%m-%d')
rows.append(row)
return pd.DataFrame(rows)