Source code for vivarium.framework.population.population_view

"""
===================
The Population View
===================

The :class:`PopulationView` is a user-facing abstraction that manages read and write access
to the underlying simulation :term:`State Table`. It has two primary responsibilities:

    1. To provide user access to subsets of the simulation state table
       when it is safe to do so.
    2. To allow the user to update the simulation state in a controlled way.

"""
from typing import TYPE_CHECKING, List, Tuple, Union

import pandas as pd

from vivarium.framework.population.exceptions import PopulationError

if TYPE_CHECKING:
    # Cyclic import
    from vivarium.framework.population.manager import PopulationManager


[docs]class PopulationView: """A read/write manager for the simulation state table. It can be used to both read and update the state of the population. A PopulationView can only read and write columns for which it is configured. Attempts to update non-existent columns are ignored except during simulant creation when new columns are allowed to be created. Parameters ---------- manager The population manager for the simulation. columns The set of columns this view should have access too. If empty, this view will have access to the entire state table. query A :mod:`pandas`-style filter that will be applied any time this view is read from. Notes ----- By default, this view will filter out ``untracked`` simulants unless the ``tracked`` column is specified in the initialization arguments. """ def __init__( self, manager: "PopulationManager", view_id: int, columns: Union[List[str], Tuple[str]] = (), query: str = None, ): self._manager = manager self._id = view_id self._columns = list(columns) self._query = query @property def name(self): return f"population_view_{self._id}" @property def columns(self) -> List[str]: """The columns that the view can read and update. If the view was created with ``None`` as the columns argument, then the view will have access to the full table by default. That case should be only be used in situations where the full state table is actually needed, like for some metrics collection applications. """ if not self._columns: return list(self._manager.get_population(True).columns) return list(self._columns) @property def query(self) -> Union[str, None]: """A :mod:`pandas` style query to filter the rows of this view. This query will be applied any time the view is read. This query may reference columns not in the view's columns. """ return self._query
[docs] def subview(self, columns: Union[List[str], Tuple[str]]) -> "PopulationView": """Retrieves a new view with a subset of this view's columns. Parameters ---------- columns The set of columns to provide access to in the subview. Must be a proper subset of this view's columns. Returns ------- PopulationView A new view with access to the requested columns. Raises ------ PopulationError If the requested columns are not a proper subset of this view's columns or no columns are requested. Notes ----- Subviews are useful during population initialization. The original view may contain both columns that a component needs to create and update as well as columns that the component needs to read. By requesting a subview, a component can read the sections it needs without running the risk of trying to access uncreated columns because the component itself has not created them. """ if not columns or set(columns) - set(self.columns): raise PopulationError( f"Invalid subview requested. Requested columns must be a non-empty " f"subset of this view's columns. Requested columns: {columns}, " f"Available columns: {self.columns}" ) # Skip constraints for requesting subviews. return self._manager._get_view(columns, self.query)
[docs] def get(self, index: pd.Index, query: str = "") -> pd.DataFrame: """Select the rows represented by the given index from this view. For the rows in ``index`` get the columns from the simulation's state table to which this view has access. The resulting rows may be further filtered by the view's query and only return a subset of the population represented by the index. Parameters ---------- index Index of the population to get. query Additional conditions used to filter the index. These conditions will be unioned with the default query of this view. The query provided may use columns that this view does not have access to. Returns ------- pandas.DataFrame A table with the subset of the population requested. Raises ------ PopulationError If this view has access to columns that have not yet been created and this method is called. If you see this error, you should request a subview with the columns you need read access to. See Also -------- :meth:`subview <PopulationView.subview>` """ pop = self._manager.get_population(True).loc[index] if not index.empty: if self._query: pop = pop.query(self._query) if query: pop = pop.query(query) non_existent_columns = set(self.columns) - set(pop.columns) if non_existent_columns: raise PopulationError( f"Requested column(s) {non_existent_columns} not in population table. " "This is likely due to a failure to require some columns, randomness " "streams, or pipelines when registering a simulant initializer, a value " "producer, or a value modifier. NOTE: It is possible for a run to " "succeed even if resource requirements were not properly specified in " "the simulant initializers or pipeline creation/modification calls. This " "success depends on component initialization order which may change in " "different run settings." ) return pop.loc[:, self.columns]
[docs] def update(self, population_update: Union[pd.DataFrame, pd.Series]) -> None: """Updates the state table with the provided data. Parameters ---------- population_update The data which should be copied into the simulation's state. If the update is a :class:`pandas.DataFrame`, it can contain a subset of the view's columns but no extra columns. If ``pop`` is a :class:`pandas.Series` it must have a name that matches one of this view's columns unless the view only has one column in which case the Series will be assumed to refer to that regardless of its name. Raises ------ PopulationError If the provided data name or columns do not match columns that this view manages or if the view is being updated with a data type inconsistent with the original population data. """ state_table = self._manager.get_population(True) population_update = self._format_update_and_check_preconditions( population_update, state_table, self.columns, self._manager.creating_initial_population, self._manager.adding_simulants, ) if self._manager.creating_initial_population: new_columns = list(set(population_update).difference(state_table)) self._manager._population[new_columns] = population_update[new_columns] elif not population_update.empty: update_columns = list(set(population_update).intersection(state_table)) for column in update_columns: column_update = self._update_column_and_ensure_dtype( population_update[column], state_table[column], self._manager.adding_simulants, ) self._manager._population.loc[:, column] = column_update
def __repr__(self): return ( f"PopulationView(_id={self._id}, _columns={self.columns}, _query={self._query})" ) ################## # Helper methods # ################## @staticmethod def _format_update_and_check_preconditions( population_update: Union[pd.Series, pd.DataFrame], state_table: pd.DataFrame, view_columns: List[str], creating_initial_population: bool, adding_simulants: bool, ) -> pd.DataFrame: """Standardizes the population update format and checks preconditions. Managing how values get written to the underlying population state table is critical to rule out several categories of error in client simulation code. The state table is modified at three different times. In the first, the initial population table is being created and new columns are being added to the state table with their initial values. In the second, the population manager has added new rows with appropriate null values to the state table in response to population creation dictated by client code, and population updates are being provided to fill in initial values for those new rows. In the final case, state table values for existing simulants are being overridden as part of a time step. All of these modification scenarios require that certain preconditions are met. For all scenarios, we require 1. The update is a DataFrame or a Series. 2. If it is a series, it is nameless and this view manages a single column or it is named and it's name matches a column in this PopulationView. 3. The update matches at least one column in this PopulationView. 4. The update columns are a subset of the columns managed by this PopulationView. 5. The update index is a subset of the existing state table index. PopulationViews don't make rows, they just fill them in. For initial population creation additional preconditions are documented in :meth:`PopulationView._ensure_coherent_initialization`. Outside population initialization, we require that all columns in the update to be present in the existing state table. When new simulants are added in the middle of the simulation, we require that only one component provide updates to a column. Parameters ---------- population_update The update to the simulation state table. state_table The existing simulation state table. view_columns The columns managed by this PopulationView. creating_initial_population Whether the initial population is being created. adding_simulants Whether new simulants are currently being initialized. Returns ------- pandas.DataFrame The input data formatted as a DataFrame. Raises ------ TypeError If the population update is not a :class:`pandas.Series` or a :class:`pandas.DataFrame`. PopulationError If the update violates any preconditions relevant to the context in which the update is provided (initial population creation, population creation on time steps, or population state changes on time steps). """ assert not creating_initial_population or adding_simulants population_update = PopulationView._coerce_to_dataframe( population_update, view_columns, ) unknown_simulants = len(population_update.index.difference(state_table.index)) if unknown_simulants: raise PopulationError( "Population updates must have an index that is a subset of the current " f"population state table. {unknown_simulants} simulants were provided " f"in an update with no matching index in the existing table." ) if creating_initial_population: PopulationView._ensure_coherent_initialization(population_update, state_table) else: new_columns = list(set(population_update).difference(state_table)) if new_columns: raise PopulationError( f"Attempting to add new columns {new_columns} to the state table " f"outside the initial population creation phase." ) if adding_simulants: state_table_new_simulants = state_table.loc[population_update.index, :] conflicting_columns = [ column for column in population_update if state_table_new_simulants[column].notnull().any() and not population_update[column].equals( state_table_new_simulants[column] ) ] if conflicting_columns: raise PopulationError( "Two components are providing conflicting initialization data " f"for the state table columns: {conflicting_columns}." ) return population_update @staticmethod def _coerce_to_dataframe( population_update: Union[pd.Series, pd.DataFrame], view_columns: List[str], ) -> pd.DataFrame: """Coerce all population updates to a :class:`pandas.DataFrame` format. Parameters ---------- population_update The update to the simulation state table. Returns ------- pandas.DataFrame The input data formatted as a DataFrame. Raises ------ TypeError If the population update is not a :class:`pandas.Series` or a :class:`pandas.DataFrame`. PopulationError If the input data is a :class:`pandas.Series` and this :class:`PopulationView` manages multiple columns or if the population update contains columns not managed by this view. """ if not isinstance(population_update, (pd.Series, pd.DataFrame)): raise TypeError( "The population update must be a pandas Series or DataFrame. " f"A {type(population_update)} was provided." ) if isinstance(population_update, pd.Series): if population_update.name is None: if len(view_columns) == 1: population_update.name = view_columns[0] else: raise PopulationError( "Cannot update with an unnamed pandas series unless there " "is only a single column in the view." ) population_update = pd.DataFrame(population_update) if not set(population_update.columns).issubset(view_columns): raise PopulationError( f"Cannot update with a DataFrame or Series that contains columns " f"the view does not. Dataframe contains the following extra columns: " f"{set(population_update.columns).difference(view_columns)}." ) update_columns = list(population_update) if not update_columns: raise PopulationError( "The update method of population view is being called " "on a DataFrame with no columns." ) return population_update @staticmethod def _ensure_coherent_initialization( population_update: pd.DataFrame, state_table: pd.DataFrame ) -> None: """Ensure that overlapping population updates have the same information. During population initialization, each state table column should be updated by exactly one component and each component with an initializer should create at least one column. Sometimes components are a little sloppy and provide duplicate column information, which we should continue to allow. We want to ensure that a column is only getting one set of unique values though. Parameters ---------- population_update The update to the simulation state table. state_table The existing simulation state table. When this method is called, the table should be in a partially complete state. That is the provided population update should carry some new attributes we need to assign. Raises ----- PopulationError If the population update contains no new information or if it contains information in conflict with the existing state table. """ missing_pops = len(state_table.index.difference(population_update.index)) if missing_pops: raise PopulationError( f"Components should initialize the same population at the simulation start. " f"A component is missing updates for {missing_pops} simulants." ) new_columns = set(population_update).difference(state_table) overlapping_columns = set(population_update).intersection(state_table) if not new_columns: raise PopulationError( f"A component is providing a population update for {list(population_update)} " "but all provided columns are initialized by other components." ) for column in overlapping_columns: if not population_update[column].equals(state_table[column]): raise PopulationError( "Two components are providing conflicting initialization data for the " f"{column} state table column." ) @staticmethod def _update_column_and_ensure_dtype( update: pd.Series, existing: pd.Series, adding_simulants: bool, ) -> pd.Series: """Build the updated state table column with an appropriate dtype. Parameters ---------- update The new column values for a subset of the existing index. existing The existing column values for all simulants in the state table. adding_simulants Whether new simulants are currently being initialized. Returns ------- pd.Series The column with the provided update applied """ # FIXME: This code does not work as described. I'm leaving it here because writing # real dtype checking code is a pain and we never seem to hit the actual edge cases. # I've also seen this error, though I don't have a reproducible and useful example. # I'm reasonably sure what's really being accounted for here is non-nullable columns # that temporarily have null values introduced in the space between rows being # added to the state table and initializers filling them with their first values. # That means the space of dtype casting issues is actually quite small. What should # actually happen in the long term is to separate the population creation entirely # from the mutation of existing state. I.e. there's not an actual reason we need # to do all these sequential operations on a single underlying dataframe during # the creation of new simulants besides the fact that it's the existing # implementation. update_values = update.values.copy() new_state_table_values = existing.values.copy() update_index_positional = existing.index.get_indexer(update.index) # Assumes the update index labels can be interpreted as an array position. new_state_table_values[update_index_positional] = update_values unmatched_dtypes = new_state_table_values.dtype != update_values.dtype if unmatched_dtypes and not adding_simulants: # This happens when the population is being grown because extending # the index forces columns that don't have a natural null type # to become 'object' raise PopulationError( "A component is corrupting the population table by modifying the dtype of " f"the {update.name} column from {existing.dtype} to {update.dtype}." ) new_state_table_values = new_state_table_values.astype(update_values.dtype) return pd.Series(new_state_table_values, index=existing.index, name=existing.name)