"""
==================
Randomness Streams
==================
This module provides a wrapper around numpy's randomness system with the intent of coupling
it to vivarium's tools for Common Random Number genereration.
Attributes
----------
RESIDUAL_CHOICE : object
A probability placeholder to be used in an un-normalized array of weights
to absorb leftover weight so that the array sums to unity.
For example::
[0.2, 0.2, RESIDUAL_CHOICE] => [0.2, 0.2, 0.6]
Note
----
Currently this object is only used in the `choice` function of this
module.
"""
import hashlib
from typing import Any, Callable, List, Tuple, Union
import numpy as np
import pandas as pd
from scipy import stats
from vivarium.framework.randomness.exceptions import RandomnessError
from vivarium.framework.randomness.index_map import IndexMap
from vivarium.framework.utilities import rate_to_probability
RESIDUAL_CHOICE = object()
[docs]
def get_hash(key: str) -> int:
"""Gets a hash of the provided key.
Parameters
----------
key :
A string used to create a seed for the random number generator.
Returns
-------
int
A hash of the provided key.
"""
max_allowable_numpy_seed = 4294967295 # 2**32 - 1
return int(hashlib.sha1(key.encode("utf8")).hexdigest(), 16) % max_allowable_numpy_seed
[docs]
class RandomnessStream:
"""A stream for producing common random numbers.
`RandomnessStream` objects provide an interface to Vivarium's
common random number generation. They provide a number of methods
for doing common simulation tasks that require random numbers like
making decisions among a number of choices.
Attributes
----------
key
The name of the randomness stream.
clock
A way to get the current simulation time.
seed
An extra number used to seed the random number generation.
Notes
-----
Should not be constructed by client code.
Simulation components get `RandomnessStream` objects by requesting
them from the builder provided to them during the setup phase.
I.E.::
class VivariumComponent:
def setup(self, builder):
self.randomness_stream = builder.randomness.get_stream('stream_name')
"""
def __init__(
self,
key: str,
clock: Callable,
seed: Any,
index_map: IndexMap,
initializes_crn_attributes: bool = False,
):
self.key = key
self.clock = clock
self.seed = seed
self.index_map = index_map
self.initializes_crn_attributes = initializes_crn_attributes
@property
def name(self):
return f"randomness_stream_{self.key}"
def _key(self, additional_key: Any = None) -> str:
"""Construct a hashable key from this object's state.
Parameters
----------
additional_key
Any additional information used to seed random number generation.
Returns
-------
str
A key to seed random number generation.
"""
return "_".join([self.key, str(self.clock()), str(additional_key), str(self.seed)])
[docs]
def get_draw(self, index: pd.Index, additional_key: Any = None) -> pd.Series:
"""Get an indexed set of numbers uniformly drawn from the unit interval.
Parameters
----------
index
An index whose length is the number of random draws made
and which indexes the returned `pandas.Series`.
additional_key
Any additional information used to seed random number generation.
Returns
-------
pandas.Series
A series of random numbers indexed by the provided `pandas.Index`.
Note
----
This is the core of the CRN implementation, allowing for consistent use of random
numbers across simulations with multiple scenarios.
See Also
--------
https://en.wikipedia.org/wiki/Variance_reduction and
"Untangling Uncertainty with Common Random Numbers:
A Simulation Study; A.Flaxman, et. al., Summersim 2017"
"""
# Return a structured null value if an empty index is passed
if index.empty:
return pd.Series(index=index, dtype=float)
# Initialize a random_state with a seed based on the simulation clock, the
# decision_point this stream represents, and any additional user-supplied information.
# This is one pre-condition to reproducibility.
seed = get_hash(self._key(additional_key))
random_state = np.random.RandomState(seed=seed)
# Second, we need to sample a very large chunk of random numbers. The size of the
# index map is set at the simulation start and is at least 10x the size of the
# initial population. Which means this is a consistently sampled block of uniformly
# distributed random numbers, irrespective of the size of the simulation population,
# which is important if there are scenarios that result in different population sizes
# through time.
sample_size = len(self.index_map)
raw_draws = random_state.random_sample(sample_size)
if self.initializes_crn_attributes:
# If we're initializing CRN attributes (i.e. attributes used to identify a
# simulant across multiple simulations), we can't use the index map yet since
# these people aren't registered in the index map yet. Instead, we use the draws
# from our random sample in order. This couples the initialization of CRN
# attributes to the time step on which they are initialized, meaning interventions
# that alter the entrance time of a simulant will break the CRN guarantees.
draws = pd.Series(raw_draws[: len(index)], index=index)
else:
# If we're not initializing CRN attributes, we can use the index map to get the
# correct draws for each simulant. This allows us to use the same CRN attributes
# across multiple simulations, even if the population size changes.
draw_index = self.index_map[index]
draws = pd.Series(raw_draws[draw_index], index=index)
return draws
[docs]
def filter_for_rate(
self,
population: Union[pd.DataFrame, pd.Series, pd.Index],
rate: Union[float, List, Tuple, np.ndarray, pd.Series],
additional_key: Any = None,
) -> Union[pd.DataFrame, pd.Series, pd.Index]:
"""Decide an event outcome for each individual from rates.
Given a population or its index and an array of associated rates for
some event to happen, we create and return the subpopulation for whom
the event occurred.
Parameters
----------
population
A view on the simulants for which we are determining the
outcome of an event.
rate
A scalar float value or a 1d list of rates of the event under
consideration occurring which corresponds (i.e.
`len(population) == len(probability))` to the population view passed
in. The rates must be scaled to the simulation time-step size either
manually or as a post-processing step in a rate pipeline. If a
scalar is provided, it is applied to every row in the population.
additional_key
Any additional information used to create the seed.
Returns
-------
pandas.core.generic.PandasObject
The subpopulation of the simulants for whom the event occurred.
The return type will be the same as type(population)
"""
return self.filter_for_probability(
population, rate_to_probability(rate), additional_key
)
[docs]
def filter_for_probability(
self,
population: Union[pd.DataFrame, pd.Series, pd.Index],
probability: Union[float, List, Tuple, np.ndarray, pd.Series],
additional_key: Any = None,
) -> Union[pd.DataFrame, pd.Series, pd.Index]:
"""Decide an outcome for each individual from probabilities.
Given a population or its index and an array of associated probabilities
for some event to happen, we create and return the subpopulation for
whom the event occurred.
Parameters
----------
population
A view on the simulants for which we are determining the
outcome of an event.
probability
A scalar float value or a 1d list of probabilities of the event
under consideration occurring which corresponds (i.e.
`len(population) == len(probability)` to the population view
passed in. If a scalar is provided, it is applied to every row in
the population.
additional_key
Any additional information used to create the seed.
Returns
-------
pandas.core.generic.PandasObject
The subpopulation of the simulants for whom the event occurred.
The return type will be the same as type(population)
"""
if population.empty:
return population
index = population if isinstance(population, pd.Index) else population.index
draws = self.get_draw(index, additional_key)
mask = np.array(draws < probability)
return population[mask]
[docs]
def choice(
self,
index: pd.Index,
choices: Union[List, Tuple, np.ndarray, pd.Series],
p: Union[List, Tuple, np.ndarray, pd.Series] = None,
additional_key: Any = None,
) -> pd.Series:
"""Decides between a weighted or unweighted set of choices.
Given a set of choices with or without corresponding weights,
returns an indexed set of decisions from those choices. This is
simply a vectorized way to make decisions with some book-keeping.
Parameters
----------
index
An index whose length is the number of random draws made
and which indexes the returned `pandas.Series`.
choices
A set of options to choose from.
p
The relative weights of the choices. Can be either a 1-d array of
the same length as `choices` or a 2-d array with `len(index)` rows
and `len(choices)` columns. In the 1-d case, the same set of
weights are used to decide among the choices for every item in
the `index`. In the 2-d case, each row in `p` contains a separate
set of weights for every item in the `index`.
additional_key
Any additional information used to seed random number generation.
Returns
-------
pandas.Series
An indexed set of decisions from among the available `choices`.
Raises
------
RandomnessError
If any row in `p` contains `RESIDUAL_CHOICE` and the remaining
weights in the row are not normalized or any row of `p contains
more than one reference to `RESIDUAL_CHOICE`.
"""
draws = self.get_draw(index, additional_key)
return _choice(draws, choices, p)
[docs]
def sample_from_distribution(
self,
index: pd.Index,
distribution: stats.rv_continuous = None,
ppf: Callable[[pd.Series, ...], pd.Series] = None,
additional_key: Any = None,
**distribution_kwargs: Any,
) -> pd.Series:
"""
Given a distribution, returns an indexed set of samples from that
distribution.
Parameters
----------
index
An index whose length is the number of random draws made
and which indexes the returned `pandas.Series`.
distribution
A scipy.stats distribution object.
ppf
A function that takes a series of draws and returns a series of samples.
additional_key
Any additional information used to seed random number generation.
distribution_kwargs
Additional keyword arguments to pass to the distribution's ppf function.
"""
if ppf is None and distribution is None:
raise ValueError("Either distribution or ppf must be provided")
if ppf is not None and distribution is not None:
raise ValueError("Only one of distribution or ppf can be provided")
if distribution is not None:
ppf = distribution.ppf
draws = self.get_draw(index, additional_key)
samples = pd.Series(ppf(draws, **distribution_kwargs), index=index)
return samples
def __repr__(self) -> str:
return "RandomnessStream(key={!r}, clock={!r}, seed={!r})".format(
self.key, self.clock(), self.seed
)
def _choice(
draws: pd.Series,
choices: Union[List, Tuple, np.ndarray, pd.Series],
p: Union[List, Tuple, np.ndarray, pd.Series] = None,
) -> pd.Series:
"""Decides between a weighted or unweighted set of choices.
Given a set of choices with or without corresponding weights,
returns an indexed set of decisions from those choices. This is
simply a vectorized way to make decisions with some book-keeping.
Parameters
----------
draws
A uniformly distributed random number for every simulant to make
a choice for.
choices
A set of options to choose from. Choices must be the same for every
simulant.
p
The relative weights of the choices. Can be either a 1-d array of
the same length as `choices` or a 2-d array with `len(draws)` rows
and `len(choices)` columns. In the 1-d case, the same set of weights
are used to decide among the choices for every item in the `index`.
In the 2-d case, each row in `p` contains a separate set of weights
for every item in the `index`.
Returns
-------
pandas.Series
An indexed set of decisions from among the available `choices`.
Raises
------
RandomnessError
If any row in `p` contains `RESIDUAL_CHOICE` and the remaining
weights in the row are not normalized or any row of `p` contains
more than one reference to `RESIDUAL_CHOICE`.
"""
# Convert p to normalized probabilities broadcasted over index.
p = (
_set_residual_probability(_normalize_shape(p, draws.index))
if p is not None
else np.ones((len(draws.index), len(choices)))
)
p = p / p.sum(axis=1, keepdims=True)
p_bins = np.cumsum(p, axis=1)
# Use the random draw to make a choice for every row in index.
choice_index = (draws.values[np.newaxis].T > p_bins).sum(axis=1)
return pd.Series(np.array(choices)[choice_index], index=draws.index)
def _normalize_shape(
p: Union[List, Tuple, np.ndarray, pd.Series],
index: Union[pd.Index, pd.MultiIndex],
) -> np.ndarray:
p = np.array(p)
# We got a 1-d array => same weights for every index.
if len(p.shape) == 1:
# Turn 1-d array into 2-d array with same weights in every row.
p = np.array(np.broadcast_to(p, (len(index), p.shape[0])))
return p
def _set_residual_probability(p: np.ndarray) -> np.ndarray:
"""Turns any use of `RESIDUAL_CHOICE` into a residual probability.
Parameters
----------
p
Array where each row is a set of probability weights and potentially
a `RESIDUAL_CHOICE` placeholder.
Returns
-------
numpy.ndarray
Array where each row is a set of normalized probability weights.
"""
residual_mask = p == RESIDUAL_CHOICE
if residual_mask.any(): # I.E. if we have any placeholders.
if np.any(np.sum(residual_mask, axis=1) - 1):
msg = (
"More than one residual choice supplied for a single "
f"set of weights. Weights: {p}."
)
raise RandomnessError(msg)
p[residual_mask] = 0
residual_p = 1 - np.sum(p, axis=1) # Probabilities sum to 1.
if np.any(residual_p < 0): # We got un-normalized probability weights.
msg = (
"Residual choice supplied with weights that summed to more than 1. "
f"Weights: {p}."
)
raise RandomnessError(msg)
p[residual_mask] = residual_p
return p