Source code for sampling

import os
import pandas as pd
import numpy as np
import warnings
import copy
from itertools import chain

from fdsim.incidentfitting import (
    get_prio_probabilities_per_type,
    get_vehicle_requirements_probabilities,
    get_spatial_distribution_per_type,
    get_building_function_probabilities,
    get_overall_building_dist,
    get_big_incident_data,
    get_big_incident_arrival_dist,
    get_big_incident_type_dist,
    prepare_incidents_for_spatial_analysis
)

from fdsim.predictors import ProphetIncidentPredictor, BasicLambdaForecaster

from fdsim.responsetimefitting import (
    prepare_data_for_response_time_analysis,
    get_osrm_distance_and_duration,
    add_osrm_distance_and_duration,
    get_coordinates_locations_stations,
    model_travel_time_per_vehicle,
    fit_dispatch_times,
    fit_turnout_times,
    fit_onscene_times,
    fit_big_incident_duration
)

from fdsim.objects import DemandLocation, IncidentType
from fdsim.helpers import progress


[docs]class ResponseTimeSampler():
    """ Class that samples response times for deployed vehicles.

    Parameters
    ----------
    load_data: boolean
        Whether to load data from disk (True) or pre-process from scratch (False).
    data_dir: str
        The path to the directory where data is stored. Used if load_data==True
        or when data is saved after preparation.
    verbose: boolean
        Whether to print progress updates when doing stuff.
    """

    def __init__(self, load_data=True, data_dir="/data",
                 verbose=True):
        """ Initialize variables. """
        self.fitted = False
        self.data = None
        self.file_name = "response_data.csv"
        self.data_dir = data_dir
        self.verbose = verbose

        if load_data:
            try:
                self.data = pd.read_csv(os.path.join(self.data_dir, self.file_name),
                                        dtype={"hub_vak_bk": int}, low_memory=False)
                self.data["hub_vak_bk"] = self.data["hub_vak_bk"].astype(str)
            except FileNotFoundError:
                warnings.warn("No prepared data found, check if 'data_dir' specifies"
                              " the right directory. If you didn't mean to "
                              "load data from disk, initialize with 'load_data=False'."
                              " Instance intialized anyway, without data. 'fit()' method"
                              " now needs OSRM API access to prepare the data. "
                              "Given directory: {}.".format(self.data_dir))

[docs]    def fit(self, incidents=None, deployments=None, stations=None, loc_coords=None,
            vehicle_types=["TS", "RV", "HV", "WO"], osrm_host="http://192.168.56.101:5000",
            save_prepared_data=False, location_col="hub_vak_bk",
            volunteer_stations=["DRIEMOND", "DUIVENDRECHT", "AMSTELVEEN VRIJWILLIG"]):
        """ Fit random variables related to response time.

        Parameters
        ----------
        incidents: pd.DataFrame
            The incident data. Only required when no prepared data is loaded.
        deployments: pd.DataFrame (optional)
            The deployment data. Only required when no prepared data is loaded.
        stations: pd.DataFrame (optional)
            The station information including coordinates and station names.
            Only required when no prepared data is loaded.
        vehicle_types: array-like of strings
            The types of vehicles to use. Defaults to ["TS", "RV", "HV", "WO"].
        osrm_host: str
            The url to the OSRM API, required when object is initialized with
            load_data=False or when no prepared data was found.
        save_prepared_data: boolean
            Whether to write the preprocessed data to a csv file so that it can
            be loaded the next time. Defaults to False.
        location_col: str
            The name of the column that specifies the demand locations, defaults
            to "hub_vak_bk".
        volunteer_stations: array-like of str, optional (default: None)
            The names of the stations that are run by volunteers. Turn-out times
            are fitted separately for these stations, since volunteers have to travel
            to the station first.

        Notes
        -----
        Performs the following steps:
            - Prepares data (merges and adds OSRM distance and duration per
              deployment)
            - Fits lognormal random variables to dispatch times per incident type.
            - Fits Gamma random variables to turnout time per station and type.
            - Models the travel time as :math:`\\alpha + \\beta * \\gamma (\\theta, k) * \\hat{t}`,
              per vehicle type. Here :math:`\\hat{t}` represents the OSRM estiamte of the
              travel time and :math:`\\gamma` is a random noise factor.
            - Saves the station and demand location coordinates in dictionaries.
        """
        self.location_col = location_col

        if self.data is None:
            if incidents is not None and deployments is not None and stations is not None:
                progress("No data loaded, preprocess with OSRM.", verbose=self.verbose)
                self._prep_data_for_fitting(incidents=incidents, deployments=deployments,
                                            stations=stations, vehicles=vehicle_types,
                                            osrm_host=osrm_host, save=save_prepared_data)
            else:
                raise ValueError("No prepared data loaded and not all data fed to 'fit()'.")

        if loc_coords is None:
            progress("Location coordinates provided. Extracting station coordinates",
                     verbose=self.verbose)
            self.location_coords = loc_coords
            _, self.station_coords = get_coordinates_locations_stations(self.data,
                                                                        location_col=location_col)
        else:
            progress("Extracting station and location coordinates.", verbose=self.verbose)
            self.location_coords, self.station_coords = \
                get_coordinates_locations_stations(self.data, location_col=location_col)

        progress('Fitting random variables on response time...', verbose=self.verbose)
        self.high_prio_data = (self.data[(self.data["dim_prioriteit_prio"] == 1) &
                                         (self.data["inzet_terplaatse_volgnummer"] == 1)]
                               .copy())
        self.dispatch_rv_dict = fit_dispatch_times(self.high_prio_data)
        self.turnout_time_rv_dict = fit_turnout_times(self.data, vehicle_types=vehicle_types,
                                                      volunteer_stations=volunteer_stations)
        self.travel_time_dict = model_travel_time_per_vehicle(self.high_prio_data)
        self.onscene_time_rv_dict = fit_onscene_times(self.data)

        progress("Creating response time generators.", verbose=self.verbose)
        self._create_response_time_generators()

        progress("Response time variables fitted.", verbose=self.verbose)
        self.fitted = True

    def _prep_data_for_fitting(self, incidents, deployments, stations,
                               vehicles, osrm_host, save):
        """Perform basic preprocessing and calculate OSRM estimates for travel time.

        Prepared data is stored under self.data. Nothing is returned.

        Parameters
        ----------
        incidents: pd.DataFrame
            The incident data.
        deployments: pd.DataFrame
            The deployment data.
        stations: pd.DataFrame
            The station information including coordinates and station names.
        vehicles: array-like of strings
            The types of vehicles to use. Defaults to ["TS", "RV", "HV", "WO"].
        osrm_host: str
            The url to the OSRM API.
        save: boolean
            Whether to save the data to a csv file after preparing it.
        """
        progress("Preprocessing and merging datasets.", verbose=self.verbose)
        data = prepare_data_for_response_time_analysis(incidents, deployments,
                                                       stations, vehicles)

        progress("Adding OSRM distance and duration.", verbose=self.verbose)
        self.data = add_osrm_distance_and_duration(data, osrm_host=osrm_host)

        if save:
            progress("Saving file.", verbose=self.verbose)
            self.data.to_csv(os.path.join(self.data_dir, self.file_name), index=False)

        progress("Data prepared for fitting.", verbose=self.verbose)

[docs]    def set_custom_stations(self, station_locations, station_names,
                            location_col="hub_vak_bk"):
        """Change the locations of stations to custom demand locations.

        Parameters
        ----------
        station_locations: array-like of strings
            Location IDs of the custom stations, must match values in location_col
            of the objects data.
        location_col: str, optional
            Name of the column to use as a location identifier for incidents.
            Defaults to "hub_vak_bk".
        """
        assert self.fitted, "You first have to 'fit()' before setting custom stations."
        assert len(station_locations) == len(station_names), \
            ("Lengths of station_locations and station_names do not match")

        # set station coordinates
        self.station_coords = dict()
        for i in range(len(station_locations)):
            self.station_coords[station_names[i]] = \
                self.location_coords[station_locations[i]]

[docs]    def move_station(self, station_name, new_location, new_name):
        """Move the location of a single station.

        Parameters
        ----------
        station_name: str
            The name of the station to move.
        new_location: str or tuple(float, float)
            The new location of the station. If a string is passed, it is interpreted
            as the identifier of the demand location to move the station to. If a tuple of
            floats is passed, it is interpreted as the new coordinates in decimal (long, lat).
        new_name: str
            The new name of the station.
        """
        if isinstance(new_location, tuple):
            self.station_coords[new_name] = new_location
        elif isinstance(new_location, str):
            self.station_coords[new_name] = self.location_coords[new_location]
        else:
            raise ValueError("new_location cannot be interpreted. Pass either a tuple of "
                             "decimal longitude and latitude or a string representing "
                             "a demand location.")

        if new_name != station_name:
            del self.station_coords[station_name]

[docs]    def add_station(self, station_name, location):
        """Move the location of a single station.

        Parameters
        ----------
        station_name: str
            The name of the new station.
        location: str or tuple(float, float)
            The location of the new station. If a string is passed, it is interpreted
            as the identifier of the demand location to move the station to. If a tuple of
            floats is passed, it is interpreted as the new coordinates in decimal (long, lat).
        """
        if isinstance(location, tuple):
            self.station_coords[station_name] = location
        elif isinstance(location, str):
            self.station_coords[station_name] = self.location_coords[location]
        else:
            raise ValueError("location cannot be interpreted. Pass either a tuple of "
                             "decimal longitude and latitude or a string representing "
                             "a demand location.")

[docs]    def reset_stations(self):
        """ Reset the station locations and names to those obtained from the data. """
        self.location_coords, self.station_coords = \
            get_coordinates_locations_stations(self.data, location_col=self.location_col)

    def _create_response_time_generators(self):
        """ Create generator objects for every element of response time.

        Sampling efficiency suffers from a high number of calls to '.rvs()' of a
        scipy.stats frozen distribution object. Hence, we avoid calling it many times
        by using generators that call the function once every 10,000 samples.

        This function creates dictionaries of the same architecture as the dictionaries
        holding the random variables, but instead of random variables, it stores generators
        as lowest level elements.

        Example
        -------
        >>> sampler = ResponseTimeSampler(args)
        >>> # sample next dispatch time for a 'Binnenbrand'
        >>> next(sampler.dispatch_generators["Binnenbrand"])

        Notes
        -----
        It is not useful to call this function manually, it is called upon initialization
        of the ResponseTimeSampler.
        """
        def time_generator(rv):
            """ A generator of random samples according to RV (random variable). """
            a = rv.rvs(10000)
            counter = 0
            while True:
                try:
                    yield a[counter]
                    counter += 1
                except IndexError:
                    counter = 0
                    a = rv.rvs(10000)

        self.dispatch_generators = {}
        for incident_type, rv in self.dispatch_rv_dict.items():
            self.dispatch_generators[incident_type] = time_generator(rv)

        self.turnout_generators = {}
        for appointment in self.turnout_time_rv_dict.keys():
            self.turnout_generators[appointment] = {}
            for prio in self.turnout_time_rv_dict[appointment].keys():
                self.turnout_generators[appointment][prio] = {}
                for vtype, rv in self.turnout_time_rv_dict[appointment][prio].items():
                    self.turnout_generators[appointment][prio][vtype] = time_generator(rv)

        self.travel_time_noise_generators = {}
        for vehicle_type, v_dict in self.travel_time_dict.items():
            rv = v_dict["noise_rv"]
            self.travel_time_noise_generators[vehicle_type] = time_generator(rv)

        self.onscene_generators = {}
        for incident_type in self.onscene_time_rv_dict.keys():
            self.onscene_generators[incident_type] = {}
            for vehicle_type, rv in self.onscene_time_rv_dict[incident_type].items():
                self.onscene_generators[incident_type][vehicle_type] = time_generator(rv)

[docs]    def sample_dispatch_time(self, incident_type):
        """ Sample a random dispatch time, given the incident type.

        Parameters
        ----------
        incident_type: str,
            The type of incident to sample dispatch times for.

        Returns
        -------
        int, the random dispatch time in seconds.
        """
        return next(self.dispatch_generators[incident_type])

[docs]    def sample_travel_time(self, estimated_time, vehicle,
                           osrm_host="http://192.168.56.101:5000"):
        """ Sample a random travel time.

        Parameters
        ----------
        estimated_time: float
            The travel time in seconds according to OSRM.
        vehicle: str
            The vehicle type (code) to sample travel time for.

        Returns
        -------
        A float representing the random travel time in seconds.
        """
        try:
            d = self.travel_time_dict[vehicle]
        except KeyError:
            d = self.travel_time_dict["overall"]

        noise = next(self.travel_time_noise_generators[vehicle])
        return d["a"] + d["b"] * noise * estimated_time

[docs]    def sample_response_time(self, incident_type, location_id, station_name, vehicle_type,
                             appointment, prio, estimated_time=None,
                             osrm_host="http://192.168.56.101:5000"):
        """ Sample a random response time based on deployment characteristics.

        Parameters
        ----------
        incident_type: str
            The type of the incident to sample turn-out time for.
        location_id: str
            The ID of the demand location where the incident takes place.
        station_name: str
            The name of the station that the deployment is executed from.
        vehicle_type: str
            The vehicle type (code) to sample travel time for.
        estimated_time: float, int, optional
            The estimated travel time according to OSRM. Optional, defaults
            to None. If None, estimation is collected from OSRM at time of
            calling, which is far less efficient.
        osrm_host: str, optional
            The URL to the OSRM API. Required when no 'estimated_time' is provided.

        Returns
        -------
        Tuple of (turn-out time, travel time, on-scene time). Note that the dispatch
        time is sampled separately, since that is only done once per incident, while
        this function is called per deployment.
        """
        if estimated_time is None:
            orig = self.station_coords[station_name]
            dest = self.location_coords[location_id]
            _, estimated_time = get_osrm_distance_and_duration(orig, dest, osrm_host=osrm_host)

        turnout = next(self.turnout_generators[appointment][prio][vehicle_type])
        travel = self.sample_travel_time(estimated_time, vehicle_type)
        onscene = next(self.onscene_generators[incident_type][vehicle_type])
        return turnout, travel, onscene


[docs]class IncidentSampler():
    """ Samples timing and details of incidents from distributions in the data.

    Parameters
    ----------
    incidents: pd.DataFrame
        The incident data to obtain distributions from.
    deployments: pd.DataFrame
        The deployments to obtain vehicle requirement distributions from.
    vehicles: array-like of strings
        The vehicles types to include in the sampling.
    start_time: datetime or str (convertible to datetime) or None
        The start of the period that should be simulated. If none, starts from
        end of the data. Defaults to None.
    end_time: datetime or str (convertible to datetime) or None
        The start of the period that should be simulated. If none, ends one year
        after end of the data. Defaults to None.
    predictor: str, one of ['prophet']
        What incident rate forecast method to use. Currently only supports
        "prophet" (based on Facebook's Prophet package).
    verbose: boolean
        Whether to print progress.

    Example
    -------
    >>> sampler = IncidentSampler(df_incidents, df_deployments)
    >>> # sample 10 incidents and print details
    >>> t = 0
    >>> for _ in range(10):
    >>>     t, type_, loc, prio, vehicles, func = sampler.sample_next_incident(t)
    >>>     print("time: {}, type: {}, loc: {}, prio: {}, vehicles: {}, object function: {}."
    >>>           .format(t, type_, loc, prio, vehicles, func))
    """

    predictors = ["prophet", "basic"]

    def __init__(self, incidents, deployments, vehicle_types, location_ids, start_time=None,
                 end_time=None, predictor="basic", fc_dir="/data", verbose=True):
        """ Initialize all properties by extracting probabilities from the data. """
        self.incidents = incidents[
                np.in1d(incidents["hub_vak_bk"].fillna(0).astype(int).astype(str), location_ids)]
        self.deployments = deployments
        self.vehicle_types = vehicle_types

        if not isinstance(location_ids[0], str):
            raise ValueError('locations must be an iterable of strings')
        self.location_ids = location_ids
        self.verbose = verbose

        self.types = self._infer_incident_types()

        self._assign_predictor(predictor, fc_dir)
        self._set_sampling_dict(start_time, end_time, incident_types=self.types)
        self._create_incident_types()
        self._create_demand_locations()

        self.reset_time()

        progress("IncidentSampler ready for simulation.", verbose=self.verbose)

    def _infer_incident_types(self):
        """ Create list of incident types based on provided data. """
        merged = pd.merge(self.deployments[["hub_incident_id", "voertuig_groep"]],
                          self.incidents[["dim_incident_id", "dim_incident_incident_type"]],
                          left_on="hub_incident_id", right_on="dim_incident_id", how="left")
        merged = merged[np.isin(merged["voertuig_groep"], self.vehicle_types)]
        return [v for v in merged["dim_incident_incident_type"].unique()
                if str(v) not in ["nan", "NVT"]]

    def _set_sampling_dict(self, start_time, end_time, incident_types=None):
        """ Get the dictionary required for sampling from the predictor.

        Gets the sampling dictionary of a Predictor and stores it to the
        IncidentSampler object so that it can be used in sampling. Also
        stores the length of the dictionary as self.T. This is used during
        simulation to loop over the sampling dictionary while avoiding
        IndexErrors.

        Parameters
        ----------
        start_time: timestamp, str convertible to timestamp, or None
            The start date and time of the period to simulate. If None,
            use all available timestamps in the forecast.
        end_time: timestamp, str convertible to timestamp, or None
            The start date and time of the period to simulate. If None,
            use all available timestamps in the forecast.
        incident_types: array-like of strings, optional (default: None)
            The incident types to incorporate in the simulation. if None,
            uses inferred incident types.
        """
        if incident_types is None:
            incident_types = self.types

        self.sampling_dict = self.predictor.create_sampling_dict(start_time, end_time,
                                                                 incident_types)
        self.T = len(self.sampling_dict)
        self.start_time = self.sampling_dict[0]["time"]
        self.end_time = self.sampling_dict[self.T - 1]["time"]
        self.lambdas = np.array([d['lambda'] for d in self.sampling_dict.values()])

    def _assign_predictor(self, predictor, fc_dir):
        """ Initialize incident rate predictor and assign to property.

        Parameters
        ----------
        predictor: str, one of ['prophet']
            The predictor to use to forecast the incident rates. Currently,
            only supports predictor='prophet'.
        """
        if predictor == "prophet":
            progress("Initializing ProphetIncidentPredictor...", verbose=self.verbose)
            predictor_cls = ProphetIncidentPredictor
            
        elif predictor == "basic":
            progress("Initializing BasicLambdaForecaster...", verbose=self.verbose)
            predictor_cls = BasicLambdaForecaster

        else:
            raise ValueError("'predictor' must be one of {}.".format(predictors))

        self.predictor = predictor_cls(load_forecast=True, fc_dir=fc_dir, verbose=self.verbose)

    def _create_incident_types(self):
        """ Initialize incident types with their characteristics.

        Creates a dictionary of IncidentType objects. Every such object holds
        type-specific distributions about priority, required vehicles,
        and demand locations.
        """
        progress("Getting priority probabilities.", verbose=self.verbose)
        prio_probs = get_prio_probabilities_per_type(self.incidents)

        progress("Getting vehicle requirement probabilities.", verbose=self.verbose)
        vehicle_probs = get_vehicle_requirements_probabilities(self.incidents,
                                                               self.deployments,
                                                               self.vehicle_types)

        progress("Getting spatial distributions.", verbose=self.verbose)
        location_probs = get_spatial_distribution_per_type(self.incidents, locations=self.location_ids)

        progress("Initializing incident types.", verbose=self.verbose)
        self.incident_types = {t: IncidentType(prio_probs[t], vehicle_probs[t],
                               location_probs[t]) for t in self.types}

    def _create_demand_locations(self):
        """ Initialize demand locations and their building function distributions.

        Creates a dictionary of DemandLocation objects. Each such object has its
        own distribution over building functions that is used during sampling.
        """
        progress("Getting building function probabilities.", verbose=self.verbose)
        building_probs = get_building_function_probabilities(self.incidents, locations=self.location_ids)

        progress("Initializing demand locations", verbose=self.verbose)
        self.locations = {l: DemandLocation(l, building_probs[l])
                          for l in building_probs.keys()}

    def _incident_time_generator(self, period_length=60, start_period=0, num_periods=None):
        """ Returns a generator object for incident times. """

        counter = 0

        while True:

            past_time = counter * len(self.lambdas) * period_length

            # process periods in sampling dict in one go
            n_arrivals = np.random.poisson(self.lambdas, size=len(self.lambdas))
            total_arrivals = np.sum(n_arrivals)

            n_arrivals = np.append(n_arrivals[start_period:], n_arrivals[:start_period])
            if num_periods is not None:
                n_arrivals = n_arrivals[:num_periods]

            periods = np.array([x for x in chain.from_iterable(
                                [[i]*n for i, n in enumerate(n_arrivals)])
                               ],
                               dtype=int)

            minutes = np.random.uniform(0, period_length, size=np.sum(n_arrivals))
            times = np.sort(periods * period_length + minutes)

            # yield times one by one
            for time in times:
                yield time + past_time

            counter += 1

[docs]    def reset_time(self):
        """ Reset the incident time generator to start from t=0. """
        self.gen_start_period = 0
        self.incident_time_generator = self._incident_time_generator()

[docs]    def set_time(self, time, num_periods=None):
        """Set time so that incidents are sampled from this point forward.

        After setting time, `sample_next_incident` will sample the next incident in the
        hour(s) after the set time rather than starting at the start of the forecast /
        sampling dict or resuming from its current position.

        Parameters
        ----------
        time: pd.Timestamp or datetime64
            The time from which to sample the next incident.
        num_periods: int, default=100
            The number of periods to simulate from the set time. Incident times will start
            from time again after num_periods are simulated. This can be used when only short
            periods need to be considered to speed up calculations, e.g., when simulating
            major incidents and investigating only simultaneous incidents.
        """
        # set the hours = periods since the start of the sampling dictionary
        self.gen_start_period = int((time - self.start_time).total_seconds() // 3600)
        # reset the incident time generator starting at the given period
        self.incident_time_generator = self._incident_time_generator(
                                        start_period=self.gen_start_period,
                                        num_periods=num_periods)

    def _sample_incident_details(self, incident_type):
        """ Draw random sample of incident characteristics.

        Parameters
        ----------
        incident_type: str
            The type of incident as occuring in the data.

        Returns
        -------
        Tuple of (location id, priority, Tuple(required vehicle types), building function)
        """
        ty = self.incident_types[incident_type]
        vehicles = ty.sample_vehicles()
        priority = ty.sample_priority()
        location = ty.sample_location()
        building_function = self.locations[location].sample_building_function(incident_type)

        return location, priority, vehicles, building_function

[docs]    def sample_next_incident(self):
        """ Sample a random time and type for the next incident.

        Parameters
        ----------
        t: float
            The current time in minutes (from an arbitrary random start time)

        Returns
        -------
        The new time t of the next incident and incident details:
        (t, incident_type, location, priority, vehicles, building function)
        """
        t = next(self.incident_time_generator)
        d = self.sampling_dict[int((t//60 + self.gen_start_period) % self.T)]
        incident_type = np.random.choice(a=self.types, p=d["type_distribution"])
        loc, prio, veh, func = self._sample_incident_details(incident_type)

        return t, d["time"], incident_type, loc, prio, veh, func

[docs]    def set_custom_forecast(self, forecast, start_time=None, end_time=None):
        """Manually provide a forecast.

        Parameters
        ----------
        forecast: pd.DataFrame
            Must have the same shape and columns as the output of
            `self.predictor.get_forecast()`. No assertions are made on this input.
        start_time, end_time: datetime object or str or None, optional, default: None
            The start and end time of the new sampling dictionary that will be created
            from the provided forecast. If None, uses the entire forecast.
        """
        self.predictor.set_custom_forecast(forecast)
        self._set_sampling_dict(start_time, end_time, incident_types=self.types)
        self.incident_time_generator = self._incident_time_generator()

[docs]    def set_location_probs(self, loc, equal_to=None, value_dict=None, types=None):
        """Set the probability that incident of certain type occurs in a given location.

        The probabilities are first set to the given values, then they are normalized again
        to form a proper probability distribution. Note that this function should normally
        not be called directly. Use fdsim.simulation.Simulator.set_location_incident_rates
        instead.

        Parameters
        ----------
        loc: str
            The location to set the probabilities for.
        equal_to: str or list(str)
            The location from which to copy the probabilities. ignored if value_dict is provided.
        value_dict: dict
            Dictionary specifying the incident types to change and the specific probability to
            set them to like {'type' -> prob}.
        types: list(str)
            The incident types to change the probabilities of. If None, uses all. Ignored if a
            value_dict is provided.

        Returns
        -------
        sum_probs: dict
            The sum of the probabilities for each incident type. Can be used to adjust the overall
            incident rates per type so that the incident rate in other locations is not changed.
        """
        # create value_dict if not provided
        if value_dict is None:
            if equal_to is None:
                raise ValueError("Either equal_to or value_dict must be provided")
            if types is None:
                types = list(self.incident_types.keys())
            if isinstance(equal_to, str):
                equal_to = [equal_to]
            if isinstance(types, str):
                types = [types]

            # initialize at zero, add for every loc in equal_to
            value_dict = {typ: 0.0 for typ in types}

            for typ in types:
                for l in equal_to:
                    value_dict[typ] += self.incident_types[typ].location_probs[l]

        # use value_dict to update values
        for typ, value in value_dict.items():
            self.incident_types[typ].location_probs[loc] = value

        # obtain sums of probabilities
        sum_probs = {typ: np.sum(list(self.incident_types[typ].location_probs.values())) for typ in types}

        # normalize probabilities
        for typ, factor in sum_probs.items():
            self.incident_types[typ].location_probs = \
                {l: v / factor for l, v in list(self.incident_types[typ].location_probs.items())}

        # also update the building function probabilities
        incidents = prepare_incidents_for_spatial_analysis(self.incidents)
        select_incidents = incidents[np.in1d(incidents['hub_vak_bk'], equal_to)].copy()
        self.locations[loc].building_probs = get_overall_building_dist(select_incidents)

        # return the factors with which total incident rates per type have changed
        return sum_probs


[docs]class BigIncidentSampler():
    """Class that simulates big incidents at random times. Mostly useful as a starting
    point for simulating more (regular) incidents and evaluating response times in
    extreme cases.

    Parameters
    ----------
    incidents: pd.DataFrame
        The incident data.
    deployments: pd.DataFrame
        The deployment data
    min_ts: int, default=3
        The minimum number of TS deployments for an incident to be considered 'big'. Only
        such incidents will be sampled.
    vehicles: str or list of strings
        The vehicle types to take into account.
    types: list of strings
        The incident types to use. If None, use all in the data.
    """
    def __init__(self, incidents, deployments, start_time, end_time, min_ts=3, vehicles=["TS"],
                 types=["Binnenbrand", "Buitenbrand", "Hulpverlening algemeen"]):

        incidents["dim_incident_start_datumtijd"] = \
                pd.to_datetime(incidents["dim_incident_start_datumtijd"], dayfirst=True)
        incidents["dim_incident_eind_datumtijd"] = \
                pd.to_datetime(incidents["dim_incident_eind_datumtijd"], dayfirst=True)

        # filter to only big incidents of relevant type and vehicles
        big_incidents, big_deployments = get_big_incident_data(incidents, deployments,
                                                               min_ts=min_ts, types=types,
                                                               vehicles=vehicles)

        # get distributions over time
        self.time_distributions = get_big_incident_arrival_dist(big_incidents)

        # get distribution over incident types
        self.type_distribution = get_big_incident_type_dist(big_incidents)

        # get duration random variable
        self.duration_rv = fit_big_incident_duration(big_incidents)

        # get big incident types (with location and vehicle distributions)
        self._create_big_incident_types(
            big_incidents, big_deployments, types=types, vehicles=vehicles
        )

        # create generator object
        self._create_big_incident_generator()

        # find combinations of month and day for sampling
        self.start_time = start_time
        self.end_time = end_time
        self._set_month_day_combinations(start_time, end_time)

    def _create_big_incident_types(self, big_incidents, big_deployments,
                                   types=["Binnenbrand", "Buitenbrand"], vehicles=["TS"]):
        """Create IncidentType instances for big incidents specifically."""
        # get vehicle requirement probabilities
        vehicle_probs = get_vehicle_requirements_probabilities(big_incidents, big_deployments,
                                                               vehicles)

        # location distribution
        location_probs = get_spatial_distribution_per_type(big_incidents)

        self.big_incident_types = {
            t: IncidentType([1., 0., 0.], vehicle_probs[t], location_probs[t]) for t in types
        }

    def _create_big_incident_generator(self):
        """Create a generator object for fast simulation of big incidents."""
        def big_incident_generator(month_probs, day_probs, hour_probs, duration_rv, type_probs):
            months = np.random.choice(a=12, size=10000, p=month_probs)
            days = np.random.choice(a=7, size=10000, p=day_probs)
            hours = np.random.choice(a=24, size=10000, p=hour_probs)
            types = np.random.choice(a=type_probs["types"], size=10000, p=type_probs["probabilities"])
            durations = duration_rv.rvs(10000)

            counter = 0
            while True:
                try:
                    yield months[counter] + 1, days[counter], hours[counter], types[counter], durations[counter]
                    counter += 1
                except IndexError:
                    counter = 0
                    months = np.random.choice(a=12, size=10000, p=month_probs)
                    days = np.random.choice(a=7, size=10000, p=day_probs)
                    hours = np.random.choice(a=24, size=10000, p=hour_probs)
                    types = np.random.choice(a=type_probs["types"], size=10000, p=type_probs["probabilities"])
                    durations = duration_rv.rvs(10000)

        self.incident_generator = big_incident_generator(
            self.time_distributions["month"],
            self.time_distributions["day"],
            self.time_distributions["hour"],
            self.duration_rv,
            self.type_distribution
        )

    def _set_month_day_combinations(self, start_time, end_time):
        """Find dates that satisfy combinations of a given month and day of the week in
        a given range.
        """
        date_range = pd.date_range(start=start_time, end=end_time)

        self.month_day_combos = {}
        for month in range(1, 13):

            self.month_day_combos[month] = {}
            for day in range(7):
                self.month_day_combos[month][day] = [
                    date for date in date_range if
                    (date.month == month) and (date.weekday() == day)
                ]

    def _sample_timestamp(self, month, day, hour):
        """Sample a random timestamp that satisfies a given month, weekday, and hour.

        Parameters
        ----------
        month, day, hour: int
            Number of the month, weekday, and hour of day respectively. Note that month
            is 1-indexed, while weekday and hour are 0-indexed.

        Returns
        -------
        timestamp: pd.Timestamp object
            The random timestamp.
        """
        return np.random.choice(self.month_day_combos[month][day]) + \
                    pd.Timedelta(value=hour, unit="h")

[docs]    def sample_big_incident(self):
        """Sample a big incident at some random time and place."""
        month, day, hour, incident_type, duration = next(self.incident_generator)
        time = self._sample_timestamp(month, day, hour)
        vehicles = self.big_incident_types[incident_type].sample_vehicles()
        location = self.big_incident_types[incident_type].sample_location()
        return time, incident_type, location, 1, vehicles, duration