Parameterizing Model with Hidden Parameters & Known Multi-modaility

I have a dataset with two issues:

  1. The data have a hidden parameter.
  2. There is a known multi-modality in the model.

Hidden Parameter Problem

First, let’s consider the hidden parameter. Here is a simple demonstration of how the data are generated. The parameters R and azimuth are hidden, but can be inferred from the observed v and the known values angle.

import pymc as pm
import pytensor.tensor as pt
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# known constants
R0 = 1.0
A = 1.0

def calc_v(R, angle, R0=R0, A=A):
    return R * pt.sin(angle) * (A / R - A / R0)

# simulate data
num_obs = 1000
R_true = stats.maxwell().rvs(num_obs)
azimuth_true = stats.uniform().rvs(num_obs) * 2.0 * np.pi
Y_true = R_true * np.cos(azimuth_true)
X_true = R_true * np.sin(azimuth_true)

d_true = np.sqrt(R_true**2.0 + R0**2.0 - 2.0 * R_true * R0 * np.cos(azimuth_true))
angle = (-np.arctan2(X_true, Y_true - R0) + np.pi) % (2.0 * np.pi)

v_sigma_true = 0.1  # intrinsic scatter
v_true = calc_v(R_true, angle, R0=R0, A=A).eval()
v_true += stats.norm().rvs(num_obs) * v_sigma_true

Here then is “reality”:

# plot "reality"
fig, ax = plt.subplots(layout="constrained")
cax = ax.scatter(X_true, Y_true, c=v_true)
fig.colorbar(cax, label="v (observed)")
ax.set_xlabel("x (unobserved)")
ax.set_ylabel("y (unobserved)")
fig.show()
plt.close(fig)

And here is “reality” as observed. angle is known, and we observe v. R is hidden, but this shows the relationship between v and R.

# plot "reality" as observed
fig, ax = plt.subplots()
cax = ax.scatter(angle, v_true, c=R_true)
fig.colorbar(cax, label="R (unobserved)")
ax.set_xlabel("angle")
ax.set_ylabel("v")
fig.tight_layout()
fig.show()
plt.close(fig)

Here is a model that attempts to infer v_sigma from noise-less observations of v.

# define model
with pm.Model(coords={"datum": range(num_obs)}) as model:
    # data
    angle_observed = pm.Data("angle_observed", angle, dims="datum")
    v_observed = pm.Data("v_observed", v_true, dims="datum")

    # priors
    R = pm.HalfNormal("R", dims="datum")
    v_sigma = pm.Exponential("v_sigma")

    # v likelihood
    v_mu = calc_v(R, angle_observed, R0=R0, A=A)
    _ = pm.Normal(
        "v",
        mu=v_mu,
        sigma=v_sigma,
        observed=v_observed,
        dims="datum",
    )

with model:
    trace = pm.sample()

pm.summary(trace, var_names=["v_sigma"])

which outputs

          mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  ess_tail  r_hat
v_sigma  0.255  0.025   0.208    0.299      0.002    0.001     121.0     270.0   1.05

As we can see, NUTS struggles with this model and v_sigma is overestimated.

Any suggestions about how I could parameterize this model differently?

Known Multi-modality

The other issue is that I have another observed parameter, f, that depends on a transformation of R and v. Specifically, f is derived from d, and d is related to R and v like so:

angle_test = np.deg2rad(np.array([30.0, -120.0]))
d_test = np.linspace(0.0, 2.0, 1000)
R_test = np.sqrt(
    R0**2.0 + d_test[:, None] ** 2.0 - 2.0 * R0 * d_test[:, None] * np.cos(angle_test)
)
v_test = calc_v(R_test, angle_test, R0=R0, A=A).eval()

fig, ax = plt.subplots(layout="constrained")
ax.plot(d_test, v_test[:, 0], "k-", label="angle = 30 deg")
ax.plot(d_test, v_test[:, 1], "r-", label="angle = -120 deg")
ax.legend(loc="best")
ax.set_xlabel("d (unobserved)")
ax.set_ylabel("v (observed)")
fig.show()

Depending on the known angle and observed v, there could be zero, one, or two values of d.

Any suggestions about how I could parameterize this model to handle the multi-modality?

Regarding hidden parameters

I am able to make progress on the first problem by including additional data that constrain R, and increasing the A parameter to a more discriminatory value. Consider the following:

import pymc as pm
import pytensor.tensor as pt
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# known constants
R0 = 1.0
A = 10.0


def calc_v(R, angle, R0=R0, A=A):
    return R * pt.sin(angle) * (A / R - A / R0)


# simulate data
num_obs = 1000
R_true = stats.maxwell().rvs(num_obs)
azimuth_true = stats.uniform().rvs(num_obs) * 2.0 * np.pi
Y_true = R_true * np.cos(azimuth_true)
X_true = R_true * np.sin(azimuth_true)

d_true = np.sqrt(R_true**2.0 + R0**2.0 - 2.0 * R_true * R0 * np.cos(azimuth_true))
angle = (-np.arctan2(X_true, Y_true - R0) + np.pi) % (2.0 * np.pi)

t_slope_true = 1.0
t_offset_true = 0.0
t_true = t_offset_true + R_true * t_slope_true

v_sigma_true = 0.1  # intrinsic scatter
v_true = calc_v(R_true, angle, R0=R0, A=A).eval()
v_true += stats.norm().rvs(num_obs) * v_sigma_true

# simulate observations
v_err = 0.01
v_obs = v_true + stats.norm().rvs(num_obs) * v_err
t_err = 0.1
t_obs = t_true + stats.norm().rvs(num_obs) * t_err

# plot observations
fig, ax = plt.subplots()
cax = ax.scatter(angle, v_obs, c=t_obs)
fig.colorbar(cax, label="t (observed)")
ax.set_xlabel("angle (known)")
ax.set_ylabel("v (observed)")
fig.tight_layout()
fig.show()
plt.close(fig)

Now the value of t helps identify R, and the increased A increases the overall magnitude of v (i.e., increases <v_obs>/v_sigma). This samples fine with the model:

# define model
with pm.Model(coords={"datum": range(num_obs)}) as model:
    # data
    angle_observed = pm.Data("angle_observed", angle, dims="datum")
    v_observed = pm.Data("v_observed", v_true, dims="datum")
    t_observed = pm.Data("t_observed", t_obs, dims="datum")
    t_observed_error = pm.Data(
        "t_observed_error", t_err * np.ones_like(t_obs), dims="datum"
    )

    # priors
    R = pm.HalfNormal("R", dims="datum")
    v_sigma = pm.Exponential("v_sigma")
    t_slope = pm.HalfNormal("t_slope")
    t_offset = pm.Normal("t_offset")

    # v likelihood
    v_mu = calc_v(R, angle_observed, R0=R0, A=A)
    _ = pm.Normal(
        "v",
        mu=v_mu,
        sigma=v_sigma,
        observed=v_observed,
        dims="datum",
    )

    # t likelihood
    t_mu = t_offset + t_slope * R
    _ = pm.Normal(
        "t", mu=t_mu, sigma=t_observed_error, observed=t_observed, dims="datum"
    )

with model:
    trace = pm.sample()

pm.summary(trace, var_names=["t_slope", "t_offset", "v_sigma"])

which outputs

           mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  ess_tail  r_hat
t_slope   1.002  0.005   0.992    1.011      0.000      0.0    2086.0    2509.0   1.00
t_offset -0.000  0.009  -0.017    0.016      0.000      0.0    2070.0    2207.0   1.00
v_sigma   0.087  0.008   0.071    0.102      0.001      0.0     153.0     364.0   1.02

It appears that v_sigma is still struggling to converge.

Regarding multi-modality

Here is a demonstration of the multi-modality problem.

F_true = stats.lognorm(1.0).rvs(num_obs)
f_true = F_true / d_true**2.0

# simulate observations
f_error = np.min(np.stack([np.ones_like(f_true) * 0.1, 0.1 * f_true]), axis=0)
f_obs = f_true + stats.norm().rvs(num_obs) * f_error

# plot observations
fig, ax = plt.subplots()
cax = ax.scatter(angle, v_obs, c=np.log10(f_obs))
fig.colorbar(cax, label="log10(f) (observed)")
ax.set_xlabel("angle (known)")
ax.set_ylabel("v (observed)")
fig.tight_layout()
fig.show()
plt.close(fig)

Which we sample with the model:

# define model
with pm.Model(coords={"datum": range(num_obs), "ambiguity": [0, 1]}) as model:
    # data
    angle_observed = pm.Data("angle_observed", angle, dims="datum")
    v_observed = pm.Data("v_observed", v_true, dims="datum")
    t_observed = pm.Data("t_observed", t_obs, dims="datum")
    t_observed_error = pm.Data(
        "t_observed_error", t_err * np.ones_like(t_obs), dims="datum"
    )
    f_observed = pm.Data("f_observed", f_obs, dims="datum")
    f_observed_error = pm.Data("f_observed_error", f_error, dims="datum")

    # priors
    R_min = R0 * np.abs(np.sin(angle))
    R_min[np.cos(angle) < 0.0] = R0
    R_off = pm.HalfNormal("R_off", dims="datum")
    R = pm.Deterministic("R", R_min + R_off)
    v_sigma = pm.Exponential("v_sigma")
    t_slope = pm.HalfNormal("t_slope")
    t_offset = pm.Normal("t_offset")

    w = pm.Dirichlet("w", a=np.ones(2), dims="ambiguity")
    F = pm.LogNormal("F", dims=("datum", "ambiguity"))

    # v likelihood
    v_mu = calc_v(R, angle_observed, R0=R0, A=A)
    _ = pm.Normal(
        "v",
        mu=v_mu,
        sigma=v_sigma,
        observed=v_observed,
        dims="datum",
    )

    # t likelihood
    t_mu = t_offset + t_slope * R
    _ = pm.Normal(
        "t", mu=t_mu, sigma=t_observed_error, observed=t_observed, dims="datum"
    )

    # f likelihood
    d_min = R0 * pt.abs(pt.cos(angle_observed))
    d_off = pt.sqrt(R**2.0 - R_min**2.0)
    d_0 = d_min - d_off
    d_1 = d_min + d_off
    d = pm.Deterministic("d", pt.stack([d_0, d_1]).T, dims=("datum", "ambiguity"))
    f_mu = F / d**2.0
    _ = pm.NormalMixture(
        "f",
        w=w,
        mu=f_mu,
        sigma=f_observed_error[:, None],
        observed=f_observed,
        dims="datum",
    )

Notice how we derive an invalid value for d here: d is required to be positive, yet we here allow d < 0 (i.e., when cos(angle) < 0, like the red line at angle = -120 in the first post). This is fine for this model since f depends on d**2, but generally this will cause problems.

This model samples slowly and fails to converge.

1 Like

Regarding the multi-modality problem

Presumably, the issue is that for some data there is no ambiguity, so one term of the likelihood becomes non-determinant, the posterior flattens out, and NUTS gets stuck.

An attempt at allowing data-dependent mixture weights similarly fails. i.e.,:

w = pm.Dirichlet("w", a=np.ones(2), dims=("datum", "ambiguity"))

I feel like there should be a simple solution considering the following. Instead of “observing” f, let’s just derive it in the model.

# define model
with pm.Model(coords={"datum": range(num_obs), "ambiguity": [0, 1]}) as model:
    # data
    angle_observed = pm.Data("angle_observed", angle, dims="datum")
    v_observed = pm.Data("v_observed", v_true, dims="datum")
    t_observed = pm.Data("t_observed", t_obs, dims="datum")
    t_observed_error = pm.Data(
        "t_observed_error", t_err * np.ones_like(t_obs), dims="datum"
    )
    f_observed = pm.Data("f_observed", f_obs, dims="datum")
    f_observed_error = pm.Data("f_observed_error", f_error, dims="datum")

    # priors
    R_min = R0 * np.abs(np.sin(angle))
    R_min[np.cos(angle) < 0.0] = R0
    R_off = pm.HalfNormal("R_off", dims="datum")
    R = pm.Deterministic("R", R_min + R_off)
    v_sigma = pm.Exponential("v_sigma")
    t_slope = pm.HalfNormal("t_slope")
    t_offset = pm.Normal("t_offset")

    # w = pm.Dirichlet("w", a=np.ones(2), dims=("datum", "ambiguity"))
    F = pm.LogNormal("F", dims=("datum", "ambiguity"))

    # v likelihood
    v_mu = calc_v(R, angle_observed, R0=R0, A=A)
    _ = pm.Normal(
        "v",
        mu=v_mu,
        sigma=v_sigma,
        observed=v_observed,
        dims="datum",
    )

    # t likelihood
    t_mu = t_offset + t_slope * R
    _ = pm.Normal(
        "t", mu=t_mu, sigma=t_observed_error, observed=t_observed, dims="datum"
    )

    # f likelihood
    d_min = R0 * pt.abs(pt.cos(angle_observed))
    d_off = pt.sqrt(R**2.0 - R_min**2.0)
    d_0 = d_min - d_off
    d_1 = d_min + d_off
    d = pm.Deterministic("d", pt.stack([d_0, d_1]).T, dims=("datum", "ambiguity"))
    f_mu = pm.Deterministic("f_mu", F / d**2.0, dims=("datum", "ambiguity"))
    # _ = pm.NormalMixture(
    #     "f",
    #     w=w,
    #     mu=f_mu,
    #     sigma=f_observed_error[:, None],
    #     observed=f_observed,
    #     dims="datum",
    # )

Now we can compare the posterior samples of f_mu to the observed values and approximate the mixture weights a priori like so:

with model:
    trace = pm.sample()

idx = 3
bins = np.linspace(0.0, 10.0 * f_obs[idx], 100)
fig, ax = plt.subplots(layout="constrained")
ax.hist(
    trace.posterior["f_mu"].sel(datum=idx, ambiguity=0).data.flatten(),
    bins=bins,
    color="k",
    alpha=0.5,
    label="0",
    density=True,
)
ax.hist(
    trace.posterior["f_mu"].sel(datum=idx, ambiguity=1).data.flatten(),
    bins=bins,
    color="r",
    alpha=0.5,
    label="1",
    density=True,
)
ax.axvline(f_obs[idx], color="b", label="observed")
ax.legend(loc="best")
ax.set_xlabel("f_mu")
ax.set_ylabel("Density")
fig.show()
plt.close(fig)

One could imagine integrating the likelihood times these density distributions in order to derive the appropriate mixture weights a prior. I don’t want to do this, however, because, in reality, I want to infer the hyper-parameters of the distribution of F. It would be great if I could find a way to parameterize this model and infer everything all at once!