Generative process#

Definitions#

We can generate participants’ biomarker data according to EBM (Event-Based Model).

\(S \sim {\rm UniformPermutation}(\cdot)\)

\(S\) follows a distribution of uniform permutation. That means the ordering of biomarkers is random.

\(k_j \sim {\rm DiscreteUniform}(N)\)

\(k_j\) follows a discrete uniform distribution, which means a participant is equally likely to fall in a progression stage (e.g., from \(0\) to \(4\), where \(0\) indicate this participant is healthy.)

\[X_{S(n)j} | S, k_j \sim I(z_j == 1) \left[ I(S(n) \leq k_j ) p(X_{S(n)j} \mid \theta_{S(n)} ) +I(S(n) \gt k_j) p(X_{S(n)j} \mid \phi_{S(n)}) \right] + \left(1-I(z_j==1) \right) p(X_{S(n)j} \mid \phi_{S(n)})\]

Note that \(\sim\) in statistics indicates that this variable follows a certain distribution. Here, it means that the biomarker values is drawn from that distribution.

Parameters#

\(z_j\): \(1\) if the participant is diseased; otherwise \(0\).

\(I(True) = 1\), \(I(False) = 0\)

\(S\) denotes the ordering of a sequence of biomarkers.

\(N\): number of observed biomarkers.

\(n\): a specific biomarker; e.g., biomarker \(b\).

\(J\): number of participants.

\(j\) denotes a participant.

\(X\) is observed values of biomarkers; it is a matrix of dimension of \(N \times J\) or \(J \times N\).

\(k\): a scalar whose value is the participant’s stage of the disease

\(K\): number of disease stages

\(S(n)\) means the disease stage that a specific biomarker \(n\) indicates.

\(k_j\): disease stage that a participant is at.

\(X_{nj}\) means the observed value of the biomarker \(n\) in participant \(j\).

\(\theta_n\) is the parameters for the probability density function (PDF) of observed value of biomarker \(n\) when this biomarker has been affected by the disease. Let’s assume this distribution is a Gaussian distribution.

\(\phi_n\) is the parameters for the probability density function (PDF) of observed value of biomarker \(n\) when this biomarker has NOT been affected by the disease.

Simulation#

We are going to generate biomarker values for each participant by randomly drawing from distributions defined by \(\theta\) or \(\phi\). We will base our generation on Chen’s paper (Figure 1).

import numpy as np
import scipy.stats as stats
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
# S_ordering = [5, 6, 3, 4, 1, 7, 8, 2, 9, 10]
def get_zscore(arr):
    mu = np.mean(arr)
    std = np.std(arr)
    return (arr - mu)/std

def min_max_scale(arr):
    arr_min = np.min(arr)
    arr_max = np.max(arr)
    scaled_arr = (arr - arr_min) / (arr_max - arr_min)
    return scaled_arr
biomarker_names = np.array([
    'MMSE', 'ADAS', 'AB', 'P-Tau', 'HIP-FCI', 
    'HIP-GMI', 'AVLT-Sum', 'PCC-FCI', 'FUS-GMI', 'FUS-FCI'])
biomarker_index_name_dict = dict(zip(range(10), biomarker_names))
S_ordering = np.array([
    'HIP-FCI', 'PCC-FCI', 'AB', 'P-Tau', 'MMSE', 'ADAS', 
    'HIP-GMI', 'AVLT-Sum', 'FUS-GMI', 'FUS-FCI'
    ])
# biomarker_names
# cyan, normal
theta_means = [28, -6, 250, -25, 5, 0.4, 40, 12, 0.6, -10]
# black, abnormal
phi_means = [22, -20, 150, -50, -5, 0.3, 20, 5, 0.5, -20]
# cyan, normal
theta_std_times_three = [2, 4, 150, 50, 5, 0.7, 45, 12, 0.2, 10]
theta_stds = [std_dev/3 for std_dev in theta_std_times_three]
# black, abnormal
phi_std_times_three = [8, 12, 50, 100, 20, 1, 20, 10, 0.2, 18]
phi_stds = [std_dev/3 for std_dev in phi_std_times_three]

# """apply z score transformation"""
# theta_means = get_zscore(theta_means)
# phi_mean = get_zscore(phi_means)
# theta_stds = get_zscore(theta_stds)
# phi_stds = get_zscore(phi_stds)

# # Apply min-max scaling
# theta_means_scaled = min_max_scale(theta_means)
# phi_means_scaled = min_max_scale(phi_means)
# theta_stds_scaled = min_max_scale(theta_stds)
# phi_stds_scaled = min_max_scale(phi_stds)
biomarker_index_name_dict
{0: 'MMSE',
 1: 'ADAS',
 2: 'AB',
 3: 'P-Tau',
 4: 'HIP-FCI',
 5: 'HIP-GMI',
 6: 'AVLT-Sum',
 7: 'PCC-FCI',
 8: 'FUS-GMI',
 9: 'FUS-FCI'}
# theta_means = [1, 3, 5, 6, 8, 0, 4, 2, 7, 9]
# theta_stds = [0.3, 0.5, 0.2, 1.3, 3.3, 2.2, 0.8, 0.9, 0.7, 0.6]
# phi_means = [32, 31, 34, 36, 38, 39, 30, 33, 35, 37]
# phi_stds = [6.3, 7.4, 9.4, 4.9, 2.5, 5.9, 6.4, 7.7, 8.0, 3.0]
def plot_distribution_pair(ax, mu1, sigma1, mu2, sigma2, title):
    xmin = min(mu1 - 4*sigma1, mu2-4*sigma2)
    xmax = max(mu1 + 4*sigma1, mu2 + 4*sigma2)
    x = np.linspace(xmin, xmax, 1000)
    y1 = stats.norm.pdf(x, loc = mu1, scale = sigma1)
    y2 = stats.norm.pdf(x, loc = mu2, scale = sigma2)
    ax.plot(x, y1, label = "Normal", color = "cyan")
    ax.plot(x, y2, label = "Abnormal", color = "black")
    ax.fill_between(x, y1, alpha = 0.3, color = "cyan")
    ax.fill_between(x, y2, alpha = 0.3, color = "black")
    ax.set_title(title)
    ax.legend()

fig, axes = plt.subplots(2, 5, figsize=(20, 10))
for i, biomarker_name in enumerate(biomarker_names):
    ax = axes.flatten()[i] 
    mu1 = theta_means[i]
    mu2 = phi_means[i]
    sigma1 = theta_stds[i]
    sigma2 = phi_stds[i]
    plot_distribution_pair(
        ax, mu1, sigma1, mu2, sigma2, title = biomarker_name)
_images/4c59b86ae514f5d0e46cad17f3594b72970d762b2a75e5b7c9eb46d561c63e12.png
for n, biomarker in enumerate(biomarker_names):
    print(n, biomarker)
0 MMSE
1 ADAS
2 AB
3 P-Tau
4 HIP-FCI
5 HIP-GMI
6 AVLT-Sum
7 PCC-FCI
8 FUS-GMI
9 FUS-FCI
def simulate_ebm(
        biomarker_names, S_ordering, J, theta_means, theta_std_dev, phi_means, phi_std_dev):
    """
    Simulate an Event-Based Model (EBM) for disease progression.
    
    Args:
    biomarker_names: Biomarker names
    S_ordering: Biomarker names ordered according to the order in which each of them get affected 
    by the disease
    J (int): Number of participants.
    
    Returns:
    tuple: A tuple containing:
        - kjs (numpy.ndarray): The disease stages for participant.
        - X (numpy.ndarray): The biomarker matrix with participant data.
          - Each cell in X is tuple containing participantId, biomarker name, biomarker value, 
            disease stage of this participant, disease stage current biomarker indicates
            and healthy status
    """
    # N (int): Number of biomarkers.
    N = len(biomarker_names)
    
    # Generate a random stage for each participant
    # The stage should be between 0 and N, inclusive
    # stage 0 indicate the participant is healthy (normal)
    kjs = np.random.randint(0, N+1, size=J)
    
    # Initiate biomarker measurement matrix (J participants x N biomarkers), 
    # with entries as None
    X = np.full((J, N), None, dtype=object)
    
    # biomarker : normal distribution
    theta_dist = {biomarker: stats.norm(
        theta_means[n], theta_std_dev[n]) for n, biomarker in enumerate(biomarker_names)}
    phi_dist = {biomarker: stats.norm(
        phi_means[n], phi_std_dev[n]) for n, biomarker in enumerate(biomarker_names)}

    # Iterate through participants
    for j in range(J):
        # Iterate through biomarkers
        for n, biomarker in enumerate(biomarker_names):
            # Disease stage of the current participant
            k_j = kjs[j]
            # Disease stage indicated by the current biomarker
            # Note that biomarkers always indicate that the participant is diseased
            # Thus, S_n >= 1
            S_n = np.where(S_ordering == biomarker)[0][0] + 1
            
            # Assign biomarker values based on the participant's disease stage 
            if k_j >= 1:
                if k_j >= S_n:
                    X[j, n] = (j, biomarker, theta_dist[biomarker].rvs(), k_j, S_n, 'affected') 
                else:
                    X[j, n] = (j, biomarker, phi_dist[biomarker].rvs(), k_j, S_n, 'not_affected')  
            # if the participant is healthy
            else:
                X[j, n] = (j, biomarker, phi_dist[biomarker].rvs(), k_j, S_n, 'not_affected')        
    return kjs, X
J = 100
kjs, X = simulate_ebm(biomarker_names, S_ordering, J, theta_means, theta_stds, phi_means, phi_stds)
theta_means, theta_stds, phi_means, phi_stds
([28, -6, 250, -25, 5, 0.4, 40, 12, 0.6, -10],
 [0.6666666666666666,
  1.3333333333333333,
  50.0,
  16.666666666666668,
  1.6666666666666667,
  0.2333333333333333,
  15.0,
  4.0,
  0.06666666666666667,
  3.3333333333333335],
 [22, -20, 150, -50, -5, 0.3, 20, 5, 0.5, -20],
 [2.6666666666666665,
  4.0,
  16.666666666666668,
  33.333333333333336,
  6.666666666666667,
  0.3333333333333333,
  6.666666666666667,
  3.3333333333333335,
  0.06666666666666667,
  6.0])
df_means_stds = pd.DataFrame([theta_means, theta_stds, phi_means, phi_stds]).transpose()
df_means_stds.columns = ['theta_mean', 'theta_std', 'phi_mean', 'phi_std']
df_means_stds = df_means_stds.rename_axis("biomarker", axis=0).reset_index()
df_means_stds['biomarker'] = df_means_stds.apply(lambda row: biomarker_index_name_dict[row.biomarker], axis = 1)
df_means_stds
biomarker theta_mean theta_std phi_mean phi_std
0 MMSE 28.0 0.666667 22.0 2.666667
1 ADAS -6.0 1.333333 -20.0 4.000000
2 AB 250.0 50.000000 150.0 16.666667
3 P-Tau -25.0 16.666667 -50.0 33.333333
4 HIP-FCI 5.0 1.666667 -5.0 6.666667
5 HIP-GMI 0.4 0.233333 0.3 0.333333
6 AVLT-Sum 40.0 15.000000 20.0 6.666667
7 PCC-FCI 12.0 4.000000 5.0 3.333333
8 FUS-GMI 0.6 0.066667 0.5 0.066667
9 FUS-FCI -10.0 3.333333 -20.0 6.000000
df_means_stds.to_csv('data/means_stds.csv', index=False)
X[10]
array([(10, 'MMSE', 26.616605393495316, 8, 5, 'affected'),
       (10, 'ADAS', -7.3556255818332055, 8, 6, 'affected'),
       (10, 'AB', 275.844520945162, 8, 3, 'affected'),
       (10, 'P-Tau', -56.84913883593285, 8, 4, 'affected'),
       (10, 'HIP-FCI', 7.611674349923921, 8, 1, 'affected'),
       (10, 'HIP-GMI', 0.5216743457543183, 8, 7, 'affected'),
       (10, 'AVLT-Sum', 41.68089096959266, 8, 8, 'affected'),
       (10, 'PCC-FCI', 15.028459059046519, 8, 2, 'affected'),
       (10, 'FUS-GMI', 0.487738269498379, 8, 9, 'not_affected'),
       (10, 'FUS-FCI', -20.445112160069744, 8, 10, 'not_affected')],
      dtype=object)

Please note that “not_affected” and “affected” are refering to a specific biomarker, rather than this participant’s healthy status.

df = pd.DataFrame(X, columns=biomarker_names)
df
# make this dataframe wide to long 
df_long = df.melt(var_name = "Biomarker", value_name="Value")
df_long
values_df = df_long['Value'].apply(pd.Series)
values_df.columns = ['participant', "biomarker", 'measurement', 'k_j', 'S_n', 'affected_or_not']
values_df
participant biomarker measurement k_j S_n affected_or_not
0 0 MMSE 28.812457 5 5 affected
1 1 MMSE 28.483480 8 5 affected
2 2 MMSE 27.483507 5 5 affected
3 3 MMSE 27.886646 9 5 affected
4 4 MMSE 27.412189 8 5 affected
... ... ... ... ... ... ...
995 95 FUS-FCI -21.983887 1 10 not_affected
996 96 FUS-FCI -16.649357 1 10 not_affected
997 97 FUS-FCI -19.670933 5 10 not_affected
998 98 FUS-FCI -18.132486 0 10 not_affected
999 99 FUS-FCI -18.226765 1 10 not_affected

1000 rows × 6 columns

Visualizing simulated results#

With the above data structure, we can visualize the following data:

  • Distribution of all biomarker values by biomarker

  • Distribution of all biomarker values when the participant is at a certain disease stage

  • Comparing a certain biomarker data

  • A certain participant’s data

Distribution of all biomarker values by biomarker#

df = pd.DataFrame(X, columns=biomarker_names)
# make this dataframe wide to long 
df_long = df.melt(var_name = "Biomarker", value_name="Value")
values_df = df_long['Value'].apply(pd.Series)
values_df.columns = ['participant', "biomarker", 'measurement', 'k_j', 'S_n', 'affected_or_not']
df = values_df

alt.Chart(df).transform_density(
    'measurement',
    as_=['measurement', 'Density'],
    groupby=['biomarker']
).mark_area().encode(
    x="measurement:Q",
    y="Density:Q",
    facet = alt.Facet(
        "biomarker:N",
        columns = 5
    ),
    color=alt.Color(
        'biomarker:N'
    )
).properties(
    width= 140,
    height = 200,
).properties(
    title='Biomarker data for all participants across all stages'
)
df.to_csv("data/participant_data.csv", index=False)
df.head()
participant biomarker measurement k_j S_n affected_or_not
0 0 MMSE 28.812457 5 5 affected
1 1 MMSE 28.483480 8 5 affected
2 2 MMSE 27.483507 5 5 affected
3 3 MMSE 27.886646 9 5 affected
4 4 MMSE 27.412189 8 5 affected
# df[df.k_j == 0]

Distribution of all biomarker values when the participant is at a certain disease stage#

# biomarker data when the participant is at stage 6
df_kj_6 = df[df.k_j == 6]
df_kj_6

alt.Chart(df_kj_6).transform_density(
    'measurement',
    as_=['measurement', 'Density'],
    groupby=['biomarker']
).mark_area().encode(
    x="measurement:Q",
    y="Density:Q",
    facet = alt.Facet(
        "biomarker:N",
        columns = 5
    ),
    color=alt.Color(
        'biomarker:N'
    )
).properties(
    width= 140,
    height = 200,
).properties(
    title='Biomarker data when the participant is at stage six'
)

Comparing a certain biomarker data#

# select only biomarker 2 
bio_2_data = df[df.biomarker=='MMSE'].drop(['k_j', 'S_n', 'biomarker'], axis = 1)
# biomarker2 data, comparing from diseased and healthy groups
alt.Chart(bio_2_data).transform_density(
    'measurement',
    as_=['measurement', 'Density'],
    groupby=['affected_or_not']
).mark_area().encode(
    x="measurement:Q",
    y="Density:Q",
    facet = alt.Facet(
        "affected_or_not:N",
    ),
    color=alt.Color(
        'affected_or_not:N'
    )
).properties(
    width= 240,
    height = 200,
).properties(
    title='MMSE data, compring healthy group and diseased group'
)

A certain participant’s data#

# participant 10
participant10_data = df[df.participant == 10]
alt.Chart(participant10_data).mark_bar().encode(
    x='biomarker',
    y='measurement',
    color=alt.Color(
        'affected_or_not:N'
    ),
    tooltip=['biomarker', 'affected_or_not', 'measurement']
).interactive().properties(
    title=f'Biomarker data for participant 10 (k_j = {participant10_data.k_j.to_list()[0]})'
)