Generative process

Generative process#

Definitions#

We can generate participants’ biomarker data according to EBM (Event-Based Model).

\(S \sim {\rm UniformPermutation}(\cdot)\)

\(S\) follows a distribution of uniform permutation. That means the ordering of biomarkers is random.

\(k_j \sim {\rm DiscreteUniform}(N)\)

\(k_j\) follows a discrete uniform distribution, which means a participant is equally likely to fall in a progression stage (e.g., from \(0\) to \(4\), where \(0\) indicate this participant is healthy.)

\[X_{S(n)j} | S, k_j \sim I(z_j == 1) \left[ I(S(n) \leq k_j ) p(X_{S(n)j} \mid \theta_{S(n)} ) +I(S(n) \gt k_j) p(X_{S(n)j} \mid \phi_{S(n)}) \right] + \left(1-I(z_j==1) \right) p(X_{S(n)j} \mid \phi_{S(n)})\]

Note that \(\sim\) in statistics indicates that this variable follows a certain distribution. Here, it means that the biomarker values is drawn from that distribution.

Parameters#

\(z_j\): \(1\) if the participant is diseased; otherwise \(0\).

\(I(True) = 1\), \(I(False) = 0\)

\(S\) denotes the ordering of a sequence of biomarkers.

\(N\): number of observed biomarkers.

\(n\): a specific biomarker; e.g., biomarker \(b\).

\(J\): number of participants.

\(j\) denotes a participant.

\(X\) is observed values of biomarkers; it is a matrix of dimension of \(N \times J\) or \(J \times N\).

\(k\): a scalar whose value is the participant’s stage of the disease

\(K\): number of disease stages

\(S(n)\) means the disease stage that a specific biomarker \(n\) indicates.

\(k_j\): disease stage that a participant is at.

\(X_{nj}\) means the observed value of the biomarker \(n\) in participant \(j\).

\(\theta_n\) is the parameters for the probability density function (PDF) of observed value of biomarker \(n\) when this biomarker has been affected by the disease. Let’s assume this distribution is a Gaussian distribution.

\(\phi_n\) is the parameters for the probability density function (PDF) of observed value of biomarker \(n\) when this biomarker has NOT been affected by the disease.

Simulation#

We are going to generate biomarker values for each participant by randomly drawing from distributions defined by \(\theta\) or \(\phi\). We will base our generation on Chen’s paper (Figure 1).

import numpy as np
import scipy.stats as stats
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt

# S_ordering = [5, 6, 3, 4, 1, 7, 8, 2, 9, 10]

def get_zscore(arr):
    mu = np.mean(arr)
    std = np.std(arr)
    return (arr - mu)/std

def min_max_scale(arr):
    arr_min = np.min(arr)
    arr_max = np.max(arr)
    scaled_arr = (arr - arr_min) / (arr_max - arr_min)
    return scaled_arr

biomarker_names = np.array([
    'MMSE', 'ADAS', 'AB', 'P-Tau', 'HIP-FCI', 
    'HIP-GMI', 'AVLT-Sum', 'PCC-FCI', 'FUS-GMI', 'FUS-FCI'])
biomarker_index_name_dict = dict(zip(range(10), biomarker_names))
S_ordering = np.array([
    'HIP-FCI', 'PCC-FCI', 'AB', 'P-Tau', 'MMSE', 'ADAS', 
    'HIP-GMI', 'AVLT-Sum', 'FUS-GMI', 'FUS-FCI'
    ])
# biomarker_names
# cyan, normal
theta_means = [28, -6, 250, -25, 5, 0.4, 40, 12, 0.6, -10]
# black, abnormal
phi_means = [22, -20, 150, -50, -5, 0.3, 20, 5, 0.5, -20]
# cyan, normal
theta_std_times_three = [2, 4, 150, 50, 5, 0.7, 45, 12, 0.2, 10]
theta_stds = [std_dev/3 for std_dev in theta_std_times_three]
# black, abnormal
phi_std_times_three = [8, 12, 50, 100, 20, 1, 20, 10, 0.2, 18]
phi_stds = [std_dev/3 for std_dev in phi_std_times_three]

# """apply z score transformation"""
# theta_means = get_zscore(theta_means)
# phi_mean = get_zscore(phi_means)
# theta_stds = get_zscore(theta_stds)
# phi_stds = get_zscore(phi_stds)

# # Apply min-max scaling
# theta_means_scaled = min_max_scale(theta_means)
# phi_means_scaled = min_max_scale(phi_means)
# theta_stds_scaled = min_max_scale(theta_stds)
# phi_stds_scaled = min_max_scale(phi_stds)

biomarker_index_name_dict

{0: 'MMSE',
'ADAS',
'AB',
'P-Tau',
'HIP-FCI',
'HIP-GMI',
'AVLT-Sum',
'PCC-FCI',
'FUS-GMI',
'FUS-FCI'}

# theta_means = [1, 3, 5, 6, 8, 0, 4, 2, 7, 9]
# theta_stds = [0.3, 0.5, 0.2, 1.3, 3.3, 2.2, 0.8, 0.9, 0.7, 0.6]
# phi_means = [32, 31, 34, 36, 38, 39, 30, 33, 35, 37]
# phi_stds = [6.3, 7.4, 9.4, 4.9, 2.5, 5.9, 6.4, 7.7, 8.0, 3.0]

def plot_distribution_pair(ax, mu1, sigma1, mu2, sigma2, title):
    xmin = min(mu1 - 4*sigma1, mu2-4*sigma2)
    xmax = max(mu1 + 4*sigma1, mu2 + 4*sigma2)
    x = np.linspace(xmin, xmax, 1000)
    y1 = stats.norm.pdf(x, loc = mu1, scale = sigma1)
    y2 = stats.norm.pdf(x, loc = mu2, scale = sigma2)
    ax.plot(x, y1, label = "Normal", color = "cyan")
    ax.plot(x, y2, label = "Abnormal", color = "black")
    ax.fill_between(x, y1, alpha = 0.3, color = "cyan")
    ax.fill_between(x, y2, alpha = 0.3, color = "black")
    ax.set_title(title)
    ax.legend()

fig, axes = plt.subplots(2, 5, figsize=(20, 10))
for i, biomarker_name in enumerate(biomarker_names):
    ax = axes.flatten()[i] 
    mu1 = theta_means[i]
    mu2 = phi_means[i]
    sigma1 = theta_stds[i]
    sigma2 = phi_stds[i]
    plot_distribution_pair(
        ax, mu1, sigma1, mu2, sigma2, title = biomarker_name)

_images/4c59b86ae514f5d0e46cad17f3594b72970d762b2a75e5b7c9eb46d561c63e12.png

for n, biomarker in enumerate(biomarker_names):
    print(n, biomarker)

MMSE
ADAS
AB
P-Tau
HIP-FCI
HIP-GMI
AVLT-Sum
PCC-FCI
FUS-GMI
FUS-FCI

def simulate_ebm(
        biomarker_names, S_ordering, J, theta_means, theta_std_dev, phi_means, phi_std_dev):
    """
    Simulate an Event-Based Model (EBM) for disease progression.
    
    Args:
    biomarker_names: Biomarker names
    S_ordering: Biomarker names ordered according to the order in which each of them get affected 
    by the disease
    J (int): Number of participants.
    
    Returns:
    tuple: A tuple containing:
        - kjs (numpy.ndarray): The disease stages for participant.
        - X (numpy.ndarray): The biomarker matrix with participant data.
          - Each cell in X is tuple containing participantId, biomarker name, biomarker value, 
            disease stage of this participant, disease stage current biomarker indicates
            and healthy status
    """
    # N (int): Number of biomarkers.
    N = len(biomarker_names)
    
    # Generate a random stage for each participant
    # The stage should be between 0 and N, inclusive
    # stage 0 indicate the participant is healthy (normal)
    kjs = np.random.randint(0, N+1, size=J)
    
    # Initiate biomarker measurement matrix (J participants x N biomarkers), 
    # with entries as None
    X = np.full((J, N), None, dtype=object)
    
    # biomarker : normal distribution
    theta_dist = {biomarker: stats.norm(
        theta_means[n], theta_std_dev[n]) for n, biomarker in enumerate(biomarker_names)}
    phi_dist = {biomarker: stats.norm(
        phi_means[n], phi_std_dev[n]) for n, biomarker in enumerate(biomarker_names)}

    # Iterate through participants
    for j in range(J):
        # Iterate through biomarkers
        for n, biomarker in enumerate(biomarker_names):
            # Disease stage of the current participant
            k_j = kjs[j]
            # Disease stage indicated by the current biomarker
            # Note that biomarkers always indicate that the participant is diseased
            # Thus, S_n >= 1
            S_n = np.where(S_ordering == biomarker)[0][0] + 1
            
            # Assign biomarker values based on the participant's disease stage 
            if k_j >= 1:
                if k_j >= S_n:
                    X[j, n] = (j, biomarker, theta_dist[biomarker].rvs(), k_j, S_n, 'affected') 
                else:
                    X[j, n] = (j, biomarker, phi_dist[biomarker].rvs(), k_j, S_n, 'not_affected')  
            # if the participant is healthy
            else:
                X[j, n] = (j, biomarker, phi_dist[biomarker].rvs(), k_j, S_n, 'not_affected')        
    return kjs, X

J = 100
kjs, X = simulate_ebm(biomarker_names, S_ordering, J, theta_means, theta_stds, phi_means, phi_stds)

theta_means, theta_stds, phi_means, phi_stds

([28, -6, 250, -25, 5, 0.4, 40, 12, 0.6, -10],
 [0.6666666666666666,
3333333333333333,
0,
666666666666668,
6666666666666667,
2333333333333333,
0,
0,
06666666666666667,
3333333333333335],
 [22, -20, 150, -50, -5, 0.3, 20, 5, 0.5, -20],
 [2.6666666666666665,
0,
666666666666668,
333333333333336,
666666666666667,
3333333333333333,
666666666666667,
3333333333333335,
06666666666666667,
0])

df_means_stds = pd.DataFrame([theta_means, theta_stds, phi_means, phi_stds]).transpose()
df_means_stds.columns = ['theta_mean', 'theta_std', 'phi_mean', 'phi_std']
df_means_stds = df_means_stds.rename_axis("biomarker", axis=0).reset_index()
df_means_stds['biomarker'] = df_means_stds.apply(lambda row: biomarker_index_name_dict[row.biomarker], axis = 1)
df_means_stds

	biomarker	theta_mean	theta_std	phi_mean	phi_std
0	MMSE	28.0	0.666667	22.0	2.666667
1	ADAS	-6.0	1.333333	-20.0	4.000000
2	AB	250.0	50.000000	150.0	16.666667
3	P-Tau	-25.0	16.666667	-50.0	33.333333
4	HIP-FCI	5.0	1.666667	-5.0	6.666667
5	HIP-GMI	0.4	0.233333	0.3	0.333333
6	AVLT-Sum	40.0	15.000000	20.0	6.666667
7	PCC-FCI	12.0	4.000000	5.0	3.333333
8	FUS-GMI	0.6	0.066667	0.5	0.066667
9	FUS-FCI	-10.0	3.333333	-20.0	6.000000

df_means_stds.to_csv('data/means_stds.csv', index=False)

X[10]

array([(10, 'MMSE', 26.616605393495316, 8, 5, 'affected'),
       (10, 'ADAS', -7.3556255818332055, 8, 6, 'affected'),
       (10, 'AB', 275.844520945162, 8, 3, 'affected'),
       (10, 'P-Tau', -56.84913883593285, 8, 4, 'affected'),
       (10, 'HIP-FCI', 7.611674349923921, 8, 1, 'affected'),
       (10, 'HIP-GMI', 0.5216743457543183, 8, 7, 'affected'),
       (10, 'AVLT-Sum', 41.68089096959266, 8, 8, 'affected'),
       (10, 'PCC-FCI', 15.028459059046519, 8, 2, 'affected'),
       (10, 'FUS-GMI', 0.487738269498379, 8, 9, 'not_affected'),
       (10, 'FUS-FCI', -20.445112160069744, 8, 10, 'not_affected')],
      dtype=object)

Please note that “not_affected” and “affected” are refering to a specific biomarker, rather than this participant’s healthy status.

df = pd.DataFrame(X, columns=biomarker_names)
df
# make this dataframe wide to long 
df_long = df.melt(var_name = "Biomarker", value_name="Value")
df_long
values_df = df_long['Value'].apply(pd.Series)
values_df.columns = ['participant', "biomarker", 'measurement', 'k_j', 'S_n', 'affected_or_not']
values_df

	participant	biomarker	measurement	k_j	S_n	affected_or_not
0	0	MMSE	28.812457	5	5	affected
1	1	MMSE	28.483480	8	5	affected
2	2	MMSE	27.483507	5	5	affected
3	3	MMSE	27.886646	9	5	affected
4	4	MMSE	27.412189	8	5	affected
...	...	...	...	...	...	...
995	95	FUS-FCI	-21.983887	1	10	not_affected
996	96	FUS-FCI	-16.649357	1	10	not_affected
997	97	FUS-FCI	-19.670933	5	10	not_affected
998	98	FUS-FCI	-18.132486	0	10	not_affected
999	99	FUS-FCI	-18.226765	1	10	not_affected

1000 rows × 6 columns

Visualizing simulated results#

With the above data structure, we can visualize the following data:

Distribution of all biomarker values by biomarker
Distribution of all biomarker values when the participant is at a certain disease stage
Comparing a certain biomarker data
A certain participant’s data

Distribution of all biomarker values by biomarker#

df = pd.DataFrame(X, columns=biomarker_names)
# make this dataframe wide to long 
df_long = df.melt(var_name = "Biomarker", value_name="Value")
values_df = df_long['Value'].apply(pd.Series)
values_df.columns = ['participant', "biomarker", 'measurement', 'k_j', 'S_n', 'affected_or_not']
df = values_df

alt.Chart(df).transform_density(
    'measurement',
    as_=['measurement', 'Density'],
    groupby=['biomarker']
).mark_area().encode(
    x="measurement:Q",
    y="Density:Q",
    facet = alt.Facet(
        "biomarker:N",
        columns = 5
    ),
    color=alt.Color(
        'biomarker:N'
    )
).properties(
    width= 140,
    height = 200,
).properties(
    title='Biomarker data for all participants across all stages'
)

df.to_csv("data/participant_data.csv", index=False)
df.head()

	participant	biomarker	measurement	k_j	S_n	affected_or_not
0	0	MMSE	28.812457	5	5	affected
1	1	MMSE	28.483480	8	5	affected
2	2	MMSE	27.483507	5	5	affected
3	3	MMSE	27.886646	9	5	affected
4	4	MMSE	27.412189	8	5	affected

# df[df.k_j == 0]

Distribution of all biomarker values when the participant is at a certain disease stage#

# biomarker data when the participant is at stage 6
df_kj_6 = df[df.k_j == 6]
df_kj_6

alt.Chart(df_kj_6).transform_density(
    'measurement',
    as_=['measurement', 'Density'],
    groupby=['biomarker']
).mark_area().encode(
    x="measurement:Q",
    y="Density:Q",
    facet = alt.Facet(
        "biomarker:N",
        columns = 5
    ),
    color=alt.Color(
        'biomarker:N'
    )
).properties(
    width= 140,
    height = 200,
).properties(
    title='Biomarker data when the participant is at stage six'
)

Comparing a certain biomarker data#

# select only biomarker 2 
bio_2_data = df[df.biomarker=='MMSE'].drop(['k_j', 'S_n', 'biomarker'], axis = 1)
# biomarker2 data, comparing from diseased and healthy groups
alt.Chart(bio_2_data).transform_density(
    'measurement',
    as_=['measurement', 'Density'],
    groupby=['affected_or_not']
).mark_area().encode(
    x="measurement:Q",
    y="Density:Q",
    facet = alt.Facet(
        "affected_or_not:N",
    ),
    color=alt.Color(
        'affected_or_not:N'
    )
).properties(
    width= 240,
    height = 200,
).properties(
    title='MMSE data, compring healthy group and diseased group'
)

A certain participant’s data#

# participant 10
participant10_data = df[df.participant == 10]
alt.Chart(participant10_data).mark_bar().encode(
    x='biomarker',
    y='measurement',
    color=alt.Color(
        'affected_or_not:N'
    ),
    tooltip=['biomarker', 'affected_or_not', 'measurement']
).interactive().properties(
    title=f'Biomarker data for participant 10 (k_j = {participant10_data.k_j.to_list()[0]})'
)