Skip to content
Snippets Groups Projects
Commit 4ce81636 authored by Markus Viljanen's avatar Markus Viljanen
Browse files

PyMCF:

-Compute MCF with standard errors
-Compare MCFs pointwise over time
-Compute MCF whole sample test
-Two examples: bladder cancer and field repair
parents
Branches master
No related tags found
No related merge requests found
T45.csv 0 → 100644
Treatment group;Patient number;Event;Time;Censored
1;1;1;0;1
1;2;1;1;1
1;3;1;4;1
1;4;1;7;1
1;5;1;10;1
1;6;1;6;0
1;6;2;10;1
1;7;1;14;1
1;8;1;18;1
1;9;1;5;0
1;9;2;18;1
1;10;1;12;0
1;10;2;16;0
1;10;3;18;1
1;11;1;23;1
1;12;1;10;0
1;12;2;15;0
1;12;3;23;1
1;13;1;3;0
1;13;2;16;0
1;13;3;23;0
1;13;4;23;1
1;14;1;3;0
1;14;2;9;0
1;14;3;21;0
1;14;4;23;1
1;15;1;7;0
1;15;2;10;0
1;15;3;16;0
1;15;4;24;0
1;15;5;24;1
1;16;1;3;0
1;16;2;15;0
1;16;3;25;0
1;16;4;25;1
1;17;1;26;1
1;18;1;1;0
1;18;2;26;1
1;19;1;2;0
1;19;2;26;0
1;19;3;26;1
1;20;1;25;0
1;20;2;28;1
1;21;1;29;1
1;22;1;29;1
1;23;1;29;1
1;24;1;28;0
1;24;2;30;0
1;24;3;30;1
1;25;1;2;0
1;25;2;17;0
1;25;3;22;0
1;25;4;30;1
1;26;1;3;0
1;26;2;6;0
1;26;3;8;0
1;26;4;12;0
1;26;5;26;0
1;26;6;30;1
1;27;1;12;0
1;27;2;15;0
1;27;3;24;0
1;27;4;31;1
1;28;1;32;1
1;29;1;34;1
1;30;1;36;1
1;31;1;29;0
1;31;2;36;1
1;32;1;37;1
1;33;1;9;0
1;33;2;17;0
1;33;3;22;0
1;33;4;24;0
1;33;5;40;1
1;34;1;16;0
1;34;2;19;0
1;34;3;23;0
1;34;4;29;0
1;34;5;34;0
1;34;6;40;0
1;34;7;40;1
1;35;1;41;1
1;36;1;3;0
1;36;2;43;1
1;37;1;6;0
1;37;2;43;1
1;38;1;3;0
1;38;2;6;0
1;38;3;9;0
1;38;4;44;1
1;39;1;9;0
1;39;2;11;0
1;39;3;20;0
1;39;4;26;0
1;39;5;30;0
1;39;6;45;1
1;40;1;18;0
1;40;2;48;1
1;41;1;49;1
1;42;1;35;0
1;42;2;51;1
1;43;1;17;0
1;43;2;53;1
1;44;1;3;0
1;44;2;15;0
1;44;3;46;0
1;44;4;51;0
1;44;5;53;0
1;44;6;53;1
1;45;1;59;1
1;46;1;2;0
1;46;2;15;0
1;46;3;24;0
1;46;4;30;0
1;46;5;34;0
1;46;6;39;0
1;46;7;43;0
1;46;8;49;0
1;46;9;52;0
1;46;10;61;1
1;47;1;5;0
1;47;2;14;0
1;47;3;19;0
1;47;4;27;0
1;47;5;41;0
1;47;6;64;1
1;48;1;2;0
1;48;2;8;0
1;48;3;12;0
1;48;4;13;0
1;48;5;17;0
1;48;6;21;0
1;48;7;33;0
1;48;8;49;0
1;48;9;64;1
2;49;1;0;1
2;50;1;2;1
2;51;1;3;0
2;51;2;4;0
2;51;3;4;1
2;52;1;4;1
2;53;1;2;0
2;53;2;3;0
2;53;3;5;1
2;54;1;7;1
2;55;1;8;1
2;56;1;4;0
2;56;2;8;1
2;57;1;3;0
2;57;2;11;1
2;58;1;14;1
2;59;1;26;1
2;60;1;29;1
2;61;1;5;0
2;61;2;30;1
2;62;1;32;1
2;63;1;33;1
2;64;1;3;0
2;64;2;10;0
2;64;3;22;0
2;64;4;26;0
2;64;5;34;0
2;64;6;34;1
2;65;1;3;0
2;65;2;9;0
2;65;3;15;0
2;65;4;19;0
2;65;5;25;0
2;65;6;37;1
2;66;1;38;1
2;67;1;3;0
2;67;2;7;0
2;67;3;12;0
2;67;4;16;0
2;67;5;19;0
2;67;6;28;0
2;67;7;34;0
2;67;8;36;0
2;67;9;39;0
2;67;10;39;1
2;68;1;40;1
2;69;1;40;1
2;70;1;2;0
2;70;2;6;0
2;70;3;10;0
2;70;4;16;0
2;70;5;23;0
2;70;6;27;0
2;70;7;36;0
2;70;8;39;0
2;70;9;42;0
2;70;10;42;1
2;71;1;45;1
2;72;1;10;0
2;72;2;45;1
2;73;1;6;0
2;73;2;20;0
2;73;3;46;1
2;74;1;8;0
2;74;2;15;0
2;74;3;18;0
2;74;4;20;0
2;74;5;22;0
2;74;6;25;0
2;74;7;38;0
2;74;8;40;0
2;74;9;46;1
2;75;1;42;0
2;75;2;48;1
2;76;1;54;1
2;77;1;44;0
2;77;2;47;0
2;77;3;54;1
2;78;1;8;0
2;78;2;14;0
2;78;3;20;0
2;78;4;25;0
2;78;5;29;0
2;78;6;33;0
2;78;7;48;0
2;78;8;49;0
2;78;9;55;1
2;79;1;57;1
2;80;1;60;1
3;81;1;1;1
3;82;1;1;1
3;83;1;5;0
3;83;2;5;1
3;84;1;9;1
3;85;1;10;1
3;86;1;13;1
3;87;1;3;0
3;87;2;14;1
3;88;1;1;0
3;88;2;3;0
3;88;3;5;0
3;88;4;7;0
3;88;5;10;0
3;88;6;17;1
3;89;1;18;1
3;90;1;17;0
3;90;2;18;1
3;91;1;2;0
3;91;2;19;1
3;92;1;17;0
3;92;2;19;0
3;92;3;21;1
3;93;1;22;1
3;94;1;25;1
3;95;1;25;1
3;96;1;25;1
3;97;1;6;0
3;97;2;12;0
3;97;3;13;0
3;97;4;26;1
3;98;1;6;0
3;98;2;27;1
3;99;1;2;0
3;99;2;29;1
3;100;1;26;0
3;100;2;35;0
3;100;3;36;1
3;101;1;38;1
3;102;1;22;0
3;102;2;23;0
3;102;3;27;0
3;102;4;32;0
3;102;5;39;1
3;103;1;4;0
3;103;2;16;0
3;103;3;23;0
3;103;4;27;0
3;103;5;33;0
3;103;6;36;0
3;103;7;37;0
3;103;8;39;1
3;104;1;24;0
3;104;2;26;0
3;104;3;29;0
3;104;4;40;0
3;104;5;40;1
3;105;1;41;1
3;106;1;41;1
3;107;1;1;0
3;107;2;27;0
3;107;3;43;1
3;108;1;44;1
3;109;1;2;0
3;109;2;20;0
3;109;3;23;0
3;109;4;27;0
3;109;5;38;0
3;109;6;44;1
3;110;1;45;1
3;111;1;2;0
3;111;2;46;1
3;112;1;46;1
3;113;1;49;1
3;114;1;50;1
3;115;1;4;0
3;115;2;24;0
3;115;3;47;0
3;115;4;50;1
3;116;1;54;1
3;117;1;38;0
3;117;2;54;1
3;118;1;59;1
# Byar, D., Blackard, C., & Veterans Administration Cooperative Urological Research Group. (1977). Comparisons of placebo, pyridoxine, and topical thiotepa in preventing recurrence of stage I bladder cancer. Urology, 10(6), 556-561.
# https://doi.org/10.1016/0090-4295(77)90101-7
# Download T45.man
# Andrews, D.F. and Herzberg, A.M. (1985). A Collection of Problems from Many Fields for the Student and Research Worker
# http://lib.stat.cmu.edu/datasets/Andrews/ (Table 45.1)
# Process T45.man to
# list1: rows of summary table
# list2: rows of recurrent events at inspection table
import re
list1 = []
list2 = []
with open('T45.man') as f:
for i, s in enumerate(f.readlines()):
print s[9:12]
if i % 2 == 0:
# split by whitespace
list1.append(s[16:].split())
else:
# split 1. by two or more spaces 2. by whitespace
events = re.split(r'\s{2,}', s[16:])
events = [event.split() for event in events if event]
list2.append(events)
# Table 45.1 patient statistics
import numpy as np
import pandas as pd
df1 = pd.DataFrame(list1, columns=['Patient number', 'Treatment group', 'Follow-up time, months', 'Survival status', 'No. of recurrences', 'Initial number', 'Initial size'])
# Table 45.1 recurrent events at inspection: month (M), number (#), size (S)
n, m = len(list2), max([len(l) for l in list2])
cols = [col for i in range(1, m+1) for col in ['M%d'%i, '#%d'%i, 'S%d'%i]]
df2 = pd.DataFrame(np.zeros((n, 3*m))*np.nan, columns=cols)
for i in range(n):
for j in range(len(list2[i])):
m, n, s = list2[i][j]
df2.iloc[i,j*3+0] = m
df2.iloc[i,j*3+1] = n
df2.iloc[i,j*3+2] = s
df2 = pd.concat((df1['Patient number'], df2), axis=1)
# Recurrent events at inspection: patient x month (M) in wide and long format
events_wide = df2.loc[:,df2.columns.str.startswith('M') | (df2.columns == 'Patient number')]
events_long = pd.wide_to_long(events_wide, 'M', i='Patient number', j='Event').dropna().reset_index()
# Recurrent events at inspection: long format Treatment group, Patient number, Event, Time, Censored
events_rec = events_long.rename(columns={'M':'Time'})
events_rec['Censored'] = 0
events_end = df1[['Patient number', 'Follow-up time, months']]
events_end.rename(columns={'Follow-up time, months':'Time'}, inplace=True)
events_end['Censored'] = 1
events_next = events_long.groupby('Patient number')['Event'].agg(lambda s: str(s.astype(int).max()+1))
events_end['Event'] = events_end['Patient number'].map(events_next).fillna('1')
events = pd.concat((events_rec, events_end))
events = events.merge(df1[['Patient number', 'Treatment group']], how='left', on='Patient number')
# Final processing and saving
events['Treatment group'] = events['Treatment group'].astype(int)
events['Patient number'] = events['Patient number'].astype(int)
events['Event'] = events['Event'].astype(int)
events['Time'] = events['Time'].astype(int)
events.sort_values(['Patient number', 'Event'], inplace=True)
events = events[['Treatment group', 'Patient number', 'Event', 'Time', 'Censored']]
events.to_csv('T45.csv', index=False, sep=';')
\ No newline at end of file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, chi2
from numpy.linalg import inv
from itertools import cycle
from warnings import warn
markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'D']
#group_colors = iter(plt.cm.rainbow(np.linspace(0, 1, 2)))
#c1, c2 = next(group_colors), next(group_colors)
def parse_data(fr, integer=True):
fr['Time'] = fr['Time.str'].str.replace('(\-|\+)', '').astype(int if integer else float)
fr['Event'] = np.where(fr['Time.str'].str[-1] == '-', 0, 1)
if 'Cost' in fr:
fr['Cost'] = fr['Cost'].fillna(0.00)
fr = fr.drop('Time.str', axis=1)
return fr
def transform_data(fr, format='recurrent'):
#TODO: add custom left-truncation
def idx_exit(fr):
if format == 'recurrent':
return (fr['Event'] == 0)
elif format == 'single':
return (fr['Event'] == 0) | (fr['Event'] == 1)
fr_entry = fr[idx_exit(fr)].copy()
fr_entry['Time'] = 0.0
fr_entry['Event'] = 2
fr = pd.concat((fr_entry, fr))
fr.sort_values(['Sample', 'Time'] if 'Sample' in fr else 'Time', inplace=True)
fr.index = np.arange(len(fr))
fr['dY'] = 0
fr['dN'] = 0
fr.loc[idx_exit(fr), 'dY'] = -1
fr.loc[fr['Event'] == 2, 'dY'] = 1
fr.loc[fr['Event'] == 1, 'dN'] = 1
if 'Cost' in fr:
fr['dC'] = fr['Cost'].fillna(0)
# Validity check: sample has entry & exit, entry < exit, all events in (entry, exit]
def valid(sfr):
stimes = sfr.loc[sfr['dY'] == 1, 'Time']
etimes = sfr.loc[sfr['dY'] == -1, 'Time']
events = sfr.loc[sfr['dN'] == 1, 'Time']
enter, exit = stimes.iloc[0], etimes.iloc[0]
return (len(stimes) == 1) & (len(etimes) == 1) & (enter < exit) & \
(events > enter).all() & (events <= exit).all()
#TODO: add population checks
if 'Sample' in fr:
# Ignore invalid samples and warn the user
fr_valid = fr.groupby('Sample').filter(valid)
if len(fr_valid) < len(fr):
fr_invalid = fr.groupby('Sample').filter(lambda sfr: ~valid(sfr))
warn("Ignoring invalid samples:\n" + str(fr_invalid))
fr = fr_valid
return fr
def transform_population(fr):
fr = fr.copy()
fr.rename(columns={'Events':'dN'}, inplace=True)
entry = fr['Truncated'] if 'Truncated' in fr else 0
exit = fr['Censored']
fr['dY'] = entry - exit
total = fr['dY'].sum()
if total < 0:
if (fr['Time'] == 0).any():
fr.loc[fr['Time'] == 0, 'dY'] += -total
else:
initial = pd.DataFrame({'Time': 0, 'dN': 0, 'dY': -total}, index=[0])
fr = pd.concat((initial, fr))
fr.index = np.arange(len(fr))
if 'Costs' in fr:
fr['Costs'] = fr['Costs'].fillna(0)
fr.rename(columns={'Costs': 'dC'}, inplace=True)
fr.drop('Censored', axis=1, inplace=True)
if 'Truncated' in fr:
fr.drop('Truncated', axis=1, inplace=True)
return fr
def population_reverse(fr):
fr = fr.groupby('Time')[['dN','dY']].agg('sum').sort_index().reset_index()
fr['Y'] = fr['dY'].cumsum().shift(1).fillna(0).astype(int)
fr['Y_next'] = fr['dY'].cumsum().astype(int)
ids = pd.Index(np.arange(1, max(fr['Y']) + 1), name='Sample')
fr_time = pd.DataFrame(np.zeros((len(ids), len(fr))) * np.nan, index=ids, columns=fr['Time'])
for idx, (time, dN, dY, Y, Y_next) in fr.iterrows():
status = np.zeros(len(ids)) * np.nan
status[:int(max(Y, Y_next))] = 0
if Y > 0:
status[np.random.choice(np.arange(Y, dtype=int), dN)] = 1
fr_time.loc[:, time] = status
fr_time = fr_time.stack()
fr_time.name = 'dN'
fr_time = fr_time.reset_index()
fr_enter = fr_time.groupby('Sample')['Time'].agg('min').reset_index()
fr_enter['dY'] = 1
fr_enter['dN'] = 0
fr_exit = fr_time.groupby('Sample')['Time'].agg('max').reset_index()
fr_exit['dY'] = -1
fr_exit['dN'] = 0
fr_event = fr_time[fr_time['dN'] > 0].reset_index()
fr_event['dY'] = 0
fr_sample = pd.concat((fr_enter, fr_event, fr_exit)).sort_values(['Sample', 'Time'])
fr_sample.index = np.arange(len(fr_sample))
fr_sample = fr_sample[['Sample', 'Time', 'dY', 'dN']]
return fr_sample
def population_reverse_costs(fr):
fr = fr.groupby('Time')[['dC', 'dN', 'dY']].agg('sum').sort_index().reset_index()
fr['Y'] = fr['dY'].cumsum().shift(1).fillna(0).astype(int)
fr['Y_next'] = fr['dY'].cumsum().astype(int)
ids = pd.Index(np.arange(1, max(fr['Y']) + 1), name='Sample')
times = pd.MultiIndex.from_product([['dC', 'dN'], fr['Time']], names=['Type', 'Time'])
fr_time = pd.DataFrame(np.zeros((len(ids), 2*len(fr))) * np.nan, index=ids, columns=times)
for idx, (time, dC, dN, dY, Y, Y_next) in fr.iterrows():
status = np.zeros(len(ids)) * np.nan
status[:int(max(Y, Y_next))] = 0
costs = status.copy()
if Y > 0:
random_ids = np.random.choice(np.arange(Y, dtype=int), int(dN))
status[random_ids] = 1
costs[random_ids] = dC/dN if dN > 0 else 0
fr_time['dN', time] = status
fr_time['dC', time] = costs
fr_time = fr_time.stack()
fr_time = fr_time.reset_index()
fr_enter = fr_time.groupby('Sample')['Time'].agg('min').reset_index()
fr_enter['dY'] = 1
fr_enter['dN'] = 0
fr_enter['dC'] = 0
fr_exit = fr_time.groupby('Sample')['Time'].agg('max').reset_index()
fr_exit['dY'] = -1
fr_exit['dN'] = 0
fr_exit['dC'] = 0
fr_event = fr_time[fr_time['dN'] > 0].reset_index()
fr_event['dY'] = 0
fr_sample = pd.concat((fr_enter, fr_event, fr_exit)).sort_values(['Sample', 'Time'])
fr_sample.index = np.arange(len(fr_sample))
fr_sample = fr_sample[['Sample', 'Time', 'dY', 'dN', 'dC']]
return fr_sample
def print_data(fr, fmt='{:.2f}'):
fr = fr.copy()
fr['Number'] = fr.groupby(['Sample'])['Sample'].transform(lambda s: range(1,len(s)+1))
fr['Time'] = fr['Time'].map(fmt.format) + fr['Event'].map({1: '', 0: '+'})
if not 'Cost' in fr:
return fr.pivot(index='Sample', columns='Number', values='Time').fillna('')
else:
fr['Cost'] = fr['Cost'].map(fmt.format, na_action='ignore')
tb1 = fr.pivot(index='Sample', columns='Number', values='Time').fillna('')
tb2 = fr.pivot(index='Sample', columns='Number', values='Cost').fillna('')
tb = pd.concat((tb1, tb2), axis=0, keys=['Time', 'Value'], names=['Type', 'Sample'])
tb = tb.swaplevel().sort_index()
return tb
def plot_data(fr, ax=None, label='', offset=0, color='black', edge_color='black', marker='o', alpha=1.0, plot_costs=True):
fr = fr.copy()
# If no axis was given, create a new figure
if ax is None:
plt.figure()
ax = plt.gca()
# If axis were given, expand to fit previous plot
ax_xmax, ax_ymax = ax.get_xlim()[1], ax.get_ylim()[1]
# Order by censoring time
sample_id = fr.loc[fr['dY'] < 0].sort_values('Time')['Sample']
sample_idx = np.arange(offset + len(sample_id), offset, -1)
map_to_nth = dict(zip(sample_id, sample_idx))
fr['Sample.y'] = fr['Sample'].map(map_to_nth)
# Plot marker at each event time and line from observable start to end
fr_events = fr[fr['dN'] > 0]
fr_limits = fr[(fr['dY'] == 1) | (fr['dY'] == -1)]
fr_limits = fr_limits.pivot(index='Sample.y', columns='dY', values='Time').reset_index()
fr_limits = fr_limits.rename(columns={1: 'Time.start', -1: 'Time.end'})
# Plot events as black dots
ax.scatter(fr_events['Time'], fr_events['Sample.y'], zorder=10, marker=marker, color=color, edgecolors='black', label=label)
# Plot vertical lines where at risk starts and ends with horizontal line in between
ax.hlines(fr_limits['Sample.y'], fr_limits['Time.start'], fr_limits['Time.end'], color=edge_color, label='', alpha=alpha)
ax.scatter(fr_limits['Time.start'], fr_limits['Sample.y'], marker='|', color=edge_color, label='', alpha=alpha)
ax.scatter(fr_limits['Time.end'], fr_limits['Sample.y'], marker='|', color=edge_color, label='', alpha=alpha)
# Annotate events with cost
if 'dC' in fr and plot_costs:
for i, cost in enumerate(fr_events['dC']):
ax.annotate("%.2f" % cost, (fr_events['Time'].iloc[i]+.02, fr_events['Sample.y'].iloc[i]+.10), fontsize=8)
ax.set_ylim(0, max(ax_ymax, fr['Sample.y'].max()+1))
ax.set_xlim(0, max(ax_xmax, fr['Time'].max()*1.05))
ax.set_title('Event History Plot')
ax.set_ylabel('Sample')
ax.set_xlabel('Time')
if label:
ax.legend()
def plot_datas(group_fr, ax=None, alpha=1.0):
# If no axis was given, create a new figure
if ax is None:
plt.figure()
ax = plt.gca()
# Plot timelines grouped by cohorts
idx, marker = 0, cycle(markers)
for cohort, cohort_fr in group_fr:
plot_data(cohort_fr, ax, label=str(cohort), marker=marker.next(), offset=idx, alpha=alpha)
idx += len(cohort_fr['Sample'].unique())
return ax
def mcf(fr_sample, confidence=0.95, robust=False, positive=False, interval=False, exact=False, Y_start=0):
Z = norm.ppf((1.00+confidence)/2.0)
fr = fr_sample.groupby('Time')[['dN', 'dY']].agg('sum').sort_index().reset_index()
fr['N'] = fr['dN'].cumsum()
if not interval:
fr['Y'] = Y_start + fr['dY'].cumsum().shift(1).fillna(0)
if interval:
enter = Y_start + fr['dY'].cumsum().shift(1).fillna(0)
exit = Y_start + fr['dY'].cumsum()
fr['Y'] = (enter + exit) / 2.0
fr['dE[N]'] = (fr['dN']/fr['Y']).fillna(0)
fr['E[N]'] = fr['dE[N]'].cumsum()
if robust:
dYi = pd.pivot_table(fr_sample, values='dY', index='Sample', columns='Time', aggfunc='sum', fill_value=0)
dNi = pd.pivot_table(fr_sample, values='dN', index='Sample', columns='Time', aggfunc='sum', fill_value=0)
Yi = dYi.cumsum(axis=1).shift(axis=1).fillna(0)
Y = Yi.sum(axis=0)
dN = dNi.sum(axis=0)
dH = (dN/Y).fillna(0)
wi = Yi.div(Y, axis=1).fillna(0)
di = dNi.subtract(dH, axis=1)
Ti = wi.multiply(di, axis=1).cumsum(axis=1)**2
Hvar = Ti.sum(axis=0)
fr['E[N].Var'] = Hvar.values
else:
dvar = (fr['dE[N]']/fr['Y']).fillna(0)
fr['E[N].Var'] = dvar.cumsum()
if not positive:
fr['E[N].ucl'] = fr['E[N]'] + Z*np.sqrt(fr['E[N].Var'])
fr['E[N].lcl'] = fr['E[N]'] - Z*np.sqrt(fr['E[N].Var'])
else:
fr['E[N].ucl'] = np.exp(np.log(fr['E[N]']) + Z*np.sqrt(fr['E[N].Var'])/fr['E[N]']).fillna(0)
fr['E[N].lcl'] = np.exp(np.log(fr['E[N]']) - Z*np.sqrt(fr['E[N].Var'])/fr['E[N]']).fillna(0)
if exact:
fr['E[N].ucl'] = 0.5*chi2.ppf((1.00+confidence)/2.0, 2*fr['N'] + 2)/fr['Y']
fr['E[N].lcl'] = 0.5*chi2.ppf((1.00-confidence)/2.0, 2*fr['N'])/fr['Y']
fr['E[N].ucl'].fillna(0, inplace=True)
fr['E[N].lcl'].fillna(0, inplace=True)
fr['E[N].ucl'] = fr['E[N].ucl'].where(fr['E[N].ucl'] != np.inf, 0)
return fr
def mcfcost(fr_sample, mcf_compound=None, confidence=0.95, robust=False, positive=False, interval=False, Y_start=0):
Z = norm.ppf((1.00+confidence)/2.0)
fr = fr_sample.groupby('Time')[['dC', 'dN', 'dY']].agg('sum').sort_index().reset_index()
if not interval:
fr['Y'] = Y_start + fr['dY'].cumsum().shift(1).fillna(0)
if interval:
enter = Y_start + fr['dY'].cumsum().shift(1).fillna(0)
exit = Y_start + fr['dY'].cumsum()
fr['Y'] = (enter + exit) / 2.0
fr['dE[C]'] = (fr['dC']/fr['Y']).fillna(0)
fr['E[C]'] = fr['dE[C]'].cumsum()
if mcf_compound is None:
if 'Sample' in fr_sample:
dYi = pd.pivot_table(fr_sample, values='dY', index='Sample', columns='Time', aggfunc='sum', fill_value=0)
dCi = pd.pivot_table(fr_sample, values='dC', index='Sample', columns='Time', aggfunc='sum', fill_value=0)
Yi = dYi.cumsum(axis=1).shift(axis=1).fillna(0)
Y = Yi.sum(axis=0)
dC = dCi.sum(axis=0)
dEC = (dC/Y).fillna(0)
wi = Yi.div(Y, axis=1).fillna(0)
di = dCi.subtract(dEC, axis=1)
Ti = wi.multiply(di, axis=1).cumsum(axis=1)**2 if robust else (wi.multiply(di, axis=1)**2).cumsum(axis=1)
Cvar = Ti.sum(axis=0)
fr['E[C].Var'] = Cvar.values
else:
fr['E[C].Var'] = np.nan
else:
cost_fr = fr_sample.loc[fr_sample['dN'] > 0, 'dC']
N = len(cost_fr)
E_cost = cost_fr.mean()
Var_cost = cost_fr.var()
fr['dE[C]'] = E_cost * mcf_compound['dE[N]'].fillna(0)
fr['E[C]'] = fr['dE[C]'].cumsum()
Var_EC = E_cost**2 * mcf_compound['E[N].Var'] + Var_cost/N * mcf_compound['E[N]']**2 #+ Varc**2/N * fr_compound['Ht.Var']
fr['E[C].Var'] = Var_EC
if not positive:
fr['E[C].ucl'] = fr['E[C]'] + Z*np.sqrt(fr['E[C].Var'])
fr['E[C].lcl'] = fr['E[C]'] - Z*np.sqrt(fr['E[C].Var'])
else:
fr['E[C].ucl'] = np.exp(np.log(fr['E[C]']) + Z*np.sqrt(fr['E[C].Var'])/fr['E[C]']).fillna(0)
fr['E[C].lcl'] = np.exp(np.log(fr['E[C]']) - Z*np.sqrt(fr['E[C].Var'])/fr['E[C]']).fillna(0)
return fr
def plot_mcf(fr, ax=None, cost=False, label='', interval=False, color='black', edge_color='black', marker='o', CI=True,
title='Mean Cumulative Function Plot'):
# If no axis was given, create a new figure
if ax is None:
plt.figure()
ax = plt.gca()
# If axis were given, expand to fit previous plot
ax_xmax, ax_ymax = ax.get_xlim()[1], ax.get_ylim()[1]
dNA = fr[fr['dN'] > 0]
dY = fr[fr['dY'] == -1]
# Draw MCF as a solid step-plot with black dots at events and vertical lines at censoring events
key = 'E[N]' if not cost else 'E[C]'
drawstyle = 'steps-post' if not interval else 'default'
if not interval:
ax.plot(dNA['Time'], dNA[key], marker=marker, linestyle='', color=color, label=label)
ax.plot(dY['Time'], dY[key], marker='+', linestyle='', color=edge_color, label='')
else:
ax.plot(fr['Time'], fr[key], marker=marker, linestyle='', color=edge_color, label=label)
ax.plot(fr['Time'], fr[key], marker='', linestyle='-', drawstyle=drawstyle, color=edge_color, label='')
# Draw confidence intervals as dashed step-plots
if CI:
ax.plot(fr['Time'], fr[key+'.ucl'], marker='', linestyle='--', drawstyle=drawstyle, color=edge_color, label='')
ax.plot(fr['Time'], fr[key+'.lcl'], marker='', linestyle='--', drawstyle=drawstyle, color=edge_color, label='')
ax.set_xlim(0, max(ax_xmax, fr['Time'].max()*1.05))
ax.set_ylim(0, max(ax_ymax, fr[key+'.ucl'].max()*1.1))
ax.set_title(title)
ax.set_ylabel('Mean Number of Events' if not cost else 'Mean Cost')
ax.set_xlabel('Time')
if label:
ax.legend()
ax.grid(True)
def plot_mcfs(group_fr, ax=None, cost=False, robust=False, positive=False, interval=False, CI=True):
# If no axis was given, create a new figure
if ax is None:
plt.figure()
ax = plt.gca()
# Plot timelines grouped by cohorts
idx, marker = 0, cycle(markers)
for cohort, cohort_fr in group_fr:
plot_mcf(cohort_fr, ax, cost=cost, label=str(cohort) if cohort else '', marker=marker.next(), interval=interval, CI=CI)
return ax
def mcfdiff(mcf1, mcf2, confidence=0.95, bonferroni_correction=2):
if bonferroni_correction > 2:
n_comparisons = bonferroni_correction*(bonferroni_correction-1)/2
confidence = 1.00 - (1.00 - confidence)/float(n_comparisons)
Z = norm.ppf((1.00+confidence)/2.0)
mcf1 = mcf1.set_index('Time')
mcf2 = mcf2.set_index('Time')
idx = mcf1.index.union(mcf2.index)
dN = (mcf1['dN'].reindex(idx, fill_value=0) != 0) | (mcf2['dN'].reindex(idx, fill_value=0) != 0)
dY = (mcf1['dY'].reindex(idx, fill_value=0) != 0) | (mcf2['dY'].reindex(idx, fill_value=0) != 0)
H_diff = mcf1['E[N]'].reindex(idx, method='ffill') - mcf2['E[N]'].reindex(idx, method='ffill')
H_dvar = mcf1['E[N].Var'].reindex(idx, method='ffill') + mcf2['E[N].Var'].reindex(idx, method='ffill')
fr_diff = pd.DataFrame({'dN': dN, 'dY': dY, 'E[N].diff': H_diff, 'E[N].diff.Var': H_dvar})
fr_diff['E[N].diff.ucl'] = fr_diff['E[N].diff'] + Z*np.sqrt(fr_diff['E[N].diff.Var'])
fr_diff['E[N].diff.lcl'] = fr_diff['E[N].diff'] - Z*np.sqrt(fr_diff['E[N].diff.Var'])
return fr_diff.reset_index()
def plot_mcfdiff(fr, ax=None, label='', interval=False, CI=True, xlabel=True, ylabel=True, title=True,
color='black', edge_color='black', marker='o'):
# If no axis was given, create a new figure
if ax is None:
plt.figure()
ax = plt.gca()
# If axis were given, expand to fit previous plot
ax_xmax, ax_ymin, ax_ymax = ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]
dNA = fr[fr['dN'] == True]
dY = fr[fr['dY'] == True]
drawstyle = 'steps-post' if not interval else 'default'
ax.plot(dNA['Time'], dNA['E[N].diff'], marker='o', linestyle='', color=color, label=label)
ax.plot(dY['Time'], dY['E[N].diff'], marker='+', linestyle='', color=edge_color, label='')
ax.plot(fr['Time'], fr['E[N].diff'], marker='', linestyle='-', drawstyle=drawstyle, color=edge_color, label='')
if CI:
ax.plot(fr['Time'], fr['E[N].diff.ucl'], marker='', linestyle='--', drawstyle=drawstyle, color=edge_color, label='')
ax.plot(fr['Time'], fr['E[N].diff.lcl'], marker='', linestyle='--', drawstyle=drawstyle, color=edge_color, label='')
lim_ucl = fr['E[N].diff.ucl'].max()
lim_lcl = fr['E[N].diff.lcl'].min()
ymin, ymax = lim_lcl*1.1 if lim_lcl < 0 else lim_lcl*0.9, lim_ucl*1.1 if lim_ucl > 0 else lim_ucl*0.9
xmax = fr['Time'].max()*1.05
ax.set_ylim(min(ymin, ax_ymin), max(ymax, ax_ymax))
ax.set_xlim(0, max(xmax, ax_xmax))
if title: ax.set_title('Mean Cumulative Difference Plot')
if ylabel: ax.set_ylabel('Mean Difference of Events')
if xlabel: ax.set_xlabel('Time')
if label:
ax.legend()
ax.grid()
def plot_mcfdiffs(group_fr, axs=None, robust=False, positive=False, interval=False):
# If no axis was given, create a new figure
nrows, ncols = len(group_fr), len(group_fr[0])
if axs is None:
fig, axs = plt.subplots(nrows, ncols, sharex=True, sharey=True)
fig.subplots_adjust(wspace=0, hspace=0)
# Plot timelines grouped by cohorts
i, j = 0, 0
idx, marker = 0, cycle(markers)
for row in group_fr:
for cohort_1, cohort_2, cohort_fr in row:
if cohort_fr:
plot_mcfdiff(cohort_fr, axs[i][j], label=str(cohort_1) + ' vs. ' + str(cohort_2),
marker=marker.next(), CI=(i != j), title=(i == 0), xlabel=(i == nrows-1), ylabel=(j == 0),
interval=interval)
j +=1
j = 0
i += 1
return axs
def var(fr, ws=None):
dN = fr['dN']
dY = fr['dY']
if ws is not None:
dY = dY.reindex(ws.index, fill_value=0)
dN = dN.reindex(ws.index, fill_value=0)
Y = dY.cumsum().shift(1).fillna(0)
dH = (dN/Y).fillna(0)
T = (dH/Y).fillna(0)
if not ws is None:
T = T.multiply(ws**2)
Hvar = T.cumsum()
return Hvar
def robust_var(fr_sample, ws=None):
#TODO: possible to combine?
dYi = pd.pivot_table(fr_sample, values='dY', index='Sample', columns='Time', aggfunc='sum', fill_value=0)
dNi = pd.pivot_table(fr_sample, values='dN', index='Sample', columns='Time', aggfunc='sum', fill_value=0)
if not ws is None:
dYi = dYi.reindex(columns=ws.index, fill_value=0)
dNi = dNi.reindex(columns=ws.index, fill_value=0)
Yi = dYi.cumsum(axis=1).shift(axis=1).fillna(0)
dN = dNi.sum(axis=0)
Y = Yi.sum(axis=0)
dH = (dN/Y).fillna(0)
di = dNi.subtract(dH, axis=1)
wi = Yi.div(Y, axis=1).fillna(0)
if not ws is None:
wi = wi.multiply(ws, axis=1)
Ti = wi.multiply(di, axis=1).cumsum(axis=1)**2
Hvar = Ti.sum(axis=0)
return Hvar
'''
def var(mcf, ws=None):
dN = mcf.set_index('Time')['dN']
dY = mcf.set_index('Time')['dY']
if ws is not None:
dY = dY.reindex(ws.index, fill_value=0)
dN = dN.reindex(ws.index, fill_value=0)
Y = dY.cumsum().shift(1).fillna(0)
dH = (dN/Y).fillna(0)
T = (dH/Y).fillna(0)
if not ws is None:
T = T.multiply(ws**2)
Hvar = T.cumsum()
return Hvar
def robust_var(fr_sample, ws=None):
fr = fr_sample.groupby('Time')[['dN', 'dY']].agg('sum').sort_index().reset_index()
fr['Y'] = fr['dY'].cumsum().shift(1).fillna(0)
fr['ht'] = (fr['dN']/fr['Y']).fillna(0)
def sample_var(gfr, Y, dH):
dYi = gfr['dY'].reindex(ws.index).fillna(0)
dNi = gfr['dN'].reindex(ws.index).fillna(0)
Yi = dYi.cumsum().shift().fillna(0)
wi = Yi / Y
wi = wi * ws
di = dNi - dH
T = (wi * di).cumsum()**2
return T
df_sample = fr_sample.set_index('Time')
Y = fr.set_index('Time')['Y']
dH = fr.set_index('Time')['ht']
T_var = pd.Series(np.zeros_like(Y), index=Y.index)
i = 0
for sample, gfr in df_sample.groupby('Sample'):
if i % 100 == 0: print i
T_var += sample_var(gfr, Y, dH)
i += 1
print
'''
def fr_agg(fr_sample):
fr = fr_sample.groupby('Time')[['dN', 'dY']].agg('sum').sort_index().reset_index()
fr['N'] = fr['dN'].cumsum()
fr['Y'] = fr['dY'].cumsum().shift(1).fillna(0)
fr['dE[N]'] = (fr['dN']/fr['Y']).fillna(0)
return fr
def mcfequal(fr_sample1, fr_sample2, confidence=0.95, robust=False):
#TODO: drop Y = 0, compare multiple
Z = norm.ppf((1.00+confidence)/2.0)
fr1 = fr_agg(fr_sample1)
fr2 = fr_agg(fr_sample2)
fr1 = fr1.set_index('Time')
fr2 = fr2.set_index('Time')
idx = fr1.index.union(fr2.index)
Y1 = fr1['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
Y2 = fr2['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
h1 = fr1['dE[N]'].reindex(idx, fill_value=0)
h2 = fr2['dE[N]'].reindex(idx, fill_value=0)
w = (Y1 * Y2/(Y1 + Y2)).fillna(0)
U_score = (w * (h1 - h2)).sum()
if not robust:
U_var = var(fr1, w).iloc[-1] + var(fr2, w).iloc[-1] #(w**2 * (h1/Y1 + h2/Y2)).sum()
else:
U_var = robust_var(fr_sample1, w).iloc[-1] + robust_var(fr_sample2, w).iloc[-1]
p_value = chi2.sf(U_score**2 / U_var, 1)
#print U_score, U_var, U_score**2/U_var, p_value
return p_value
def mcfequal_k(*fr_samples):
confidence = 0.95
Z = norm.ppf((1.00+confidence)/2.0)
k = len(fr_samples)
cohorts = range(k)
fr_aggs = [fr_agg(fr_cohort).set_index('Time') for fr_cohort in fr_samples]
fr = {cohort: fr_cohort.set_index('Time') for cohort, fr_cohort in zip(cohorts, fr_samples)}
dY = pd.DataFrame({idx: fr_agg['dY'] for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0)
dN = pd.DataFrame({idx: fr_agg['dN'] for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0)
dN_ = dN.sum(axis=1)
Y = dY.cumsum(axis=0).shift(1, axis=0).fillna(0)
Y_ = Y.sum(axis=1)
Z = np.array([(Y[cohort] * (dN[cohort] / Y[cohort] - dN_ / Y_)).sum() for cohort in cohorts])
V = np.zeros((k, k))
for cohort_i in cohorts:
for cohort_j in cohorts:
var = 0.0
for cohort in cohorts:
c_i = int(cohort == cohort_i) - Y[cohort_i] / Y_
c_j = int(cohort == cohort_j) - Y[cohort_j] / Y_
for sample, fr_sample in fr[cohort].groupby('Sample'):
fr_sample = fr_sample.groupby(level=0).agg('sum')
Y_sample = fr_sample['dY'].reindex(Y.index, fill_value=0).cumsum().shift(1).fillna(0)
dN_sample = fr_sample['dN'].reindex(Y.index, fill_value=0)
var_sample = (Y_sample * c_i * (dN_sample - dN[cohort] / Y[cohort])).sum()*\
(Y_sample * c_j * (dN_sample - dN[cohort] / Y[cohort])).sum()
var += var_sample
V[cohort_i, cohort_j] = var
Z = Z[:-1]
V = V[:-1,:-1]
score = np.dot(np.dot(Z, inv(V)), Z)
p_value = chi2.sf(score, k - 1)
#print Z, V, score, p_value
return p_value
def logrank(fr_sample1, fr_sample2):
confidence = 0.95
Z = norm.ppf((1.00+confidence)/2.0)
fr1 = fr_agg(fr_sample1)
fr2 = fr_agg(fr_sample2)
fr1 = fr1.set_index('Time')
fr2 = fr2.set_index('Time')
idx = fr1.index.union(fr2.index)
Y1 = fr1['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
Y2 = fr2['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
h1 = fr1['dE[N]'].reindex(idx, fill_value=0)
h2 = fr2['dE[N]'].reindex(idx, fill_value=0)
dN1 = fr1['dN'].reindex(idx, fill_value=0)
dN2 = fr2['dN'].reindex(idx, fill_value=0)
w1 = (Y1 * Y2/(Y1 + Y2)).fillna(0)
w2 = ((Y1 * Y2)/(Y1 + Y2)**2).fillna(0)
U_score = (w1 * (h1 - h2)).sum()
U_var = (w2 * (dN1 + dN2)).sum()
score = U_score**2 / U_var
p_value = chi2.sf(score, 1)
#print U_score, U_var, score, p_value
return p_value
def logrank_k(*fr_samples):
confidence = 0.95
Z = norm.ppf((1.00+confidence)/2.0)
k = len(fr_samples)
cohorts = range(k)
fr_aggs = [fr_agg(fr_cohort).set_index('Time') for fr_cohort in fr_samples]
dY = pd.DataFrame({idx: fr_agg['dY'] for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0)
dN = pd.DataFrame({idx: fr_agg['dN'] for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0)
dN_ = dN.sum(axis=1)
Y = dY.cumsum(axis=0).shift(1, axis=0).fillna(0)
Y_ = Y.sum(axis=1)
K = Y.all(axis=1).astype(int)
Z = np.array([(K * (dN[cohort] - Y[cohort] * (dN_ / Y_))).sum() for cohort in cohorts])
V = np.array([[(K * Y[cohort_i] / Y_ * (int(cohort_i == cohort_j) - Y[cohort_j] / Y_) * dN_).sum()
for cohort_j in cohorts]
for cohort_i in cohorts])
Z = Z[:-1]
V = V[:-1,:-1]
score = np.dot(np.dot(Z, inv(V)), Z)
p_value = chi2.sf(score, k - 1)
#print Z, V, score, p_value
return p_value
# List of [(cohort tuple, cohort dataframe), ...] as defined by covariates
# group_fr = fr.groupby(covariates) if covariates else [('', fr)]
if __name__ == '__main__':
pass
\ No newline at end of file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_operations import transform_data, mcf, mcfdiff, mcfequal, logrank, plot_data, plot_datas, plot_mcf, \
plot_mcfs, plot_mcfdiff, print_data
if __name__ == '__main__':
# Bladder Example
raw_data = pd.read_csv('T45.csv', sep=';')
raw_data.rename(columns={'Patient number': 'Sample', 'Event': 'Number', 'Censored': 'Event',
'Treatment group': 'Drug'}, inplace=True)
raw_data['Drug'] = raw_data['Drug'].map({1: 'Placebo', 2: 'Pyridoxine', 3: 'Thiotepa'})
raw_data['Event'] = 1 - raw_data['Event']
# Print data set
print print_data(raw_data, fmt='{:.0f}')
# Counting process format data set and whole sample MCF
df = transform_data(raw_data)
lt = mcf(df, robust=True, positive=False)
# Separate into Placebo & Thiotepa data sets and MCFs
covariate = 'Drug'
cohort1, cohort2 = 'Placebo', 'Thiotepa'
group_df = df.set_index(covariate)
group_lt = df.groupby(covariate).apply(lambda sfr: mcf(sfr, robust=True, positive=False))
df1, df2 = group_df.ix[cohort1], group_df.ix[cohort2]
lt1, lt2 = group_lt.ix[cohort1], group_lt.ix[cohort2]
lt_diff = mcfdiff(lt1, lt2)
# Plot, no stratification
fig, (ax1, ax2) = plt.subplots(2, 1)
plot_data(df, ax=ax1, alpha=0.5)
plot_mcf(lt, ax=ax2)
plt.tight_layout()
# Plot, stratified
fig, (ax1, ax2) = plt.subplots(2, 1)
plot_datas([(cohort1, df1), (cohort2, df2)], ax=ax1, alpha=0.5)
plot_mcfs([(cohort1, lt1), (cohort2, lt2)], ax=ax2)
# Comparison plot
fig, ax1 = plt.subplots(1, 1)
plot_mcfdiff(lt_diff, ax=ax1, label='%s vs. %s' % (cohort1, cohort2))
plt.tight_layout()
# Compute p-values
#p_value0 = logrank(df1, df2)
p_value1 = mcfequal(df1, df2)
p_value2 = mcfequal(df1, df2, robust=True)
print "p-values: %.3f (robust %.3f)" % (p_value1, p_value2)
plt.show()
\ No newline at end of file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_operations import transform_data, print_data, mcf, mcfcost, plot_data, plot_mcf
from scipy.stats import gamma, uniform, expon, norm
# Cook (2008, pg. 367) D.5 Artificial Field Repair Data
# In Example 8.1 some results were presented on the analysis of an artificial dataset on field repairs. The data were
# generated as follows. The time-homogeneous event rate for subject i was gamma distributed with mean 2 and variance 0.5
# and this was used to generate events over (0, ti], where ti ~ Unif(1,3). At the jth event time for subject i, the cost
# was generated independently as Cij ~ N(10, 2.52).
def generate_data(n=134, mean_gamma=2.0, var_gamma=0.5, min_unif=1, max_unif=3, mean_norm=10, std_norm=2.52):
rows = []
for i in range(1, n+1):
rate = gamma.rvs(mean_gamma**2/var_gamma, scale=var_gamma/mean_gamma)
t_i = uniform.rvs()*(max_unif-min_unif) + min_unif
t_ij = expon.rvs(scale=1./rate)
while t_ij <= t_i:
cost = norm.rvs(loc=mean_norm, scale=std_norm)
rows.append((i, t_ij, 1, cost))
t_ij += expon.rvs(scale=1./rate)
rows.append((i, t_i, 0, np.nan))
return pd.DataFrame(rows, columns=['Sample', 'Time', 'Event', 'Cost'])
if __name__ == '__main__':
# Cook (2008, pg. 299) Example 8.1: Field Repair data
# This dataset (see Appendix D) gives simulated data on unscheduled repairs
# for a fleet of m = 134 large utility vehicles operated by a city. The data were
# collected over a three-year period on new vehicles which were purchased and
# placed in service over the first two years of the study. Time is measured in
# years from the start of the study, and costs are in hundreds of dollars.
raw_data = generate_data(n=25, mean_gamma=1.0, var_gamma=1.0)
# Print data
print print_data(raw_data)
# Transform to counting process format
df = transform_data(raw_data)
# MCF for events and MCF for costs
mcf1 = mcf(df, robust=True)
mcf2 = mcf(df, robust=False)
mcfc = mcfcost(df, robust=True)
# Plot data
plot_data(df)
# Plot MCFs
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
plot_mcf(mcf2, ax=ax1, label='Events (Naive S.E.)', title='', color='grey', edge_color='grey')
plot_mcf(mcf1, ax=ax1, label='Events (Robust S.E.)', title='')
plot_mcf(mcfc, ax=ax2, label='Cost', cost=True, title='')
fig.suptitle('Mean Cumulative Function Plots')
plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment