PyMCF:

-Compute MCF with standard errors -Compare MCFs pointwise over time -Compute MCF whole sample test -Two examples: bladder cancer and field repair

PyMCF:
4ce81636 · Markus Viljanen · 4ce81636 · 4ce81636 · 4ce81636 · 4ce81636
Commit 4ce81636 authored 7 years ago by Markus Viljanen
--- a/T45.csv
+++ b/T45.csv
+Treatment group;Patient number;Event;Time;Censored
+1;1;1;0;1
+1;2;1;1;1
+1;3;1;4;1
+1;4;1;7;1
+1;5;1;10;1
+1;6;1;6;0
+1;6;2;10;1
+1;7;1;14;1
+1;8;1;18;1
+1;9;1;5;0
+1;9;2;18;1
+1;10;1;12;0
+1;10;2;16;0
+1;10;3;18;1
+1;11;1;23;1
+1;12;1;10;0
+1;12;2;15;0
+1;12;3;23;1
+1;13;1;3;0
+1;13;2;16;0
+1;13;3;23;0
+1;13;4;23;1
+1;14;1;3;0
+1;14;2;9;0
+1;14;3;21;0
+1;14;4;23;1
+1;15;1;7;0
+1;15;2;10;0
+1;15;3;16;0
+1;15;4;24;0
+1;15;5;24;1
+1;16;1;3;0
+1;16;2;15;0
+1;16;3;25;0
+1;16;4;25;1
+1;17;1;26;1
+1;18;1;1;0
+1;18;2;26;1
+1;19;1;2;0
+1;19;2;26;0
+1;19;3;26;1
+1;20;1;25;0
+1;20;2;28;1
+1;21;1;29;1
+1;22;1;29;1
+1;23;1;29;1
+1;24;1;28;0
+1;24;2;30;0
+1;24;3;30;1
+1;25;1;2;0
+1;25;2;17;0
+1;25;3;22;0
+1;25;4;30;1
+1;26;1;3;0
+1;26;2;6;0
+1;26;3;8;0
+1;26;4;12;0
+1;26;5;26;0
+1;26;6;30;1
+1;27;1;12;0
+1;27;2;15;0
+1;27;3;24;0
+1;27;4;31;1
+1;28;1;32;1
+1;29;1;34;1
+1;30;1;36;1
+1;31;1;29;0
+1;31;2;36;1
+1;32;1;37;1
+1;33;1;9;0
+1;33;2;17;0
+1;33;3;22;0
+1;33;4;24;0
+1;33;5;40;1
+1;34;1;16;0
+1;34;2;19;0
+1;34;3;23;0
+1;34;4;29;0
+1;34;5;34;0
+1;34;6;40;0
+1;34;7;40;1
+1;35;1;41;1
+1;36;1;3;0
+1;36;2;43;1
+1;37;1;6;0
+1;37;2;43;1
+1;38;1;3;0
+1;38;2;6;0
+1;38;3;9;0
+1;38;4;44;1
+1;39;1;9;0
+1;39;2;11;0
+1;39;3;20;0
+1;39;4;26;0
+1;39;5;30;0
+1;39;6;45;1
+1;40;1;18;0
+1;40;2;48;1
+1;41;1;49;1
+1;42;1;35;0
+1;42;2;51;1
+1;43;1;17;0
+1;43;2;53;1
+1;44;1;3;0
+1;44;2;15;0
+1;44;3;46;0
+1;44;4;51;0
+1;44;5;53;0
+1;44;6;53;1
+1;45;1;59;1
+1;46;1;2;0
+1;46;2;15;0
+1;46;3;24;0
+1;46;4;30;0
+1;46;5;34;0
+1;46;6;39;0
+1;46;7;43;0
+1;46;8;49;0
+1;46;9;52;0
+1;46;10;61;1
+1;47;1;5;0
+1;47;2;14;0
+1;47;3;19;0
+1;47;4;27;0
+1;47;5;41;0
+1;47;6;64;1
+1;48;1;2;0
+1;48;2;8;0
+1;48;3;12;0
+1;48;4;13;0
+1;48;5;17;0
+1;48;6;21;0
+1;48;7;33;0
+1;48;8;49;0
+1;48;9;64;1
+2;49;1;0;1
+2;50;1;2;1
+2;51;1;3;0
+2;51;2;4;0
+2;51;3;4;1
+2;52;1;4;1
+2;53;1;2;0
+2;53;2;3;0
+2;53;3;5;1
+2;54;1;7;1
+2;55;1;8;1
+2;56;1;4;0
+2;56;2;8;1
+2;57;1;3;0
+2;57;2;11;1
+2;58;1;14;1
+2;59;1;26;1
+2;60;1;29;1
+2;61;1;5;0
+2;61;2;30;1
+2;62;1;32;1
+2;63;1;33;1
+2;64;1;3;0
+2;64;2;10;0
+2;64;3;22;0
+2;64;4;26;0
+2;64;5;34;0
+2;64;6;34;1
+2;65;1;3;0
+2;65;2;9;0
+2;65;3;15;0
+2;65;4;19;0
+2;65;5;25;0
+2;65;6;37;1
+2;66;1;38;1
+2;67;1;3;0
+2;67;2;7;0
+2;67;3;12;0
+2;67;4;16;0
+2;67;5;19;0
+2;67;6;28;0
+2;67;7;34;0
+2;67;8;36;0
+2;67;9;39;0
+2;67;10;39;1
+2;68;1;40;1
+2;69;1;40;1
+2;70;1;2;0
+2;70;2;6;0
+2;70;3;10;0
+2;70;4;16;0
+2;70;5;23;0
+2;70;6;27;0
+2;70;7;36;0
+2;70;8;39;0
+2;70;9;42;0
+2;70;10;42;1
+2;71;1;45;1
+2;72;1;10;0
+2;72;2;45;1
+2;73;1;6;0
+2;73;2;20;0
+2;73;3;46;1
+2;74;1;8;0
+2;74;2;15;0
+2;74;3;18;0
+2;74;4;20;0
+2;74;5;22;0
+2;74;6;25;0
+2;74;7;38;0
+2;74;8;40;0
+2;74;9;46;1
+2;75;1;42;0
+2;75;2;48;1
+2;76;1;54;1
+2;77;1;44;0
+2;77;2;47;0
+2;77;3;54;1
+2;78;1;8;0
+2;78;2;14;0
+2;78;3;20;0
+2;78;4;25;0
+2;78;5;29;0
+2;78;6;33;0
+2;78;7;48;0
+2;78;8;49;0
+2;78;9;55;1
+2;79;1;57;1
+2;80;1;60;1
+3;81;1;1;1
+3;82;1;1;1
+3;83;1;5;0
+3;83;2;5;1
+3;84;1;9;1
+3;85;1;10;1
+3;86;1;13;1
+3;87;1;3;0
+3;87;2;14;1
+3;88;1;1;0
+3;88;2;3;0
+3;88;3;5;0
+3;88;4;7;0
+3;88;5;10;0
+3;88;6;17;1
+3;89;1;18;1
+3;90;1;17;0
+3;90;2;18;1
+3;91;1;2;0
+3;91;2;19;1
+3;92;1;17;0
+3;92;2;19;0
+3;92;3;21;1
+3;93;1;22;1
+3;94;1;25;1
+3;95;1;25;1
+3;96;1;25;1
+3;97;1;6;0
+3;97;2;12;0
+3;97;3;13;0
+3;97;4;26;1
+3;98;1;6;0
+3;98;2;27;1
+3;99;1;2;0
+3;99;2;29;1
+3;100;1;26;0
+3;100;2;35;0
+3;100;3;36;1
+3;101;1;38;1
+3;102;1;22;0
+3;102;2;23;0
+3;102;3;27;0
+3;102;4;32;0
+3;102;5;39;1
+3;103;1;4;0
+3;103;2;16;0
+3;103;3;23;0
+3;103;4;27;0
+3;103;5;33;0
+3;103;6;36;0
+3;103;7;37;0
+3;103;8;39;1
+3;104;1;24;0
+3;104;2;26;0
+3;104;3;29;0
+3;104;4;40;0
+3;104;5;40;1
+3;105;1;41;1
+3;106;1;41;1
+3;107;1;1;0
+3;107;2;27;0
+3;107;3;43;1
+3;108;1;44;1
+3;109;1;2;0
+3;109;2;20;0
+3;109;3;23;0
+3;109;4;27;0
+3;109;5;38;0
+3;109;6;44;1
+3;110;1;45;1
+3;111;1;2;0
+3;111;2;46;1
+3;112;1;46;1
+3;113;1;49;1
+3;114;1;50;1
+3;115;1;4;0
+3;115;2;24;0
+3;115;3;47;0
+3;115;4;50;1
+3;116;1;54;1
+3;117;1;38;0
+3;117;2;54;1
+3;118;1;59;1
--- a/T45process.py
+++ b/T45process.py
+# Byar, D., Blackard, C., & Veterans Administration Cooperative Urological Research Group. (1977). Comparisons of placebo, pyridoxine, and topical thiotepa in preventing recurrence of stage I bladder cancer. Urology, 10(6), 556-561.
+# https://doi.org/10.1016/0090-4295(77)90101-7
+# Download T45.man
+#	Andrews, D.F. and Herzberg, A.M. (1985). A Collection of Problems from Many Fields for the Student and Research Worker
+#	http://lib.stat.cmu.edu/datasets/Andrews/ (Table 45.1)
+# Process T45.man to
+#	list1: rows of summary table
+#	list2: rows of recurrent events at inspection table
+import re
+list1 = []
+list2 = []
+with open('T45.man') as f:
+	for i, s in enumerate(f.readlines()):
+		print s[9:12]
+		if i % 2 == 0:
+			# split by whitespace
+			list1.append(s[16:].split())
+		else:
+			# split 1. by two or more spaces 2. by whitespace
+			events = re.split(r'\s{2,}', s[16:])
+			events = [event.split() for event in events if event]
+			list2.append(events)
+# Table 45.1 patient statistics
+import numpy as np
+import pandas as pd
+df1 = pd.DataFrame(list1, columns=['Patient number', 'Treatment group', 'Follow-up time, months', 'Survival status', 'No. of recurrences', 'Initial number', 'Initial size'])
+# Table 45.1 recurrent events at inspection: month (M), number (#), size (S)
+n, m = len(list2), max([len(l) for l in list2])
+cols = [col for i in range(1, m+1) for col in ['M%d'%i, '#%d'%i, 'S%d'%i]]
+df2 = pd.DataFrame(np.zeros((n, 3*m))*np.nan, columns=cols)
+for i in range(n):
+	for j in range(len(list2[i])):
+		m, n, s = list2[i][j]
+		df2.iloc[i,j*3+0] = m
+		df2.iloc[i,j*3+1] = n
+		df2.iloc[i,j*3+2] = s
+df2 = pd.concat((df1['Patient number'], df2), axis=1)
+# Recurrent events at inspection: patient x month (M) in wide and long format
+events_wide = df2.loc[:,df2.columns.str.startswith('M') | (df2.columns == 'Patient number')]
+events_long = pd.wide_to_long(events_wide, 'M', i='Patient number', j='Event').dropna().reset_index()
+# Recurrent events at inspection: long format Treatment group, Patient number, Event, Time, Censored
+events_rec = events_long.rename(columns={'M':'Time'})
+events_rec['Censored'] = 0
+events_end = df1[['Patient number', 'Follow-up time, months']]
+events_end.rename(columns={'Follow-up time, months':'Time'}, inplace=True)
+events_end['Censored'] = 1
+events_next = events_long.groupby('Patient number')['Event'].agg(lambda s: str(s.astype(int).max()+1))
+events_end['Event'] = events_end['Patient number'].map(events_next).fillna('1')
+events = pd.concat((events_rec, events_end))
+events = events.merge(df1[['Patient number', 'Treatment group']], how='left', on='Patient number')
+# Final processing and saving
+events['Treatment group'] = events['Treatment group'].astype(int)
+events['Patient number'] = events['Patient number'].astype(int)
+events['Event'] = events['Event'].astype(int)
+events['Time'] = events['Time'].astype(int)
+events.sort_values(['Patient number', 'Event'], inplace=True)
+events = events[['Treatment group', 'Patient number', 'Event', 'Time', 'Censored']]
+events.to_csv('T45.csv', index=False, sep=';')
\ No newline at end of file
--- a/data_operations.py
+++ b/data_operations.py
--- a/example_bladder.py
+++ b/example_bladder.py
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from data_operations import transform_data, mcf, mcfdiff, mcfequal, logrank, plot_data, plot_datas, plot_mcf, \
+    plot_mcfs, plot_mcfdiff, print_data
+if __name__ == '__main__':
+    # Bladder Example
+    raw_data = pd.read_csv('T45.csv', sep=';')
+    raw_data.rename(columns={'Patient number': 'Sample', 'Event': 'Number', 'Censored': 'Event',
+                             'Treatment group': 'Drug'}, inplace=True)
+    raw_data['Drug'] = raw_data['Drug'].map({1: 'Placebo', 2: 'Pyridoxine', 3: 'Thiotepa'})
+    raw_data['Event'] = 1 - raw_data['Event']
+    # Print data set
+    print print_data(raw_data, fmt='{:.0f}')
+    # Counting process format data set and whole sample MCF
+    df = transform_data(raw_data)
+    lt = mcf(df, robust=True, positive=False)
+    # Separate into Placebo & Thiotepa data sets and MCFs
+    covariate = 'Drug'
+    cohort1, cohort2 = 'Placebo', 'Thiotepa'
+    group_df = df.set_index(covariate)
+    group_lt = df.groupby(covariate).apply(lambda sfr: mcf(sfr, robust=True, positive=False))
+    df1, df2 = group_df.ix[cohort1], group_df.ix[cohort2]
+    lt1, lt2 = group_lt.ix[cohort1], group_lt.ix[cohort2]
+    lt_diff = mcfdiff(lt1, lt2)
+    # Plot, no stratification
+    fig, (ax1, ax2) = plt.subplots(2, 1)
+    plot_data(df, ax=ax1, alpha=0.5)
+    plot_mcf(lt, ax=ax2)
+    plt.tight_layout()
+    # Plot, stratified
+    fig, (ax1, ax2) = plt.subplots(2, 1)
+    plot_datas([(cohort1, df1), (cohort2, df2)], ax=ax1, alpha=0.5)
+    plot_mcfs([(cohort1, lt1), (cohort2, lt2)], ax=ax2)
+    # Comparison plot
+    fig, ax1 = plt.subplots(1, 1)
+    plot_mcfdiff(lt_diff, ax=ax1, label='%s vs. %s' % (cohort1, cohort2))
+    plt.tight_layout()
+    # Compute p-values
+    #p_value0 = logrank(df1, df2)
+    p_value1 = mcfequal(df1, df2)
+    p_value2 = mcfequal(df1, df2, robust=True)
+    print "p-values: %.3f (robust %.3f)" % (p_value1, p_value2)
+    plt.show()
\ No newline at end of file
--- a/example_fieldrepair.py
+++ b/example_fieldrepair.py
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from data_operations import transform_data, print_data, mcf, mcfcost, plot_data, plot_mcf
+from scipy.stats import gamma, uniform, expon, norm
+# Cook (2008, pg. 367) D.5 Artificial Field Repair Data
+# In Example 8.1 some results were presented on the analysis of an artificial dataset on field repairs. The data were
+# generated as follows. The time-homogeneous event rate for subject i was gamma distributed with mean 2 and variance 0.5
+# and this was used to generate events over (0, ti], where ti ~ Unif(1,3). At the jth event time for subject i, the cost
+# was generated independently as Cij ~ N(10, 2.52).
+def generate_data(n=134, mean_gamma=2.0, var_gamma=0.5, min_unif=1, max_unif=3, mean_norm=10, std_norm=2.52):
+    rows = []
+    for i in range(1, n+1):
+        rate = gamma.rvs(mean_gamma**2/var_gamma, scale=var_gamma/mean_gamma)
+        t_i = uniform.rvs()*(max_unif-min_unif) + min_unif
+        t_ij = expon.rvs(scale=1./rate)
+        while t_ij <= t_i:
+            cost = norm.rvs(loc=mean_norm, scale=std_norm)
+            rows.append((i, t_ij, 1, cost))
+            t_ij += expon.rvs(scale=1./rate)
+        rows.append((i, t_i, 0, np.nan))
+    return pd.DataFrame(rows, columns=['Sample', 'Time', 'Event', 'Cost'])
+if __name__ == '__main__':
+    # Cook (2008, pg. 299) Example 8.1: Field Repair data
+    #   This dataset (see Appendix D) gives simulated data on unscheduled repairs
+    #   for a fleet of m = 134 large utility vehicles operated by a city. The data were
+    #   collected over a three-year period on new vehicles which were purchased and
+    #   placed in service over the first two years of the study. Time is measured in
+    #   years from the start of the study, and costs are in hundreds of dollars.
+    raw_data = generate_data(n=25, mean_gamma=1.0, var_gamma=1.0)
+    # Print data
+    print print_data(raw_data)
+    # Transform to counting process format
+    df = transform_data(raw_data)
+    # MCF for events and MCF for costs
+    mcf1 = mcf(df, robust=True)
+    mcf2 = mcf(df, robust=False)
+    mcfc = mcfcost(df, robust=True)
+    # Plot data
+    plot_data(df)
+    # Plot MCFs
+    fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
+    plot_mcf(mcf2, ax=ax1, label='Events (Naive S.E.)', title='', color='grey', edge_color='grey')
+    plot_mcf(mcf1, ax=ax1, label='Events (Robust S.E.)', title='')
+    plot_mcf(mcfc, ax=ax2, label='Cost', cost=True, title='')
+    fig.suptitle('Mean Cumulative Function Plots')
+    plt.show()