Code
import numpy as np
import pandas as pd
import plotly.express as px
from itertools import product
from scipy import integrate
import scipy
import matplotlib.pyplot as pltIn the third lecture block for “Finanz und Versicherungstechnik” (Finance & Insurance), we learnt about survival analysis. To solidify our understanding about it, the Professor provided us a lifetable consisting 2 columns of age and qx (probability of mortality for each specific age). The dataset can be found here on page 38.
The goal is to answer the following questions:
import numpy as np
import pandas as pd
import plotly.express as px
from itertools import product
from scipy import integrate
import scipy
import matplotlib.pyplot as pltThe dataset has the following shape:
raw_data = pd.read_csv("DAV2008T.csv", sep=";")
raw_data.shape(122, 16)
Get all column names:
raw_data.columnsIndex(['Alter', 'A', 'E', 'C', 'B', 'F', 'D', 'G', 'M', 'J', 'H', 'N', 'K',
'I', 'O', 'L'],
dtype='object')
Based on this document on page 38, we need only the following data from columns:
data = raw_data.loc[:,['Alter', 'A']]
data.head()| Alter | A | |
|---|---|---|
| 0 | 0 | 0.006113 |
| 1 | 1 | 0.000423 |
| 2 | 2 | 0.000343 |
| 3 | 3 | 0.000275 |
| 4 | 4 | 0.000220 |
Let’s give the columns more appropriate names:
data.rename(columns={'Alter': "age", 'A': "qx"}, inplace=True)
data.head()| age | qx | |
|---|---|---|
| 0 | 0 | 0.006113 |
| 1 | 1 | 0.000423 |
| 2 | 2 | 0.000343 |
| 3 | 3 | 0.000275 |
| 4 | 4 | 0.000220 |
Visualize f curve (density function for mortality):
fig = px.line(data, y="qx", x="age", hover_data=['age', 'qx'], labels=dict(age="Age", qx="f"))
fig.show()Get survival rate (S):
S = 1
data.loc[:, "S"] = 0
for i in range(len(data)):
S = S*(1-data.loc[i, 'qx'])
data.loc[i, 'S'] = SVisualize survival rate:
fig = px.line(data, y="S", x="age", hover_data=['age'], labels=dict(age="Age", S="Survival Rate"))
fig.add_hline(y=0.5, line_dash="solid", row=1, col=1, line_color="#000000", line_width=1)
fig.show()Based on the graph above, the life expectancy is:
data.iloc[(data['S']-0.5).abs().argsort()[:1]]| age | qx | S | |
|---|---|---|---|
| 76 | 76 | 0.067433 | 0.504283 |
We define life expectancy as the age after which the survival rate < 0.5. More explanation can be found here.
Get F or failure rate:
data.loc[:, "F"] = 1 - data.loc[:, "S"]Visualize F:
fig = px.line(data, y="F", x="age", hover_data=['age'], labels=dict(age="Age", F="Failure Rate"))
fig.add_hline(y=0.5, line_dash="solid", row=1, col=1, line_color="#000000", line_width=1)
fig.show()Get f or failure density:
data.loc[:, "f"] = data.loc[:, "F"].diff()Visualize f:
fig = px.line(data, y="f", x="age", hover_data=['age'], labels=dict(age="Age", f="Failure Density"))
fig.show()I am born in 1988 and by the time of writting, it is 2023. My age is 35 years old. For further calculation, we interested in the failure density starting from 35 years old.
Let’s see the failure density graph after 35 years old:
fig = px.line(data.loc[data['age'] >= 35], y="f", x="age", hover_data=['age'], labels=dict(age="Age", f="Failure Density"))
fig.show()Find my life expectancy by integrating the area below the graph until it approaches 0.5:
x = list(data.loc[data['age'] >= 35].age)
y = list(data.loc[data['age'] >= 35].f)
for i in range(len(x)):
current_x = x[: i+1]
current_y = y[: i+1]
area = integrate.simpson(current_y, current_x)
if area >= 0.5:
break
print("My life expectancy is {}".format(current_x[-1]))My life expectancy is 78
Let’s visualize my survival rate:
x = list(data.loc[data['age'] >= 35].age)
y = list(data.loc[data['age'] >= 35].f)
my_failure_rate = []
my_life_expectancy = 0
executed = False
for i in range(len(x)):
current_x = x[: i+1]
current_y = y[: i+1]
area = integrate.simpson(current_y, current_x)
my_failure_rate.append(area)
if area >= 0.5 and not executed:
my_life_expectancy = current_x[-1]
executed = True
my_survival = pd.DataFrame({"age": x, "failure_rate": my_failure_rate})
my_survival.loc[:, "survival_rate"] = 1 - my_survival.loc[:, "failure_rate"]
fig = px.line(my_survival, y="survival_rate", x="age", hover_data=['age'], labels=dict(age="Age", survival_rate="My Survival Rate"))
fig.add_hline(y=0.5, line_dash="solid", row=1, col=1, line_color="#000000", line_width=1)
fig.show()In the following section, we want to find the survival function for people born in year 2000 and still alive in 2023.
To do that we need to fit the relevant data for this group to the following distributions:
weibull_min function is used instead of weibull_max, because the x value (age) are positive.View the data:
x = list(data.loc[data['age'] >= 23].age)
y = list(data.loc[data['age'] >= 23].f)
failure_rate = []
life_expectancy = 0
executed = False
for i in range(len(x)):
current_x = x[: i+1]
current_y = y[: i+1]
area = integrate.simpson(current_y, current_x)
failure_rate.append(area)
if area >= 0.5 and not executed:
my_life_expectancy = current_x[-1]
executed = True
survival_2023 = pd.DataFrame({"age": x, "failure_rate": failure_rate})
survival_2023.loc[:, "survival_rate"] = 1 - survival_2023.loc[:, "failure_rate"]
fig = px.line(survival_2023, y="survival_rate", x="age", hover_data=['age'], labels=dict(age="Age", survival_rate="Survival Rate"), title="Survival rate for 23 years old man in year 2023")
fig.add_hline(y=0.5, line_dash="solid", row=1, col=1, line_color="#000000", line_width=1)
fig.show()Prepare survival function for exponential, weibull and lognorm distributions:
# sf: survival function
def sf_exponential(x, loc=0, scale=1):
return scipy.stats.expon.sf(x, loc, scale)
def sf_weibull(x, c, loc=0, scale=1):
return scipy.stats.weibull_min.sf(x, c, loc, scale)
def sf_lognorm(x, s, loc=0, scale=1):
return scipy.stats.lognorm.sf(x, s, loc, scale)exponential distributionpopt, pcov = scipy.optimize.curve_fit(sf_exponential, survival_2023.age, survival_2023.survival_rate)
plt.scatter(survival_2023.age, survival_2023.survival_rate, label='Real Data', c='r')
pred = sf_exponential(survival_2023.age, *popt)
plt.plot(survival_2023.age, pred, 'b-', label="Prediction")
plt.xlabel('Age')
plt.ylabel('Survival Rate')
plt.legend()
plt.show()weibull distributionpopt_weibull, pcov = scipy.optimize.curve_fit(sf_weibull, survival_2023.age, survival_2023.survival_rate)
plt.scatter(survival_2023.age, survival_2023.survival_rate, label='Real Data', c='r')
pred = sf_weibull(survival_2023.age, *popt_weibull)
plt.plot(survival_2023.age, pred, 'b-', label="Prediction")
plt.xlabel('Age')
plt.ylabel('Survival Rate')
plt.legend()
plt.show()lognorm distributionpopt, pcov = scipy.optimize.curve_fit(sf_lognorm, survival_2023.age, survival_2023.survival_rate)
plt.scatter(survival_2023.age, survival_2023.survival_rate, label='Real Data', c='r')
pred = sf_lognorm(survival_2023.age, *popt)
plt.plot(survival_2023.age, pred, 'b-', label="Prediction")
plt.xlabel('Age')
plt.ylabel('Survival Rate')
plt.legend()
plt.show()weibbull distribution is obviously the best distribution to fit our dataset.\[\Huge S(age) = e^{-(\frac{age-loc}{scale})^c} \]
where:
print("c, loc and scale = {:.2f}, {:.2f}, {:.2f}".format(popt_weibull[0], popt_weibull[1], popt_weibull[2]))c, loc and scale = 37.84, -275.27, 355.85
Proof: The following graph shows the output of preceeding function, which is identical to the Section 7.3.
c, loc, scale = [*popt_weibull]
x = survival_2023.age
pred = np.exp(-((x-loc)/scale)**c)
plt.plot(x, pred, 'b-', label="Prediction")
plt.legend()
plt.show()