BINS = 120  # Number of age bins for histograms
YEARS = tuple(range(1972, 2025))  # the 1972-2024 period will be covered
ANNOTATIONS = {1914: '1914-1918: World War I',
               1918: '',
               1946: '1946: Babyboom'}

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

def download_unzip(zipurl, destination):
    """Download zipfile from URL and extract it to destination"""
    with urlopen(zipurl) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(destination)

for decade in 1970, 1980, 1990, 2000, 2010:
    url = f"https://www.insee.fr/fr/statistiques/fichier/4769950/deces-{decade}-{decade + 9}-csv.zip"
    print("Downloading and extracting", url)
    download_unzip(url, 'data')

Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4769950/deces-1970-1979-csv.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4769950/deces-1980-1989-csv.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4769950/deces-1990-1999-csv.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4769950/deces-2000-2009-csv.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4769950/deces-2010-2019-csv.zip

for year in 2020, 2021, 2022, 2023, 2024:
    url = f"https://www.insee.fr/fr/statistiques/fichier/4190491/Deces_{year}.zip"
    print("Downloading and extracting", url)
    download_unzip(url, 'data')

Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4190491/Deces_2020.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4190491/Deces_2021.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4190491/Deces_2022.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4190491/Deces_2023.zip
Downloading and extracting https://www.insee.fr/fr/statistiques/fichier/4190491/Deces_2024.zip

from pathlib import Path

data_dir = Path("data")
print(" ".join(sorted(f.name for f in data_dir.iterdir())))

Deces_2010.csv Deces_2011.csv Deces_2012.csv Deces_2013.csv Deces_2014.csv Deces_2015.csv Deces_2016.csv Deces_2017.csv Deces_2018.csv Deces_2019.csv Deces_2021.csv Deces_2022.csv Deces_2023.csv Deces_2024.csv deces-1970.csv deces-1971.csv deces-1972.csv deces-1973.csv deces-1974.csv deces-1975.csv deces-1976.csv deces-1977.csv deces-1978.csv deces-1979.csv deces-1980.csv deces-1981.csv deces-1982.csv deces-1983.csv deces-1984.csv deces-1985.csv deces-1986.csv deces-1987.csv deces-1988.csv deces-1989.csv deces-1990.csv deces-1991.csv deces-1992.csv deces-1993.csv deces-1994.csv deces-1995.csv deces-1996.csv deces-1997.csv deces-1998.csv deces-1999.csv deces-2000.csv deces-2001.csv deces-2002.csv deces-2003.csv deces-2004.csv deces-2005.csv deces-2006.csv deces-2007.csv deces-2008.csv deces-2009.csv deces-2010.csv deces-2011.csv deces-2012.csv deces-2013.csv deces-2014.csv deces-2015.csv deces-2016.csv deces-2017.csv deces-2018.csv deces-2019.csv deces-2020.csv deces-2021.csv deces-2022.csv deces-2023.csv deces-2024.csv deces_2020.csv

for csv_file in data_dir.glob('*.csv'):
    new_name = csv_file.name.lower().replace('_', '-')
    csv_file.rename(csv_file.parent / new_name)

print(' '.join(sorted(f.name for f in data_dir.iterdir())))

deces-1970.csv deces-1971.csv deces-1972.csv deces-1973.csv deces-1974.csv deces-1975.csv deces-1976.csv deces-1977.csv deces-1978.csv deces-1979.csv deces-1980.csv deces-1981.csv deces-1982.csv deces-1983.csv deces-1984.csv deces-1985.csv deces-1986.csv deces-1987.csv deces-1988.csv deces-1989.csv deces-1990.csv deces-1991.csv deces-1992.csv deces-1993.csv deces-1994.csv deces-1995.csv deces-1996.csv deces-1997.csv deces-1998.csv deces-1999.csv deces-2000.csv deces-2001.csv deces-2002.csv deces-2003.csv deces-2004.csv deces-2005.csv deces-2006.csv deces-2007.csv deces-2008.csv deces-2009.csv deces-2010.csv deces-2011.csv deces-2012.csv deces-2013.csv deces-2014.csv deces-2015.csv deces-2016.csv deces-2017.csv deces-2018.csv deces-2019.csv deces-2020.csv deces-2021.csv deces-2022.csv deces-2023.csv deces-2024.csv

import pandas as pd
import numpy as np
from pathlib import Path

data_dir = Path('data')
csv_files = sorted(data_dir.glob('*.csv'))
n_files = len(csv_files)
df_years = []
for i, csv_file in enumerate(csv_files):
    print(f"Loading {csv_file} ({i + 1}/{n_files})", end='\r', flush=True)
    df_year = pd.read_csv(csv_file,
                          sep=';',
                          usecols=[1, 2, 6],
                          dtype={'datenaiss': 'string', 'datedeces': 'string'},
                          na_filter=False)
    
    # Convert and handle errors
    df_year['datenaiss'] = pd.to_datetime(df_year['datenaiss'], format='%Y%m%d', errors='coerce')
    df_year['datedeces'] = pd.to_datetime(df_year['datedeces'], format='%Y%m%d', errors='coerce')
    
    df_years.append(df_year)

df = pd.concat(df_years, axis=0, ignore_index=True)
del df_years  # free memory
print()

Loading data/deces-2024.csv (55/55)

df

# remove entries containing missing values
df.dropna(axis='index', inplace=True)
df['age'] = (df['datedeces'] - df['datenaiss']).dt.days / 365.25
df = df[df['age'] >= 0]
df = df[min(YEARS) <= df['datedeces'].dt.year]
df = df[df['datedeces'].dt.year <= max(YEARS)]
# We will need to group by year of death
df['death_year'] = df['datedeces'].dt.year
women = df[df.sexe == 2]  # a subset containing women
men = df[df.sexe == 1]  # a subset containing men

print(f"{'Dataframe':<10} Number of entries")
template_line = "{:<10} {:,}"
print(template_line.format('df', len(df)))
print(template_line.format('women', len(women)))
print(template_line.format('men', len(men)))

Dataframe  Number of entries
df         27,951,563
women      13,481,712
men        14,469,851

%matplotlib inline
%config InlineBackend.figure_format='retina'
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = [12, 8]  # Set default figure size
sns.set_theme()

by_year = df.groupby('death_year').size()
by_year_women = women.groupby('death_year').size()
by_year_men = men.groupby('death_year').size()
ax = by_year.plot(label="Total")
by_year_women.plot(style='--', ax=ax, label="Women")
by_year_men.plot(style='-.', ax=ax, label="Men")
ax.set_ybound(lower=0)
ax.set_xlabel("Year")
ax.set_title("Number of deaths")
ax.legend();

def get_year(df: pd.DataFrame, year: int) -> pd.DataFrame:
    """Return sub dataframe corresponding to decease year"""
    return df[df['datedeces'].dt.year == year]

def plot_year(df: pd.DataFrame, year: int, annotate=False):
    """Plot one year histogram ad return fig, ax and bar_container"""
    # Initialize figure
    fig, ax = plt.subplots()
    ax.set_xlabel('Age of death (years)')
    _, _, bar_container = ax.hist(get_year(df, year)['age'],
                                  BINS,
                                  range=[0, BINS])
    fig.suptitle("Distribution of the age of death in France")
    ax.set_title(f"Year of death: {year}")
    ax.set_xlim([0, BINS])
    ax.set_ylim([0, 27500])
    ax.set_ylabel("Number of deaths")
    annotations = {}
    if annotate:
        for birthyear, text in ANNOTATIONS.items():
            age = year - birthyear
            vl = ax.axvline((age - 1, ), color='r', linewidth=0.75)
            text = ax.text(age - 10, 23000, text, color='r',
                           bbox=dict(facecolor='white', alpha=0.75))
            annotations[birthyear] = (vl, text)
    return fig, ax, (bar_container, annotations)

plot_year(df, 1983);

plot_year(df, 1983, annotate=True);

import matplotlib.animation as animation
from IPython.display import HTML


def update(bar_container, annotations):
    """Update ax for animation"""

    def animate(year):
        age = get_year(df, year)['age']
        print(f"{year}: {len(age)}", end='\r')
        n, _ = np.histogram(age, BINS, range=[0, BINS])
        ax.set_title(f"Year of death: {year}")
        if annotations:
            for birthyear, (vl, text) in annotations.items():
                age = year - birthyear
                vl.set_xdata([age - 1, age - 1])
                text.set_x(age - 10)
        for count, rect in zip(n, bar_container.patches):
            rect.set_height(count)
        return bar_container.patches

    return animate


fig, ax, artists = plot_year(df, YEARS[0], annotate=True)  # First plot
# Build animation
anim = animation.FuncAnimation(fig, update(*artists), YEARS,
                               blit=True, repeat=False)
# Record animation as js video
video = anim.to_jshtml(default_mode='loop')
plt.close()  # To prevent from displaying a fixed figure
HTML(video)  # Display video

2024: 636069

import dataclasses

@dataclasses.dataclass
class Dataset:
    """Class for storing a named dataset and its visual elements"""
    name: str
    data: pd.DataFrame
    ax: matplotlib.axes.SubplotBase
    barcontainer: matplotlib.container.BarContainer = None
    annotate: str = None
    annotations: dict = dataclasses.field(default_factory=dict)


    def set_ax_params(self, bins: int, xmax: int, xlabel: str, year: int):
        """Set parameters to ax"""
        self.ax.set_title(self.name)
        self.ax.set_ylim([0, bins])
        self.ax.set_xlabel(xlabel)
        self.ax.set_xlim([0, xmax])
        if self.annotate:
            for birthyear, text in ANNOTATIONS.items():
                age = year - birthyear
                vl = self.ax.axhline((age - 1, ), color='r', linewidth=0.75)
                if self.annotate != 'hlines':
                    text = self.ax.text(14000, age + 1, text, color='r')
                else:
                    text = None
                self.annotations[birthyear] = (vl, text)

def plot_year_wm(women, men, year: int):
    """Plot one year histogram and return fig and Datasets"""
    # Initialize figure
    fig, axes = plt.subplots(ncols=2, sharey=True)
    fig.suptitle(f"Distribution of the age of death in France in {year}")

    w = Dataset(name="Women", data=women, ax=axes[0], annotate='both')
    m = Dataset(name="Men", data=men, ax=axes[1], annotate='hlines')

    for s in w, m:
        _, _, s.bar_container = s.ax.hist(
            get_year(s.data, year)['age'],
            BINS,
            range=[0, BINS],
            orientation='horizontal')
        s.set_ax_params(BINS, 17000, "Number of deaths", year)

    # These parameters are specific to left plot
    w.ax.invert_xaxis()
    w.ax.set_ylabel("Age of death (years)")

    fig.tight_layout()
    return fig, (w, m)

fig, (w, m) = plot_year_wm(women, men, 1983);

def update_wm(wm):
    """Update ax for animation"""

    def animate(year):
        fig.suptitle(
            f"Distribution of the age of death in France in {year}")
        for s in wm:
            age = get_year(s.data, year)['age']
            s.len = len(age)
            n, _ = np.histogram(age, BINS, range=[0, BINS])
            for count, rect in zip(n, s.bar_container.patches):
                rect.set_width(count)
            if s.annotations:
                for birthyear, (vl, text) in s.annotations.items():
                    age = year - birthyear
                    vl.set_ydata([age - 1, age - 1])
                    if text:
                        text.set_y(age + 1)
        print(f"{year}: {wm[0].len} women, {wm[1].len} men", end='\r',
                flush=True)
        return s.bar_container.patches

    return animate

fig, wm = plot_year_wm(women, men, YEARS[0])  # First plot
anim_wm = animation.FuncAnimation(fig, update_wm(wm), YEARS,
                                blit=True, repeat=False)
video_wm = anim_wm.to_jshtml(default_mode='loop')
plt.close()
HTML(video_wm)

2024: 318371 women, 317698 men

anim_wm.save("deaths_wm.gif", fps=12)
plt.close()

2024: 318371 women, 317698 men

	sexe	datenaiss	datedeces
0	2	1922-01-09	1970-12-10
1	1	1969-03-29	1970-04-25
2	1	1970-02-01	1970-02-03
3	2	1970-04-06	1970-04-06
4	2	1970-07-08	1970-07-08
...	...	...	...
28257726	2	2003-04-08	2023-10-07
28257727	1	1943-11-12	2024-10-13
28257728	1	1953-04-03	2024-11-12
28257729	1	1939-01-06	2024-10-14
28257730	2	1934-11-09	2024-10-04

Animating the French deaths¶

Introduction¶

Some parameters¶

Processing INSEE data¶

Data downloading¶

Data loading¶

Plotting the data¶

Deaths over the years¶

Animating the total deaths¶

Comparing women and men¶

Conclusion¶