Source code for covidtracker.calculate_stat_summary

import pandas as pd

[docs]def calculate_stat_summary(df, column): """Creates summary information about the covid cases in each province of Canada Parameters ---------- df : pandas.DataFrame Pandas DataFrame containing covid data to summary. column : string column name, specifying which column to summarize. the data type of the column must be numeric. Returns ------- pandas.DataFrame pandas DataFrame containing summary information. Examples -------- >>> calculate_stat_summary(covid_df, 'cases') >>> calculate_stat_summary(covid_df, 'cumulative_deaths') """ # Check input dataframe validity if not isinstance(df, pd.DataFrame): raise TypeError("Invalid argument type: df must be a pandas DataFrame") elif len(df) == 0: raise ValueError("Argument value error: df must contain at least one row of data") elif not ('province' in df.columns): raise ValueError("Argument value error: df must contain province columns") columns = list(df.columns) for i in range(len(columns)): if columns[i].startswith('date'): date_col = columns[i] break # Check column name validity if not isinstance(column, str): raise TypeError("Invalid argument type: column must be a string") elif column not in columns: raise ValueError("The column name does not exist in the dataframe,\ Choose a valid column name") # Check the data type of the column if not (df[column].dtype == 'int64' or df[column].dtype == 'float64'): raise TypeError("Invalid argument type: this column must be numeric") # Select the up to date information of each province df[date_col] = pd.to_datetime(df[date_col], format='%d-%m-%Y') start_date = df.loc[df[date_col].argmin(), date_col] end_date = df.loc[df[date_col].argmax(), date_col] columns = [date_col, 'province'] + [column] summary = df[df[date_col] == end_date][columns].sort_values('province') # Summarize the min, max and mean of the selected summary column min_value = [] max_value = [] mean_value = [] std = [] count = [] percentile_25 = [] percentile_50 = [] percentile_75 = [] for i in range(len(summary)): province = summary.iloc[i, 1] summ = df[df['province'] == province][column].describe() min_value.append(int(summ[3])) max_value.append(int(summ[7])) mean_value.append(int(summ[1])) percentile_25.append(int(summ[4])) percentile_50.append(int(summ[5])) percentile_75.append(int(summ[6])) std.append(int(summ[2])) count.append(int(summ[0])) summary['min'] = min_value summary['max'] = max_value summary['mean'] = mean_value summary['25%'] = percentile_25 summary['50%'] = percentile_50 summary['75%'] = percentile_75 summary['std'] = std summary['count'] = count summary['start_date'] = start_date return(summary)