# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import seaborn as sns
from math import ceil
df = pd.read_csv('/Users/azadvoryanskiy/Documents/data_analyst_calendar_test.csv')
df.head()
user_id | event_date | test_join_date | platform | level | ab_group | iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 544cabcc | 2021-08-10 | 2021-09-03 | android | 467 | BP_7_4_new_calendars_CONTROL | 0.00 | 3.0 | 260.0 | 60.0 |
1 | 544cabcc | 2021-09-02 | 2021-09-03 | android | 467 | BP_7_4_new_calendars_CONTROL | 19.99 | 21.0 | 5451.0 | 2215.0 |
2 | f59502a5 | 2021-10-24 | 2021-10-24 | android | 0 | BP_7_4_new_calendars_CONTROL | 0.00 | 2.0 | 46.0 | 72.0 |
3 | b6a8e86e | 2021-09-17 | 2021-09-08 | iphone | 1 | BP_7_4_new_calendars_ON | 0.00 | 6.0 | 176.0 | 148.0 |
4 | e022ecbd | 2021-08-19 | 2021-09-02 | iphone | 881 | BP_7_4_new_calendars_ON | 0.00 | 2.0 | 552.0 | 127.0 |
print(f"Number of unique users: {df['user_id'].nunique()}")
print(f"Percentage of users in control: {round(df[df['ab_group']=='BP_7_4_new_calendars_CONTROL'].shape[0] * 100 / df.shape[0])}%")
Number of unique users: 80166 Percentage of users in control: 50%
DATA PROCESSING
First we need to clean the data - fix manual input errors, remove duplicates and make sure that users do not appear multiple times.
df['platform'].value_counts()
android 424755 iphone 146378 andriod 30 Name: platform, dtype: int64
df['platform'].replace({'andriod': 'android'}, inplace=True) # replace "andriod" typos
df = df.drop_duplicates(keep = 'first') # drop 10 duplicate rows
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 571153 entries, 0 to 571162 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 571153 non-null object 1 event_date 571153 non-null object 2 test_join_date 571153 non-null object 3 platform 571153 non-null object 4 level 571153 non-null int64 5 ab_group 571153 non-null object 6 iap_revenue 571153 non-null float64 7 matches_played 571153 non-null float64 8 hard_currency_spent 571153 non-null float64 9 hard_currency_earned_matches 571153 non-null float64 dtypes: float64(4), int64(1), object(5) memory usage: 47.9+ MB
We want to separate the data to sessions logged before the test start and after the test start date.
after_test = df.drop(df[df.event_date < df.test_join_date].index)
before_test = df.drop(df[df.event_date >= df.test_join_date].index)
session_counts = df['user_id'].value_counts(ascending=False)
multi_users = session_counts[session_counts > 1].count()
print(f'There are {multi_users} users that appear multiple times in the dataset')
There are 45915 users that appear multiple times in the dataset
Since we want to measure users' conversion and don't want to randomly select same users multiple times, we need to make the "user_id" column unique. To do that we have 2 methods:
We also need to create a new column "converted" with binary variables where "1" means conversion if total amount spent is greater than 0 and "0" otherwise.
First method: calculate LTV for each user
before = before_test.groupby("user_id").sum()[['iap_revenue']]
before = before.reset_index()
before['converted'] = np.where(before['iap_revenue'] > 0, 1, 0)
converted_before = before['converted'].sum() / before['converted'].count()
conversion_before = round(converted_before*100,2)
print(f'Historical user purchase conversion is {conversion_before}% before the test start date with the old calendar design')
Historical user purchase conversion is 15.98% before the test start date with the old calendar design
after = after_test.groupby(['user_id', 'ab_group', 'platform']).sum()[['iap_revenue', 'matches_played', 'hard_currency_spent', 'hard_currency_earned_matches']]
after = after.reset_index()
after['converted'] = np.where(after['iap_revenue'] > 0, 1, 0)
converted_after = after['converted'].sum() / after['converted'].count()
conversion_after = round(converted_after*100,2)
print(f'After the test start date we see an overall purchase conversion of {conversion_after}%, much lower than before')
After the test start date we see an overall purchase conversion of 6.1%, much lower than before
A/B TEST
First, we need to decide on a sample size, which depends on a few factors:
Power of the test (1 — β) — This represents the probability of finding a statistical difference between the groups in our test when a difference is actually present. This is usually set at 0.8 by convention.
Alpha value (α) — The critical value of 0.05
Minimum Detectable Effect — The difference we expect to see between the conversion rates. We assume it's 2%.
effect_size = sms.proportion_effectsize(0.16, 0.18) # Calculating effect size based on our expected rates
required_n = sms.NormalIndPower().solve_power(
effect_size,
power=0.8,
alpha=0.05,
ratio=1
) # Calculating sample size needed
required_n = ceil(required_n) # Rounding up to next whole number
print(f'The sample size required for control and treatment groups is {required_n}')
The sample size required for control and treatment groups is 5534
control_sample = after[after['ab_group'] == 'BP_7_4_new_calendars_CONTROL'].sample(n=required_n, random_state=10)
treatment_sample = after[after['ab_group'] == 'BP_7_4_new_calendars_ON'].sample(n=required_n, random_state=10)
ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)
ab_test
user_id | ab_group | platform | iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | converted | |
---|---|---|---|---|---|---|---|---|
0 | 3937f1da | BP_7_4_new_calendars_CONTROL | android | 0.000000 | 103.0 | 11946.0 | 6545.0 | 0 |
1 | 863b9ffb | BP_7_4_new_calendars_CONTROL | android | 0.000000 | 24.0 | 13648.0 | 9296.0 | 0 |
2 | 3d32ed4b | BP_7_4_new_calendars_CONTROL | android | 0.000000 | 1.0 | 44.0 | 48.0 | 0 |
3 | f75fafb9 | BP_7_4_new_calendars_CONTROL | android | 0.000000 | 140.0 | 2140.0 | 759.0 | 0 |
4 | a950a78d | BP_7_4_new_calendars_CONTROL | android | 0.000000 | 158.0 | 8532.0 | 3953.0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
11063 | 94823437 | BP_7_4_new_calendars_ON | android | 0.000000 | 60.0 | 1430.0 | 1202.0 | 0 |
11064 | 4c3e4683 | BP_7_4_new_calendars_ON | android | 0.000000 | 16.0 | 4962.0 | 2522.0 | 0 |
11065 | d39e5d4b | BP_7_4_new_calendars_ON | android | 0.000000 | 13.0 | 2943.0 | 1236.0 | 0 |
11066 | eaafff8a | BP_7_4_new_calendars_ON | android | 53.949999 | 246.0 | 60502.0 | 32335.0 | 1 |
11067 | 85d256a1 | BP_7_4_new_calendars_ON | android | 0.000000 | 11.0 | 698.0 | 1005.0 | 0 |
11068 rows × 8 columns
ab_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11068 entries, 0 to 11067 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 11068 non-null object 1 ab_group 11068 non-null object 2 platform 11068 non-null object 3 iap_revenue 11068 non-null float64 4 matches_played 11068 non-null float64 5 hard_currency_spent 11068 non-null float64 6 hard_currency_earned_matches 11068 non-null float64 7 converted 11068 non-null int64 dtypes: float64(4), int64(1), object(3) memory usage: 691.9+ KB
ab_test['ab_group'].value_counts() # total number of users is 11068 as expected (5534 in each group)
BP_7_4_new_calendars_CONTROL 5534 BP_7_4_new_calendars_ON 5534 Name: ab_group, dtype: int64
conversion_rates = ab_test.groupby('ab_group')['converted']
std_p = lambda x: np.std(x, ddof=0) # Std. deviation of the proportion
se_p = lambda x: stats.sem(x, ddof=0) # Std. error of the proportion (std / sqrt(n))
conversion_rates = conversion_rates.agg([np.mean, std_p, se_p])
conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']
conversion_rates.style.format('{:.3f}')
conversion_rate | std_deviation | std_error | |
---|---|---|---|
ab_group | |||
BP_7_4_new_calendars_CONTROL | 0.064 | 0.245 | 0.003 |
BP_7_4_new_calendars_ON | 0.063 | 0.242 | 0.003 |
In the table above, it does look like our two designs performed very similarly. The old design performed slightly better, 6.4% vs. 6.3% conversion rate.
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
control_results = ab_test[ab_test['ab_group'] == 'BP_7_4_new_calendars_CONTROL']['converted']
treatment_results = ab_test[ab_test['ab_group'] == 'BP_7_4_new_calendars_ON']['converted']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]
z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)
print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')
z statistic: 0.35 p-value: 0.725 ci 95% for control group: [0.058, 0.071] ci 95% for treatment group: [0.056, 0.069]
Since we have a large sample, we can use the normal approximation for calculating the p-value with a z-test. The p-value=0.725 is much higher than α=0.05 threshold, meaning that the new design did not perform significantly different than the old one.
avg_matches_control = round(control_sample['matches_played'].mean(),2)
avg_matches_treatment = round(treatment_sample['matches_played'].mean(),2)
print(f'On average, each user in the control group played {avg_matches_control} matches')
print(f'On average, each user in the treatment group played {avg_matches_treatment} matches')
avg_amount_control = round(control_sample['iap_revenue'].mean(),2)
avg_amount_treatment = round(treatment_sample['iap_revenue'].mean(),2)
print(f'The control group spent ${avg_amount_control} on average per user')
print(f'The treatment group spent ${avg_amount_treatment} on average per user')
avg_hard_currency_spent_control = round(control_sample['hard_currency_spent'].mean(),2)
avg_hard_currency_spent_treatment = round(treatment_sample['hard_currency_spent'].mean(),2)
print(f'On average, each user in the control group spent {avg_hard_currency_spent_control} of hard currency')
print(f'On average, each user in the treatment group spent {avg_hard_currency_spent_treatment} of hard currency')
avg_hard_currency_earned_matches_control = round(control_sample['hard_currency_earned_matches'].mean(),2)
avg_hard_currency_earned_matches_treatment = round(treatment_sample['hard_currency_earned_matches'].mean(),2)
print(f'On average, each user in the control group earned {avg_hard_currency_earned_matches_control} of hard currency')
print(f'On average, each user in the treatment group earned {avg_hard_currency_earned_matches_treatment} of hard currency')
On average, each user in the control group played 54.44 matches On average, each user in the treatment group played 55.02 matches The control group spent $3.29 on average per user The treatment group spent $4.54 on average per user On average, each user in the control group spent 7625.51 of hard currency On average, each user in the treatment group spent 6936.14 of hard currency On average, each user in the control group earned 3863.66 of hard currency On average, each user in the treatment group earned 3357.38 of hard currency
As we saw, there's no significant difference in conversion rates between the 2 groups. However, each user in the treatment group spends on average ~38% more than users in the control group, while playing a similar number of matches.
control_sample.groupby('platform').mean()
iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | converted | |
---|---|---|---|---|---|
platform | |||||
android | 2.793120 | 54.406495 | 6619.665398 | 3242.595297 | 0.060470 |
iphone | 5.348793 | 54.599626 | 11826.697848 | 6457.726848 | 0.079514 |
treatment_sample.groupby('platform').mean()
iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | converted | |
---|---|---|---|---|---|
platform | |||||
android | 4.076555 | 55.063612 | 6229.516681 | 3043.955294 | 0.058941 |
iphone | 6.531067 | 54.848170 | 9996.833333 | 4714.963006 | 0.078035 |
Second method: First exposure of each user
after_second = after_test.sort_values('event_date').drop_duplicates(subset=['user_id'])
before_second = before_test.sort_values('event_date').drop_duplicates(subset=['user_id'])
after_second = after_second.reset_index(drop=True)
before_second = before_second.reset_index(drop=True)
before_second['converted'] = np.where(before_second['iap_revenue'] > 0, 1, 0)
converted_before_second = round(before_second['converted'].sum()*100 / before_second['converted'].count(),2)
converted_before_second
print(f'User purchase conversion is {converted_before_second}% on first exposure before the test start date with the old calendar design')
User purchase conversion is 4.83% on first exposure before the test start date with the old calendar design
after_second['converted'] = np.where(after_second['iap_revenue'] > 0, 1, 0)
converted_after_second = round(after_second['converted'].sum()*100 / after_second['converted'].count(),2)
converted_after_second
print(f'User purchase conversion is {converted_after_second}% on first exposure after the test start date with the new calendar design')
User purchase conversion is 2.4% on first exposure after the test start date with the new calendar design
A/B TEST
First, we need to decide on a sample size, which depends on a few factors:
Power of the test (1 — β) — This represents the probability of finding a statistical difference between the groups in our test when a difference is actually present. This is usually set at 0.8 by convention.
Alpha value (α) — The critical value of 0.05
Minimum Detectable Effect — The difference we expect to see between the conversion rates. We assume it's 2%.
effect_size_second = sms.proportion_effectsize(0.0483, 0.0683) # Calculating effect size based on our expected rates
required_n_second = sms.NormalIndPower().solve_power(
effect_size_second,
power=0.8,
alpha=0.05,
ratio=1
) # Calculating sample size needed
required_n_second = ceil(required_n_second) # Rounding up to next whole number
print(f'The sample size required for control and treatment groups is {required_n_second}')
The sample size required for control and treatment groups is 2140
control_sample_second = after_second[after_second['ab_group'] == 'BP_7_4_new_calendars_CONTROL'].sample(n=required_n_second, random_state=10)
treatment_sample_second = after_second[after_second['ab_group'] == 'BP_7_4_new_calendars_ON'].sample(n=required_n_second, random_state=10)
ab_test_second = pd.concat([control_sample_second, treatment_sample_second], axis=0)
ab_test_second.reset_index(drop=True, inplace=True)
ab_test_second
user_id | event_date | test_join_date | platform | level | ab_group | iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | converted | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | eb8ce6cd | 2021-09-05 | 2021-09-05 | iphone | 342 | BP_7_4_new_calendars_CONTROL | 0.00 | 28.0 | 5415.0 | 1855.0 | 0 |
1 | 1cd7d622 | 2021-09-24 | 2021-09-24 | android | 1 | BP_7_4_new_calendars_CONTROL | 0.00 | 3.0 | 60.0 | 106.0 | 0 |
2 | 6f9d84a8 | 2021-09-06 | 2021-09-06 | android | 1 | BP_7_4_new_calendars_CONTROL | 0.00 | 2.0 | 196.0 | 119.0 | 0 |
3 | 921b4f99 | 2021-10-29 | 2021-10-29 | android | 0 | BP_7_4_new_calendars_CONTROL | 0.00 | 19.0 | 19106.0 | 9041.0 | 0 |
4 | ca4228cd | 2021-10-06 | 2021-10-06 | android | 0 | BP_7_4_new_calendars_CONTROL | 0.00 | 11.0 | 490.0 | 420.0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4275 | 758f1966 | 2021-09-07 | 2021-09-07 | iphone | 35 | BP_7_4_new_calendars_ON | 0.00 | 4.0 | 692.0 | 78.0 | 0 |
4276 | 5a6522ab | 2021-09-03 | 2021-09-03 | android | 0 | BP_7_4_new_calendars_ON | 0.00 | 2.0 | 44.0 | 44.0 | 0 |
4277 | 3df319cb | 2021-10-31 | 2021-10-31 | android | 1 | BP_7_4_new_calendars_ON | 0.00 | 37.0 | 4158.0 | 2383.0 | 0 |
4278 | f8458521 | 2021-09-25 | 2021-09-25 | android | 1 | BP_7_4_new_calendars_ON | 0.99 | 12.0 | 200.0 | 50.0 | 1 |
4279 | ce0336ba | 2021-10-16 | 2021-10-14 | android | 0 | BP_7_4_new_calendars_ON | 0.00 | 1.0 | 16.0 | 0.0 | 0 |
4280 rows × 11 columns
conversion_rates_second = ab_test_second.groupby('ab_group')['converted']
std_p_second = lambda x: np.std(x, ddof=0) # Std. deviation of the proportion
se_p_second = lambda x: stats.sem(x, ddof=0) # Std. error of the proportion (std / sqrt(n))
conversion_rates_second = conversion_rates_second.agg([np.mean, std_p_second, se_p_second])
conversion_rates_second.columns = ['conversion_rate', 'std_deviation', 'std_error']
conversion_rates_second.style.format('{:.3f}')
conversion_rate | std_deviation | std_error | |
---|---|---|---|
ab_group | |||
BP_7_4_new_calendars_CONTROL | 0.022 | 0.148 | 0.003 |
BP_7_4_new_calendars_ON | 0.025 | 0.155 | 0.003 |
In the table above, it does look like both of the designs performed very similarly. The new design performed slightly better, 2.5% vs. 2.2% conversion rate.
control_results_second = ab_test_second[ab_test_second['ab_group'] == 'BP_7_4_new_calendars_CONTROL']['converted']
treatment_results_second = ab_test_second[ab_test_second['ab_group'] == 'BP_7_4_new_calendars_ON']['converted']
n_con_second = control_results_second.count()
n_treat_second = treatment_results_second.count()
successes_second = [control_results_second.sum(), treatment_results_second.sum()]
nobs_second = [n_con_second, n_treat_second]
z_stat_second, pval_second = proportions_ztest(successes_second, nobs=nobs_second)
(lower_con_second, lower_treat_second), (upper_con_second, upper_treat_second) = proportion_confint(successes_second, nobs=nobs_second, alpha=0.05)
print(f'z statistic: {z_stat_second:.2f}')
print(f'p-value: {pval_second:.3f}')
print(f'ci 95% for control group: [{lower_con_second:.3f}, {upper_con_second:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat_second:.3f}, {upper_treat_second:.3f}]')
z statistic: -0.50 p-value: 0.615 ci 95% for control group: [0.016, 0.029] ci 95% for treatment group: [0.018, 0.031]
Since we have a large sample, we can use the normal approximation for calculating the p-value with a z-test. The p-value=0.615 is much higher than α=0.05 threshold, meaning that the new design did not perform significantly different than the old one, which corresponds to the results we got using the first method.
avg_matches_control_second = round(control_sample_second['matches_played'].mean(),2)
avg_matches_treatment_second = round(treatment_sample_second['matches_played'].mean(),2)
print(f'On average, each user in the control group played {avg_matches_control_second} matches')
print(f'On average, each user in the treatment group played {avg_matches_treatment_second} matches')
avg_amount_control_second = round(control_sample_second['iap_revenue'].mean(),2)
avg_amount_treatment_second = round(treatment_sample_second['iap_revenue'].mean(),2)
print(f'The control group spent ${avg_amount_control_second} on average per user')
print(f'The treatment group spent ${avg_amount_treatment_second} on average per user')
avg_hard_currency_spent_control_second = round(control_sample_second['hard_currency_spent'].mean(),2)
avg_hard_currency_spent_treatment_second = round(treatment_sample_second['hard_currency_spent'].mean(),2)
print(f'On average, each user in the control group spent {avg_hard_currency_spent_control_second} of hard currency')
print(f'On average, each user in the treatment group spent {avg_hard_currency_spent_treatment_second} of hard currency')
avg_hard_currency_earned_matches_control_second = round(control_sample_second['hard_currency_earned_matches'].mean(),2)
avg_hard_currency_earned_matches_treatment_second = round(treatment_sample_second['hard_currency_earned_matches'].mean(),2)
print(f'On average, each user in the control group earned {avg_hard_currency_earned_matches_control_second} of hard currency')
print(f'On average, each user in the treatment group earned {avg_hard_currency_earned_matches_treatment_second} of hard currency')
On average, each user in the control group played 14.46 matches On average, each user in the treatment group played 15.35 matches The control group spent $0.48 on average per user The treatment group spent $0.45 on average per user On average, each user in the control group spent 1835.79 of hard currency On average, each user in the treatment group spent 2128.51 of hard currency On average, each user in the control group earned 1007.84 of hard currency On average, each user in the treatment group earned 1168.97 of hard currency
As we saw, there's no significant difference in conversion rates between the 2 groups. Furthermore, there's no much difference in matches played and average amount spent per user as opposed to the previous method. However, the treatment group earns and spends more hard currency than the control group, possibly because of the cherries offered as a reward on the first day in the new design.
control_sample_second.groupby('platform').mean()
level | iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | converted | |
---|---|---|---|---|---|---|
platform | ||||||
android | 38.700760 | 0.359082 | 14.708358 | 1693.943308 | 925.934541 | 0.018703 |
iphone | 100.002331 | 0.966713 | 13.463869 | 2401.543124 | 1334.482517 | 0.037296 |
treatment_sample_second.groupby('platform').mean()
level | iap_revenue | matches_played | hard_currency_spent | hard_currency_earned_matches | converted | |
---|---|---|---|---|---|---|
platform | ||||||
android | 39.623515 | 0.362470 | 15.359857 | 2024.438836 | 1118.416865 | 0.023159 |
iphone | 71.116228 | 0.764794 | 15.306140 | 2512.820175 | 1355.658772 | 0.030702 |