# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import seaborn as sns
from math import ceil
import matplotlib.pyplot as plt
#read the data tables
payouts = pd.read_csv('/Users/azadvoryanskiy/Documents/payouts.csv')
countries = pd.read_csv('/Users/azadvoryanskiy/Documents/countries.csv')
industries = pd.read_csv('/Users/azadvoryanskiy/Documents/industries.csv')
payouts.head()
date | platform_id | recipient_id | count | amount | |
---|---|---|---|---|---|
0 | 2018-05-16 00:00:00+00:00 | id_5dded1fc8ff3f8c0d96019076394d2a7 | id_23d90ec275370c686dedd7dc1c5e93b3 | 1 | 9786.0 |
1 | 2018-11-23 00:00:00+00:00 | id_424316eb0d974bebd1736fc4bd2eac3d | id_72f05535ba5e6e5b141db6b5c1f1b13b | 1 | 3750.0 |
2 | 2018-08-07 00:00:00+00:00 | id_3702c1be346117616974e4127c952ab7 | id_d70e8046fe5583e1154b2e077133e27c | 1 | 2258.0 |
3 | 2018-07-10 00:00:00+00:00 | id_a1eafb7ca12772d4e8770b9b3566e608 | id_1a336bd21b4bfd9810e0510e1a5f7ec2 | 1 | 20369.0 |
4 | 2018-01-07 00:00:00+00:00 | id_a1eafb7ca12772d4e8770b9b3566e608 | id_c577aeddec71607ec0be1f94bef025da | 1 | 34922.0 |
DATA PROCESSING
First we need to change the date format, convert the payouts from cents to thousands dollars and introduce a new variable weekday.
#change date format, convert cents to thousands dollars, add weekday column (Monday = 0, Tuesday =1 etc.)
payouts['date'] = pd.to_datetime(payouts['date'])
payouts['date'] = payouts['date'].dt.date
payouts['amount_in_thousand_$'] = payouts['amount'].div(100000)
payouts['weekday'] = pd.to_datetime(payouts['date'])
payouts['weekday'] = payouts['weekday'].dt.dayofweek
#create another df and delete some columns
payouts_2 = payouts
payouts_2 = payouts_2.drop(['date','platform_id', 'count', 'amount'], axis = 1)
payouts_2.head()
recipient_id | amount_in_thousand_$ | weekday | |
---|---|---|---|
0 | id_23d90ec275370c686dedd7dc1c5e93b3 | 0.09786 | 2 |
1 | id_72f05535ba5e6e5b141db6b5c1f1b13b | 0.03750 | 4 |
2 | id_d70e8046fe5583e1154b2e077133e27c | 0.02258 | 1 |
3 | id_1a336bd21b4bfd9810e0510e1a5f7ec2 | 0.20369 | 1 |
4 | id_c577aeddec71607ec0be1f94bef025da | 0.34922 | 6 |
Next we merge the payouts table with the countries table to get the country of each merchant. Since only 1% of payouts don't have country associated with them, we can drop these rows.
#merge payouts with countries
merged_payouts_countries = pd.merge(payouts_2, countries, how='left', left_on='recipient_id', right_on='merchant_id')
# only 1% of payouts don't have country so we can drop them
merged_payouts_countries.info()
#remove unneccessary columns
merged_payouts_countries = merged_payouts_countries.drop(['recipient_id','merchant_id'], axis = 1)
<class 'pandas.core.frame.DataFrame'> Int64Index: 1622642 entries, 0 to 1622641 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 recipient_id 1622642 non-null object 1 amount_in_thousand_$ 1622642 non-null float64 2 weekday 1622642 non-null int64 3 merchant_id 1609476 non-null object 4 country 1609476 non-null object dtypes: float64(1), int64(1), object(3) memory usage: 74.3+ MB
#group amounts by weekday and country
merged_payouts_countries = merged_payouts_countries.groupby(['weekday', 'country'], as_index=False).sum()
merged_payouts_countries
weekday | country | amount_in_thousand_$ | |
---|---|---|---|
0 | 0 | AT | 59.99371 |
1 | 0 | AU | 1373.56356 |
2 | 0 | BE | 308.90536 |
3 | 0 | BR | 117.93376 |
4 | 0 | CA | 2943.51326 |
... | ... | ... | ... |
167 | 6 | NZ | 228.99310 |
168 | 6 | PT | 20.02176 |
169 | 6 | SE | 134.24244 |
170 | 6 | SG | 94.15636 |
171 | 6 | US | 141726.23336 |
172 rows × 3 columns
We can clearly see below that Tuesday (weekday=1) has by far the highest payout compared to other days. Since we're required to estimate the payouts on Jan. 1, 2019 and this date falls on Tuesday, we would want to make sure we're accounting for that in our prediction.
For simplicity, we can assume that each Tuesday is similar in terms of payouts and therefore we can calculate the total payouts on all Tuesdays of year 2018 by country and divide these numbers by 52 to find an average payout amount on a typical Tuesday.
merged_payouts_countries_2=merged_payouts_countries
merged_payouts_countries_2= merged_payouts_countries_2.groupby(['weekday'], as_index=False).sum()
merged_payouts_countries_2
weekday | amount_in_thousand_$ | |
---|---|---|
0 | 0 | 196516.00269 |
1 | 1 | 293800.04021 |
2 | 2 | 242971.75017 |
3 | 3 | 200414.60491 |
4 | 4 | 149249.98639 |
5 | 5 | 104333.67793 |
6 | 6 | 153383.16287 |
#total payouts on all Tuesdays in 2018 by country
merged_payouts_countries = merged_payouts_countries[merged_payouts_countries['weekday']== 1]
merged_payouts_countries
weekday | country | amount_in_thousand_$ | |
---|---|---|---|
25 | 1 | AT | 175.48883 |
26 | 1 | AU | 2725.86024 |
27 | 1 | BE | 231.48953 |
28 | 1 | BR | 148.90838 |
29 | 1 | CA | 3680.16026 |
30 | 1 | CH | 139.54734 |
31 | 1 | DE | 1435.43346 |
32 | 1 | DK | 94.77140 |
33 | 1 | ES | 1088.30410 |
34 | 1 | FI | 105.38809 |
35 | 1 | FR | 3204.91741 |
36 | 1 | GB | 7847.86131 |
37 | 1 | HK | 458.94807 |
38 | 1 | IE | 375.22717 |
39 | 1 | IT | 360.38189 |
40 | 1 | JP | 584.47990 |
41 | 1 | LU | 3.49237 |
42 | 1 | MX | 3.38985 |
43 | 1 | NL | 253.49345 |
44 | 1 | NO | 154.53824 |
45 | 1 | NZ | 299.22449 |
46 | 1 | PT | 38.03378 |
47 | 1 | SE | 204.93012 |
48 | 1 | SG | 41.29643 |
49 | 1 | US | 270144.47410 |
On average, ~5.2 million dollars are being paid out on each Tuesday to the US constituting over 92% of all payouts on each Tuesday to all countries. All the amounts paid out on Jan.1, 2019 by country can be seen in the 4th column.
#Since 2019-01-01 is Tuesday we will divide total amount paid out on Tuesday by 52 since there are
#52 Tuesdays in a year. Here's a breakdown:
merged_payouts_countries['expected_amount_Jan_1_2019_in_thousand_$'] = merged_payouts_countries['amount_in_thousand_$'].div(52)
merged_payouts_countries
/var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/1170396318.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy merged_payouts_countries['expected_amount_Jan_1_2019_in_thousand_$'] = merged_payouts_countries['amount_in_thousand_$'].div(52)
weekday | country | amount_in_thousand_$ | expected_amount_Jan_1_2019_in_thousand_$ | |
---|---|---|---|---|
25 | 1 | AT | 175.48883 | 3.374785 |
26 | 1 | AU | 2725.86024 | 52.420389 |
27 | 1 | BE | 231.48953 | 4.451722 |
28 | 1 | BR | 148.90838 | 2.863623 |
29 | 1 | CA | 3680.16026 | 70.772313 |
30 | 1 | CH | 139.54734 | 2.683603 |
31 | 1 | DE | 1435.43346 | 27.604490 |
32 | 1 | DK | 94.77140 | 1.822527 |
33 | 1 | ES | 1088.30410 | 20.928925 |
34 | 1 | FI | 105.38809 | 2.026694 |
35 | 1 | FR | 3204.91741 | 61.633027 |
36 | 1 | GB | 7847.86131 | 150.920410 |
37 | 1 | HK | 458.94807 | 8.825924 |
38 | 1 | IE | 375.22717 | 7.215907 |
39 | 1 | IT | 360.38189 | 6.930421 |
40 | 1 | JP | 584.47990 | 11.239998 |
41 | 1 | LU | 3.49237 | 0.067161 |
42 | 1 | MX | 3.38985 | 0.065189 |
43 | 1 | NL | 253.49345 | 4.874874 |
44 | 1 | NO | 154.53824 | 2.971889 |
45 | 1 | NZ | 299.22449 | 5.754317 |
46 | 1 | PT | 38.03378 | 0.731419 |
47 | 1 | SE | 204.93012 | 3.940964 |
48 | 1 | SG | 41.29643 | 0.794162 |
49 | 1 | US | 270144.47410 | 5195.086040 |
To better forecast payouts made to each country on Jan. 1,2019, we can build a Time-Series model (ex. SARIMA) which takes care of trend and seasonality, but it require more time than the time given for this project (6 hours).
daily = payouts
daily = daily.drop(['platform_id','recipient_id','amount', 'weekday'], axis = 1)
daily = daily.groupby(['date'], as_index=False).sum()
daily_count = daily.drop(['amount_in_thousand_$'], axis = 1)
daily_amount = daily.drop(['count'], axis = 1)
daily_avg_payout=daily
daily_avg_payout['avg_payout_in_thousand_$'] = daily['amount_in_thousand_$']/(daily["count"].values)
daily_avg_payout = daily_avg_payout.drop(['count','amount_in_thousand_$'], axis = 1)
daily_avg_payout
date | avg_payout_in_thousand_$ | |
---|---|---|
0 | 2018-01-01 | 1.033032 |
1 | 2018-01-02 | 0.846291 |
2 | 2018-01-03 | 1.047811 |
3 | 2018-01-04 | 1.091858 |
4 | 2018-01-05 | 0.873750 |
... | ... | ... |
360 | 2018-12-27 | 0.654588 |
361 | 2018-12-28 | 0.938263 |
362 | 2018-12-29 | 1.064002 |
363 | 2018-12-30 | 0.996324 |
364 | 2018-12-31 | 0.807593 |
365 rows × 2 columns
Below we can see the positive trend in the number of transactions on the platforms, which is a good indicator. The total weekly payout amounts are also positively trending with a peak in July, which may indicate a seasonality happening in July. However, we need more data for other years to confirm that.
When we look at the weekly average payout amount per transaction, we can see a slight negative trend, meaning that that customers, on average, are spending less money per transaction over time.
#weekly count of transactions over time
daily_count.date = pd.to_datetime(daily_count.date)
daily_count.set_index('date', inplace=True)
daily_count=daily_count.sort_index()
daily_count = daily_count[:-1]
weekly_count=daily_count.resample('W').sum()
weekly_count.plot()
<AxesSubplot:xlabel='date'>
#weekly payout amount over time
daily_amount.date = pd.to_datetime(daily_amount.date)
daily_amount.set_index('date', inplace=True)
daily_amount=daily_amount.sort_index()
daily_amount = daily_amount[:-1]
weekly_amount=daily_amount.resample('W').sum()
weekly_amount.plot()
<AxesSubplot:xlabel='date'>
#weekly average payout amount per transaction over time
daily_avg_payout.date = pd.to_datetime(daily_avg_payout.date)
daily_avg_payout.set_index('date', inplace=True)
daily_avg_payout=daily_avg_payout.sort_index()
daily_avg_payout = daily_avg_payout[:-1]
weekly_avg_payout=daily_avg_payout.resample('W').sum()
weekly_avg_payout.plot()
<AxesSubplot:xlabel='date'>
QUESTION 2
We can see that there are 4 merchants that were assigned to industries that appear only once in the data and that may be due to manual error. We're assigning Financial Services and Real Estate to Professional Services (non-medical), internet software & services to Other Software & Content and hotels, restaurants & leisure to Travel & Hospitality industry.
industries['industry'].value_counts()
Other Software & Content 26037 Travel & Hospitality 18370 Other Services 15902 Professional Services (non-medical) 14527 Food & Beverage 12221 Non-profit 9269 Tickets: concerts,sports,movies,conventions 8081 Content 4730 Education 1010 Retail Fashion 994 Other Retail 603 Healthcare 389 Direct Services: classes,memberships,appointments 171 Financial Services 1 internet software & services 1 Real Estate 1 hotels, restaurants & leisure 1 Name: industry, dtype: int64
industries['industry'] = industries['industry'].str.replace('hotels, restaurants & leisure','Travel & Hospitality')
industries['industry'] = industries['industry'].str.replace('Real Estate','Professional Services (non-medical)')
industries['industry'] = industries['industry'].str.replace('internet software & services','Other Software & Content')
industries['industry'] = industries['industry'].str.replace('Financial Services','Professional Services (non-medical)')
industries['industry'].value_counts()
Other Software & Content 26038 Travel & Hospitality 18371 Other Services 15902 Professional Services (non-medical) 14529 Food & Beverage 12221 Non-profit 9269 Tickets: concerts,sports,movies,conventions 8081 Content 4730 Education 1010 Retail Fashion 994 Other Retail 603 Healthcare 389 Direct Services: classes,memberships,appointments 171 Name: industry, dtype: int64
#create another df and delete some columns
payouts_4 = payouts
payouts_4
payouts_4 = payouts_4.drop(['date','recipient_id', 'count'], axis = 1)
merged_payouts_industries = pd.merge(payouts_4, industries, how='left', left_on='platform_id', right_on='merchant_id')
#there are 463 unique platforms
merged_payouts_industries['platform_id'].nunique()
463
breakdown_platforms_industries = merged_payouts_industries.groupby('industry')['platform_id'].nunique()
breakdown_platforms_industries
industry Content 6 Direct Services: classes,memberships,appointments 7 Education 5 Food & Beverage 15 Healthcare 5 Non-profit 16 Other Retail 10 Other Services 105 Other Software & Content 179 Professional Services (non-medical) 26 Retail Fashion 19 Tickets: concerts,sports,movies,conventions 10 Travel & Hospitality 13 Name: platform_id, dtype: int64
# only less than 1% of payouts don't have industry so we can drop them
merged_payouts_industries.info()
#remove unneccessary columns
merged_payouts_industries = merged_payouts_industries.drop(['platform_id','merchant_id'], axis = 1)
<class 'pandas.core.frame.DataFrame'> Int64Index: 1622642 entries, 0 to 1622641 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 platform_id 1622642 non-null object 1 amount 1622642 non-null float64 2 amount_in_thousand_$ 1622642 non-null float64 3 weekday 1622642 non-null int64 4 merchant_id 1622437 non-null object 5 industry 1622432 non-null object dtypes: float64(2), int64(1), object(3) memory usage: 86.7+ MB
merged_payouts_industries_grouped = merged_payouts_industries.groupby(['industry','weekday'], as_index=False).sum()
relevant_industries = ['Education','Food & Beverage', 'Travel & Hospitality']
df = merged_payouts_industries_grouped[merged_payouts_industries_grouped.industry.isin(relevant_industries)]
We can see a daily breakdown of payouts made by Education platforms, Food & Beverage platforms and by Travel & Hospitality platforms. Platforms from Education and Travel & Hospitality industries have the highest total payout on Tuesdays, while platforms from the Food & Beverage industry have the highest total payout on Wednesdays in 2018.
df
industry | weekday | amount | amount_in_thousand_$ | |
---|---|---|---|---|
14 | Education | 0 | 6.628098e+08 | 6628.09847 |
15 | Education | 1 | 9.448335e+08 | 9448.33509 |
16 | Education | 2 | 5.089684e+08 | 5089.68407 |
17 | Education | 3 | 4.424264e+08 | 4424.26442 |
18 | Education | 4 | 3.612140e+06 | 36.12140 |
19 | Education | 5 | 2.577729e+06 | 25.77729 |
20 | Education | 6 | 3.850287e+08 | 3850.28749 |
21 | Food & Beverage | 0 | 5.313783e+08 | 5313.78348 |
22 | Food & Beverage | 1 | 4.496764e+08 | 4496.76394 |
23 | Food & Beverage | 2 | 2.402195e+09 | 24021.95057 |
24 | Food & Beverage | 3 | 4.034605e+08 | 4034.60499 |
25 | Food & Beverage | 4 | 1.833658e+08 | 1833.65797 |
26 | Food & Beverage | 5 | 7.422723e+07 | 742.27226 |
27 | Food & Beverage | 6 | 1.736782e+08 | 1736.78160 |
84 | Travel & Hospitality | 0 | 5.892464e+08 | 5892.46359 |
85 | Travel & Hospitality | 1 | 8.636227e+08 | 8636.22718 |
86 | Travel & Hospitality | 2 | 6.437264e+08 | 6437.26413 |
87 | Travel & Hospitality | 3 | 4.986030e+08 | 4986.03013 |
88 | Travel & Hospitality | 4 | 2.447141e+08 | 2447.14092 |
89 | Travel & Hospitality | 5 | 9.416848e+07 | 941.68479 |
90 | Travel & Hospitality | 6 | 5.307999e+08 | 5307.99933 |
breakdown_platforms_industries
industry Content 6 Direct Services: classes,memberships,appointments 7 Education 5 Food & Beverage 15 Healthcare 5 Non-profit 16 Other Retail 10 Other Services 105 Other Software & Content 179 Professional Services (non-medical) 26 Retail Fashion 19 Tickets: concerts,sports,movies,conventions 10 Travel & Hospitality 13 Name: platform_id, dtype: int64
In the breakdown above we can see that currently we have 15 platforms in the Food & Beverage industry, 5 in Education industry and 13 in Travel and Hospitality industry.
Since we're expecting an increase in the number of platforms in the Education and Food & Beverage industries and a decrease in the Travel & Hospitality industry, we need to multiply the total payouts by respective coefficients.
education = df[df.industry == 'Education']
food_beverage = df[df.industry == 'Food & Beverage']
travel_hospitality = df[df.industry == 'Travel & Hospitality']
#multiply payout by expected growth in the number of platforms by industry
travel_hospitality['amount_expected_in_thousands'] = travel_hospitality['amount_in_thousand_$']*(5/13)
travel_hospitality['amount_expected_in_thousands_per_day'] = travel_hospitality['amount_expected_in_thousands']/52
food_beverage['amount_expected_in_thousands'] = food_beverage['amount_in_thousand_$']*(40/15)
food_beverage['amount_expected_in_thousands_per_day'] = food_beverage['amount_expected_in_thousands']/52
education['amount_expected_in_thousands'] = education['amount_in_thousand_$']*3
education['amount_expected_in_thousands_per_day'] = education['amount_expected_in_thousands']/52
/var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/3621811212.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy travel_hospitality['amount_expected_in_thousands'] = travel_hospitality['amount_in_thousand_$']*(5/13) /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/3621811212.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy travel_hospitality['amount_expected_in_thousands_per_day'] = travel_hospitality['amount_expected_in_thousands']/52 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/3621811212.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy food_beverage['amount_expected_in_thousands'] = food_beverage['amount_in_thousand_$']*(40/15) /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/3621811212.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy food_beverage['amount_expected_in_thousands_per_day'] = food_beverage['amount_expected_in_thousands']/52 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/3621811212.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy education['amount_expected_in_thousands'] = education['amount_in_thousand_$']*3 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/3621811212.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy education['amount_expected_in_thousands_per_day'] = education['amount_expected_in_thousands']/52
travel_hospitality
industry | weekday | amount | amount_in_thousand_$ | amount_expected_in_thousands | amount_expected_in_thousands_per_day | |
---|---|---|---|---|---|---|
84 | Travel & Hospitality | 0 | 589246359.0 | 5892.46359 | 2266.332150 | 43.583311 |
85 | Travel & Hospitality | 1 | 863622718.0 | 8636.22718 | 3321.625838 | 63.877420 |
86 | Travel & Hospitality | 2 | 643726413.0 | 6437.26413 | 2475.870819 | 47.612900 |
87 | Travel & Hospitality | 3 | 498603013.0 | 4986.03013 | 1917.703896 | 36.878921 |
88 | Travel & Hospitality | 4 | 244714092.0 | 2447.14092 | 941.208046 | 18.100155 |
89 | Travel & Hospitality | 5 | 94168479.0 | 941.68479 | 362.186458 | 6.965124 |
90 | Travel & Hospitality | 6 | 530799933.0 | 5307.99933 | 2041.538204 | 39.260350 |
food_beverage
industry | weekday | amount | amount_in_thousand_$ | amount_expected_in_thousands | amount_expected_in_thousands_per_day | |
---|---|---|---|---|---|---|
21 | Food & Beverage | 0 | 5.313783e+08 | 5313.78348 | 14170.089280 | 272.501717 |
22 | Food & Beverage | 1 | 4.496764e+08 | 4496.76394 | 11991.370507 | 230.603279 |
23 | Food & Beverage | 2 | 2.402195e+09 | 24021.95057 | 64058.534853 | 1231.894901 |
24 | Food & Beverage | 3 | 4.034605e+08 | 4034.60499 | 10758.946640 | 206.902820 |
25 | Food & Beverage | 4 | 1.833658e+08 | 1833.65797 | 4889.754587 | 94.033742 |
26 | Food & Beverage | 5 | 7.422723e+07 | 742.27226 | 1979.392693 | 38.065244 |
27 | Food & Beverage | 6 | 1.736782e+08 | 1736.78160 | 4631.417600 | 89.065723 |
education
industry | weekday | amount | amount_in_thousand_$ | amount_expected_in_thousands | amount_expected_in_thousands_per_day | |
---|---|---|---|---|---|---|
14 | Education | 0 | 662809847.0 | 6628.09847 | 19884.29541 | 382.390296 |
15 | Education | 1 | 944833509.0 | 9448.33509 | 28345.00527 | 545.096255 |
16 | Education | 2 | 508968407.0 | 5089.68407 | 15269.05221 | 293.635619 |
17 | Education | 3 | 442426442.0 | 4424.26442 | 13272.79326 | 255.246024 |
18 | Education | 4 | 3612140.0 | 36.12140 | 108.36420 | 2.083927 |
19 | Education | 5 | 2577729.0 | 25.77729 | 77.33187 | 1.487151 |
20 | Education | 6 | 385028749.0 | 3850.28749 | 11550.86247 | 222.131971 |
After getting the expected average payouts on each typical day of the week, we can calculate the total expected payout by each relevant industry in 2019.
#let's calculate total payout
df1=df
df1 = df1.drop(['weekday','amount'], axis = 1)
df1 = df1.groupby(['industry'], as_index=False).sum()
education_total = df1[df1.industry == 'Education']
food_beverage_total = df1[df1.industry == 'Food & Beverage']
travel_hospitality_total = df1[df1.industry == 'Travel & Hospitality']
#multiply payout by expected growth in the number of platforms by industry
travel_hospitality_total['amount_per_day_current'] = travel_hospitality_total['amount_in_thousand_$']/365
travel_hospitality_total['amount_expected_in_thousand'] = travel_hospitality_total['amount_in_thousand_$']*(5/13)
travel_hospitality_total['amount_expected_in_thousand_per_day'] = travel_hospitality_total['amount_expected_in_thousand']/365
food_beverage_total['amount_per_day_current'] = food_beverage_total['amount_in_thousand_$']/365
food_beverage_total['amount_expected_in_thousand'] = food_beverage_total['amount_in_thousand_$']*(40/15)
food_beverage_total['amount_expected_in_thousand_per_day'] = food_beverage_total['amount_expected_in_thousand']/365
education_total['amount_per_day_current'] = education_total['amount_in_thousand_$']/365
education_total['amount_expected_in_thousand'] = education_total['amount_in_thousand_$']*3
education_total['amount_expected_in_thousand_per_day'] = education_total['amount_expected_in_thousand']/365
/var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy travel_hospitality_total['amount_per_day_current'] = travel_hospitality_total['amount_in_thousand_$']/365 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy travel_hospitality_total['amount_expected_in_thousand'] = travel_hospitality_total['amount_in_thousand_$']*(5/13) /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy travel_hospitality_total['amount_expected_in_thousand_per_day'] = travel_hospitality_total['amount_expected_in_thousand']/365 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy food_beverage_total['amount_per_day_current'] = food_beverage_total['amount_in_thousand_$']/365 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy food_beverage_total['amount_expected_in_thousand'] = food_beverage_total['amount_in_thousand_$']*(40/15) /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy food_beverage_total['amount_expected_in_thousand_per_day'] = food_beverage_total['amount_expected_in_thousand']/365 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy education_total['amount_per_day_current'] = education_total['amount_in_thousand_$']/365 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy education_total['amount_expected_in_thousand'] = education_total['amount_in_thousand_$']*3 /var/folders/v6/x3nn93216175vhmjj5q9xgdr0000gp/T/ipykernel_17869/2525266137.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy education_total['amount_expected_in_thousand_per_day'] = education_total['amount_expected_in_thousand']/365
We can see that platforms from the Food & Beverage industry are expected to have the highest payouts (112479.5 thousands) in 2019, while Travel & Hospitality platforms the least payouts.
travel_hospitality_total
industry | amount_in_thousand_$ | amount_per_day_current | amount_expected_in_thousand | amount_expected_in_thousand_per_day | |
---|---|---|---|---|---|
2 | Travel & Hospitality | 34648.81007 | 94.928247 | 13326.465412 | 36.510864 |
food_beverage_total
industry | amount_in_thousand_$ | amount_per_day_current | amount_expected_in_thousand | amount_expected_in_thousand_per_day | |
---|---|---|---|---|---|
1 | Food & Beverage | 42179.81481 | 115.561136 | 112479.50616 | 308.163031 |
education_total
industry | amount_in_thousand_$ | amount_per_day_current | amount_expected_in_thousand | amount_expected_in_thousand_per_day | |
---|---|---|---|---|---|
0 | Education | 29502.56823 | 80.828954 | 88507.70469 | 242.486862 |