Scope of The Analysis
To help Olist, a brazilian e-commerce platform, to increase its market share and revenue.
Final Result
Here is the Github repository with all the code used.
And this is the final result (on Tableau):
If you prefer to watch it from the official Tableau website, here is the link.
The Analysis
Below there’s the analysis I’ve done using Python and its libraries. For a better view click this link.
In [ ]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from scipy.stats import f_oneway
In [ ]:
engine_txt = 'postgresql://postgres:password@localhost:5432/olist'
engine = create_engine(engine_txt)
Total products bought by state¶
In [ ]:
query = """
WITH orders_by_state AS(
SELECT
customers.customer_state as state,
SUM(payments.payment_installments) AS tot_order
FROM order_payments AS payments
JOIN orders ON orders.order_id = payments.order_id
JOIN customers ON customers.customer_id = orders.customer_id
GROUP BY customers.customer_state
ORDER BY SUM(payments.payment_installments) DESC
)
SELECT
*,
SUM(tot_order) OVER (ORDER BY tot_order DESC) / (SELECT SUM(tot_order)
FROM orders_by_state) AS cum_share
FROM orders_by_state
"""
orders_by_state = pd.read_sql(query, engine)
orders_by_state.head()
Out[ ]:
state | tot_order | cum_share | |
---|---|---|---|
0 | SP | 114377 | 0.385857 |
1 | RJ | 40110 | 0.521171 |
2 | MG | 36091 | 0.642926 |
3 | RS | 16847 | 0.699760 |
4 | PR | 15016 | 0.750417 |
In [ ]:
data = {
'states': [
'São Paulo', 'Rio de Janeiro', 'Minas Gerais', 'Paraná', 'Rio Grande do Sul',
'Santa Catarina', 'Bahia', 'Distrito Federal', 'Goiás', 'Pará', 'Pernambuco',
'Mato Grosso', 'Ceará', 'Espírito Santo', 'Mato Grosso do Sul', 'Amazonas',
'Maranhão', 'Rio Grande do Norte', 'Paraíba', 'Alagoas', 'Piauí', 'Rondônia',
'Sergipe', 'Tocantins', 'Amapá', 'Acre', 'Roraima'
],
'gdp': [
2377.64, 759.82, 682.79, 487.93, 470.94, 349.28, 305.32, 265.85, 224.13, 215.94,
193.31, 178.65, 166.92, 138.45, 122.63, 116.02, 106.92, 71.58, 70.29, 63.2, 56.39,
51.6, 45.41, 43.65, 18.47, 16.48, 16.02
],
'iso_code': [
'SP', 'RJ', 'MG', 'PR', 'RS', 'SC', 'BA', 'DF', 'GO', 'PA', 'PE', 'MT', 'CE', 'ES',
'MS', 'AM', 'MA', 'RN', 'PB', 'AL', 'PI', 'RO', 'SE', 'TO', 'AP', 'AC', 'RR'
]
}
info_states = pd.DataFrame(data)
In [ ]:
orders_by_state = pd.merge(orders_by_state, info_states, left_on='state', right_on='iso_code').drop(columns='iso_code')
orders_by_state.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\tot_prod_bought_by_state.csv',
index=False)
In [ ]:
orders_by_state[['tot_order', 'gdp']].corr()
Out[ ]:
tot_order | gdp | |
---|---|---|
tot_order | 1.000000 | 0.992012 |
gdp | 0.992012 | 1.000000 |
In [ ]:
top_8 = orders_by_state[orders_by_state['cum_share']<0.85]
top_8
Out[ ]:
state | tot_order | cum_share | states | gdp | |
---|---|---|---|---|---|
0 | SP | 114377 | 0.385857 | São Paulo | 2377.64 |
1 | RJ | 40110 | 0.521171 | Rio de Janeiro | 759.82 |
2 | MG | 36091 | 0.642926 | Minas Gerais | 682.79 |
3 | RS | 16847 | 0.699760 | Rio Grande do Sul | 470.94 |
4 | PR | 15016 | 0.750417 | Paraná | 487.93 |
5 | BA | 11509 | 0.789244 | Bahia | 305.32 |
6 | SC | 10760 | 0.825543 | Santa Catarina | 349.28 |
7 | ES | 6301 | 0.846800 | Espírito Santo | 138.45 |
State with the highest order value¶
In [ ]:
#ANOVA test
query = """
SELECT
SUM(payment_value::NUMERIC) AS order_value,
customers.customer_state AS state
FROM orders
JOIN customers ON customers.customer_id = orders.customer_id
JOIN order_payments AS payments ON payments.order_id = orders.order_id
GROUP BY customers.customer_state, orders.order_id
"""
df = pd.read_sql(query, engine)
df.head()
Out[ ]:
order_value | state | |
---|---|---|
0 | 20.24 | SP |
1 | 21.06 | PR |
2 | 9.66 | SP |
3 | 21.22 | SP |
4 | 9.55 | PR |
In [ ]:
states = []
for state in df['state'].unique():
states.append(df.loc[df['state']==state, 'order_value'].values)
f_oneway(*states)
# The order value is influenced by the state
Out[ ]:
F_onewayResult(statistic=34.62863411435716, pvalue=3.149889058834275e-172)
In [ ]:
query = """
WITH order_values AS(
SELECT
SUM(payment_value::NUMERIC) AS order_value,
customers.customer_state AS state
FROM orders
JOIN customers ON customers.customer_id = orders.customer_id
JOIN order_payments AS payments ON payments.order_id = orders.order_id
GROUP BY customers.customer_state, orders.order_id
)
SELECT
state,
AVG(order_value) as mean_order_value,
STDDEV(order_value) as std_order_value
FROM order_values
GROUP BY state
ORDER BY AVG(order_value) DESC
"""
order_value_by_states = pd.read_sql(query, engine)
order_value_by_states.head()
Out[ ]:
state | mean_order_value | std_order_value | |
---|---|---|---|
0 | PB | 49.355168 | 73.505447 |
1 | AC | 45.410370 | 43.188524 |
2 | RO | 44.962806 | 57.541384 |
3 | AP | 44.697353 | 51.895554 |
4 | AL | 43.878232 | 51.838240 |
Is there a relation between avg_order_value and tot_orders by state?¶
In [ ]:
df_states = pd.merge(orders_by_state, order_value_by_states, on='state').rename(columns={'states':'state_name'})
df_states.head()
Out[ ]:
state | tot_order | cum_share | state_name | gdp | mean_order_value | std_order_value | |
---|---|---|---|---|---|---|---|
0 | SP | 114377 | 0.385857 | São Paulo | 2377.64 | 26.852245 | 36.135222 |
1 | RJ | 40110 | 0.521171 | Rio de Janeiro | 759.82 | 31.182029 | 45.748465 |
2 | MG | 36091 | 0.642926 | Minas Gerais | 682.79 | 30.072543 | 39.704736 |
3 | RS | 16847 | 0.699760 | Rio Grande do Sul | 470.94 | 30.460115 | 37.665875 |
4 | PR | 15016 | 0.750417 | Paraná | 487.93 | 30.047903 | 40.224941 |
In [ ]:
df_states.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\info_all_statea.csv',
index=False)
In [ ]:
df_states[['gdp', 'mean_order_value']].corr()
Out[ ]:
gdp | mean_order_value | |
---|---|---|
gdp | 1.000000 | -0.610035 |
mean_order_value | -0.610035 | 1.000000 |
Confidece intervals avg order value¶
In [ ]:
def mean_conf_int(df, category, to_test, value, alpha=0.05, do_return=True):
'''Calculate the confidence interval for the difference between 2 means.
The Central Limit Theorem applies.'''
mean1 = df.loc[df[category]==to_test[0], value].mean()
mean2 = df.loc[df[category]==to_test[1], value].mean()
std1 = df.loc[df[category]==to_test[0], value].std()
std2 = df.loc[df[category]==to_test[1], value].std()
n1 = df[df[category]==to_test[0]].shape[0]
n2 = df[df[category]==to_test[1]].shape[0]
std_error = np.sqrt((std1**2 / n1) + (std2**2 / n2))
z_critical = stats.norm.ppf(1 - alpha / 2)
margin_of_error = z_critical * std_error
lower_bound = (mean1 - mean2) - margin_of_error
upper_bound = (mean1 - mean2) + margin_of_error
if do_return:
return lower_bound, upper_bound
else:
print(f'{to_test[0]} vs. {to_test[1]}')
print(f'Upper bound: {upper_bound} —— Lower bound: {lower_bound}')
In [ ]:
conf_intervals = {'PB_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'PB'
for state in order_value_by_states['state'].values:
if state != prev:
conf_intervals['PB_vs'].append(state)
curr = state
lo, hi = mean_conf_int(df, 'state', (prev, curr), 'order_value')
conf_intervals['lo_bound'].append(lo)
conf_intervals['up_bound'].append(hi)
ci_order_value = pd.DataFrame(conf_intervals)
In [ ]:
ci_order_value = ci_order_value.sort_values('lo_bound', ascending=False).reset_index(drop=True)
ci_order_value.head()
Out[ ]:
PB_vs | up_bound | lo_bound | |
---|---|---|---|
0 | SP | 28.735361 | 16.270485 |
1 | MG | 25.547098 | 13.018152 |
2 | PR | 25.628275 | 12.986255 |
3 | ES | 25.996509 | 12.784727 |
4 | RS | 25.197448 | 12.592657 |
In [ ]:
ci_order_value.head(9)
Out[ ]:
PB_vs | up_bound | lo_bound | |
---|---|---|---|
0 | SP | 28.735361 | 16.270485 |
1 | MG | 25.547098 | 13.018152 |
2 | PR | 25.628275 | 12.986255 |
3 | ES | 25.996509 | 12.784727 |
4 | RS | 25.197448 | 12.592657 |
5 | RJ | 24.445993 | 11.900284 |
6 | DF | 24.848277 | 11.833432 |
7 | SC | 23.739358 | 10.936884 |
8 | GO | 23.484587 | 10.446046 |
In [ ]:
top_8
Out[ ]:
state | tot_order | cum_share | states | gdp | |
---|---|---|---|---|---|
0 | SP | 114377 | 0.385857 | São Paulo | 2377.64 |
1 | RJ | 40110 | 0.521171 | Rio de Janeiro | 759.82 |
2 | MG | 36091 | 0.642926 | Minas Gerais | 682.79 |
3 | RS | 16847 | 0.699760 | Rio Grande do Sul | 470.94 |
4 | PR | 15016 | 0.750417 | Paraná | 487.93 |
5 | BA | 11509 | 0.789244 | Bahia | 305.32 |
6 | SC | 10760 | 0.825543 | Santa Catarina | 349.28 |
7 | ES | 6301 | 0.846800 | Espírito Santo | 138.45 |
In [ ]:
plt.barh(ci_order_value['PB_vs'], ci_order_value['up_bound']-ci_order_value['lo_bound'], left=ci_order_value['lo_bound'])
plt.title('Conf. Int. Avg Order Value by State')
plt.show()
In [ ]:
ci_order_value.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\order_value_vs_state.csv',
index=False)
State with the most recurring customers¶
In [ ]:
# ANOVA test
query = '''
SELECT
customers.customer_state AS state,
CASE
WHEN COUNT(*)>1 THEN 1
ELSE 0
END AS is_recurring
FROM orders
JOIN customers ON customers.customer_id = orders.customer_id
GROUP BY customers.customer_state, customers.customer_unique_id
'''
anova_t_recur = pd.read_sql(query, engine)
anova_t_recur.head()
Out[ ]:
state | is_recurring | |
---|---|---|
0 | MA | 0 |
1 | CE | 0 |
2 | ES | 0 |
3 | MG | 0 |
4 | MG | 0 |
In [ ]:
states = []
for state in anova_t_recur['state'].unique():
states.append(anova_t_recur.loc[df['state']==state, 'is_recurring'].values)
f_oneway(*states)
Out[ ]:
F_onewayResult(statistic=1.0598090762110413, pvalue=0.380720876490711)
Confidence intervals recurring customers proportion¶
In [ ]:
query = '''
WITH recurring_tab AS(
SELECT
customers.customer_state AS state,
COUNT(*) AS num_orders,
CASE
WHEN COUNT(*)>1 THEN 1
ELSE 0
END AS is_recurring
FROM orders
JOIN customers ON customers.customer_id = orders.customer_id
GROUP BY customers.customer_state, customers.customer_unique_id
),
pi_table AS(
SELECT
state,
COUNT(*) AS size,
SUM(is_recurring) AS recurring_size,
ROUND(SUM(is_recurring)::NUMERIC/COUNT(*)::NUMERIC, 4) AS pi
FROM recurring_tab
GROUP BY state
)
SELECT *
FROM pi_table
WHERE size*pi*(1-pi) > 15
ORDER BY pi DESC
'''
recurring_by_state = pd.read_sql(query, engine)
recurring_by_state.head()
Out[ ]:
state | size | recurring_size | pi | |
---|---|---|---|---|
0 | RJ | 12384 | 421 | 0.0340 |
1 | MT | 876 | 29 | 0.0331 |
2 | GO | 1952 | 64 | 0.0328 |
3 | SP | 40302 | 1296 | 0.0322 |
4 | RS | 5277 | 167 | 0.0316 |
In [ ]:
def proportion_conf_int(df, category, to_test, pi='pi', size='recurring_size', alpha=0.05, do_return=True):
'''Calculate the confidence interval for the difference between 2 proportions.
The Central Limit Theorem applies.'''
pi1 = df.loc[df[category]==to_test[0], pi].values[0]
pi2 = df.loc[df[category]==to_test[1], pi].values[0]
n1 = df.loc[df[category]==to_test[0], size].values[0]
n2 = df.loc[df[category]==to_test[1], size].values[0]
std_error = np.sqrt((pi1*(1-pi1) / n1) + (pi2*(1-pi2) / n2))
z_critical = stats.norm.ppf(1 - alpha / 2)
margin_of_error = z_critical * std_error
lower_bound = (pi1 - pi2) - margin_of_error
upper_bound = (pi1 - pi2) + margin_of_error
if do_return:
return lower_bound, upper_bound
else:
print(f'{to_test[0]} vs. {to_test[1]}')
print(f'Upper bound: {upper_bound} —— Lower bound: {lower_bound}')
In [ ]:
conf_intervals = {'RJ_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'RJ'
for state in recurring_by_state['state'].values:
if state != prev:
conf_intervals['RJ_vs'].append(state)
curr = state
lo, hi = proportion_conf_int(recurring_by_state, 'state', (prev, curr))
conf_intervals['lo_bound'].append(lo)
conf_intervals['up_bound'].append(hi)
ci_proportion_value = pd.DataFrame(conf_intervals)
In [ ]:
plt.barh(ci_proportion_value['RJ_vs'], ci_proportion_value['up_bound']-ci_proportion_value['lo_bound'], left=ci_proportion_value['lo_bound'])
plt.title('Conf. Int. % Recurring Customers by State')
plt.show()
In [ ]:
ci_proportion_value.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\recurring_pi_by_state.csv',
index=False)
Payment method with the highest average order value¶
In [ ]:
query = '''
SELECT
AVG(payment_value::NUMERIC) AS avg_order_value,
COUNT(*) AS num_transactions,
STDDEV(payment_value::NUMERIC) AS std,
payment_type
FROM order_payments
WHERE payment_value > '0€'
AND payment_sequential = 1
GROUP BY payment_type
'''
pay_types = pd.read_sql(query, engine)
pay_types
Out[ ]:
avg_order_value | num_transactions | std | payment_type | |
---|---|---|---|---|
0 | 27.104754 | 19783 | 39.922639 | boleto |
1 | 30.536987 | 76469 | 41.541676 | credit_card |
2 | 26.681889 | 1477 | 46.499899 | debit_card |
3 | 18.133068 | 1620 | 32.299097 | voucher |
In [ ]:
def mean_conf_int(df, category, to_test, mean, std, size, alpha=0.05, do_return=True):
'''Calculate the confidence interval for the difference between 2 means.
The Central Limit Theorem applies.'''
mean1 = df.loc[df[category]==to_test[0], mean].values[0]
mean2 = df.loc[df[category]==to_test[1], mean].values[0]
std1 = df.loc[df[category]==to_test[0], std].values[0]
std2 = df.loc[df[category]==to_test[1], std].values[0]
n1 = df.loc[df[category]==to_test[0], size].values[0]
n2 = df.loc[df[category]==to_test[1], size].values[0]
std_error = np.sqrt((std1**2 / n1) + (std2**2 / n2))
z_critical = stats.norm.ppf(1 - alpha / 2)
margin_of_error = z_critical * std_error
lower_bound = (mean1 - mean2) - margin_of_error
upper_bound = (mean1 - mean2) + margin_of_error
if do_return:
return lower_bound, upper_bound
else:
print(f'{to_test[0]} vs. {to_test[1]}')
print(f'Upper bound: {upper_bound} —— Lower bound: {lower_bound}')
In [ ]:
conf_intervals = {'CC_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'credit_card'
for type in pay_types['payment_type'].values:
if type != prev:
lo, hi = mean_conf_int(pay_types, 'payment_type', (prev, type),
'avg_order_value', 'std', 'num_transactions')
conf_intervals['CC_vs'].append(type)
conf_intervals['lo_bound'].append(lo)
conf_intervals['up_bound'].append(hi)
ci_paym_type_value = pd.DataFrame(conf_intervals)
In [ ]:
plt.barh(ci_paym_type_value['CC_vs'], ci_paym_type_value['up_bound']-ci_paym_type_value['lo_bound'], left=ci_paym_type_value['lo_bound'])
plt.title('Conf. Int. Order Value by Payment Type')
plt.xlim(-1,15)
plt.show()
In [ ]:
ci_paym_type_value.to_csv((r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\order_value_vs_payment_types.csv'), index=False)
Findings:¶
Customers that use credit card spend around 3€ more than customers that use boleto... and 11€ more than customers who use voucher.
Satisfaction vs delivery delay¶
In [ ]:
query='''
WITH review_vs_delivery AS(
SELECT
EXTRACT(DAY FROM orders.order_estimated_delivery_date - orders.order_delivered_customer_date) AS days_in_advance,
reviews.review_score
FROM orders
JOIN order_reviews AS reviews ON reviews.order_id = orders.order_id
WHERE (orders.order_estimated_delivery_date - orders.order_delivered_customer_date) IS NOT NULL
)
SELECT
CASE
WHEN days_in_advance > 0 THEN 'early'
WHEN days_in_advance = 0 THEN 'on_time'
ELSE 'delay'
END AS delivery_status,
STDDEV(review_score) as std,
COUNT(*) AS frequency,
ROUND(AVG(review_score),2) AS average_review_score
FROM
review_vs_delivery
GROUP BY
CASE
WHEN days_in_advance > 0 THEN 'early'
WHEN days_in_advance = 0 THEN 'on_time'
ELSE 'delay'
END
'''
delivery_satisf = pd.read_sql(query, engine)
delivery_satisf
Out[ ]:
delivery_status | std | frequency | average_review_score | |
---|---|---|---|---|
0 | on_time | 1.222938 | 2718 | 4.11 |
1 | early | 1.144371 | 86527 | 4.30 |
2 | delay | 1.570843 | 6362 | 2.27 |
In [ ]:
delivery_satisf.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\bar_satisfac_vs_delay.csv',
index=False)
In [ ]:
conf_intervals = {'on_time_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'on_time'
for type in delivery_satisf['delivery_status'].values:
if type != prev:
lo, hi = mean_conf_int(delivery_satisf, 'delivery_status', (prev, type),
'average_review_score', 'std', 'frequency')
conf_intervals['on_time_vs'].append(type)
conf_intervals['lo_bound'].append(lo)
conf_intervals['up_bound'].append(hi)
ci_delivery_satisf = pd.DataFrame(conf_intervals)
In [ ]:
plt.barh(ci_delivery_satisf['on_time_vs'], ci_delivery_satisf['up_bound']-ci_delivery_satisf['lo_bound'], left=ci_delivery_satisf['lo_bound'])
plt.title('Conf. Int. Delivery vs Satisfaction')
plt.xlim(-1,2.5)
plt.show()
In [ ]:
ci_delivery_satisf.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\satisfac_vs_delay.csv',
index=False)
Satisfaction Trend vs. Delivery Time¶
In [ ]:
query = '''
WITH review_vs_delivery AS(
SELECT
EXTRACT(DAY FROM orders.order_estimated_delivery_date - orders.order_delivered_customer_date) AS days_in_advance,
reviews.review_score
FROM orders
JOIN order_reviews AS reviews ON reviews.order_id = orders.order_id
WHERE (orders.order_estimated_delivery_date - orders.order_delivered_customer_date) IS NOT NULL
),
reviews_delivery_bin AS(
SELECT
*,
width_bucket(days_in_advance, -20, 20, 20) AS bin
FROM review_vs_delivery
)
SELECT
bin,
MAX(days_in_advance) AS high_bound,
MIN(days_in_advance) AS low_bound,
AVG(review_score) AS avg_stars
FROM reviews_delivery_bin
GROUP BY bin
ORDER BY bin
'''
satisf_delivery_trend = pd.read_sql(query, engine)
satisf_delivery_trend.head()
Out[ ]:
bin | high_bound | low_bound | avg_stars | |
---|---|---|---|---|
0 | 0 | -21.0 | -188.0 | 1.768725 |
1 | 1 | -19.0 | -20.0 | 1.587500 |
2 | 2 | -17.0 | -18.0 | 1.672840 |
3 | 3 | -15.0 | -16.0 | 1.696000 |
4 | 4 | -13.0 | -14.0 | 1.654494 |
In [ ]:
satisf_delivery_trend.loc[0, 'low_bound'] = -22
satisf_delivery_trend.loc[21, 'high_bound'] = 21
In [ ]:
plt.plot(satisf_delivery_trend['low_bound'], satisf_delivery_trend['avg_stars'])
plt.title('Satisfaction Trend vs. Delivery Time')
plt.show()
In [ ]:
df.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\review_score_vs_delivery.csv',
index=False)
Brazil and the tendency to buy locally¶
In [ ]:
query = '''
SELECT
CASE
WHEN sellers.seller_city = customers.customer_city THEN 'km 0'
WHEN sellers.seller_state = customers.customer_state THEN 'local'
ELSE 'not local'
END,
COUNT(*) AS frequency
FROM orders
LEFT JOIN order_items AS items ON items.order_id = orders.order_id
JOIN customers ON customers.customer_id = orders.customer_id
JOIN sellers ON sellers.seller_id = items.seller_id
GROUP BY CASE
WHEN sellers.seller_city = customers.customer_city THEN 'km 0'
WHEN sellers.seller_state = customers.customer_state THEN 'local'
ELSE 'not local'
END;
'''
pd.read_sql(query, engine)
Out[ ]:
case | frequency | |
---|---|---|
0 | km 0 | 5863 |
1 | local | 34908 |
2 | not local | 71879 |
In [ ]:
query = '''
WITH distances AS(
SELECT
ACOS(
GREATEST(
LEAST(
SIN(RADIANS(geo_sellers.geolocation_lat)) * SIN(RADIANS(geo_cust.geolocation_lat)) +
COS(RADIANS(geo_sellers.geolocation_lat)) * COS(RADIANS(geo_cust.geolocation_lat)) *
COS(RADIANS(geo_cust.geolocation_lng) - RADIANS(geo_sellers.geolocation_lng)),
1
),
-1
)
) * 6371 AS distance
FROM orders
LEFT JOIN order_items AS items ON items.order_id = orders.order_id
JOIN customers ON customers.customer_id = orders.customer_id
JOIN sellers ON sellers.seller_id = items.seller_id
JOIN geolocation as geo_sellers ON geo_sellers.geolocation_zip_code_prefix = sellers.seller_zip_code_prefix
JOIN geolocation as geo_cust ON geo_cust.geolocation_zip_code_prefix = customers.customer_zip_code_prefix
)
SELECT
AVG(distance) AS avg_distance,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY distance) AS median_distance
FROM distances;
'''
pd.read_sql(query, engine)
Out[ ]:
avg_distance | median_distance | |
---|---|---|
0 | 597.187869 | 431.753988 |
Most sold categories¶
In [ ]:
query = '''
SELECT
translat.product_category_name_english AS category,
COUNT(*) AS frequency
FROM products
JOIN product_category_name_translation AS translat ON translat.product_category_name = products.product_category_name
JOIN order_items AS items ON items.product_id = products.product_id
GROUP BY translat.product_category_name_english
ORDER BY COUNT(*) DESC;
'''
pd.read_sql(query, engine).head()
Out[ ]:
category | frequency | |
---|---|---|
0 | bed_bath_table | 11115 |
1 | health_beauty | 9670 |
2 | sports_leisure | 8641 |
3 | furniture_decor | 8334 |
4 | computers_accessories | 7827 |
Orders with just one category¶
In [ ]:
query = '''
WITH sold_togheter AS(
SELECT
translat.product_category_name_english AS category,
items.product_id AS product_id,
items.order_id AS order_id,
MAX(items.order_item_id) OVER (PARTITION BY items.order_id) AS num_items
FROM products
JOIN product_category_name_translation AS translat ON translat.product_category_name = products.product_category_name
JOIN order_items AS items ON items.product_id = products.product_id
ORDER BY items.order_item_id DESC
),
categories_sold_togheter AS(
SELECT
order_id,
category,
num_items,
COUNT(*) AS num_category
FROM sold_togheter
WHERE num_items > 1
GROUP BY order_id, category, num_items
ORDER BY num_items DESC
)
SELECT
SUM(CASE WHEN num_items=num_category THEN 1 ELSE 0 END) AS one_cat_orders,
COUNT(DISTINCT order_id) as tot_orders
FROM categories_sold_togheter;
'''
pd.read_sql(query, engine)
Out[ ]:
one_cat_orders | tot_orders | |
---|---|---|
0 | 8914 | 9673 |
Findings: The vast majority of orders (with more than 1 product) in Brazil are within the same product category.
Finding Insights
Well… I can’t spill all the beans here. This part is exclusively for people who work with me.
Interested? Let’s work together!