In [ ]:

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from scipy.stats import f_oneway

In [ ]:

engine_txt = 'postgresql://postgres:password@localhost:5432/olist'
engine = create_engine(engine_txt)

Total products bought by state¶

In [ ]:

query = """
WITH orders_by_state AS(
    SELECT
        customers.customer_state as state,
        SUM(payments.payment_installments) AS tot_order
    FROM order_payments AS payments
    JOIN orders ON orders.order_id = payments.order_id
    JOIN customers ON customers.customer_id = orders.customer_id
    GROUP BY customers.customer_state
    ORDER BY SUM(payments.payment_installments) DESC
)

SELECT
    *,
    SUM(tot_order) OVER (ORDER BY tot_order DESC) / (SELECT SUM(tot_order)
                                                    FROM orders_by_state) AS cum_share
FROM orders_by_state
"""

orders_by_state = pd.read_sql(query, engine)
orders_by_state.head()

Out[ ]:

	state	tot_order	cum_share
0	SP	114377	0.385857
1	RJ	40110	0.521171
2	MG	36091	0.642926
3	RS	16847	0.699760
4	PR	15016	0.750417

In [ ]:

data = {
    'states': [
        'São Paulo', 'Rio de Janeiro', 'Minas Gerais', 'Paraná', 'Rio Grande do Sul',
        'Santa Catarina', 'Bahia', 'Distrito Federal', 'Goiás', 'Pará', 'Pernambuco',
        'Mato Grosso', 'Ceará', 'Espírito Santo', 'Mato Grosso do Sul', 'Amazonas',
        'Maranhão', 'Rio Grande do Norte', 'Paraíba', 'Alagoas', 'Piauí', 'Rondônia',
        'Sergipe', 'Tocantins', 'Amapá', 'Acre', 'Roraima'
    ],
    'gdp': [
        2377.64, 759.82, 682.79, 487.93, 470.94, 349.28, 305.32, 265.85, 224.13, 215.94,
        193.31, 178.65, 166.92, 138.45, 122.63, 116.02, 106.92, 71.58, 70.29, 63.2, 56.39,
        51.6, 45.41, 43.65, 18.47, 16.48, 16.02
    ],
    'iso_code': [
        'SP', 'RJ', 'MG', 'PR', 'RS', 'SC', 'BA', 'DF', 'GO', 'PA', 'PE', 'MT', 'CE', 'ES',
        'MS', 'AM', 'MA', 'RN', 'PB', 'AL', 'PI', 'RO', 'SE', 'TO', 'AP', 'AC', 'RR'
    ]
}

info_states = pd.DataFrame(data)

In [ ]:

orders_by_state = pd.merge(orders_by_state, info_states, left_on='state', right_on='iso_code').drop(columns='iso_code')

orders_by_state.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\tot_prod_bought_by_state.csv',
                       index=False)

In [ ]:

orders_by_state[['tot_order', 'gdp']].corr()

Out[ ]:

	tot_order	gdp
tot_order	1.000000	0.992012
gdp	0.992012	1.000000

In [ ]:

top_8 = orders_by_state[orders_by_state['cum_share']<0.85]
top_8

Out[ ]:

	state	tot_order	cum_share	states	gdp
0	SP	114377	0.385857	São Paulo	2377.64
1	RJ	40110	0.521171	Rio de Janeiro	759.82
2	MG	36091	0.642926	Minas Gerais	682.79
3	RS	16847	0.699760	Rio Grande do Sul	470.94
4	PR	15016	0.750417	Paraná	487.93
5	BA	11509	0.789244	Bahia	305.32
6	SC	10760	0.825543	Santa Catarina	349.28
7	ES	6301	0.846800	Espírito Santo	138.45

State with the highest order value¶

In [ ]:

#ANOVA test

query = """
SELECT
    SUM(payment_value::NUMERIC) AS order_value,
    customers.customer_state AS state
FROM orders
JOIN customers ON customers.customer_id = orders.customer_id
JOIN order_payments AS payments ON payments.order_id = orders.order_id
GROUP BY customers.customer_state, orders.order_id
"""

df = pd.read_sql(query, engine)
df.head()

Out[ ]:

	order_value	state
0	20.24	SP
1	21.06	PR
2	9.66	SP
3	21.22	SP
4	9.55	PR

In [ ]:

states = []
for state in df['state'].unique():
    states.append(df.loc[df['state']==state, 'order_value'].values)

f_oneway(*states)

# The order value is influenced by the state

Out[ ]:

F_onewayResult(statistic=34.62863411435716, pvalue=3.149889058834275e-172)

In [ ]:

query = """
WITH order_values AS(
    SELECT
        SUM(payment_value::NUMERIC) AS order_value,
        customers.customer_state AS state
    FROM orders
    JOIN customers ON customers.customer_id = orders.customer_id
    JOIN order_payments AS payments ON payments.order_id = orders.order_id
    GROUP BY customers.customer_state, orders.order_id
)

SELECT
    state,
    AVG(order_value) as mean_order_value,
    STDDEV(order_value) as std_order_value
FROM order_values
GROUP BY state
ORDER BY AVG(order_value) DESC
"""

order_value_by_states = pd.read_sql(query, engine)
order_value_by_states.head()

Out[ ]:

	state	mean_order_value	std_order_value
0	PB	49.355168	73.505447
1	AC	45.410370	43.188524
2	RO	44.962806	57.541384
3	AP	44.697353	51.895554
4	AL	43.878232	51.838240

Is there a relation between avg_order_value and tot_orders by state?¶

In [ ]:

df_states = pd.merge(orders_by_state, order_value_by_states, on='state').rename(columns={'states':'state_name'})
df_states.head()

Out[ ]:

	state	tot_order	cum_share	state_name	gdp	mean_order_value	std_order_value
0	SP	114377	0.385857	São Paulo	2377.64	26.852245	36.135222
1	RJ	40110	0.521171	Rio de Janeiro	759.82	31.182029	45.748465
2	MG	36091	0.642926	Minas Gerais	682.79	30.072543	39.704736
3	RS	16847	0.699760	Rio Grande do Sul	470.94	30.460115	37.665875
4	PR	15016	0.750417	Paraná	487.93	30.047903	40.224941

In [ ]:

df_states.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\info_all_statea.csv',
                 index=False)

In [ ]:

df_states[['gdp', 'mean_order_value']].corr()

Out[ ]:

	gdp	mean_order_value
gdp	1.000000	-0.610035
mean_order_value	-0.610035	1.000000

Confidece intervals avg order value¶

In [ ]:

def mean_conf_int(df, category, to_test, value, alpha=0.05, do_return=True):
    '''Calculate the confidence interval for the difference between 2 means.
    The Central Limit Theorem applies.'''

    mean1 = df.loc[df[category]==to_test[0], value].mean()
    mean2 = df.loc[df[category]==to_test[1], value].mean()
    std1 = df.loc[df[category]==to_test[0], value].std()
    std2 = df.loc[df[category]==to_test[1], value].std()
    n1 = df[df[category]==to_test[0]].shape[0]
    n2 = df[df[category]==to_test[1]].shape[0]

    std_error = np.sqrt((std1**2 / n1) + (std2**2 / n2))
    z_critical = stats.norm.ppf(1 - alpha / 2)
    margin_of_error = z_critical * std_error

    lower_bound = (mean1 - mean2) - margin_of_error
    upper_bound = (mean1 - mean2) + margin_of_error

    if do_return:
        return lower_bound, upper_bound
    else:
        print(f'{to_test[0]} vs. {to_test[1]}')
        print(f'Upper bound: {upper_bound} —— Lower bound: {lower_bound}')

In [ ]:

conf_intervals = {'PB_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'PB'

for state in order_value_by_states['state'].values:
    if state != prev:
        conf_intervals['PB_vs'].append(state)
        curr = state
        lo, hi = mean_conf_int(df, 'state', (prev, curr), 'order_value')
        conf_intervals['lo_bound'].append(lo)
        conf_intervals['up_bound'].append(hi)

ci_order_value = pd.DataFrame(conf_intervals)

In [ ]:

ci_order_value = ci_order_value.sort_values('lo_bound', ascending=False).reset_index(drop=True)
ci_order_value.head()

Out[ ]:

	PB_vs	up_bound	lo_bound
0	SP	28.735361	16.270485
1	MG	25.547098	13.018152
2	PR	25.628275	12.986255
3	ES	25.996509	12.784727
4	RS	25.197448	12.592657

In [ ]:

ci_order_value.head(9)

Out[ ]:

	PB_vs	up_bound	lo_bound
0	SP	28.735361	16.270485
1	MG	25.547098	13.018152
2	PR	25.628275	12.986255
3	ES	25.996509	12.784727
4	RS	25.197448	12.592657
5	RJ	24.445993	11.900284
6	DF	24.848277	11.833432
7	SC	23.739358	10.936884
8	GO	23.484587	10.446046

In [ ]:

top_8

Out[ ]:

	state	tot_order	cum_share	states	gdp
0	SP	114377	0.385857	São Paulo	2377.64
1	RJ	40110	0.521171	Rio de Janeiro	759.82
2	MG	36091	0.642926	Minas Gerais	682.79
3	RS	16847	0.699760	Rio Grande do Sul	470.94
4	PR	15016	0.750417	Paraná	487.93
5	BA	11509	0.789244	Bahia	305.32
6	SC	10760	0.825543	Santa Catarina	349.28
7	ES	6301	0.846800	Espírito Santo	138.45

In [ ]:

plt.barh(ci_order_value['PB_vs'], ci_order_value['up_bound']-ci_order_value['lo_bound'], left=ci_order_value['lo_bound'])
plt.title('Conf. Int. Avg Order Value by State')
plt.show()

No description has been provided for this image

In [ ]:

ci_order_value.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\order_value_vs_state.csv',
                 index=False)

State with the most recurring customers¶

In [ ]:

# ANOVA test

query = '''
SELECT
	customers.customer_state AS state,
	CASE
		WHEN COUNT(*)>1 THEN 1
		ELSE 0
	END AS is_recurring
FROM orders
JOIN customers ON customers.customer_id = orders.customer_id
GROUP BY customers.customer_state, customers.customer_unique_id
'''

anova_t_recur = pd.read_sql(query, engine)
anova_t_recur.head()

Out[ ]:

	state	is_recurring
0	MA	0
1	CE	0
2	ES	0
3	MG	0
4	MG	0

In [ ]:

states = []
for state in anova_t_recur['state'].unique():
    states.append(anova_t_recur.loc[df['state']==state, 'is_recurring'].values)

f_oneway(*states)

Out[ ]:

F_onewayResult(statistic=1.0598090762110413, pvalue=0.380720876490711)

Confidence intervals recurring customers proportion¶

In [ ]:

query = '''
WITH recurring_tab AS(
	SELECT
		customers.customer_state AS state,
		COUNT(*) AS num_orders,
		CASE
			WHEN COUNT(*)>1 THEN 1
			ELSE 0
		END AS is_recurring
	FROM orders
	JOIN customers ON customers.customer_id = orders.customer_id
	GROUP BY customers.customer_state, customers.customer_unique_id
),

pi_table AS(
    SELECT
        state,
        COUNT(*) AS size,
        SUM(is_recurring) AS recurring_size,
        ROUND(SUM(is_recurring)::NUMERIC/COUNT(*)::NUMERIC, 4) AS pi
    FROM recurring_tab
    GROUP BY state
)

SELECT *
FROM pi_table
WHERE size*pi*(1-pi) > 15
ORDER BY pi DESC
'''

recurring_by_state = pd.read_sql(query, engine)
recurring_by_state.head()

Out[ ]:

	state	size	recurring_size	pi
0	RJ	12384	421	0.0340
1	MT	876	29	0.0331
2	GO	1952	64	0.0328
3	SP	40302	1296	0.0322
4	RS	5277	167	0.0316

In [ ]:

def proportion_conf_int(df, category, to_test, pi='pi', size='recurring_size', alpha=0.05, do_return=True):
    '''Calculate the confidence interval for the difference between 2 proportions.
    The Central Limit Theorem applies.'''

    pi1 = df.loc[df[category]==to_test[0], pi].values[0]
    pi2 = df.loc[df[category]==to_test[1], pi].values[0]
    n1 = df.loc[df[category]==to_test[0], size].values[0]
    n2 = df.loc[df[category]==to_test[1], size].values[0]

    std_error = np.sqrt((pi1*(1-pi1) / n1) + (pi2*(1-pi2) / n2))
    z_critical = stats.norm.ppf(1 - alpha / 2)
    margin_of_error = z_critical * std_error

    lower_bound = (pi1 - pi2) - margin_of_error
    upper_bound = (pi1 - pi2) + margin_of_error

    if do_return:
        return lower_bound, upper_bound
    else:
        print(f'{to_test[0]} vs. {to_test[1]}')
        print(f'Upper bound: {upper_bound} —— Lower bound: {lower_bound}')

In [ ]:

conf_intervals = {'RJ_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'RJ'

for state in recurring_by_state['state'].values:
    if state != prev:
        conf_intervals['RJ_vs'].append(state)
        curr = state
        lo, hi = proportion_conf_int(recurring_by_state, 'state', (prev, curr))
        conf_intervals['lo_bound'].append(lo)
        conf_intervals['up_bound'].append(hi)

ci_proportion_value = pd.DataFrame(conf_intervals)

In [ ]:

plt.barh(ci_proportion_value['RJ_vs'], ci_proportion_value['up_bound']-ci_proportion_value['lo_bound'], left=ci_proportion_value['lo_bound'])
plt.title('Conf. Int. % Recurring Customers by State')
plt.show()

In [ ]:

ci_proportion_value.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\recurring_pi_by_state.csv',
                index=False)

Payment method with the highest average order value¶

In [ ]:

query = '''
SELECT
    AVG(payment_value::NUMERIC) AS avg_order_value,
	COUNT(*) AS num_transactions,
	STDDEV(payment_value::NUMERIC) AS std,
	payment_type
FROM order_payments
WHERE payment_value > '0€'
    AND payment_sequential = 1
GROUP BY payment_type
'''

pay_types = pd.read_sql(query, engine)
pay_types

Out[ ]:

	avg_order_value	num_transactions	std	payment_type
0	27.104754	19783	39.922639	boleto
1	30.536987	76469	41.541676	credit_card
2	26.681889	1477	46.499899	debit_card
3	18.133068	1620	32.299097	voucher

In [ ]:

def mean_conf_int(df, category, to_test, mean, std, size, alpha=0.05, do_return=True):
    '''Calculate the confidence interval for the difference between 2 means.
    The Central Limit Theorem applies.'''

    mean1 = df.loc[df[category]==to_test[0], mean].values[0]
    mean2 = df.loc[df[category]==to_test[1], mean].values[0]
    std1 = df.loc[df[category]==to_test[0], std].values[0]
    std2 = df.loc[df[category]==to_test[1], std].values[0]
    n1 = df.loc[df[category]==to_test[0], size].values[0]
    n2 = df.loc[df[category]==to_test[1], size].values[0]

    std_error = np.sqrt((std1**2 / n1) + (std2**2 / n2))
    z_critical = stats.norm.ppf(1 - alpha / 2)
    margin_of_error = z_critical * std_error

    lower_bound = (mean1 - mean2) - margin_of_error
    upper_bound = (mean1 - mean2) + margin_of_error

    if do_return:
        return lower_bound, upper_bound
    else:
        print(f'{to_test[0]} vs. {to_test[1]}')
        print(f'Upper bound: {upper_bound} —— Lower bound: {lower_bound}')

In [ ]:

conf_intervals = {'CC_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'credit_card'

for type in pay_types['payment_type'].values:
    if type != prev:
        lo, hi = mean_conf_int(pay_types, 'payment_type', (prev, type),
                      'avg_order_value', 'std', 'num_transactions')
        conf_intervals['CC_vs'].append(type)
        conf_intervals['lo_bound'].append(lo)
        conf_intervals['up_bound'].append(hi)

ci_paym_type_value = pd.DataFrame(conf_intervals)

In [ ]:

plt.barh(ci_paym_type_value['CC_vs'], ci_paym_type_value['up_bound']-ci_paym_type_value['lo_bound'], left=ci_paym_type_value['lo_bound'])
plt.title('Conf. Int. Order Value by Payment Type')
plt.xlim(-1,15)
plt.show()

In [ ]:

ci_paym_type_value.to_csv((r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\order_value_vs_payment_types.csv'), index=False)

Findings:¶

Customers that use credit card spend around 3€ more than customers that use boleto... and 11€ more than customers who use voucher.

Satisfaction vs delivery delay¶

In [ ]:

query='''
WITH review_vs_delivery AS(
	SELECT
		EXTRACT(DAY FROM orders.order_estimated_delivery_date - orders.order_delivered_customer_date) AS days_in_advance,
		reviews.review_score
	FROM orders
	JOIN order_reviews AS reviews ON reviews.order_id = orders.order_id
	WHERE (orders.order_estimated_delivery_date - orders.order_delivered_customer_date) IS NOT NULL
)

SELECT
    CASE
        WHEN days_in_advance > 0 THEN 'early'
        WHEN days_in_advance = 0 THEN 'on_time'
        ELSE 'delay'
    END AS delivery_status,
	STDDEV(review_score) as std,
	COUNT(*) AS frequency,
    ROUND(AVG(review_score),2) AS average_review_score
FROM 
    review_vs_delivery
GROUP BY 
    CASE
        WHEN days_in_advance > 0 THEN 'early'
        WHEN days_in_advance = 0 THEN 'on_time'
        ELSE 'delay'
    END
'''

delivery_satisf = pd.read_sql(query, engine)
delivery_satisf

Out[ ]:

	delivery_status	std	frequency	average_review_score
0	on_time	1.222938	2718	4.11
1	early	1.144371	86527	4.30
2	delay	1.570843	6362	2.27

In [ ]:

delivery_satisf.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\bar_satisfac_vs_delay.csv',
                 index=False)

In [ ]:

conf_intervals = {'on_time_vs':[], 'up_bound':[], 'lo_bound':[]}
prev = 'on_time'

for type in delivery_satisf['delivery_status'].values:
    if type != prev:
        lo, hi = mean_conf_int(delivery_satisf, 'delivery_status', (prev, type),
                      'average_review_score', 'std', 'frequency')
        conf_intervals['on_time_vs'].append(type)
        conf_intervals['lo_bound'].append(lo)
        conf_intervals['up_bound'].append(hi)

ci_delivery_satisf = pd.DataFrame(conf_intervals)

In [ ]:

plt.barh(ci_delivery_satisf['on_time_vs'], ci_delivery_satisf['up_bound']-ci_delivery_satisf['lo_bound'], left=ci_delivery_satisf['lo_bound'])
plt.title('Conf. Int. Delivery vs Satisfaction')
plt.xlim(-1,2.5)
plt.show()

In [ ]:

ci_delivery_satisf.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\satisfac_vs_delay.csv',
                 index=False)

Satisfaction Trend vs. Delivery Time¶

In [ ]:

query = '''
WITH review_vs_delivery AS(
	SELECT
		EXTRACT(DAY FROM orders.order_estimated_delivery_date - orders.order_delivered_customer_date) AS days_in_advance,
		reviews.review_score
	FROM orders
	JOIN order_reviews AS reviews ON reviews.order_id = orders.order_id
	WHERE (orders.order_estimated_delivery_date - orders.order_delivered_customer_date) IS NOT NULL
),

reviews_delivery_bin AS(
	SELECT
		*,
		width_bucket(days_in_advance, -20, 20, 20) AS bin
	FROM review_vs_delivery
)

SELECT
	bin,
	MAX(days_in_advance) AS high_bound,
	MIN(days_in_advance) AS low_bound,
	AVG(review_score) AS avg_stars
FROM reviews_delivery_bin
GROUP BY bin
ORDER BY bin
'''

satisf_delivery_trend = pd.read_sql(query, engine)
satisf_delivery_trend.head()

Out[ ]:

	bin	high_bound	low_bound	avg_stars
0	0	-21.0	-188.0	1.768725
1	1	-19.0	-20.0	1.587500
2	2	-17.0	-18.0	1.672840
3	3	-15.0	-16.0	1.696000
4	4	-13.0	-14.0	1.654494

In [ ]:

satisf_delivery_trend.loc[0, 'low_bound'] = -22
satisf_delivery_trend.loc[21, 'high_bound'] = 21

In [ ]:

plt.plot(satisf_delivery_trend['low_bound'], satisf_delivery_trend['avg_stars'])
plt.title('Satisfaction Trend vs. Delivery Time')
plt.show()

In [ ]:

df.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\ecom_brazil\tableau_good\review_score_vs_delivery.csv',
          index=False)

Brazil and the tendency to buy locally¶

In [ ]:

query = '''
SELECT
	CASE
		 WHEN sellers.seller_city = customers.customer_city THEN 'km 0'
		 WHEN sellers.seller_state = customers.customer_state THEN 'local'
		 ELSE 'not local'
	END,
	COUNT(*) AS frequency
FROM orders
LEFT JOIN order_items AS items ON items.order_id = orders.order_id
JOIN customers ON customers.customer_id = orders.customer_id
JOIN sellers ON sellers.seller_id = items.seller_id
GROUP BY CASE
		 	WHEN sellers.seller_city = customers.customer_city THEN 'km 0'
			WHEN sellers.seller_state = customers.customer_state THEN 'local'
		 	ELSE 'not local'
		 END;
'''

pd.read_sql(query, engine)

Out[ ]:

	case	frequency
0	km 0	5863
1	local	34908
2	not local	71879

In [ ]:

query = '''
WITH distances AS(
    SELECT
        ACOS(
            GREATEST(
                LEAST(
                    SIN(RADIANS(geo_sellers.geolocation_lat)) * SIN(RADIANS(geo_cust.geolocation_lat)) +
                    COS(RADIANS(geo_sellers.geolocation_lat)) * COS(RADIANS(geo_cust.geolocation_lat)) *
                    COS(RADIANS(geo_cust.geolocation_lng) - RADIANS(geo_sellers.geolocation_lng)),
                    1
                ),
                -1
            )
        ) * 6371 AS distance
    FROM orders
    LEFT JOIN order_items AS items ON items.order_id = orders.order_id
    JOIN customers ON customers.customer_id = orders.customer_id
    JOIN sellers ON sellers.seller_id = items.seller_id
    JOIN geolocation as geo_sellers ON geo_sellers.geolocation_zip_code_prefix = sellers.seller_zip_code_prefix
    JOIN geolocation as geo_cust ON geo_cust.geolocation_zip_code_prefix = customers.customer_zip_code_prefix
)

SELECT
    AVG(distance) AS avg_distance,
    PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY distance) AS median_distance
FROM distances;
'''

pd.read_sql(query, engine)

Out[ ]:

	avg_distance	median_distance
0	597.187869	431.753988

Most sold categories¶

In [ ]:

query = '''
SELECT
	translat.product_category_name_english AS category,
	COUNT(*) AS frequency
FROM products
JOIN product_category_name_translation AS translat ON translat.product_category_name = products.product_category_name
JOIN order_items AS items ON items.product_id = products.product_id
GROUP BY translat.product_category_name_english
ORDER BY COUNT(*) DESC;
'''

pd.read_sql(query, engine).head()

Out[ ]:

	category	frequency
0	bed_bath_table	11115
1	health_beauty	9670
2	sports_leisure	8641
3	furniture_decor	8334
4	computers_accessories	7827

Orders with just one category¶

In [ ]:

query = '''
WITH sold_togheter AS(
	SELECT
		translat.product_category_name_english AS category,
		items.product_id AS product_id,
		items.order_id AS order_id,
		MAX(items.order_item_id) OVER (PARTITION BY items.order_id) AS num_items
	FROM products
	JOIN product_category_name_translation AS translat ON translat.product_category_name = products.product_category_name
	JOIN order_items AS items ON items.product_id = products.product_id
	ORDER BY items.order_item_id DESC
),

categories_sold_togheter AS(
	SELECT
		order_id,
		category,
		num_items,
		COUNT(*) AS num_category
	FROM sold_togheter
	WHERE num_items > 1
	GROUP BY order_id, category, num_items
	ORDER BY num_items DESC
)

SELECT
	SUM(CASE WHEN num_items=num_category THEN 1 ELSE 0 END) AS one_cat_orders,
	COUNT(DISTINCT order_id) as tot_orders
FROM categories_sold_togheter;
'''

pd.read_sql(query, engine)

Out[ ]:

	one_cat_orders	tot_orders
0	8914	9673

Findings: The vast majority of orders (with more than 1 product) in Brazil are within the same product category.

Olist: Penetrating The Brazilian Market

Scope of The Analysis

Final Result

The Analysis

Total products bought by state¶

State with the highest order value¶

Is there a relation between avg_order_value and tot_orders by state?¶

Confidece intervals avg order value¶

State with the most recurring customers¶

Confidence intervals recurring customers proportion¶

Payment method with the highest average order value¶

Findings:¶

Satisfaction vs delivery delay¶

Satisfaction Trend vs. Delivery Time¶

Brazil and the tendency to buy locally¶

Most sold categories¶

Orders with just one category¶

Finding Insights

Let's Work Together