import numpy as np
import pandas as pd


population = pd.DataFrame(
    ["white"] * 100000 +
    ["hispanic"] * 60000 +
    ["black"] * 50000 +
    ["asian"] * 15000 +
    ["other"] * 35000
)

sample = pd.DataFrame(
    ["white"] * 600 +
    ["hispanic"] * 300 +
    ["black"] * 250 +
    ["asian"] * 75 +
    ["other"] * 150
)

print("Population:")
display(population)

print("\nSample:")
display(sample)

Population:

Sample:


population_table =  pd.crosstab(index=population[0], columns='count')
sample_table = pd.crosstab(index=sample[0], columns='count')

print("Population:")
display(population_table)
print("\nSample:")
display(sample_table)

Population:

Sample:


observed = sample_table

population_ratios = population_table / len(population)      # Get population ratios

print("Population ratios:")
display(population_ratios)

expected = population_ratios * len(sample)                  # Get expected counts

print("\nExpected:")
display(expected)

chi_squared_stat = (((observed - expected) ** 2) / expected).sum()
print(f"\nChi-square stat: {chi_squared_stat}")

Population ratios:

Expected:

Chi-square stat: col_0
count    18.194805
dtype: float64


from scipy.stats import chi2
crit = chi2.ppf(
    q=0.95,         # The critical value for 95% confidence level (5% alpha)
    df=4            # Degrees of Freedom = Number of categorical values - 1
)

print(f"Critical value: {crit}")

p_value = 1 - chi2.cdf(x=chi_squared_stat, df=4)

print(f"p-value: {p_value}")

Critical value: 9.487729036781154
p-value: [0.00113047]


from scipy.stats import chisquare

chisquare(f_obs=observed, f_exp=expected)

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))


np.random.seed(10)

# Sample data generated randomly at fixed probabilities
voter_race = np.random.choice(
    a=['asian', 'black', 'hispanic', 'other', 'white'],
    p=[0.05, 0.15, 0.25, 0.05, 0.5],
    size=1000
)

voter_party = np.random.choice(
    a=['democrat', 'independent', 'republican'],
    p=[0.4, 0.2, 0.4],
    size=1000
)

voters = pd.DataFrame({
    'race': voter_race,
    'party': voter_party
})

print("Voters:")
display(voters)

voters_table = pd.crosstab(voters.race, voters.party, margins=True)
voters_table.columns = ['democrat', 'independent', 'republican', 'row_totals']
voters_table.index = ['asian', 'black', 'hispanic', 'other', 'white', 'col_totals']

print("\n\nVoters Table:")
display(voters_table)

Voters:


Voters Table:


observed = voters_table.iloc[0:5, 0:3]     # Get tabel without totals for later use
display(observed)


expected = np.outer(
    voters_table['row_totals'][0:5],
    voters_table.loc['col_totals'][0:3]
) / 1000

expected = pd.DataFrame(expected)

expected.columns = ['democrat', 'independent', 'republican']
expected.index = ['asian', 'black', 'hispanic', 'other', 'white']

display(expected)


chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()

print(f"\nChi-square stat: {chi_squared_stat}")

Chi-square stat: 7.169321280162059


from scipy.stats import chi2

crit = chi2.ppf(
    q=0.95,     # Find the critical value for 95% confidence level
    df=8
)

print(f"Critical value: {crit}")

p_value = 1 - chi2.cdf(x=chi_squared_stat, df=8)

print(f"p-value: {p_value}")

Critical value: 15.50731305586545
p-value: 0.518479392948842


from scipy.stats import chi2_contingency

result = chi2_contingency(observed=observed)

print(result)

print(f'\nStatistic: {result.statistic}')
print(f'\np-value: {result.pvalue}')

Chi2ContingencyResult(statistic=7.169321280162059, pvalue=0.518479392948842, dof=8, expected_freq=array([[ 23.82 ,  11.16 ,  25.02 ],
       [ 61.138,  28.644,  64.218],
       [ 99.647,  46.686, 104.667],
       [ 15.086,   7.068,  15.846],
       [197.309,  92.442, 207.249]]))

Statistic: 7.169321280162059

p-value: 0.518479392948842


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from scipy.stats import poisson
from pprint import pprint
np.random.seed(12)

races = ['asian', 'black', 'hispanic', 'other', 'white']

# Generate random data
voter_race = np.random.choice(
    a=races,
    p=[0.05, 0.15, 0.25, 0.05, 0.5],
    size=1000
)

voter_age = poisson.rvs(loc=18, mu=30, size=1000)

# Group age data by race
voters = pd.DataFrame({'race': voter_race, 'age': voter_age})

groups = voters.groupby('race').groups

display(groups)

{'asian': [4, 7, 14, 21, 49, 53, 59, 78, 95, 98, 135, 136, 162, 203, 227, 264, 278, 289, 326, 335, 345, 373, 430, 480, 484, 491, 516, 587, 602, 684, 692, 708, 715, 761, 776, 826, 828, 832, 853, 897, 942, 951, 986, 996], 'black': [0, 9, 19, 22, 23, 42, 50, 56, 62, 76, 105, 108, 119, 120, 124, 126, 131, 134, 138, 143, 152, 167, 171, 178, 182, 184, 202, 206, 211, 213, 229, 233, 236, 238, 242, 243, 245, 249, 253, 259, 261, 265, 266, 281, 287, 290, 294, 299, 303, 312, 314, 320, 328, 332, 333, 340, 349, 379, 383, 389, 394, 409, 411, 412, 420, 427, 438, 456, 457, 464, 469, 472, 476, 478, 479, 486, 488, 492, 497, 498, 508, 518, 521, 534, 536, 552, 555, 557, 568, 574, 601, 607, 609, 615, 620, 624, 625, 628, 636, 646, ...], 'hispanic': [2, 10, 24, 28, 31, 32, 38, 40, 44, 45, 47, 54, 55, 58, 63, 71, 74, 83, 87, 88, 89, 91, 100, 104, 109, 110, 111, 113, 114, 117, 121, 123, 128, 132, 133, 139, 144, 145, 148, 155, 156, 158, 159, 168, 169, 172, 173, 188, 191, 195, 209, 210, 217, 218, 220, 223, 224, 231, 235, 240, 241, 248, 254, 256, 257, 258, 262, 263, 268, 270, 273, 274, 282, 285, 291, 292, 298, 301, 313, 329, 330, 336, 337, 338, 339, 342, 344, 346, 348, 356, 357, 361, 366, 369, 375, 376, 377, 380, 381, 384, ...], 'other': [17, 26, 39, 46, 48, 65, 67, 72, 146, 237, 246, 255, 284, 302, 317, 322, 358, 370, 386, 413, 425, 446, 530, 542, 569, 571, 573, 575, 583, 626, 629, 637, 662, 696, 700, 701, 728, 739, 756, 757, 773, 813, 819, 880, 923, 936, 939, 971, 980, 992], 'white': [1, 3, 5, 6, 8, 11, 12, 13, 15, 16, 18, 20, 25, 27, 29, 30, 33, 34, 35, 36, 37, 41, 43, 51, 52, 57, 60, 61, 64, 66, 68, 69, 70, 73, 75, 77, 79, 80, 81, 82, 84, 85, 86, 90, 92, 93, 94, 96, 97, 99, 101, 102, 103, 106, 107, 112, 115, 116, 118, 122, 125, 127, 129, 130, 137, 140, 141, 142, 147, 149, 150, 151, 153, 154, 157, 160, 161, 163, 164, 165, 166, 170, 174, 175, 176, 177, 179, 180, 181, 183, 185, 186, 187, 189, 190, 192, 193, 194, 196, 197, ...]}


# Extracting individual groups

asian = voter_age[groups['asian']]
black = voter_age[groups['black']]
hispanic = voter_age[groups['hispanic']]
other = voter_age[groups['other']]
white = voter_age[groups['white']]

display(asian)

array([56, 52, 37, 50, 53, 47, 56, 43, 46, 54, 45, 54, 42, 44, 55, 50, 45,
       49, 51, 57, 56, 46, 43, 53, 48, 54, 54, 44, 40, 46, 51, 52, 44, 54,
       43, 44, 53, 42, 54, 44, 59, 47, 54, 40])


# Perform the ANOVA
from scipy.stats import f_oneway

f_oneway(asian, black, hispanic, other, white)

F_onewayResult(statistic=1.7744689357329695, pvalue=0.13173183201930463)


np.random.seed(12)

# Generate random data
voter_race = np.random.choice(
    a=races,
    p=[0.05, 0.15, 0.25, 0.05, 0.5],
    size=1000
)

# Use a different distribution for ages of white people
white_ages = poisson.rvs(loc=18, mu=32, size=1000)

voter_age = poisson.rvs(loc=18, mu=30, size=1000)

voter_age = np.where(voter_race=='white', white_ages, voter_age)

# Group age data by race
voters = pd.DataFrame({'race': voter_race, 'age': voter_age})

groups = voters.groupby('race').groups

# Extract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

# Perform the ANOVA
f_oneway(asian, black, hispanic, other, white)

F_onewayResult(statistic=10.164699828386366, pvalue=4.5613242113994585e-08)


# Get all race pairs
race_pairs = list()

for race1 in range(4):
    for race2 in range(race1+1, 5):
        race_pairs.append((races[race1], races[race2]))

print(f"Race pairs: {race_pairs}")

Race pairs: [('asian', 'black'), ('asian', 'hispanic'), ('asian', 'other'), ('asian', 'white'), ('black', 'hispanic'), ('black', 'other'), ('black', 'white'), ('hispanic', 'other'), ('hispanic', 'white'), ('other', 'white')]


# Conduct t-test on each pair
from scipy.stats import ttest_ind

for race1, race2 in race_pairs:
    print(f"\n{race1} - {race2} t-test")
    print(ttest_ind(voter_age[groups[race1]], voter_age[groups[race2]]))

asian - black t-test
TtestResult(statistic=0.8386446909747979, pvalue=0.4027281369339345, df=189.0)

asian - hispanic t-test
TtestResult(statistic=-0.42594691924932293, pvalue=0.6704669004240726, df=286.0)

asian - other t-test
TtestResult(statistic=0.9795284739636, pvalue=0.3298877500095151, df=92.0)

asian - white t-test
TtestResult(statistic=-2.318108811252288, pvalue=0.020804701566400217, df=557.0)

black - hispanic t-test
TtestResult(statistic=-1.9527839210712925, pvalue=0.05156197171952594, df=389.0)

black - other t-test
TtestResult(statistic=0.28025754367057176, pvalue=0.7795770111117659, df=195.0)

black - white t-test
TtestResult(statistic=-5.379303881281834, pvalue=1.0394212166624012e-07, df=660.0)

hispanic - other t-test
TtestResult(statistic=1.5853626170340225, pvalue=0.11396630528484335, df=292.0)

hispanic - white t-test
TtestResult(statistic=-3.5160312714115376, pvalue=0.0004641298649066684, df=757.0)

other - white t-test
TtestResult(statistic=-3.763809322077872, pvalue=0.00018490576317593065, df=563.0)


from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey = pairwise_tukeyhsd(
    endog=voter_age,    # Data
    groups=voter_race,  # Groups
    alpha=0.05          # Significance level
)

# Plot group confidence intervals
tukey.plot_simultaneous()
plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red")

tukey.summary()

group1	group2	meandiff	p-adj	lower	upper	reject
asian	black	-0.8032	0.9208	-3.4423	1.836	False
asian	hispanic	0.4143	0.9915	-2.1011	2.9297	False
asian	other	-1.0645	0.8906	-4.2391	2.11	False
asian	white	1.9547	0.1751	-0.4575	4.3668	False
black	hispanic	1.2175	0.2318	-0.386	2.821	False
black	other	-0.2614	0.9986	-2.7757	2.253	False
black	white	2.7579	0.0	1.3217	4.194	True
hispanic	other	-1.4789	0.4374	-3.863	0.9053	False
hispanic	white	1.5404	0.004	0.3468	2.734	True
other	white	3.0192	0.0028	0.7443	5.2941	True

Chi-Square Goodness-Of-Fit Test¶

Chi-Square Test of Independence¶

ANOVA¶

One-Way ANOVA¶

col_0	count
0
asian	0.057692
black	0.192308
hispanic	0.230769
other	0.134615
white	0.384615

col_0	count
0
asian	79.326923
black	264.423077
hispanic	317.307692
other	185.096154
white	528.846154

	race	party
0	white	democrat
1	asian	republican
2	white	independent
3	white	republican
4	other	democrat
...	...	...
995	white	republican
996	hispanic	independent
997	black	independent
998	white	republican
999	black	democrat

	democrat	independent	republican
asian	23.820	11.160	25.020
black	61.138	28.644	64.218
hispanic	99.647	46.686	104.667
other	15.086	7.068	15.846
white	197.309	92.442	207.249

	democrat	independent	republican	row_totals
asian	21	7	32	60
black	65	25	64	154
hispanic	107	50	94	251
other	15	8	15	38
white	189	96	212	497
col_totals	397	186	417	1000