Skip to content

Commit 002d3e0

Browse files
travisjneumanclaude
andcommitted
feat: add tests for expansion modules 07-12
Adds pedagogically annotated test files for all 26 projects across data analysis, advanced testing, Docker, Django, package publishing, and cloud deployment modules. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8d74f64 commit 002d3e0

31 files changed

Lines changed: 4121 additions & 0 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
"""
2+
Tests for Project 01 — Pandas Basics
3+
4+
These tests verify the data exploration functions from project.py using
5+
small inline DataFrames instead of loading CSV files. This makes the tests
6+
self-contained and fast — they do not depend on external data files.
7+
8+
Why inline DataFrames?
9+
In real data analysis, your data lives in files or databases. But in tests,
10+
you want full control over the input so you can predict the output. Creating
11+
small DataFrames directly in Python gives you that control.
12+
13+
Run with: pytest tests/test_project.py -v
14+
"""
15+
16+
import pandas as pd
17+
import pytest
18+
19+
from project import (
20+
load_data,
21+
explore_shape,
22+
select_columns,
23+
sort_by_grade,
24+
)
25+
26+
27+
# ── Helper: create a small test DataFrame ──────────────────────────────
28+
# This fixture provides a consistent DataFrame for all tests in this file.
29+
# Using a fixture avoids repeating the same setup code in every test.
30+
31+
@pytest.fixture
32+
def sample_df():
33+
"""Create a small DataFrame that mimics the students.csv structure."""
34+
return pd.DataFrame({
35+
"name": ["Alice", "Bob", "Charlie", "Diana"],
36+
"subject": ["Math", "Science", "Math", "English"],
37+
"grade": [92, 78, 85, 95],
38+
"age": [16, 17, 16, 18],
39+
})
40+
41+
42+
# ── Test: load_data reads a CSV correctly ──────────────────────────────
43+
# We test load_data by writing a temporary CSV and loading it.
44+
# tmp_path is a built-in pytest fixture that gives us a temporary directory.
45+
46+
def test_load_data_returns_dataframe(tmp_path):
47+
"""load_data should return a DataFrame with the correct number of rows.
48+
49+
WHY: This verifies that pd.read_csv is called correctly and the function
50+
returns a proper DataFrame. We use a temporary file so the test does not
51+
depend on data/students.csv existing.
52+
"""
53+
# Create a small CSV file in the temporary directory.
54+
csv_path = tmp_path / "test_students.csv"
55+
csv_path.write_text("name,subject,grade,age\nAlice,Math,92,16\nBob,Science,78,17\n")
56+
57+
df = load_data(str(csv_path))
58+
59+
# Check that we got a DataFrame back with the right shape.
60+
assert isinstance(df, pd.DataFrame), "load_data should return a DataFrame"
61+
assert len(df) == 2, "Should have loaded 2 rows from the CSV"
62+
assert list(df.columns) == ["name", "subject", "grade", "age"]
63+
64+
65+
# ── Test: explore_shape reports correct dimensions ─────────────────────
66+
67+
def test_explore_shape_does_not_modify_data(sample_df):
68+
"""explore_shape should not alter the DataFrame.
69+
70+
WHY: Exploration functions should be read-only. If they accidentally
71+
modify the data, downstream analysis could produce wrong results.
72+
"""
73+
original_shape = sample_df.shape
74+
75+
# Call the function (it prints output, but we only care about side effects).
76+
explore_shape(sample_df)
77+
78+
assert sample_df.shape == original_shape, "explore_shape should not change the DataFrame"
79+
80+
81+
# ── Test: select_columns picks the right columns ──────────────────────
82+
83+
def test_select_columns_returns_name_and_grade(sample_df, capsys):
84+
"""select_columns should display only the name and grade columns.
85+
86+
WHY: Column selection is one of the most common pandas operations.
87+
This test verifies that the function selects the correct subset.
88+
capsys captures printed output so we can verify it contains expected text.
89+
"""
90+
select_columns(sample_df)
91+
captured = capsys.readouterr()
92+
93+
# The function prints the selected columns, so we check the output.
94+
assert "name" in captured.out, "Output should mention the 'name' column"
95+
assert "grade" in captured.out, "Output should mention the 'grade' column"
96+
97+
98+
# ── Test: sort_by_grade orders highest first ───────────────────────────
99+
100+
def test_sort_by_grade_highest_first(sample_df, capsys):
101+
"""sort_by_grade should display grades in descending order.
102+
103+
WHY: Sorting is fundamental in data analysis. Verifying the sort order
104+
ensures the ascending=False parameter is applied correctly.
105+
"""
106+
sort_by_grade(sample_df)
107+
captured = capsys.readouterr()
108+
109+
# Diana has the highest grade (95), so her name should appear first
110+
# in the sorted output.
111+
assert "Diana" in captured.out or "95" in captured.out, (
112+
"The highest-scoring student should appear in the output"
113+
)
114+
115+
116+
# ── Test: DataFrame column types ──────────────────────────────────────
117+
118+
def test_dataframe_has_expected_dtypes(sample_df):
119+
"""The DataFrame should have numeric types for grade and age.
120+
121+
WHY: If pandas reads grade or age as strings (object type), mathematical
122+
operations like mean() and sort_values() would fail or give wrong results.
123+
This test catches type-detection problems early.
124+
"""
125+
assert sample_df["grade"].dtype in ("int64", "int32", "float64"), (
126+
"grade column should be numeric"
127+
)
128+
assert sample_df["age"].dtype in ("int64", "int32", "float64"), (
129+
"age column should be numeric"
130+
)
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
"""
2+
Tests for Project 02 — Filtering & Grouping
3+
4+
These tests verify boolean filtering, .loc[] selection, value_counts(),
5+
groupby(), and multi-aggregation using small inline DataFrames.
6+
7+
Why test data analysis functions?
8+
Even though pandas does the heavy lifting, your functions add logic on
9+
top (thresholds, column choices, aggregation strategies). Tests verify
10+
that YOUR logic is correct, not that pandas works.
11+
12+
Run with: pytest tests/test_project.py -v
13+
"""
14+
15+
import pandas as pd
16+
import pytest
17+
18+
from project import (
19+
filter_high_grades,
20+
filter_with_loc,
21+
count_subjects,
22+
group_by_subject_mean,
23+
group_by_subject_multi_agg,
24+
top_students_per_subject,
25+
)
26+
27+
28+
@pytest.fixture
29+
def sample_df():
30+
"""Create a small DataFrame mimicking students.csv.
31+
32+
This dataset is carefully designed so we know the expected results:
33+
- 2 students above grade 80 (Alice=92, Diana=95)
34+
- 2 Math students, 1 Science, 1 English
35+
- Top per subject: Alice (Math), Bob (Science), Diana (English)
36+
"""
37+
return pd.DataFrame({
38+
"name": ["Alice", "Bob", "Charlie", "Diana"],
39+
"subject": ["Math", "Science", "Math", "English"],
40+
"grade": [92, 78, 75, 95],
41+
"age": [16, 17, 16, 18],
42+
})
43+
44+
45+
# ── Test: filter_high_grades returns correct rows ──────────────────────
46+
47+
def test_filter_high_grades_default_threshold(sample_df):
48+
"""filter_high_grades(df, 80) should return only students above 80.
49+
50+
WHY: Boolean indexing is the primary way to filter data in pandas.
51+
Getting the threshold comparison wrong (>= vs >, wrong column) would
52+
silently return wrong results. This test catches those mistakes.
53+
"""
54+
result = filter_high_grades(sample_df, threshold=80)
55+
56+
assert len(result) == 2, "Should find exactly 2 students above grade 80"
57+
assert set(result["name"]) == {"Alice", "Diana"}, (
58+
"Alice (92) and Diana (95) should be the high performers"
59+
)
60+
61+
62+
def test_filter_high_grades_custom_threshold(sample_df):
63+
"""filter_high_grades with a high threshold should return fewer rows.
64+
65+
WHY: The threshold parameter should actually be used — this test verifies
66+
that changing the threshold changes the output. A hardcoded filter would
67+
fail this test.
68+
"""
69+
result = filter_high_grades(sample_df, threshold=93)
70+
71+
assert len(result) == 1, "Only Diana (95) is above 93"
72+
assert result.iloc[0]["name"] == "Diana"
73+
74+
75+
# ── Test: filter_with_loc selects Science students ─────────────────────
76+
77+
def test_filter_with_loc_returns_science_only(sample_df):
78+
"""filter_with_loc should return only Science students with name and grade.
79+
80+
WHY: .loc[] is more explicit than bracket indexing. This test verifies
81+
both the row filter (subject == 'Science') and the column selection
82+
(only 'name' and 'grade' columns).
83+
"""
84+
result = filter_with_loc(sample_df)
85+
86+
assert len(result) == 1, "Only Bob studies Science"
87+
assert list(result.columns) == ["name", "grade"], (
88+
"Should return only name and grade columns"
89+
)
90+
assert result.iloc[0]["name"] == "Bob"
91+
92+
93+
# ── Test: count_subjects tallies correctly ─────────────────────────────
94+
95+
def test_count_subjects_returns_correct_counts(sample_df):
96+
"""count_subjects should count students per subject.
97+
98+
WHY: value_counts() is the quickest way to see the distribution of a
99+
categorical variable. This test verifies the counts match our known data.
100+
"""
101+
counts = count_subjects(sample_df)
102+
103+
assert counts["Math"] == 2, "Math has 2 students (Alice, Charlie)"
104+
assert counts["Science"] == 1, "Science has 1 student (Bob)"
105+
assert counts["English"] == 1, "English has 1 student (Diana)"
106+
107+
108+
# ── Test: group_by_subject_mean computes correct averages ──────────────
109+
110+
def test_group_by_subject_mean_values(sample_df):
111+
"""group_by_subject_mean should return the average grade per subject.
112+
113+
WHY: groupby + mean is the pandas equivalent of SQL's GROUP BY + AVG.
114+
An incorrect grouping column or aggregation function would give wrong
115+
numbers. We verify against hand-calculated averages.
116+
"""
117+
means = group_by_subject_mean(sample_df)
118+
119+
# Math average: (92 + 75) / 2 = 83.5
120+
assert means["Math"] == pytest.approx(83.5), "Math mean should be 83.5"
121+
# Science average: 78 / 1 = 78.0
122+
assert means["Science"] == pytest.approx(78.0), "Science mean should be 78.0"
123+
# English average: 95 / 1 = 95.0
124+
assert means["English"] == pytest.approx(95.0), "English mean should be 95.0"
125+
126+
127+
# ── Test: group_by_subject_multi_agg returns mean, max, min ────────────
128+
129+
def test_group_by_subject_multi_agg_columns(sample_df):
130+
"""group_by_subject_multi_agg should return a table with mean, max, min.
131+
132+
WHY: agg() with multiple functions is a powerful pattern. This test
133+
verifies that all three aggregation columns are present and that max/min
134+
are correct for the Math group (which has 2 students).
135+
"""
136+
result = group_by_subject_multi_agg(sample_df)
137+
138+
assert "mean" in result.columns, "Result should have a 'mean' column"
139+
assert "max" in result.columns, "Result should have a 'max' column"
140+
assert "min" in result.columns, "Result should have a 'min' column"
141+
142+
# Math: max=92, min=75
143+
assert result.loc["Math", "max"] == 92
144+
assert result.loc["Math", "min"] == 75
145+
146+
147+
# ── Test: top_students_per_subject finds the best in each ─────────────
148+
149+
def test_top_students_per_subject(sample_df):
150+
"""top_students_per_subject should find the highest-scoring student per subject.
151+
152+
WHY: Combining groupby with idxmax is a common pattern for finding
153+
"the best X in each category." This test verifies the combination works
154+
end-to-end.
155+
"""
156+
result = top_students_per_subject(sample_df)
157+
158+
# Convert to a dict of subject -> name for easy checking.
159+
top_by_subject = dict(zip(result["subject"], result["name"]))
160+
161+
assert top_by_subject["Math"] == "Alice", "Alice has the highest Math grade (92)"
162+
assert top_by_subject["Science"] == "Bob", "Bob is the only Science student"
163+
assert top_by_subject["English"] == "Diana", "Diana is the only English student"

0 commit comments

Comments
 (0)