travisjneuman
diff --git a/‎projects/modules/07-data-analysis/01-pandas-basics/tests/test_project.py‎
Lines changed: 130 additions & 0 deletions b/‎projects/modules/07-data-analysis/01-pandas-basics/tests/test_project.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎projects/modules/07-data-analysis/02-filtering-grouping/tests/test_project.py‎
Lines changed: 163 additions & 0 deletions b/‎projects/modules/07-data-analysis/02-filtering-grouping/tests/test_project.py‎
Lines changed: 163 additions & 0 deletions
@@ -0,0 +1,130 @@
+"""
+Tests for Project 01 — Pandas Basics
+
+These tests verify the data exploration functions from project.py using
+small inline DataFrames instead of loading CSV files. This makes the tests
+self-contained and fast — they do not depend on external data files.
+
+Why inline DataFrames?
+    In real data analysis, your data lives in files or databases. But in tests,
+    you want full control over the input so you can predict the output. Creating
+    small DataFrames directly in Python gives you that control.
+
+Run with: pytest tests/test_project.py -v
+"""
+
+import pandas as pd
+import pytest
+
+from project import (
+    load_data,
+    explore_shape,
+    select_columns,
+    sort_by_grade,
+)
+
+
+# ── Helper: create a small test DataFrame ──────────────────────────────
+# This fixture provides a consistent DataFrame for all tests in this file.
+# Using a fixture avoids repeating the same setup code in every test.
+
+@pytest.fixture
+def sample_df():
+    """Create a small DataFrame that mimics the students.csv structure."""
+    return pd.DataFrame({
+        "name": ["Alice", "Bob", "Charlie", "Diana"],
+        "subject": ["Math", "Science", "Math", "English"],
+        "grade": [92, 78, 85, 95],
+        "age": [16, 17, 16, 18],
+    })
+
+
+# ── Test: load_data reads a CSV correctly ──────────────────────────────
+# We test load_data by writing a temporary CSV and loading it.
+# tmp_path is a built-in pytest fixture that gives us a temporary directory.
+
+def test_load_data_returns_dataframe(tmp_path):
+    """load_data should return a DataFrame with the correct number of rows.
+
+    WHY: This verifies that pd.read_csv is called correctly and the function
+    returns a proper DataFrame. We use a temporary file so the test does not
+    depend on data/students.csv existing.
+    """
+    # Create a small CSV file in the temporary directory.
+    csv_path = tmp_path / "test_students.csv"
+    csv_path.write_text("name,subject,grade,age\nAlice,Math,92,16\nBob,Science,78,17\n")
+
+    df = load_data(str(csv_path))
+
+    # Check that we got a DataFrame back with the right shape.
+    assert isinstance(df, pd.DataFrame), "load_data should return a DataFrame"
+    assert len(df) == 2, "Should have loaded 2 rows from the CSV"
+    assert list(df.columns) == ["name", "subject", "grade", "age"]
+
+
+# ── Test: explore_shape reports correct dimensions ─────────────────────
+
+def test_explore_shape_does_not_modify_data(sample_df):
+    """explore_shape should not alter the DataFrame.
+
+    WHY: Exploration functions should be read-only. If they accidentally
+    modify the data, downstream analysis could produce wrong results.
+    """
+    original_shape = sample_df.shape
+
+    # Call the function (it prints output, but we only care about side effects).
+    explore_shape(sample_df)
+
+    assert sample_df.shape == original_shape, "explore_shape should not change the DataFrame"
+
+
+# ── Test: select_columns picks the right columns ──────────────────────
+
+def test_select_columns_returns_name_and_grade(sample_df, capsys):
+    """select_columns should display only the name and grade columns.
+
+    WHY: Column selection is one of the most common pandas operations.
+    This test verifies that the function selects the correct subset.
+    capsys captures printed output so we can verify it contains expected text.
+    """
+    select_columns(sample_df)
+    captured = capsys.readouterr()
+
+    # The function prints the selected columns, so we check the output.
+    assert "name" in captured.out, "Output should mention the 'name' column"
+    assert "grade" in captured.out, "Output should mention the 'grade' column"
+
+
+# ── Test: sort_by_grade orders highest first ───────────────────────────
+
+def test_sort_by_grade_highest_first(sample_df, capsys):
+    """sort_by_grade should display grades in descending order.
+
+    WHY: Sorting is fundamental in data analysis. Verifying the sort order
+    ensures the ascending=False parameter is applied correctly.
+    """
+    sort_by_grade(sample_df)
+    captured = capsys.readouterr()
+
+    # Diana has the highest grade (95), so her name should appear first
+    # in the sorted output.
+    assert "Diana" in captured.out or "95" in captured.out, (
+        "The highest-scoring student should appear in the output"
+    )
+
+
+# ── Test: DataFrame column types ──────────────────────────────────────
+
+def test_dataframe_has_expected_dtypes(sample_df):
+    """The DataFrame should have numeric types for grade and age.
+
+    WHY: If pandas reads grade or age as strings (object type), mathematical
+    operations like mean() and sort_values() would fail or give wrong results.
+    This test catches type-detection problems early.
+    """
+    assert sample_df["grade"].dtype in ("int64", "int32", "float64"), (
+        "grade column should be numeric"
+    )
+    assert sample_df["age"].dtype in ("int64", "int32", "float64"), (
+        "age column should be numeric"
+    )
@@ -0,0 +1,163 @@
+"""
+Tests for Project 02 — Filtering & Grouping
+
+These tests verify boolean filtering, .loc[] selection, value_counts(),
+groupby(), and multi-aggregation using small inline DataFrames.
+
+Why test data analysis functions?
+    Even though pandas does the heavy lifting, your functions add logic on
+    top (thresholds, column choices, aggregation strategies). Tests verify
+    that YOUR logic is correct, not that pandas works.
+
+Run with: pytest tests/test_project.py -v
+"""
+
+import pandas as pd
+import pytest
+
+from project import (
+    filter_high_grades,
+    filter_with_loc,
+    count_subjects,
+    group_by_subject_mean,
+    group_by_subject_multi_agg,
+    top_students_per_subject,
+)
+
+
+@pytest.fixture
+def sample_df():
+    """Create a small DataFrame mimicking students.csv.
+
+    This dataset is carefully designed so we know the expected results:
+    - 2 students above grade 80 (Alice=92, Diana=95)
+    - 2 Math students, 1 Science, 1 English
+    - Top per subject: Alice (Math), Bob (Science), Diana (English)
+    """
+    return pd.DataFrame({
+        "name": ["Alice", "Bob", "Charlie", "Diana"],
+        "subject": ["Math", "Science", "Math", "English"],
+        "grade": [92, 78, 75, 95],
+        "age": [16, 17, 16, 18],
+    })
+
+
+# ── Test: filter_high_grades returns correct rows ──────────────────────
+
+def test_filter_high_grades_default_threshold(sample_df):
+    """filter_high_grades(df, 80) should return only students above 80.
+
+    WHY: Boolean indexing is the primary way to filter data in pandas.
+    Getting the threshold comparison wrong (>= vs >, wrong column) would
+    silently return wrong results. This test catches those mistakes.
+    """
+    result = filter_high_grades(sample_df, threshold=80)
+
+    assert len(result) == 2, "Should find exactly 2 students above grade 80"
+    assert set(result["name"]) == {"Alice", "Diana"}, (
+        "Alice (92) and Diana (95) should be the high performers"
+    )
+
+
+def test_filter_high_grades_custom_threshold(sample_df):
+    """filter_high_grades with a high threshold should return fewer rows.
+
+    WHY: The threshold parameter should actually be used — this test verifies
+    that changing the threshold changes the output. A hardcoded filter would
+    fail this test.
+    """
+    result = filter_high_grades(sample_df, threshold=93)
+
+    assert len(result) == 1, "Only Diana (95) is above 93"
+    assert result.iloc[0]["name"] == "Diana"
+
+
+# ── Test: filter_with_loc selects Science students ─────────────────────
+
+def test_filter_with_loc_returns_science_only(sample_df):
+    """filter_with_loc should return only Science students with name and grade.
+
+    WHY: .loc[] is more explicit than bracket indexing. This test verifies
+    both the row filter (subject == 'Science') and the column selection
+    (only 'name' and 'grade' columns).
+    """
+    result = filter_with_loc(sample_df)
+
+    assert len(result) == 1, "Only Bob studies Science"
+    assert list(result.columns) == ["name", "grade"], (
+        "Should return only name and grade columns"
+    )
+    assert result.iloc[0]["name"] == "Bob"
+
+
+# ── Test: count_subjects tallies correctly ─────────────────────────────
+
+def test_count_subjects_returns_correct_counts(sample_df):
+    """count_subjects should count students per subject.
+
+    WHY: value_counts() is the quickest way to see the distribution of a
+    categorical variable. This test verifies the counts match our known data.
+    """
+    counts = count_subjects(sample_df)
+
+    assert counts["Math"] == 2, "Math has 2 students (Alice, Charlie)"
+    assert counts["Science"] == 1, "Science has 1 student (Bob)"
+    assert counts["English"] == 1, "English has 1 student (Diana)"
+
+
+# ── Test: group_by_subject_mean computes correct averages ──────────────
+
+def test_group_by_subject_mean_values(sample_df):
+    """group_by_subject_mean should return the average grade per subject.
+
+    WHY: groupby + mean is the pandas equivalent of SQL's GROUP BY + AVG.
+    An incorrect grouping column or aggregation function would give wrong
+    numbers. We verify against hand-calculated averages.
+    """
+    means = group_by_subject_mean(sample_df)
+
+    # Math average: (92 + 75) / 2 = 83.5
+    assert means["Math"] == pytest.approx(83.5), "Math mean should be 83.5"
+    # Science average: 78 / 1 = 78.0
+    assert means["Science"] == pytest.approx(78.0), "Science mean should be 78.0"
+    # English average: 95 / 1 = 95.0
+    assert means["English"] == pytest.approx(95.0), "English mean should be 95.0"
+
+
+# ── Test: group_by_subject_multi_agg returns mean, max, min ────────────
+
+def test_group_by_subject_multi_agg_columns(sample_df):
+    """group_by_subject_multi_agg should return a table with mean, max, min.
+
+    WHY: agg() with multiple functions is a powerful pattern. This test
+    verifies that all three aggregation columns are present and that max/min
+    are correct for the Math group (which has 2 students).
+    """
+    result = group_by_subject_multi_agg(sample_df)
+
+    assert "mean" in result.columns, "Result should have a 'mean' column"
+    assert "max" in result.columns, "Result should have a 'max' column"
+    assert "min" in result.columns, "Result should have a 'min' column"
+
+    # Math: max=92, min=75
+    assert result.loc["Math", "max"] == 92
+    assert result.loc["Math", "min"] == 75
+
+
+# ── Test: top_students_per_subject finds the best in each ─────────────
+
+def test_top_students_per_subject(sample_df):
+    """top_students_per_subject should find the highest-scoring student per subject.
+
+    WHY: Combining groupby with idxmax is a common pattern for finding
+    "the best X in each category." This test verifies the combination works
+    end-to-end.
+    """
+    result = top_students_per_subject(sample_df)
+
+    # Convert to a dict of subject -> name for easy checking.
+    top_by_subject = dict(zip(result["subject"], result["name"]))
+
+    assert top_by_subject["Math"] == "Alice", "Alice has the highest Math grade (92)"
+    assert top_by_subject["Science"] == "Bob", "Bob is the only Science student"
+    assert top_by_subject["English"] == "Diana", "Diana is the only English student"