From ee21c35c77a3ab0e9e632da8ceb1e971936db5fd Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Mon, 22 Jun 2026 21:27:41 +0000 Subject: [PATCH] skip blank rows in load_csv so a trailing newline no longer triggers KeyError Closes #29. A file ending with \n (or any number of blank trailing lines) used to produce a dict entry that was missing every column, which then crashed with KeyError when the caller tried to access the key column. csv.reader returns an empty list for a fully blank row, so filtering those out before building dicts is enough - blank rows in the middle of the file are skipped the same way. Whitespace-only or comma-only lines still parse as data, which preserves the existing behaviour for inputs where those carry meaning. --- csv_diff/__init__.py | 7 +++++- tests/test_csv_diff.py | 54 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 59a2eaf..223b306 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -16,7 +16,12 @@ def load_csv(fp, key=None, dialect=None): pass fp = csv.reader(fp, dialect=(dialect or "excel")) headings = next(fp) - rows = [dict(zip(headings, line)) for line in fp] + # Skip blank rows so trailing newlines (or stray blank lines inside the + # file) don't get treated as data rows that are missing every column - + # that surfaced as an unhelpful KeyError on the key column. A row is + # "blank" if csv.reader returned no fields for it, which is what happens + # when the final record is just "\n" or a file ends with extra blank lines. + rows = [dict(zip(headings, line)) for line in fp if line] if key: keyfn = lambda r: r[key] else: diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..fc51655 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -51,6 +51,22 @@ 1,Cleo,5 2,Pancakes,3""" +# Trailing blank line - the most common case (POSIX text files end in \n, +# many editors add an extra one). Used to raise KeyError on the key column. +SEVEN_TRAILING = """id,name,age +1,Cleo,4 +2,Pancakes,2 +""" + +# Multiple trailing blank lines, plus a blank row in the middle of the file. +EIGHT_BLANK = """id,name,age + +1,Cleo,4 +2,Pancakes,2 + + +""" + def test_row_changed(): diff = compare( @@ -115,3 +131,41 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + + +def test_trailing_blank_line_is_skipped(): + # https://github.com/simonw/csv-diff/issues/29 - a file ending in "\n" + # (or any number of empty trailing lines) used to produce a dict entry + # missing every column, which then KeyError'd on the key column. The + # loader should silently skip blank rows so the rest of the diff runs. + loaded = load_csv(io.StringIO(SEVEN_TRAILING), key="id") + assert loaded == { + "1": {"id": "1", "name": "Cleo", "age": "4"}, + "2": {"id": "2", "name": "Pancakes", "age": "2"}, + } + + +def test_multiple_blank_lines_and_interior_blank_skipped(): + # A blank row in the middle and several trailing blank lines should all + # be dropped - the loader treats csv.reader's empty `line` as the marker + # of a row that contributed no fields. + loaded = load_csv(io.StringIO(EIGHT_BLANK), key="id") + assert loaded == { + "1": {"id": "1", "name": "Cleo", "age": "4"}, + "2": {"id": "2", "name": "Pancakes", "age": "2"}, + } + + +def test_compare_with_trailing_blank_lines(): + # End-to-end: comparing two identical files where both have a trailing + # newline should report no changes (regression check for issue #29). + a = "id,name,age\n1,Cleo,4\n" + b = "id,name,age\n1,Cleo,4\n" + diff = compare(load_csv(io.StringIO(a), key="id"), load_csv(io.StringIO(b), key="id")) + assert diff == { + "added": [], + "removed": [], + "changed": [], + "columns_added": [], + "columns_removed": [], + }