Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion csv_diff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@ def load_csv(fp, key=None, dialect=None):
pass
fp = csv.reader(fp, dialect=(dialect or "excel"))
headings = next(fp)
rows = [dict(zip(headings, line)) for line in fp]
# Skip blank rows so trailing newlines (or stray blank lines inside the
# file) don't get treated as data rows that are missing every column -
# that surfaced as an unhelpful KeyError on the key column. A row is
# "blank" if csv.reader returned no fields for it, which is what happens
# when the final record is just "\n" or a file ends with extra blank lines.
rows = [dict(zip(headings, line)) for line in fp if line]
if key:
keyfn = lambda r: r[key]
else:
Expand Down
54 changes: 54 additions & 0 deletions tests/test_csv_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,22 @@
1,Cleo,5
2,Pancakes,3"""

# Trailing blank line - the most common case (POSIX text files end in \n,
# many editors add an extra one). Used to raise KeyError on the key column.
SEVEN_TRAILING = """id,name,age
1,Cleo,4
2,Pancakes,2
"""

# Multiple trailing blank lines, plus a blank row in the middle of the file.
EIGHT_BLANK = """id,name,age

1,Cleo,4
2,Pancakes,2


"""


def test_row_changed():
diff = compare(
Expand Down Expand Up @@ -115,3 +131,41 @@ def test_tsv():
"columns_added": [],
"columns_removed": [],
} == diff


def test_trailing_blank_line_is_skipped():
# https://github.com/simonw/csv-diff/issues/29 - a file ending in "\n"
# (or any number of empty trailing lines) used to produce a dict entry
# missing every column, which then KeyError'd on the key column. The
# loader should silently skip blank rows so the rest of the diff runs.
loaded = load_csv(io.StringIO(SEVEN_TRAILING), key="id")
assert loaded == {
"1": {"id": "1", "name": "Cleo", "age": "4"},
"2": {"id": "2", "name": "Pancakes", "age": "2"},
}


def test_multiple_blank_lines_and_interior_blank_skipped():
# A blank row in the middle and several trailing blank lines should all
# be dropped - the loader treats csv.reader's empty `line` as the marker
# of a row that contributed no fields.
loaded = load_csv(io.StringIO(EIGHT_BLANK), key="id")
assert loaded == {
"1": {"id": "1", "name": "Cleo", "age": "4"},
"2": {"id": "2", "name": "Pancakes", "age": "2"},
}


def test_compare_with_trailing_blank_lines():
# End-to-end: comparing two identical files where both have a trailing
# newline should report no changes (regression check for issue #29).
a = "id,name,age\n1,Cleo,4\n"
b = "id,name,age\n1,Cleo,4\n"
diff = compare(load_csv(io.StringIO(a), key="id"), load_csv(io.StringIO(b), key="id"))
assert diff == {
"added": [],
"removed": [],
"changed": [],
"columns_added": [],
"columns_removed": [],
}