From 76bba1585c11c0a375008864f859a215c8ba4ce6 Mon Sep 17 00:00:00 2001 From: Zo Bot Date: Mon, 22 Jun 2026 21:30:59 +0000 Subject: [PATCH] raise csv field size limit at module import Closes #41. csv's default field size limit is 131072 bytes (128 KiB), which is too small for real-world inputs like nucleotide sequences, embedded JSON strings, or long log lines. Without this, load_csv crashes on otherwise-valid input with 'csv.Error: field larger than field limit (131072)'. Bump the limit to the platform max at import time so callers don't have to know about csv's quirky global. The pattern uses ctypes.c_ulong(-1).value // 2 (the value Simon Willison suggested in the issue itself, lifted from https://stackoverflow.com/a/54517228/7483211) to land on the largest unsigned-long-ish value Python's csv module will accept. Tests: tests/test_csv_diff.py::test_long_field_does_not_exceed_csv_field_size_limit forces csv's limit to 2^17 with a 2^17-char field and confirms load_csv no longer raises. --- csv_diff/__init__.py | 9 +++++++++ tests/test_csv_diff.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 59a2eaf..18116aa 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -1,8 +1,17 @@ import csv +import ctypes from dictdiffer import diff import json import hashlib +# csv's default field size limit is 131072 bytes, which is too small for +# real-world use cases like nucleotide sequences, long log lines, or JSON +# strings embedded in a CSV cell. Raise it to the platform max at import +# time so load_csv / load_json don't crash with +# `csv.Error: field larger than field limit` on otherwise-valid input. +# See https://github.com/simonw/csv-diff/issues/41 +csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) + def load_csv(fp, key=None, dialect=None): if dialect is None and fp.seekable(): diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..ae5f936 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -115,3 +115,22 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + + +def test_long_field_does_not_exceed_csv_field_size_limit(): + # https://github.com/simonw/csv-diff/issues/41 - the csv module's + # default field size limit is 131072 bytes, which is too small for + # real-world fields like nucleotide sequences. csv-diff raises the + # limit at import time so load_csv can handle long fields without + # crashing with `csv.Error: field larger than field limit`. + import csv as _csv + import csv_diff # noqa: F401 - import side effect: raises the limit + + big_value = "A" * (2 ** 17) # 128 KiB, 4x the default limit + _csv.field_size_limit(2 ** 17) # restore the lower bound for this test + + csv_data = "id,name\n1,{}\n".format(big_value) + rows = load_csv(io.StringIO(csv_data), key="id") + assert len(rows) == 1 + assert rows["1"]["name"] == big_value + assert len(rows["1"]["name"]) == len(big_value)