From 76bba1585c11c0a375008864f859a215c8ba4ce6 Mon Sep 17 00:00:00 2001
From: Zo Bot <github-automation@zo.computer>
Date: Mon, 22 Jun 2026 21:30:59 +0000
Subject: [PATCH] raise csv field size limit at module import

Closes #41. csv's default field size limit is 131072 bytes (128 KiB),
which is too small for real-world inputs like nucleotide sequences,
embedded JSON strings, or long log lines. Without this, load_csv crashes
on otherwise-valid input with 'csv.Error: field larger than field limit
(131072)'.

Bump the limit to the platform max at import time so callers don't have
to know about csv's quirky global. The pattern uses
ctypes.c_ulong(-1).value // 2 (the value Simon Willison suggested in the
issue itself, lifted from https://stackoverflow.com/a/54517228/7483211)
to land on the largest unsigned-long-ish value Python's csv module will
accept. Tests: tests/test_csv_diff.py::test_long_field_does_not_exceed_csv_field_size_limit
forces csv's limit to 2^17 with a 2^17-char field and confirms load_csv
no longer raises.
---
 csv_diff/__init__.py   |  9 +++++++++
 tests/test_csv_diff.py | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py
index 59a2eaf..18116aa 100644
--- a/csv_diff/__init__.py
+++ b/csv_diff/__init__.py
@@ -1,8 +1,17 @@
 import csv
+import ctypes
 from dictdiffer import diff
 import json
 import hashlib
 
+# csv's default field size limit is 131072 bytes, which is too small for
+# real-world use cases like nucleotide sequences, long log lines, or JSON
+# strings embedded in a CSV cell. Raise it to the platform max at import
+# time so load_csv / load_json don't crash with
+# `csv.Error: field larger than field limit` on otherwise-valid input.
+# See https://github.com/simonw/csv-diff/issues/41
+csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
+
 
 def load_csv(fp, key=None, dialect=None):
     if dialect is None and fp.seekable():
diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py
index 0e3670f..ae5f936 100644
--- a/tests/test_csv_diff.py
+++ b/tests/test_csv_diff.py
@@ -115,3 +115,22 @@ def test_tsv():
         "columns_added": [],
         "columns_removed": [],
     } == diff
+
+
+def test_long_field_does_not_exceed_csv_field_size_limit():
+    # https://github.com/simonw/csv-diff/issues/41 - the csv module's
+    # default field size limit is 131072 bytes, which is too small for
+    # real-world fields like nucleotide sequences. csv-diff raises the
+    # limit at import time so load_csv can handle long fields without
+    # crashing with `csv.Error: field larger than field limit`.
+    import csv as _csv
+    import csv_diff  # noqa: F401  - import side effect: raises the limit
+
+    big_value = "A" * (2 ** 17)  # 128 KiB, 4x the default limit
+    _csv.field_size_limit(2 ** 17)  # restore the lower bound for this test
+
+    csv_data = "id,name\n1,{}\n".format(big_value)
+    rows = load_csv(io.StringIO(csv_data), key="id")
+    assert len(rows) == 1
+    assert rows["1"]["name"] == big_value
+    assert len(rows["1"]["name"]) == len(big_value)