From f9be85aa0f3a5153f7ee51ad7d5c61673559f6ed Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Fri, 12 Jun 2026 10:24:26 +0900 Subject: [PATCH 1/4] Strip a leading byte order mark before parsing libyaml only discounts the BOM when it detects the stream encoding by itself. Psych passes the encoding explicitly whenever it is known, and on that path libyaml counts the BOM as a first-line character, shifting every token on the first line one column right and silently terminating a block mapping at the second line. https://github.com/ruby/psych/issues/331 Co-Authored-By: Claude Fable 5 --- lib/psych/parser.rb | 36 +++++++++++++++++++++++++++++++++++- test/psych/test_parser.rb | 35 +++++++++++++++++++++++++++++++++++ test/psych/test_psych.rb | 10 ++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/lib/psych/parser.rb b/lib/psych/parser.rb index 2181c730..43bf9938 100644 --- a/lib/psych/parser.rb +++ b/lib/psych/parser.rb @@ -59,7 +59,41 @@ def initialize handler = Handler.new # See Psych::Parser and Psych::Parser#handler def parse yaml, path = yaml.respond_to?(:path) ? yaml.path : "" - _native_parse @handler, yaml, path + _native_parse @handler, strip_bom(yaml), path + end + + private + + BOM = { + Encoding::UTF_8 => "\u{FEFF}".freeze, + Encoding::UTF_16LE => "\u{FEFF}".encode(Encoding::UTF_16LE).freeze, + Encoding::UTF_16BE => "\u{FEFF}".encode(Encoding::UTF_16BE).freeze, + }.freeze + private_constant :BOM + + # libyaml only skips a leading byte order mark when it detects the stream + # encoding by itself. Psych passes the encoding explicitly whenever it is + # known, and on that path libyaml counts the BOM as a first-line character, + # which shifts the column of every token on the first line and silently + # terminates a block mapping at the second line [Bug #13615]. + def strip_bom yaml + if String === yaml + bom = BOM[yaml.encoding] + return yaml[1..-1] if bom && yaml.start_with?(bom) + elsif yaml.respond_to?(:read) && yaml.respond_to?(:external_encoding) && + yaml.respond_to?(:pos) && yaml.respond_to?(:seek) + bom = BOM[yaml.external_encoding] + skip_io_bom yaml, bom.b if bom + end + yaml + end + + def skip_io_bom io, bom + pos = io.pos + head = io.read(bom.bytesize) + io.seek(pos, IO::SEEK_SET) if head && head.b != bom + rescue SystemCallError, IOError + # Not seekable; pos raises before anything is consumed. end end end diff --git a/test/psych/test_parser.rb b/test/psych/test_parser.rb index 4ca4d63d..de8c4fa0 100644 --- a/test/psych/test_parser.rb +++ b/test/psych/test_parser.rb @@ -173,6 +173,37 @@ def test_bom assert_equal tadpole, @parser.handler.calls.find { |method, args| method == :scalar }[1].first end + # BOM + multi-line mapping used to lose every line after the first one + # https://github.com/ruby/psych/issues/331 + def test_bom_multiline_utf8 + @parser.parse "\uFEFFa: b\nc: d\n" + assert_equal %w[a b c d], scalars(@parser.handler) + end + + def test_bom_multiline_utf16 + %w[UTF-16LE UTF-16BE].each do |enc| + handler = EventCatcher.new + Psych::Parser.new(handler).parse "\uFEFFa: b\nc: d\n".encode(enc) + assert_equal %w[a b c d], scalars(handler), enc + end + end + + def test_bom_multiline_io + @parser.parse StringIO.new("\uFEFFa: b\nc: d\n") + assert_equal %w[a b c d], scalars(@parser.handler) + end + + def test_bom_only + @parser.parse "\uFEFF" + assert_equal [], scalars(@parser.handler) + end + + def test_io_without_bom_is_not_modified + io = StringIO.new "a: b\nc: d\n".freeze + @parser.parse io + assert_equal %w[a b c d], scalars(@parser.handler) + end + def test_external_encoding tadpole = 'おたまじゃくし' @@ -445,6 +476,10 @@ def test_code_point_limit end end + def scalars handler + handler.calls.select { |method, _| method == :scalar }.map { |_, args| args.first } + end + def assert_called call, with = nil, parser = @parser if with call = parser.handler.calls.find { |x| diff --git a/test/psych/test_psych.rb b/test/psych/test_psych.rb index 4455c471..8e5ec941 100644 --- a/test/psych/test_psych.rb +++ b/test/psych/test_psych.rb @@ -158,6 +158,16 @@ def test_parse_stream assert_equal(%w[foo bar], docs.children.map(&:transform)) end + # https://github.com/ruby/psych/issues/331 + def test_load_with_leading_bom + assert_equal({ "a" => "b", "c" => "d" }, Psych.load("\uFEFFa: b\nc: d")) + end + + def test_parse_stream_with_leading_bom + docs = Psych.parse_stream("\uFEFFa: b\nc: d") + assert_equal [{ "a" => "b", "c" => "d" }], docs.children.map(&:to_ruby) + end + def test_parse_stream_with_block docs = [] Psych.parse_stream("--- foo\n...\n--- bar\n...") do |node| From b0dd20d3be9bcb56695b7441efcd9005981522fc Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Fri, 12 Jun 2026 11:01:49 +0900 Subject: [PATCH 2/4] Restrict the BOM skip rescue to the seekability probe If pos succeeded but the later seek failed, the rescue silently discarded the bytes read to check for a BOM. Only the initial pos call is expected to fail, for non-seekable IOs, before anything is consumed. Co-Authored-By: Claude Fable 5 --- lib/psych/parser.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/psych/parser.rb b/lib/psych/parser.rb index 43bf9938..5a90296e 100644 --- a/lib/psych/parser.rb +++ b/lib/psych/parser.rb @@ -89,11 +89,13 @@ def strip_bom yaml end def skip_io_bom io, bom - pos = io.pos + begin + pos = io.pos + rescue SystemCallError, IOError + return # Not seekable; nothing has been consumed yet. + end head = io.read(bom.bytesize) io.seek(pos, IO::SEEK_SET) if head && head.b != bom - rescue SystemCallError, IOError - # Not seekable; pos raises before anything is consumed. end end end From 72eee65e6fbf22faee99fceb2aa381de4fcca1d9 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Fri, 12 Jun 2026 11:02:42 +0900 Subject: [PATCH 3/4] Strip the BOM from UTF-32 strings too The C extension transcodes UTF-32 strings to UTF-8 with the BOM preserved, so they were truncated the same way as UTF-8 input. Co-Authored-By: Claude Fable 5 --- lib/psych/parser.rb | 2 ++ test/psych/test_parser.rb | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/lib/psych/parser.rb b/lib/psych/parser.rb index 5a90296e..5d4d063a 100644 --- a/lib/psych/parser.rb +++ b/lib/psych/parser.rb @@ -68,6 +68,8 @@ def parse yaml, path = yaml.respond_to?(:path) ? yaml.path : "" Encoding::UTF_8 => "\u{FEFF}".freeze, Encoding::UTF_16LE => "\u{FEFF}".encode(Encoding::UTF_16LE).freeze, Encoding::UTF_16BE => "\u{FEFF}".encode(Encoding::UTF_16BE).freeze, + Encoding::UTF_32LE => "\u{FEFF}".encode(Encoding::UTF_32LE).freeze, + Encoding::UTF_32BE => "\u{FEFF}".encode(Encoding::UTF_32BE).freeze, }.freeze private_constant :BOM diff --git a/test/psych/test_parser.rb b/test/psych/test_parser.rb index de8c4fa0..c175b8a1 100644 --- a/test/psych/test_parser.rb +++ b/test/psych/test_parser.rb @@ -188,6 +188,14 @@ def test_bom_multiline_utf16 end end + def test_bom_multiline_utf32 + %w[UTF-32LE UTF-32BE].each do |enc| + handler = EventCatcher.new + Psych::Parser.new(handler).parse "\uFEFFa: b\nc: d\n".encode(enc) + assert_equal %w[a b c d], scalars(handler), enc + end + end + def test_bom_multiline_io @parser.parse StringIO.new("\uFEFFa: b\nc: d\n") assert_equal %w[a b c d], scalars(@parser.handler) From 681fb06b46547d45849bb2d7a0682973e4eafb70 Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Fri, 12 Jun 2026 11:03:06 +0900 Subject: [PATCH 4/4] Use String#delete_prefix to strip the BOM Co-Authored-By: Claude Fable 5 --- lib/psych/parser.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/psych/parser.rb b/lib/psych/parser.rb index 5d4d063a..df14713d 100644 --- a/lib/psych/parser.rb +++ b/lib/psych/parser.rb @@ -81,7 +81,8 @@ def parse yaml, path = yaml.respond_to?(:path) ? yaml.path : "" def strip_bom yaml if String === yaml bom = BOM[yaml.encoding] - return yaml[1..-1] if bom && yaml.start_with?(bom) + # delete_prefix copies even when there is no prefix, so keep the guard. + return yaml.delete_prefix(bom) if bom && yaml.start_with?(bom) elsif yaml.respond_to?(:read) && yaml.respond_to?(:external_encoding) && yaml.respond_to?(:pos) && yaml.respond_to?(:seek) bom = BOM[yaml.external_encoding]