diff --git a/lib/psych/parser.rb b/lib/psych/parser.rb index 2181c730..df14713d 100644 --- a/lib/psych/parser.rb +++ b/lib/psych/parser.rb @@ -59,7 +59,46 @@ def initialize handler = Handler.new # See Psych::Parser and Psych::Parser#handler def parse yaml, path = yaml.respond_to?(:path) ? yaml.path : "" - _native_parse @handler, yaml, path + _native_parse @handler, strip_bom(yaml), path + end + + private + + BOM = { + Encoding::UTF_8 => "\u{FEFF}".freeze, + Encoding::UTF_16LE => "\u{FEFF}".encode(Encoding::UTF_16LE).freeze, + Encoding::UTF_16BE => "\u{FEFF}".encode(Encoding::UTF_16BE).freeze, + Encoding::UTF_32LE => "\u{FEFF}".encode(Encoding::UTF_32LE).freeze, + Encoding::UTF_32BE => "\u{FEFF}".encode(Encoding::UTF_32BE).freeze, + }.freeze + private_constant :BOM + + # libyaml only skips a leading byte order mark when it detects the stream + # encoding by itself. Psych passes the encoding explicitly whenever it is + # known, and on that path libyaml counts the BOM as a first-line character, + # which shifts the column of every token on the first line and silently + # terminates a block mapping at the second line [Bug #13615]. + def strip_bom yaml + if String === yaml + bom = BOM[yaml.encoding] + # delete_prefix copies even when there is no prefix, so keep the guard. + return yaml.delete_prefix(bom) if bom && yaml.start_with?(bom) + elsif yaml.respond_to?(:read) && yaml.respond_to?(:external_encoding) && + yaml.respond_to?(:pos) && yaml.respond_to?(:seek) + bom = BOM[yaml.external_encoding] + skip_io_bom yaml, bom.b if bom + end + yaml + end + + def skip_io_bom io, bom + begin + pos = io.pos + rescue SystemCallError, IOError + return # Not seekable; nothing has been consumed yet. + end + head = io.read(bom.bytesize) + io.seek(pos, IO::SEEK_SET) if head && head.b != bom end end end diff --git a/test/psych/test_parser.rb b/test/psych/test_parser.rb index 4ca4d63d..c175b8a1 100644 --- a/test/psych/test_parser.rb +++ b/test/psych/test_parser.rb @@ -173,6 +173,45 @@ def test_bom assert_equal tadpole, @parser.handler.calls.find { |method, args| method == :scalar }[1].first end + # BOM + multi-line mapping used to lose every line after the first one + # https://github.com/ruby/psych/issues/331 + def test_bom_multiline_utf8 + @parser.parse "\uFEFFa: b\nc: d\n" + assert_equal %w[a b c d], scalars(@parser.handler) + end + + def test_bom_multiline_utf16 + %w[UTF-16LE UTF-16BE].each do |enc| + handler = EventCatcher.new + Psych::Parser.new(handler).parse "\uFEFFa: b\nc: d\n".encode(enc) + assert_equal %w[a b c d], scalars(handler), enc + end + end + + def test_bom_multiline_utf32 + %w[UTF-32LE UTF-32BE].each do |enc| + handler = EventCatcher.new + Psych::Parser.new(handler).parse "\uFEFFa: b\nc: d\n".encode(enc) + assert_equal %w[a b c d], scalars(handler), enc + end + end + + def test_bom_multiline_io + @parser.parse StringIO.new("\uFEFFa: b\nc: d\n") + assert_equal %w[a b c d], scalars(@parser.handler) + end + + def test_bom_only + @parser.parse "\uFEFF" + assert_equal [], scalars(@parser.handler) + end + + def test_io_without_bom_is_not_modified + io = StringIO.new "a: b\nc: d\n".freeze + @parser.parse io + assert_equal %w[a b c d], scalars(@parser.handler) + end + def test_external_encoding tadpole = 'おたまじゃくし' @@ -445,6 +484,10 @@ def test_code_point_limit end end + def scalars handler + handler.calls.select { |method, _| method == :scalar }.map { |_, args| args.first } + end + def assert_called call, with = nil, parser = @parser if with call = parser.handler.calls.find { |x| diff --git a/test/psych/test_psych.rb b/test/psych/test_psych.rb index 4455c471..8e5ec941 100644 --- a/test/psych/test_psych.rb +++ b/test/psych/test_psych.rb @@ -158,6 +158,16 @@ def test_parse_stream assert_equal(%w[foo bar], docs.children.map(&:transform)) end + # https://github.com/ruby/psych/issues/331 + def test_load_with_leading_bom + assert_equal({ "a" => "b", "c" => "d" }, Psych.load("\uFEFFa: b\nc: d")) + end + + def test_parse_stream_with_leading_bom + docs = Psych.parse_stream("\uFEFFa: b\nc: d") + assert_equal [{ "a" => "b", "c" => "d" }], docs.children.map(&:to_ruby) + end + def test_parse_stream_with_block docs = [] Psych.parse_stream("--- foo\n...\n--- bar\n...") do |node|