Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion lib/psych/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,46 @@ def initialize handler = Handler.new
# See Psych::Parser and Psych::Parser#handler

def parse yaml, path = yaml.respond_to?(:path) ? yaml.path : "<unknown>"
_native_parse @handler, yaml, path
_native_parse @handler, strip_bom(yaml), path
end

private

BOM = {
Encoding::UTF_8 => "\u{FEFF}".freeze,
Encoding::UTF_16LE => "\u{FEFF}".encode(Encoding::UTF_16LE).freeze,
Encoding::UTF_16BE => "\u{FEFF}".encode(Encoding::UTF_16BE).freeze,
Encoding::UTF_32LE => "\u{FEFF}".encode(Encoding::UTF_32LE).freeze,
Encoding::UTF_32BE => "\u{FEFF}".encode(Encoding::UTF_32BE).freeze,
}.freeze
private_constant :BOM

# libyaml only skips a leading byte order mark when it detects the stream
# encoding by itself. Psych passes the encoding explicitly whenever it is
# known, and on that path libyaml counts the BOM as a first-line character,
# which shifts the column of every token on the first line and silently
# terminates a block mapping at the second line [Bug #13615].
def strip_bom yaml
if String === yaml
bom = BOM[yaml.encoding]
# delete_prefix copies even when there is no prefix, so keep the guard.
return yaml.delete_prefix(bom) if bom && yaml.start_with?(bom)
elsif yaml.respond_to?(:read) && yaml.respond_to?(:external_encoding) &&
yaml.respond_to?(:pos) && yaml.respond_to?(:seek)
bom = BOM[yaml.external_encoding]
skip_io_bom yaml, bom.b if bom
end
yaml
end

def skip_io_bom io, bom
begin
pos = io.pos
rescue SystemCallError, IOError
return # Not seekable; nothing has been consumed yet.
end
head = io.read(bom.bytesize)
io.seek(pos, IO::SEEK_SET) if head && head.b != bom
end
end
end
43 changes: 43 additions & 0 deletions test/psych/test_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,45 @@ def test_bom
assert_equal tadpole, @parser.handler.calls.find { |method, args| method == :scalar }[1].first
end

# BOM + multi-line mapping used to lose every line after the first one
# https://github.com/ruby/psych/issues/331
def test_bom_multiline_utf8
@parser.parse "\uFEFFa: b\nc: d\n"
assert_equal %w[a b c d], scalars(@parser.handler)
end

def test_bom_multiline_utf16
%w[UTF-16LE UTF-16BE].each do |enc|
handler = EventCatcher.new
Psych::Parser.new(handler).parse "\uFEFFa: b\nc: d\n".encode(enc)
assert_equal %w[a b c d], scalars(handler), enc
end
end

def test_bom_multiline_utf32
%w[UTF-32LE UTF-32BE].each do |enc|
handler = EventCatcher.new
Psych::Parser.new(handler).parse "\uFEFFa: b\nc: d\n".encode(enc)
assert_equal %w[a b c d], scalars(handler), enc
end
end

def test_bom_multiline_io
@parser.parse StringIO.new("\uFEFFa: b\nc: d\n")
assert_equal %w[a b c d], scalars(@parser.handler)
end

def test_bom_only
@parser.parse "\uFEFF"
assert_equal [], scalars(@parser.handler)
end

def test_io_without_bom_is_not_modified
io = StringIO.new "a: b\nc: d\n".freeze
@parser.parse io
assert_equal %w[a b c d], scalars(@parser.handler)
end

def test_external_encoding
tadpole = 'おたまじゃくし'

Expand Down Expand Up @@ -445,6 +484,10 @@ def test_code_point_limit
end
end

def scalars handler
handler.calls.select { |method, _| method == :scalar }.map { |_, args| args.first }
end

def assert_called call, with = nil, parser = @parser
if with
call = parser.handler.calls.find { |x|
Expand Down
10 changes: 10 additions & 0 deletions test/psych/test_psych.rb
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,16 @@ def test_parse_stream
assert_equal(%w[foo bar], docs.children.map(&:transform))
end

# https://github.com/ruby/psych/issues/331
def test_load_with_leading_bom
assert_equal({ "a" => "b", "c" => "d" }, Psych.load("\uFEFFa: b\nc: d"))
end

def test_parse_stream_with_leading_bom
docs = Psych.parse_stream("\uFEFFa: b\nc: d")
assert_equal [{ "a" => "b", "c" => "d" }], docs.children.map(&:to_ruby)
end

def test_parse_stream_with_block
docs = []
Psych.parse_stream("--- foo\n...\n--- bar\n...") do |node|
Expand Down
Loading