From 2bbb245edaf80179f4673bf4bf75978a10654c64 Mon Sep 17 00:00:00 2001 From: "Thomas E. Enebo" Date: Tue, 5 Sep 2017 16:30:31 -0500 Subject: Parse.load with an IO/File which has a non-YAML allowed encoding will just set the encoding to UTF-8 and hope for the best. This appears to be how libyaml works. This issue was noticed in yaml/store because it extends pstore which will create an IO in read_only mode as: ```ruby RD_ACCESS = {mode: IO::RDONLY | IO::BINARY, encoding: Encoding::ASCII_8BIT} ``` The data in the test case I was debugging was in fact UTF-8 data and MRI was happy to take this 8bit IO and pretend it is UTF-8. Form-fitting ftw. --- ext/java/PsychParser.java | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'ext') diff --git a/ext/java/PsychParser.java b/ext/java/PsychParser.java index b3e747e..f5b6faf 100644 --- a/ext/java/PsychParser.java +++ b/ext/java/PsychParser.java @@ -33,6 +33,8 @@ import java.nio.charset.Charset; import java.util.Map; import org.jcodings.Encoding; +import org.jcodings.specific.UTF16BEEncoding; +import org.jcodings.specific.UTF16LEEncoding; import org.jcodings.specific.UTF8Encoding; import org.jcodings.unicode.UnicodeEncoding; import org.jruby.Ruby; @@ -162,6 +164,11 @@ public class PsychParser extends RubyObject { if (yaml instanceof RubyIO) { Encoding enc = ((RubyIO) yaml).getReadEncoding(); charset = enc.getCharset(); + + // libyaml treats non-utf encodings as utf-8 and hopes for the best. + if (!(enc instanceof UTF8Encoding) && !(enc instanceof UTF16LEEncoding) && !(enc instanceof UTF16BEEncoding)) { + charset = UTF8Encoding.INSTANCE.getCharset(); + } } if (charset == null) { // If we can't get it from the IO or it doesn't have a charset, fall back on UTF-8 -- cgit v1.2.3