Ruby 2.4参考手册
编码 | Encoding
Encoding::Converter
Parent:Data
常量
AFTER_OUTPUT
AFTER_OUTPUT
在某些输出完成但在所有输入消耗完之前停止转换。有关示例,请参阅#primitive_convert。
CRLF_NEWLINE_DECORATOR
CRLF_NEWLINE_DECORATOR
用于将LF转换为CRLF的装饰器
CR_NEWLINE_DECORATOR
CR_NEWLINE_DECORATOR
用于将LF转换为CR的装饰器
INVALID_MASK
INVALID_MASK
用于无效字节序列的掩码
INVALID_REPLACE
INVALID_REPLACE
替换无效的字节序列
PARTIAL_INPUT
PARTIAL_INPUT
指示:源可能是较大字符串的一部分。有关示例,请参阅#primitive_convert。
UNDEF_HEX_CHARREF
UNDEF_HEX_CHARREF
将目标编码中未定义的字节序列替换为XML十六进制字符引用。这对XML转换有效。
UNDEF_MASK
UNDEF_MASK
掩码为源编码中的有效字符,但目标编码中不包含相关字符。
UNDEF_REPLACE
UNDEF_REPLACE
替换目标编码中未定义的字节序列。
UNIVERSAL_NEWLINE_DECORATOR
UNIVERSAL_NEWLINE_DECORATOR
用于将CRLF和CR转换为LF的装饰器
XML_ATTR_CONTENT_DECORATOR
XML_ATTR_CONTENT_DECORATOR
转义为XML AttValue
XML_ATTR_QUOTE_DECORATOR
XML_ATTR_QUOTE_DECORATOR
转义为XML AttValue
XML_TEXT_DECORATOR
XML_TEXT_DECORATOR
转义为XML CharData
公共类方法
Encoding::Converter.asciicompat_encoding(string) → encoding or nil Show source
Encoding::Converter.asciicompat_encoding(encoding) → encoding or nil
返回相应的ASCII兼容编码。
如果参数是ASCII兼容编码,则返回零。
“对应的ASCII兼容编码”是ASCII兼容编码,其可以表示与给定的ASCII不兼容编码完全相同的字符。所以,在两种编码之间转换时不会发生未定义的转换错误。
Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
static VALUE
econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
{
const char *arg_name, *result_name;
rb_encoding *arg_enc, *result_enc;
enc_arg(&arg, &arg_name, &arg_enc);
result_name = rb_econv_asciicompat_encoding(arg_name);
if (result_name == NULL)
return Qnil;
result_enc = make_encoding(result_name);
return rb_enc_from_encoding(result_enc);
}
Encoding::Converter.new(source_encoding, destination_encoding) Show source
Encoding::Converter.new(source_encoding, destination_encoding, opt)
Encoding::Converter.new(convpath)
可能的选项元素:
hash form:
:invalid => nil # raise error on invalid byte sequence (default)
:invalid => :replace # replace invalid byte sequence
:undef => nil # raise error on undefined conversion (default)
:undef => :replace # replace undefined conversion
:replace => string # replacement string ("?" or "\uFFFD" if not specified)
:newline => :universal # decorator for converting CRLF and CR to LF
:newline => :crlf # decorator for converting LF to CRLF
:newline => :cr # decorator for converting LF to CR
:universal_newline => true # decorator for converting CRLF and CR to LF
:crlf_newline => true # decorator for converting LF to CRLF
:cr_newline => true # decorator for converting LF to CR
:xml => :text # escape as XML CharData.
:xml => :attr # escape as XML AttValue
integer form:
Encoding::Converter::INVALID_REPLACE
Encoding::Converter::UNDEF_REPLACE
Encoding::Converter::UNDEF_HEX_CHARREF
Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
Encoding::Converter::CRLF_NEWLINE_DECORATOR
Encoding::Converter::CR_NEWLINE_DECORATOR
Encoding::Converter::XML_TEXT_DECORATOR
Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
::new creates an instance of Encoding::Converter.
Source_encoding和#destination_encoding应该是一个字符串或Encoding对象。
opt应该是零,散列或整数。
convpath应该是一个数组。它可能包含:
- 包含编码或编码名称的两元素数组,或是:
- 表示装饰器名称的字符串。
:: new可选地带有一个选项。该选项应该是散列或整数。选项hash可以包含:invalid => nil等。选项integer应该是逻辑或常量,例如Encoding :: Converter :: INVALID_REPLACE等。
:invalid => nil
在无效字节序列上引发错误,这会是一个默认行为。
:invalid => :replace
用替换字符串替换无效字节序列。
:undef => nil
如果#source_encoding中的字符未在destination_encoding中定义,则引发错误。这是一个默认行为。
:undef => :replace
用替换字符串替换#destination_encoding中的未定义字符。
:replace => string
指定替换字符串。如果未指定,则对Unicode编码使用“uFFFD”,对其他使用“?”。
:universal_newline => true
将CRLF和CR转换为LF。
:crlf_newline => true
将LF转换为CRLF。
:cr_newline => true
将LF转换为CR。
:xml => :text
作为XML CharData转义。此表单可以用作HTML 4.0 #PCDATA。
- '&' -> '&'
- '<' -> '<'
- '>' -> '>'
- #destination_encoding中的未定义字符 - >十六进制CharRef,例如&#xHH;
:xml => :attr
转义为XML AttValue。转换后的结果被引用为“...”。此表单可以用作HTML 4.0属性值。
- '&' -> '&'
- '<' -> '<'
- '>' -> '>'
- '“' -> '"'
- undefined characters in#destination_encoding-> hexadecimal CharRef 例如&#xHH;
例如:
# UTF-16BE to UTF-8
ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
# Usually, decorators such as newline conversion are inserted last.
ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
# "universal_newline"]
# But, if the last encoding is ASCII incompatible,
# decorators are inserted before the last conversion.
ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
p ec.convpath #=> ["crlf_newline",
# [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
# Conversion path can be specified directly.
ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
p ec.convpath #=> ["universal_newline",
# [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
# [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
Encoding::Converter.search_convpath(source_encoding, destination_encoding) → ary
Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) → ary
返回转换路径。
p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
#=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
# [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
or
p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
#=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
# [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
# "universal_newline"]
p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
or
p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
#=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
# "universal_newline",
# [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
公共实例方法
ec == other → true or false
static VALUE
econv_equal(VALUE self, VALUE other)
{
rb_econv_t *ec1 = check_econv(self);
rb_econv_t *ec2;
int i;
if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
return Qnil;
}
ec2 = DATA_PTR(other);
if (!ec2) return Qfalse;
if (ec1->source_encoding_name != ec2->source_encoding_name &&
strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
return Qfalse;
if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
return Qfalse;
if (ec1->flags != ec2->flags) return Qfalse;
if (ec1->replacement_enc != ec2->replacement_enc &&
strcmp(ec1->replacement_enc, ec2->replacement_enc))
return Qfalse;
if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
if (ec1->replacement_str != ec2->replacement_str &&
memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
return Qfalse;
if (ec1->num_trans != ec2->num_trans) return Qfalse;
for (i = 0; i < ec1->num_trans; i++) {
if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
return Qfalse;
}
return Qtrue;
}
convert(source_string) → destination_string
转换source_string并返回destination_string。
source_string被假定为源的一部分。即:partial_input => true在内部指定。最后应使用完成方法。
ec = Encoding::Converter.new("utf-8", "euc-jp")
puts ec.convert("\u3042").dump #=> "\xA4\xA2"
puts ec.finish.dump #=> ""
ec = Encoding::Converter.new("euc-jp", "utf-8")
puts ec.convert("\xA4").dump #=> ""
puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
puts ec.finish.dump #=> ""
ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
如果发生转换错误,则会引发Encoding :: UndefinedConversionError或Encoding :: InvalidByteSequenceError。#convert不提供从这些异常中恢复或重新启动的方法。当您想要处理这些转换错误时,请使用#primitive_convert。
convpath → ary
返回ec的转换路径。
结果是一系列转换。
ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
p ec.convpath
#=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
# [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
# "crlf_newline"]
数组中的每个元素都是一对编码或一个字符串。一对意味着一个编码转换。字符串表示装饰器。
在上例中,[#<Encoding:ISO-8859-1>,#<Encoding:UTF-8>]表示从ISO-8859-1到UTF-8的转换器。“crlf_newline”表示从LF到CRLF的换行符转换器。
static VALUE
econv_convpath(VALUE self)
{
rb_econv_t *ec = check_econv(self);
VALUE result;
int i;
result = rb_ary_new();
for (i = 0; i < ec->num_trans; i++) {
const rb_transcoder *tr = ec->elems[i].tc->transcoder;
VALUE v;
if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
v = rb_str_new_cstr(tr->dst_encoding);
else
v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
rb_ary_push(result, v);
}
return result;
}
destination_encoding → encoding
以编码对象的形式返回目标编码。
static VALUE
econv_destination_encoding(VALUE self)
{
rb_econv_t *ec = check_econv(self);
if (!ec->destination_encoding)
return Qnil;
return rb_enc_from_encoding(ec->destination_encoding);
}
finish → string
完成转换器。它返回转换字符串的最后一部分。
ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
p ec.convert("\u3042") #=> "\e$B$\""
p ec.finish #=> "\e(B"
static VALUE
econv_finish(VALUE self)
{
VALUE ret, dst;
VALUE av[5];
int ac;
rb_econv_t *ec = check_econv(self);
dst = rb_str_new(NULL, 0);
av[0] = Qnil;
av[1] = dst;
av[2] = Qnil;
av[3] = Qnil;
av[4] = INT2FIX(0);
ac = 5;
ret = econv_primitive_convert(ac, av, self);
if (ret == sym_invalid_byte_sequence ||
ret == sym_undefined_conversion ||
ret == sym_incomplete_input) {
VALUE exc = make_econv_exception(ec);
rb_exc_raise(exc);
}
if (ret != sym_finished) {
rb_bug("unexpected result of econv_primitive_convert");
}
return dst;
}
insert_output(string) → nil
将字符串插入编码转换器。该字符串将转换为目标编码并在稍后的转换中输出。
如果目标编码是有状态的,则根据状态转换字符串并更新状态。
只有在发生转换错误时才应使用此方法。
ec = Encoding::Converter.new("utf-8", "iso-8859-1")
src = "HIRAGANA LETTER A is \u{3042}."
dst = ""
p ec.primitive_convert(src, dst) #=> :undefined_conversion
puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
ec.insert_output("<err>")
p ec.primitive_convert(src, dst) #=> :finished
puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
dst = ""
p ec.primitive_convert(src, dst) #=> :undefined_conversion
puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
ec.insert_output "?" # state change required to output "?".
p ec.primitive_convert(src, dst) #=> :finished
puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
static VALUE
econv_insert_output(VALUE self, VALUE string)
{
const char *insert_enc;
int ret;
rb_econv_t *ec = check_econv(self);
StringValue(string);
insert_enc = rb_econv_encoding_to_insert_output(ec);
string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
if (ret == -1) {
rb_raise(rb_eArgError, "too big string");
}
return Qnil;
}
inspect → string
返回ec的可打印版本
ec = Encoding::Converter.new("iso-8859-1", "utf-8")
puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
static VALUE
econv_inspect(VALUE self)
{
const char *cname = rb_obj_classname(self);
rb_econv_t *ec;
TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
if (!ec)
return rb_sprintf("#<%s: uninitialized>", cname);
else {
const char *sname = ec->source_encoding_name;
const char *dname = ec->destination_encoding_name;
VALUE str;
str = rb_sprintf("#<%s: ", cname);
econv_description(sname, dname, ec->flags, str);
rb_str_cat2(str, ">");
return str;
}
}
last_error → exception or nil
返回上次转换的异常对象。如果最后一次转换没有产生错误,则返回nil。
“error”是指Encoding:: InvalidByteSequenceError 和Encoding:: UndefinedConversionError 为#convert和:invalid_byte_sequence,:incomplete_input和:undefined_conversion为#primitive_convert。
ec = Encoding::Converter.new("utf-8", "iso-8859-1")
p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
p ec.last_error #=> nil
static VALUE
econv_last_error(VALUE self)
{
rb_econv_t *ec = check_econv(self);
VALUE exc;
exc = make_econv_exception(ec);
if (NIL_P(exc))
return Qnil;
return exc;
}
primitive_convert(source_buffer, destination_buffer) → symbol
primitive_convert(source_buffer, destination_buffer, destination_byteoffset) → symbol
primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) → symbol
primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) → symbol
可能的选择元素:
hash form:
:partial_input => true # source buffer may be part of larger source
:after_output => true # stop conversion after output before input
integer form:
Encoding::Converter::PARTIAL_INPUT
Encoding::Converter::AFTER_OUTPUT
可能的结果:
:invalid_byte_sequence
:incomplete_input
:undefined_conversion
:after_output
:destination_buffer_full
:source_buffer_empty
:finished
#primitive_convert 将source_buffer转换为destination_buffer。
source_buffer应该是一个字符串或零。零表示一个空字符串。
destination_buffer应该是一个字符串。
destination_byteoffset应该是一个整数或零。nil表示destination_buffer的结束。如果省略,则假定为零。
destination_bytesize应该是一个整数或零。零意味着无限。如果省略,则假定为零。
opt应该是零,散列或整数。零表示没有标志。如果省略,则假定为零。
#primitive_convert 将source_buffer的内容从开始转换并存储到destination_buffer中。
destination_byteoffset和destination_bytesize指定转换结果存储的区域。destination_byteoffset以字节为单位指定destination_buffer中的起始位置。如果destination_byteoffset为零,则使用destination_buffer.bytesize来附加结果。destination_bytesize指定最大字节数。如果destination_bytesize为零,则目标大小不受限制。转换后,destination_buffer被调整为destination_byteoffset +实际产生的字节数。另外destination_buffer的编码被设置为destination_encoding。
#primitive_convert 删除source_buffer的转换部分。丢弃的部分在destination_buffer中转换或在Encoding :: Converter对象中缓冲。
满足下列条件之一时,#primitive_convert停止转换。
- 在源缓冲区(:invalid_byte_sequence)
primitive_errinfo
和last_error
方法中找到的无效字节序列返回错误的详细信息。
- 源缓冲区意外结束(:incomplete_input)只有在未指定partial_input时才会发生。
primitive_errinfo
并且last_error
方法返回错误的详细信息。
- 字符不能表示在输出编码(:undefined_conversion)
primitive_errinfo
和last_error
方法返回错误的细节。
- 在生成了一些输出之后,在输入完成之前(:after_output),只有在指定after_output时才会发生这种情况。
- 目标缓冲区已满(:destination_buffer_full)只有在destination_bytesize非零时才会发生。
- 源缓冲区为空(:source_buffer_empty),仅当指定partial_input时才会发生。
- 转换完成(:完成)
例:
ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
ret = ec.primitive_convert(src="pi", dst="", nil, 100)
p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
ret = ec.primitive_convert(src="pi", dst="", nil, 1)
p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
ret = ec.primitive_convert(src, dst="", nil, 1)
p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
ret = ec.primitive_convert(src, dst="", nil, 1)
p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
ret = ec.primitive_convert(src, dst="", nil, 1)
p [ret, src, dst] #=> [:finished, "", "i"]
static VALUE
econv_primitive_convert(int argc, VALUE *argv, VALUE self)
{
VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
rb_econv_t *ec = check_econv(self);
rb_econv_result_t res;
const unsigned char *ip, *is;
unsigned char *op, *os;
long output_byteoffset, output_bytesize;
unsigned long output_byteend;
int flags;
argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
if (NIL_P(output_byteoffset_v))
output_byteoffset = 0; /* dummy */
else
output_byteoffset = NUM2LONG(output_byteoffset_v);
if (NIL_P(output_bytesize_v))
output_bytesize = 0; /* dummy */
else
output_bytesize = NUM2LONG(output_bytesize_v);
if (!NIL_P(flags_v)) {
if (!NIL_P(opt)) {
rb_error_arity(argc + 1, 2, 5);
}
flags = NUM2INT(rb_to_int(flags_v));
}
else if (!NIL_P(opt)) {
VALUE v;
flags = 0;
v = rb_hash_aref(opt, sym_partial_input);
if (RTEST(v))
flags |= ECONV_PARTIAL_INPUT;
v = rb_hash_aref(opt, sym_after_output);
if (RTEST(v))
flags |= ECONV_AFTER_OUTPUT;
}
else {
flags = 0;
}
StringValue(output);
if (!NIL_P(input))
StringValue(input);
rb_str_modify(output);
if (NIL_P(output_bytesize_v)) {
output_bytesize = RSTRING_EMBED_LEN_MAX;
if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
output_bytesize = RSTRING_LEN(input);
}
retry:
if (NIL_P(output_byteoffset_v))
output_byteoffset = RSTRING_LEN(output);
if (output_byteoffset < 0)
rb_raise(rb_eArgError, "negative output_byteoffset");
if (RSTRING_LEN(output) < output_byteoffset)
rb_raise(rb_eArgError, "output_byteoffset too big");
if (output_bytesize < 0)
rb_raise(rb_eArgError, "negative output_bytesize");
output_byteend = (unsigned long)output_byteoffset +
(unsigned long)output_bytesize;
if (output_byteend < (unsigned long)output_byteoffset ||
LONG_MAX < output_byteend)
rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
if (rb_str_capacity(output) < output_byteend)
rb_str_resize(output, output_byteend);
if (NIL_P(input)) {
ip = is = NULL;
}
else {
ip = (const unsigned char *)RSTRING_PTR(input);
is = ip + RSTRING_LEN(input);
}
op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
os = op + output_bytesize;
res = rb_econv_convert(ec, &ip, is, &op, os, flags);
rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
if (!NIL_P(input)) {
OBJ_INFECT_RAW(output, input);
rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
}
if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
if (LONG_MAX / 2 < output_bytesize)
rb_raise(rb_eArgError, "too long conversion result");
output_bytesize *= 2;
output_byteoffset_v = Qnil;
goto retry;
}
if (ec->destination_encoding) {
rb_enc_associate(output, ec->destination_encoding);
}
return econv_result_to_symbol(res);
}
primitive_errinfo → array Show source
#primitive_errinfo将有关上次错误的重要信息作为5元素数组返回:
[result, enc1, enc2, error_bytes, readagain_bytes]
结果是primitive_convert的最后结果。
其他元素仅在结果为invalid_byte_sequence,:incomplete_input或undefined_conversion时才有意义。
enc1和enc2将转换步骤表示为一对字符串。例如,从EUC-JP到ISO-8859-1的转换器按如下方式转换字符串:EUC-JP - > UTF-8 - > ISO-8859-1。因此enc1,enc2是“EUC-JP”,“UTF-8”或“UTF-8”,“ISO-8859-1”。
error_bytes和readagain_bytes指示导致错误的字节序列。error_bytes是丢弃的部分。readagain_bytes是缓冲部分,在下次转换时会再次读取。
Example:
# \xff is invalid as EUC-JP.
ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
ec.primitive_convert(src="\xff", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
# HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
# Since this error is occur in UTF-8 to ISO-8859-1 conversion,
# error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
# partial character is invalid
ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
ec.primitive_convert(src="\xa4", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
# Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
# partial characters.
ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
p ec.primitive_errinfo
#=> [:source_buffer_empty, nil, nil, nil, nil]
# \xd8\x00\x00@ is invalid as UTF-16BE because
# no low surrogate after high surrogate (\xd8\x00).
# It is detected by 3rd byte (\00) which is part of next character.
# So the high surrogate (\xd8\x00) is discarded and
# the 3rd byte is read again later.
# Since the byte is buffered in ec, it is dropped from src.
ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
p src
#=> "@"
# Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
# The problem is detected by 4th byte.
ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
p src
#=> ""
static VALUE
econv_primitive_errinfo(VALUE self)
{
rb_econv_t *ec = check_econv(self);
VALUE ary;
ary = rb_ary_new2(5);
rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
rb_ary_store(ary, 4, Qnil);
if (ec->last_error.source_encoding)
rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
if (ec->last_error.destination_encoding)
rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
if (ec->last_error.error_bytes_start) {
rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
}
return ary;
}
putback(p1 = v1) Show source
call-seq
ec.putback -> string
ec.putback(max_numbytes) -> string
放回将被转换的字节。
这些字节是由invalid_byte_sequence错误引起的。当invalid_byte_sequence错误时,一些字节被丢弃,一些字节被缓冲以后转换。后面的字节可以放回去。可以通过Encoding :: InvalidByteSequenceError#readagain_bytes和#primitive_errinfo来观察。
ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
src = "\x00\xd8\x61\x00"
dst = ""
p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
p ec.putback #=> "a\x00"
p ec.putback #=> "" # no more bytes to put back
static VALUE
econv_putback(int argc, VALUE *argv, VALUE self)
{
rb_econv_t *ec = check_econv(self);
int n;
int putbackable;
VALUE str, max;
rb_scan_args(argc, argv, "01", &max);
if (NIL_P(max))
n = rb_econv_putbackable(ec);
else {
n = NUM2INT(max);
putbackable = rb_econv_putbackable(ec);
if (putbackable < n)
n = putbackable;
}
str = rb_str_new(NULL, n);
rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
if (ec->source_encoding) {
rb_enc_associate(str, ec->source_encoding);
}
return str;
}
replacement → string Show source
返回替换字符串。
ec = Encoding::Converter.new("euc-jp", "us-ascii")
p ec.replacement #=> "?"
ec = Encoding::Converter.new("euc-jp", "utf-8")
p ec.replacement #=> "\uFFFD"
static VALUE
econv_get_replacement(VALUE self)
{
rb_econv_t *ec = check_econv(self);
int ret;
rb_encoding *enc;
ret = make_replacement(ec);
if (ret == -1) {
rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
}
enc = rb_enc_find(ec->replacement_enc);
return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
}
replacement = string显示源文件
设置替换字符串。
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
ec.replacement = "<undef>"
p ec.convert("a \u3042 b") #=> "a <undef> b"
static VALUE
econv_set_replacement(VALUE self, VALUE arg)
{
rb_econv_t *ec = check_econv(self);
VALUE string = arg;
int ret;
rb_encoding *enc;
StringValue(string);
enc = rb_enc_get(string);
ret = rb_econv_set_replacement(ec,
(const unsigned char *)RSTRING_PTR(string),
RSTRING_LEN(string),
rb_enc_name(enc));
if (ret == -1) {
/* xxx: rb_eInvalidByteSequenceError? */
rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
}
return arg;
}
source_encoding→encoding 显示源文件
以编码对象的形式返回源编码。
static VALUE
econv_source_encoding(VALUE self)
{
rb_econv_t *ec = check_econv(self);
if (!ec->source_encoding)
return Qnil;
return rb_enc_from_encoding(ec->source_encoding);
}
编码 | Encoding相关
Ruby 是一种面向对象、命令式、函数式、动态的通用编程语言,是世界上最优美而巧妙的语言。
主页 | https://www.ruby-lang.org/ |
源码 | https://github.com/ruby/ruby |
版本 | 2.4 |
发布版本 | 2.4.1 |