[ruby-dev:47241] [ruby-trunk - Feature #6752] Replacing ill-formed subsequencce
From:
"naruse (Yui NARUSE)" <naruse@...>
Date:
2013-04-09 09:24:45 UTC
List:
ruby-dev #47241
Issue #6752 has been updated by naruse (Yui NARUSE).
I wrote a updated patch which include String#scrub and String#encode with extension.
String#scrub allows replacement as both argument or block.
diff --git a/string.c b/string.c
index 8b85739..7131ac5 100644
--- a/string.c
+++ b/string.c
@@ -7741,6 +7741,272 @@ rb_str_ellipsize(VALUE str, long len)
return ret;
}
+static VALUE
+str_compat_and_valid(VALUE str, rb_encoding *enc)
+{
+ int cr;
+ str = StringValue(str);
+ cr = rb_enc_str_coderange(str);
+ if (cr == ENC_CODERANGE_BROKEN) {
+ rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
+ }
+ else if (cr == ENC_CODERANGE_7BIT) {
+ rb_encoding *e = STR_ENC_GET(str);
+ if (!rb_enc_asciicompat(enc)) {
+ rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
+ rb_enc_name(enc), rb_enc_name(e));
+ }
+ }
+ else { /* ENC_CODERANGE_VALID */
+ rb_encoding *e = STR_ENC_GET(str);
+ if (enc != e) {
+ rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
+ rb_enc_name(enc), rb_enc_name(e));
+ }
+ }
+ return str;
+}
+
+/*
+ * call-seq:
+ * str.scrub -> new_str
+ * str.scrub(repl) -> new_str
+ * str.scrub{|bytes|} -> new_str
+ *
+ * If the string is invalid byte sequence then replace invalid bytes with given replacement
+ * character, else returns self.
+ */
+VALUE
+rb_str_scrub(int argc, VALUE *argv, VALUE str)
+{
+ int cr = ENC_CODERANGE(str);
+ rb_encoding *enc;
+ VALUE repl;
+
+ if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
+ return rb_str_dup(str);
+
+ enc = STR_ENC_GET(str);
+ rb_scan_args(argc, argv, "01", &repl);
+ if (argc != 0) {
+ repl = str_compat_and_valid(repl, enc);
+ }
+
+ if (rb_enc_dummy_p(enc)) {
+ return rb_str_dup(str);
+ }
+
+ if (rb_enc_asciicompat(enc)) {
+ const char *p = RSTRING_PTR(str);
+ const char *e = RSTRING_END(str);
+ const char *p1 = p;
+ const char *rep;
+ long replen;
+ int rep7bit_p;
+ VALUE buf = rb_str_buf_new(RSTRING_LEN(str));
+ if (rb_block_given_p()) {
+ rep = NULL;
+ }
+ else if (!NIL_P(repl)) {
+ rep = RSTRING_PTR(repl);
+ replen = RSTRING_LEN(repl);
+ rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
+ }
+ else if (enc == rb_utf8_encoding()) {
+ rep = "\xEF\xBF\xBD";
+ replen = strlen(rep);
+ rep7bit_p = FALSE;
+ }
+ else {
+ rep = "?";
+ replen = strlen(rep);
+ rep7bit_p = TRUE;
+ }
+ cr = ENC_CODERANGE_7BIT;
+
+ p = search_nonascii(p, e);
+ if (!p) {
+ p = e;
+ }
+ while (p < e) {
+ int ret = rb_enc_precise_mbclen(p, e, enc);
+ if (MBCLEN_NEEDMORE_P(ret)) {
+ break;
+ }
+ else if (MBCLEN_CHARFOUND_P(ret)) {
+ cr = ENC_CODERANGE_VALID;
+ p += MBCLEN_CHARFOUND_LEN(ret);
+ }
+ else if (MBCLEN_INVALID_P(ret)) {
+ /*
+ * p1~p: valid ascii/multibyte chars
+ * p ~e: invalid bytes + unknown bytes
+ */
+ long clen = rb_enc_mbmaxlen(enc);
+ if (p > p1) {
+ rb_str_buf_cat(buf, p1, p - p1);
+ }
+
+ if (e - p < clen) clen = e - p;
+ if (clen <= 2) {
+ clen = 1;
+ }
+ else {
+ const char *q = p;
+ clen--;
+ for (; clen > 1; clen--) {
+ ret = rb_enc_precise_mbclen(q, q + clen, enc);
+ if (MBCLEN_NEEDMORE_P(ret)) break;
+ else if (MBCLEN_INVALID_P(ret)) continue;
+ else UNREACHABLE;
+ }
+ }
+ if (rep) {
+ rb_str_buf_cat(buf, rep, replen);
+ if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
+ }
+ else {
+ repl = rb_yield(rb_enc_str_new(p1, clen, enc));
+ repl = str_compat_and_valid(repl, enc);
+ rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
+ if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
+ cr = ENC_CODERANGE_VALID;
+ }
+ p += clen;
+ p1 = p;
+ p = search_nonascii(p, e);
+ if (!p) {
+ p = e;
+ break;
+ }
+ }
+ else {
+ UNREACHABLE;
+ }
+ }
+ if (p1 < p) {
+ rb_str_buf_cat(buf, p1, p - p1);
+ }
+ if (p < e) {
+ if (rep) {
+ rb_str_buf_cat(buf, rep, replen);
+ if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
+ }
+ else {
+ repl = rb_yield(rb_enc_str_new(p, e-p, enc));
+ repl = str_compat_and_valid(repl, enc);
+ rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
+ if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
+ cr = ENC_CODERANGE_VALID;
+ }
+ }
+ ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
+ return buf;
+ }
+ else {
+ /* ASCII incompatible */
+ const char *p = RSTRING_PTR(str);
+ const char *e = RSTRING_END(str);
+ const char *p1 = p;
+ VALUE buf = rb_str_buf_new(RSTRING_LEN(str));
+ const char *rep;
+ long replen;
+ long mbminlen = rb_enc_mbminlen(enc);
+ static rb_encoding *utf16be;
+ static rb_encoding *utf16le;
+ static rb_encoding *utf32be;
+ static rb_encoding *utf32le;
+ if (!utf16be) {
+ utf16be = rb_enc_find("UTF-16BE");
+ utf16le = rb_enc_find("UTF-16LE");
+ utf32be = rb_enc_find("UTF-32BE");
+ utf32le = rb_enc_find("UTF-32LE");
+ }
+ if (!NIL_P(repl)) {
+ rep = RSTRING_PTR(repl);
+ replen = RSTRING_LEN(repl);
+ }
+ else if (enc == utf16be) {
+ rep = "\xFF\xFD";
+ replen = strlen(rep);
+ }
+ else if (enc == utf16le) {
+ rep = "\xFD\xFF";
+ replen = strlen(rep);
+ }
+ else if (enc == utf32be) {
+ rep = "\x00\x00\xFF\xFD";
+ replen = strlen(rep);
+ }
+ else if (enc == utf32le) {
+ rep = "\xFD\xFF\x00\x00";
+ replen = strlen(rep);
+ }
+ else {
+ rep = "?";
+ replen = strlen(rep);
+ }
+
+ while (p < e) {
+ int ret = rb_enc_precise_mbclen(p, e, enc);
+ if (MBCLEN_NEEDMORE_P(ret)) {
+ break;
+ }
+ else if (MBCLEN_CHARFOUND_P(ret)) {
+ p += MBCLEN_CHARFOUND_LEN(ret);
+ }
+ else if (MBCLEN_INVALID_P(ret)) {
+ const char *q = p;
+ long clen = rb_enc_mbmaxlen(enc);
+ if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
+
+ if (e - p < clen) clen = e - p;
+ if (clen <= mbminlen * 2) {
+ clen = mbminlen;
+ }
+ else {
+ clen -= mbminlen;
+ for (; clen > mbminlen; clen-=mbminlen) {
+ ret = rb_enc_precise_mbclen(q, q + clen, enc);
+ if (MBCLEN_NEEDMORE_P(ret)) break;
+ else if (MBCLEN_INVALID_P(ret)) continue;
+ else UNREACHABLE;
+ }
+ rb_str_set_len(buf, len);
+ }
+ if (rep) {
+ rb_str_buf_cat(buf, rep, replen);
+ }
+ else {
+ repl = rb_yield(rb_enc_str_new(p, e-p, enc));
+ repl = str_compat_and_valid(repl, enc);
+ rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
+ }
+ p += clen;
+ p1 = p;
+ }
+ else {
+ UNREACHABLE;
+ }
+ }
+ if (p1 < p) {
+ rb_str_buf_cat(buf, p1, p - p1);
+ }
+ if (p < e) {
+ if (rep) {
+ rb_str_buf_cat(buf, rep, replen);
+ }
+ else {
+ repl = rb_yield(rb_enc_str_new(p, e-p, enc));
+ repl = str_compat_and_valid(repl, enc);
+ rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
+ }
+ }
+ ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
+ return buf;
+ }
+}
+
/**********************************************************************
* Document-class: Symbol
*
@@ -8222,6 +8488,7 @@ Init_String(void)
rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
+ rb_define_method(rb_cString, "scrub", rb_str_scrub, -1);
rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index a8d56a4..60834bb 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -1489,4 +1489,38 @@ class TestM17N < Test::Unit::TestCase
s.untrust
assert_equal(true, s.b.untrusted?)
end
+
+ def test_scrub
+ assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub)
+ assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub)
+
+ # exapmles in Unicode 6.1.0 D93b
+ assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41",
+ u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub)
+ assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41",
+ u("\x41\xE0\x9F\x80\x41").scrub)
+ assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
+ u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub)
+ assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
+ u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub)
+
+ assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013"))
+ assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) }
+ assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) }
+ assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) }
+ assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE")))
+
+ assert_equal("\u3042<e381>", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'})
+ assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} }
+ assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} }
+ assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} }
+ assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")})
+
+ assert_equal("\uFFFD\u3042".encode("UTF-16BE"),
+ "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE).
+ scrub)
+ assert_equal("\uFFFD\u3042".encode("UTF-16LE"),
+ "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE).
+ scrub)
+ end
end
diff --git a/transcode.c b/transcode.c
index de12c04..9c940ed 100644
--- a/transcode.c
+++ b/transcode.c
@@ -2652,6 +2652,8 @@ str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
return dencidx;
}
+VALUE rb_str_scrub(int argc, VALUE *argv, VALUE str);
+
static int
str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
{
@@ -2686,6 +2688,17 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
ECONV_XML_ATTR_CONTENT_DECORATOR|
ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
if (senc && senc == denc) {
+ if (ecflags & ECONV_INVALID_MASK) {
+ if (!NIL_P(ecopts)) {
+ VALUE rep = rb_hash_aref(ecopts, sym_replace);
+ dest = rb_str_scrub(1, &rep, str);
+ }
+ else {
+ dest = rb_str_scrub(0, NULL, str);
+ }
+ *self = dest;
+ return dencidx;
+ }
return NIL_P(arg2) ? -1 : dencidx;
}
if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
----------------------------------------
Feature #6752: Replacing ill-formed subsequencce
https://bugs.ruby-lang.org/issues/6752#change-38389
Author: naruse (Yui NARUSE)
Status: Assigned
Priority: Normal
Assignee: matz (Yukihiro Matsumoto)
Category: core
Target version: next minor
=begin
== 概要
Stringになんらかの理由で不正なバイト列が含まれている時に、それを置換文字で置き換えたい。
== ユースケース
実際に確認されているユースケースは以下の通りです。
* twitterのtitle
* IRCのログ
* ニコニコ動画の API
* Webクローリング
これらの不正なバイト列の生成過程は、おそらく、バイト単位で文字列を切り詰めた時に末尾が切れて、
末尾がおかしい不正な文字列が作られます。(前二者)
これをコンテナに入れたり結合することによって、途中にも混ざった文字列が作られます。(後二者)
* https://twitter.com/takahashim/status/18974040397
* https://twitter.com/n0kada/status/215674740705210368
* https://twitter.com/n0kada/status/215686490070585346
* https://twitter.com/hajimehoshi/status/215671146769682432
* http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
* http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8
== 必要な引数: 置換文字
省略可能、String。
デフォルトは、Unicode系ならU+FFFD、それ以外では「?」。
デフォルトが空文字でない理由は、削除してしまうことで、従来は存在しなかったトークンを作れてしまい、
上位のレイヤーの脆弱性に繋がるからです。
http://unicode.org/reports/tr36/#UTF-8_Exploit
== API
--- str.encode(str.encoding, invalid: replace, [replace: "〓"])
* CSI的じゃなくて気持ち悪い
* iconv でできるのは glibc iconv か GNU libiconv に //IGNORE つけた時で他はできない
* 実装上のメリットは後述の通り、直感に反してあまりない(と思う)
== 別メソッド
* 新しいメソッドである
* fix/repair invalid/illegal bytes/sequence あたりの名前か
== 実装
=== 鬼車ベース
int ret = rb_enc_precise_mbclen(p, e, enc); して、
MBCLEN_INVALID_P(ret) が真な時、何バイト目が不正なのかわからないのが微妙。
ONIGENC_CONSTRUCT_MBCLEN_INVALID() がバイト数を取らないのが原因なので、
鬼車のエンコーディングモジュール全てに影響してしまうため、修正困難。
不正なバイトはほとんど存在しないと仮定して、効率を犠牲にすれば回避は可能。
=== transcodeベース
UCS正規化なglibc iconv, GNU libiconv, Perl Encodeなどと違って、
CSIなtranscodeでは、自分自身に変換する場合、
エンコーディングごとに「何もしない」変換モジュールを用意しないといけない。
とりあえず鬼車ベースのコンセプト実装とテストを添付しておきます。
diff --git a/string.c b/string.c
index d038835..4808f15 100644
--- a/string.c
+++ b/string.c
@@ -7426,6 +7426,199 @@ rb_str_ellipsize(VALUE str, long len)
return ret;
}
+/*
+ * call-seq:
+ * str.fix_invalid -> new_str
+ *
+ * If the string is well-formed, it returns self.
+ * If the string has invalid byte sequence, repair it with given replacement
+ * character.
+ */
+VALUE
+rb_str_fix_invalid(VALUE str)
+{
+ int cr = ENC_CODERANGE(str);
+ rb_encoding *enc;
+ if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
+ return rb_str_dup(str);
+
+ enc = STR_ENC_GET(str);
+ if (rb_enc_asciicompat(enc)) {
+ const char *p = RSTRING_PTR(str);
+ const char *e = RSTRING_END(str);
+ const char *p1 = p;
+ /* 10 should be enough for the usual use case,
+ * fixing a wrongly chopped character at the end of the string
+ */
+ long room = 10;
+ VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room);
+ const char *rep;
+ if (enc == rb_utf8_encoding())
+ rep = "\xEF\xBF\xBD";
+ else
+ rep = "?";
+ cr = ENC_CODERANGE_7BIT;
+
+ p = search_nonascii(p, e);
+ if (!p) {
+ p = e;
+ }
+ while (p < e) {
+ int ret = rb_enc_precise_mbclen(p, e, enc);
+ if (MBCLEN_CHARFOUND_P(ret)) {
+ if ((unsigned char)*p > 127) cr = ENC_CODERANGE_VALID;
+ p += MBCLEN_CHARFOUND_LEN(ret);
+ }
+ else if (MBCLEN_INVALID_P(ret)) {
+ const char *q;
+ long clen = rb_enc_mbmaxlen(enc);
+ if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
+ q = RSTRING_END(buf);
+
+ if (e - p < clen) clen = e - p;
+ if (clen < 3) {
+ clen = 1;
+ }
+ else {
+ long len = RSTRING_LEN(buf);
+ clen--;
+ rb_str_buf_cat(buf, p, clen);
+ for (; clen > 1; clen--) {
+ ret = rb_enc_precise_mbclen(q, q + clen, enc);
+ if (MBCLEN_NEEDMORE_P(ret)) {
+ break;
+ }
+ else if (MBCLEN_INVALID_P(ret)) {
+ continue;
+ }
+ else {
+ rb_bug("shouldn't reach here '%s'", q);
+ }
+ }
+ rb_str_set_len(buf, len);
+ }
+ p += clen;
+ p1 = p;
+ rb_str_buf_cat2(buf, rep);
+ p = search_nonascii(p, e);
+ if (!p) {
+ p = e;
+ break;
+ }
+ }
+ else if (MBCLEN_NEEDMORE_P(ret)) {
+ break;
+ }
+ else {
+ rb_bug("shouldn't reach here");
+ }
+ }
+ if (p1 < p) {
+ rb_str_buf_cat(buf, p1, p - p1);
+ }
+ if (p < e) {
+ rb_str_buf_cat2(buf, rep);
+ cr = ENC_CODERANGE_VALID;
+ }
+ ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
+ return buf;
+ }
+ else if (rb_enc_dummy_p(enc)) {
+ return rb_str_dup(str);
+ }
+ else {
+ /* ASCII incompatible */
+ const char *p = RSTRING_PTR(str);
+ const char *e = RSTRING_END(str);
+ const char *p1 = p;
+ /* 10 should be enough for the usual use case,
+ * fixing a wrongly chopped character at the end of the string
+ */
+ long room = 10;
+ VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room);
+ const char *rep;
+ long mbminlen = rb_enc_mbminlen(enc);
+ static rb_encoding *utf16be;
+ static rb_encoding *utf16le;
+ static rb_encoding *utf32be;
+ static rb_encoding *utf32le;
+ if (!utf16be) {
+ utf16be = rb_enc_find("UTF-16BE");
+ utf16le = rb_enc_find("UTF-16LE");
+ utf32be = rb_enc_find("UTF-32BE");
+ utf32le = rb_enc_find("UTF-32LE");
+ }
+ if (enc == utf16be) {
+ rep = "\xFF\xFD";
+ }
+ else if (enc == utf16le) {
+ rep = "\xFD\xFF";
+ }
+ else if (enc == utf32be) {
+ rep = "\x00\x00\xFF\xFD";
+ }
+ else if (enc == utf32le) {
+ rep = "\xFD\xFF\x00\x00";
+ }
+ else {
+ rep = "?";
+ }
+
+ while (p < e) {
+ int ret = rb_enc_precise_mbclen(p, e, enc);
+ if (MBCLEN_CHARFOUND_P(ret)) {
+ p += MBCLEN_CHARFOUND_LEN(ret);
+ }
+ else if (MBCLEN_INVALID_P(ret)) {
+ const char *q;
+ long clen = rb_enc_mbmaxlen(enc);
+ if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
+ q = RSTRING_END(buf);
+
+ if (e - p < clen) clen = e - p;
+ if (clen < mbminlen * 3) {
+ clen = mbminlen;
+ }
+ else {
+ long len = RSTRING_LEN(buf);
+ clen -= mbminlen;
+ rb_str_buf_cat(buf, p, clen);
+ for (; clen > mbminlen; clen-=mbminlen) {
+ ret = rb_enc_precise_mbclen(q, q + clen, enc);
+ if (MBCLEN_NEEDMORE_P(ret)) {
+ break;
+ }
+ else if (MBCLEN_INVALID_P(ret)) {
+ continue;
+ }
+ else {
+ rb_bug("shouldn't reach here '%s'", q);
+ }
+ }
+ rb_str_set_len(buf, len);
+ }
+ p += clen;
+ p1 = p;
+ rb_str_buf_cat2(buf, rep);
+ }
+ else if (MBCLEN_NEEDMORE_P(ret)) {
+ break;
+ }
+ else {
+ rb_bug("shouldn't reach here");
+ }
+ }
+ if (p1 < p) {
+ rb_str_buf_cat(buf, p1, p - p1);
+ }
+ if (p < e) {
+ rb_str_buf_cat2(buf, rep);
+ }
+ ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
+ return buf;
+ }
+}
+
/**********************************************************************
* Document-class: Symbol
*
@@ -7882,6 +8075,7 @@ Init_String(void)
rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
+ rb_define_method(rb_cString, "fix_invalid", rb_str_fix_invalid, 0);
rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
index 47f349c..2b0cfeb 100644
--- a/test/ruby/test_string.rb
+++ b/test/ruby/test_string.rb
@@ -2031,6 +2031,29 @@ class TestString < Test::Unit::TestCase
assert_equal(u("\x82")+("\u3042"*9), ("\u3042"*10).byteslice(2, 28))
end
+
+ def test_fix_invalid
+ assert_equal("\uFFFD\uFFFD\uFFFD", "\x80\x80\x80".fix_invalid)
+ assert_equal("\uFFFDA", "\xF4\x80\x80A".fix_invalid)
+
+ # exapmles in Unicode 6.1.0 D93b
+ assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41",
+ "\x41\xC0\xAF\x41\xF4\x80\x80\x41".fix_invalid)
+ assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41",
+ "\x41\xE0\x9F\x80\x41".fix_invalid)
+ assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
+ "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid)
+
+ assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
+ "abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid)
+
+ assert_equal("\uFFFD\u3042".encode("UTF-16BE"),
+ "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE).
+ fix_invalid)
+ assert_equal("\uFFFD\u3042".encode("UTF-16LE"),
+ "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE).
+ fix_invalid)
+ end
end
class TestString2 < TestString
=end
--
http://bugs.ruby-lang.org/