Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
- Rune4 = (1<<(Bit4+3*Bitx))-1,
- /* 0001 1111 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
+ SurrogateMin = 0xD800,
+ SurrogateMax = 0xDFFF,
+
Bad = Runeerror,
};
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
+ if (SurrogateMin <= l && l <= SurrogateMax)
+ goto bad;
*rune = l;
return 3;
}
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
- if (l <= Rune3)
+ if (l <= Rune3 || l > Runemax)
goto bad;
*rune = l;
return 4;
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
+ if (SurrogateMin <= l && l <= SurrogateMax)
+ goto bad;
*rune = l;
return 3;
}
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
- if (l <= Rune3)
+ if (l <= Rune3 || l > Runemax)
goto bad;
*rune = l;
return 4;
}
/*
- * If the Rune is out of range, convert it to the error rune.
+ * If the Rune is out of range or a surrogate half, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
+ if (SurrogateMin <= c && c <= SurrogateMax)
+ c = Runeerror;
/*
* three character sequence
var surrogateMap = []Utf8Map{
{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
- {0xdfff, "\xed bf bf"}, // surrogate max decodes to (RuneError, 1)
+ {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
}
var testStrings = []string{
{string([]byte{66, 250}), false},
{string([]byte{66, 250, 67}), false},
{"a\uFFFDb", true},
- {string("\xF7\xBF\xBF\xBF"), true}, // U+1FFFFF
+ {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
+ {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
+ {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
{string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
{string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
"backslashes 2 (backquote)")
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)")
- // test large runes. perhaps not the most logical place for this test.
+ // test large and surrogate-half runes. perhaps not the most logical place for these tests.
var r int32
r = 0x10ffff // largest rune value
s = string(r)
r = 0x10ffff + 1
s = string(r)
assert(s, "\xef\xbf\xbd", "too-large rune")
+ r = 0xD800
+ s = string(r)
+ assert(s, "\xef\xbf\xbd", "surrogate rune min")
+ r = 0xDFFF
+ s = string(r)
+ assert(s, "\xef\xbf\xbd", "surrogate rune max")
+ r = -1
+ s = string(r)
+ assert(s, "\xef\xbf\xbd", "negative rune")
+
+ // the large rune tests again, this time using constants instead of a variable.
+ // these conversions will be done at compile time.
+ s = string(0x10ffff) // largest rune value
+ assert(s, "\xf4\x8f\xbf\xbf", "largest rune constant")
+ s = string(0x10ffff + 1)
+ assert(s, "\xef\xbf\xbd", "too-large rune constant")
+ s = string(0xD800)
+ assert(s, "\xef\xbf\xbd", "surrogate rune min constant")
+ s = string(0xDFFF)
+ assert(s, "\xef\xbf\xbd", "surrogate rune max constant")
+ s = string(-1)
+ assert(s, "\xef\xbf\xbd", "negative rune")
assert(string(gr1), gx1, "global ->[]rune")
assert(string(gr2), gx2fix, "global invalid ->[]rune")