From 427a0adb39a883be182f222a40a22f00ad4531cb Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Sun, 30 Aug 2009 18:17:52 -0700 Subject: [PATCH] further simplification of the case fold calculation. hard to beat at this point, i think. R=rsc DELTA=38 (8 added, 21 deleted, 9 changed) OCL=34092 CL=34096 --- src/pkg/unicode/letter.go | 45 +++++++++++++---------------------- src/pkg/unicode/maketables.go | 2 +- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/src/pkg/unicode/letter.go b/src/pkg/unicode/letter.go index 615c839d62..c68ec47e36 100644 --- a/src/pkg/unicode/letter.go +++ b/src/pkg/unicode/letter.go @@ -20,13 +20,9 @@ type Range struct { // different case for that character. They may be negative. If zero, it // means the character is in the corresponding case. There is a special // case representing sequences of alternating corresponding Upper and Lower -// pairs. It appears with the usual Lo and Hi values and a Delta of -// {0, UpperLower, 0} -// The constant UpperLower has (meaningful) value 1. The lower case -// letters in such sequences are assumed; were they present they would -// have a Delta of -// {LowerUpper, 0, LowerUpper} -// where LowerUpper has value -1. +// pairs. It appears with a fixed Delta of +// {UpperLower, UpperLower, UpperLower} +// The constant UpperLower has an otherwise impossible delta value. type CaseRange struct { Lo int; Hi int; @@ -46,9 +42,8 @@ type d [MaxCase]int32 // to make the CaseRanges text shorter // this CaseRange represents a sequence of the form (say) // Upper Lower Upper Lower. const ( - MaxChar = 0x10FFFF; - UpperLower = MaxChar + 2; // cannot be a valid delta - LowerUpper = MaxChar + 3; + MaxChar = 0x10FFFF; // Maximum valid Unicode character value. + UpperLower = MaxChar + 1; // (Cannot be a valid delta.) ) // Is tests whether rune is in the specified table of ranges. @@ -118,22 +113,6 @@ func IsLetter(rune int) bool { return Is(Letter, rune); } -// In an Upper-Lower sequence, which always starts with an UpperCase letter, -// the real deltas always look like: -// 0 1 0 -// -1 0 -1 -// This is a single-dimensioned array addressed by the case shifted up one bit -// (the column of this table) or'ed with the low bit of the position in -// the sequence (the row of the table). -var ulDelta = [8]int{ - (UpperCase<<1) | 0: 0, - (UpperCase<<1) | 1: -1, - (LowerCase<<1) | 0: 1, - (LowerCase<<1) | 1: 0, - (TitleCase<<1) | 0: 0, - (TitleCase<<1) | 1: -1, -} - // To maps the rune to the specified case, UpperCase, LowerCase, or TitleCase func To(_case int, rune int) int { if _case < 0 || MaxCase <= _case { @@ -148,9 +127,17 @@ func To(_case int, rune int) int { if r.Lo <= rune && rune <= r.Hi { delta := int(r.Delta[_case]); if delta > MaxChar { - // Somewhere inside an UpperLower sequence. Use - // the precomputed delta table to get our offset. - delta = ulDelta[((_case<<1) | ((rune-r.Lo)&1))]; + // In an Upper-Lower sequence, which always starts with + // an UpperCase letter, the real deltas always look like: + // {0, 1, 0} UpperCase (Lower is next) + // {-1, 0, -1} LowerCase (Upper, Title are previous) + // The characters at even offsets from the beginning of the + // sequence are upper case; the ones at odd offsets are lower. + // The correct mapping can be done by clearing or setting the low + // bit in the sequence offset. + // The constants UpperCase and TitleCase are even while LowerCase + // is odd so we take the low bit from _case. + return r.Lo + ((rune - r.Lo)&^1 | _case&1); } return rune + delta; } diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go index f47fda9103..142c8b6d02 100644 --- a/src/pkg/unicode/maketables.go +++ b/src/pkg/unicode/maketables.go @@ -798,7 +798,7 @@ func printCaseRange(lo, hi *caseState) { fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", lo.point, hi.point) case hi.point > lo.point && lo.isLowerUpper(): - die.Log("LowerUpper sequence: should not happen: U+%04X\n", lo.point); + die.Log("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point); fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", lo.point, hi.point) default: -- 2.48.1