Files
tidb/pkg/util/collate/ucadata/generator/main.go

422 lines
9.6 KiB
Go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bytes"
_ "embed"
"go/format"
"math"
"os"
"text/template"
)
//go:embed allkeys-4.0.0.txt
var allkeys0400 string
//go:embed allkeys-9.0.0.txt
var allkeys0900 string
type cetEntry struct {
// a cetEntry can actually contain several characters, which means map many to one or many
// but MySQL's collation doesn't handle these contractions, so we only care the rule with a
// single char.
char rune
// weights is the first level of each collation element
// as we only implement 'ai_ci' collation, so other levels are ignored directly
weights []uint16
}
func parseCETHex(input string) (bool, uint32, string) {
hasValidHex := false
currentRune := uint32(0)
end := 0
for idx, c := range input {
if reverseHexTable[c] <= 0xf {
hasValidHex = true
currentRune <<= 4
currentRune += uint32(reverseHexTable[c])
continue
}
// invalid character
end = idx
break
}
return hasValidHex, currentRune, input[end:]
}
func parseCETWeights(input string) (bool, []uint16, string) {
var left string
ok := true
weights := make([]uint16, 0, 1)
outer:
for {
left = input
var (
weight uint32
)
if input[0] != '[' {
break
}
// ignore the dot or star
if input[1] != '.' && input[1] != '*' {
ok = false
break
}
input = input[2:]
ok, weight, input = parseCETHex(input)
if !ok {
ok = false
break
}
if weight > math.MaxUint16 {
ok = false
break
}
// then ignore all weight with higher level
for {
if input[0] == '.' {
ok, _, input = parseCETHex(input[1:])
if !ok {
ok = false
break outer
}
} else if input[0] == ']' {
input = input[1:]
break
} else {
ok = false
break outer
}
}
weights = append(weights, uint16(weight))
}
return ok, weights, left
}
func parseCETEntry(input string) (bool, *cetEntry, string) {
var (
ok bool
char uint32
weights []uint16
left string
)
left = input
ok, char, input = parseCETHex(input)
if !ok {
return false, nil, left
}
// then ignore the space and ';'
outer:
for {
switch input[0] {
case ' ':
input = input[1:]
case ';':
input = input[1:]
default:
break outer
}
}
// then parse the weights
ok, weights, input = parseCETWeights(input)
if !ok {
return false, nil, left
}
return true, &cetEntry{
char: rune(char),
weights: weights,
}, input
}
type unicodeVersion int
const (
unicode0400 unicodeVersion = 400
unicode0900 unicodeVersion = 900
)
type cet struct {
Name string
Length rune
MapTable4 []uint64
LongRuneMap map[rune][2]uint64
URL string
// explicitRune indicates whether this character is set in the `MapTable/LongRuneMap`.
explicitRune map[rune]bool
version unicodeVersion
}
func (c *cet) insertWeights(char rune, weights []uint16) {
if char == 0xFDFA {
// this is a special case, MySQL doesn't handle this character in unicode 4.0.0
switch c.version {
case unicode0400:
return
case unicode0900:
weights = weights[:8]
}
}
c.explicitRune[char] = true
nonZeroWeights := make([]uint16, 0, len(weights))
for _, w := range weights {
if w != 0 {
nonZeroWeights = append(nonZeroWeights, w)
}
}
if len(nonZeroWeights) <= 4 {
idx := 0
for _, w := range nonZeroWeights {
if w != 0 {
c.MapTable4[char] += uint64(w) << (idx * 16)
idx++
}
}
} else if len(nonZeroWeights) <= 8 {
c.MapTable4[char] = LongRune8
idx := 0
weight0 := uint64(0)
for _, w := range nonZeroWeights[:4] {
if w != 0 {
weight0 += uint64(w) << (idx * 16)
idx++
}
}
idx = 0
weight1 := uint64(0)
for _, w := range nonZeroWeights[4:] {
if w != 0 {
weight1 += uint64(w) << (idx * 16)
idx++
}
}
c.LongRuneMap[char] = [2]uint64{weight0, weight1}
} else {
panic("unreachable")
}
// a special value. The `MapTable4` is set to 0xFFFD automatically, but the implementation will get value from `LongRuneMap`
// and gets 0.
if c.version == unicode0900 && char == 0xFFFD {
c.LongRuneMap[char] = [2]uint64{0xFFFD, 0}
}
}
// parseAllKeys dumps the `MapTable0900` and `LongRuneTable0900` from allkeys0900
// this function actually has the potential to become a generic parser for both allkeys0900
// and the data in `unicode_ci`. TODO: migrate the `unicode_ci_data` to use this parser
func parseAllKeys(input string, length rune, version unicodeVersion) cet {
cet := cet{
Length: length,
MapTable4: make([]uint64, length),
LongRuneMap: make(map[rune][2]uint64),
explicitRune: make(map[rune]bool),
version: version,
}
for {
var (
entry *cetEntry
)
_, entry, input = parseCETEntry(input)
if len(input) == 0 {
break
}
if entry != nil && entry.char < length {
cet.insertWeights(entry.char, entry.weights)
}
// just go to the next line
for {
if input[0] != '\n' {
input = input[1:]
continue
}
input = input[1:]
break
}
}
return cet
}
func (c *cet) calcImplicitWeight() {
for i := rune(1); i < c.Length; i++ {
if c.explicitRune[i] {
continue
}
var first, second uint64
if c.version == unicode0400 {
first, second = c.getImplicitWeight0400(i)
} else {
first, second = c.getImplicitWeight0900(i)
}
if second == 0 {
c.MapTable4[i] = first
} else {
c.MapTable4[i] = LongRune8
c.LongRuneMap[i] = [2]uint64{first, second}
}
}
}
func (*cet) getImplicitWeight0400(r rune) (first uint64, second uint64) {
// Han and other unsigned cases
first = uint64(r >> 15)
if r >= 0x3400 && r <= 0x4DB5 {
first += 0xFB80
} else if (r >= 0x4E00 && r <= 0x9FA5) || (r >= 0xFA0E && r <= 0xFA0F) {
first += 0xFB40
} else {
first += 0xFBC0
}
return first + (uint64((r&0x7FFF)|0x8000) << 16), 0
}
func (c *cet) getImplicitWeight0900(r rune) (first uint64, second uint64) {
// invalid characters, they are surrogate pair in utf-16, so removed in unicode
if (r >= 0xD800 && r <= 0xDFFF) || r == 0xFFFD {
return 0xFFFD, 0
}
// handle hangul syllable
if r >= 0xAC00 && r <= 0xD7AF {
jamo := decomposeHangulSyllable(r)
// the length of jamo is 2 or 3, so it will only use a single uint64
first = uint64(0)
for idx, j := range jamo {
// `ucadata.DUCET0900Table.MapTable[j]` should have only one weight
// test has ensured the jamo has only one weight
first += (c.MapTable4[j] & 0xFFFF) << (idx * 16)
}
return first, 0
}
// The implicit weight is always [.AAAA.0020.0002][.BBBB.0000.0000]
// The calculation process of AAAA and BBBB is according to the UCA
if r >= 0x17000 && r <= 0x18AFF {
// Tangut characters
return 0xFB00 + (uint64((r-0x17000)|0x8000) << 16), 0
}
// Nushu and Khitan Small Script were added into unicode in 10.0 and 13.0, so they don't need to be handled
// specially.
// Han and other unsigned cases
first = uint64(r >> 15)
if (r >= 0x3400 && r <= 0x4DB5) || (r >= 0x20000 && r <= 0x2A6D6) ||
(r >= 0x2A700 && r <= 0x2B734) || (r >= 0x2B740 && r <= 0x2B81D) ||
(r >= 0x2B820 && r <= 0x2CEA1) {
first += 0xFB80
} else if (r >= 0x4E00 && r <= 0x9FD5) || (r >= 0xFA0E && r <= 0xFA29) {
first += 0xFB40
} else {
first += 0xFBC0
}
return first + (uint64((r&0x7FFF)|0x8000) << 16), 0
}
func decomposeHangulSyllable(r rune) []rune {
const (
syllableBase rune = 0xAC00
leadingJamoBase rune = 0x1100
vowelJamoBase rune = 0x1161
trailingJamoBase rune = 0x11A7
vowelJamoCnt rune = 21
trailingJamoCnt rune = 28
)
syllableIndex := r - syllableBase
vtCombination := vowelJamoCnt * trailingJamoCnt
leadingJamoIndex := syllableIndex / vtCombination
vowelJamoIndex := (syllableIndex % vtCombination) / trailingJamoCnt
trailingJamoIndex := syllableIndex % trailingJamoCnt
result := []rune{leadingJamoBase + leadingJamoIndex, vowelJamoBase + vowelJamoIndex}
if trailingJamoIndex > 0 {
result = append(result, trailingJamoBase+trailingJamoIndex)
}
return result
}
//go:embed data.go.tpl
var unicodeDataTemplate string
func generateFile(filename string, d *cet) {
tpl, err := template.New("unicode_template").
Funcs(template.FuncMap{"mod": func(i, j int) bool { return i%j == 0 }}).
Parse(unicodeDataTemplate)
if err != nil {
panic(err)
}
output := bytes.Buffer{}
err = tpl.Execute(&output, d)
if err != nil {
panic(err)
}
formattedSource, err := format.Source(output.Bytes())
if err != nil {
panic(err)
}
err = os.WriteFile(filename, formattedSource, 0666)
if err != nil {
panic(err)
}
}
func main() {
switch os.Args[len(os.Args)-1] {
case "unicode_0900_ai_ci_data_generated.go":
ducet0900Table := parseAllKeys(allkeys0900, 0x2CEA1, unicode0900)
ducet0900Table.Name = "DUCET0900Table"
ducet0900Table.URL = "https://www.unicode.org/Public/UCA/9.0.0/allkeys.txt"
ducet0900Table.calcImplicitWeight()
generateFile("unicode_0900_ai_ci_data_generated.go", &ducet0900Table)
case "unicode_ci_data_generated.go":
// in 4.0.0, only cares the character between 0 and 0xFFFF
ducet0400Table := parseAllKeys(allkeys0400, 0x10000, unicode0400)
ducet0400Table.Name = "DUCET0400Table"
ducet0400Table.URL = "https://www.unicode.org/Public/UCA/4.0.0/allkeys-4.0.0.txt"
ducet0400Table.calcImplicitWeight()
generateFile("unicode_ci_data_generated.go", &ducet0400Table)
default:
panic("unreachable")
}
}