//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties //-*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements functions to map the name or alias of a unicode // character to its codepoint. // //===----------------------------------------------------------------------===// #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Unicode.h" namespace llvm { namespace sys { namespace unicode { extern const char *UnicodeNameToCodepointDict; extern const uint8_t *UnicodeNameToCodepointIndex; extern const std::size_t UnicodeNameToCodepointIndexSize; extern const std::size_t UnicodeNameToCodepointLargestNameSize; using BufferType = SmallString<64>; struct Node { bool IsRoot = false; char32_t Value = 0xFFFFFFFF; uint32_t ChildrenOffset = 0; bool HasSibling = false; uint32_t Size = 0; StringRef Name; const Node *Parent = nullptr; constexpr bool isValid() const { return !Name.empty() || Value == 0xFFFFFFFF; } constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } std::string fullName() const { std::string S; // Reserve enough space for most unicode code points. // The chosen value represent the 99th percentile of name size as of // Unicode 14. S.reserve(46); const Node *N = this; while (N) { std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); N = N->Parent; } std::reverse(S.begin(), S.end()); return S; } }; static Node createRoot() { Node N; N.IsRoot = true; N.ChildrenOffset = 1; N.Size = 1; return N; } static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { if (Offset == 0) return createRoot(); uint32_t Origin = Offset; Node N; N.Parent = Parent; uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; if (Offset + 6 >= UnicodeNameToCodepointIndexSize) return N; bool LongName = NameInfo & 0x40; bool HasValue = NameInfo & 0x80; std::size_t Size = NameInfo & ~0xC0; if (LongName) { uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); NameOffset |= UnicodeNameToCodepointIndex[Offset++]; N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); } else { N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); } if (HasValue) { uint8_t H = UnicodeNameToCodepointIndex[Offset++]; uint8_t M = UnicodeNameToCodepointIndex[Offset++]; uint8_t L = UnicodeNameToCodepointIndex[Offset++]; N.Value = ((H << 16) | (M << 8) | L) >> 3; bool HasChildren = L & 0x02; N.HasSibling = L & 0x01; if (HasChildren) { N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; } } else { uint8_t H = UnicodeNameToCodepointIndex[Offset++]; N.HasSibling = H & 0x80; bool HasChildren = H & 0x40; H &= ~0xC0; if (HasChildren) { N.ChildrenOffset = (H << 16); N.ChildrenOffset |= (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; } } N.Size = Offset - Origin; return N; } static bool startsWith(StringRef Name, StringRef Needle, bool Strict, std::size_t &Consummed, char &PreviousCharInName, char &PreviousCharInNeedle, bool IsPrefix = false) { Consummed = 0; if (Strict) { if (!Name.startswith(Needle)) return false; Consummed = Needle.size(); return true; } if (Needle.empty()) return true; auto NamePos = Name.begin(); auto NeedlePos = Needle.begin(); char PreviousCharInNameOrigin = PreviousCharInName; char PreviousCharInNeedleOrigin = PreviousCharInNeedle; auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, bool IgnoreEnd = false) { while (It != End) { const auto Next = std::next(It); // Ignore spaces, underscore, medial hyphens // https://unicode.org/reports/tr44/#UAX44-LM2. bool Ignore = *It == ' ' || *It == '_' || (*It == '-' && isAlnum(PreviousChar) && ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd))); PreviousChar = *It; if (!Ignore) break; ++It; } return It; }; while (true) { NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); NeedlePos = IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); if (NeedlePos == Needle.end()) break; if (NamePos == Name.end()) break; if (toUpper(*NeedlePos) != toUpper(*NamePos)) break; NeedlePos++; NamePos++; } Consummed = std::distance(Name.begin(), NamePos); if (NeedlePos != Needle.end()) { PreviousCharInName = PreviousCharInNameOrigin; PreviousCharInNeedle = PreviousCharInNeedleOrigin; } return NeedlePos == Needle.end(); } static std::tuple compareNode(uint32_t Offset, StringRef Name, bool Strict, char PreviousCharInName, char PreviousCharInNeedle, BufferType &Buffer, const Node *Parent = nullptr) { Node N = readNode(Offset, Parent); std::size_t Consummed = 0; bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, PreviousCharInName, PreviousCharInNeedle); if (!DoesStartWith) return std::make_tuple(N, false, 0); if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) return std::make_tuple(N, true, N.Value); if (N.hasChildren()) { uint32_t ChildOffset = N.ChildrenOffset; for (;;) { Node C; bool Matches; uint32_t Value; std::tie(C, Matches, Value) = compareNode(ChildOffset, Name.substr(Consummed), Strict, PreviousCharInName, PreviousCharInNeedle, Buffer, &N); if (Matches) { std::reverse_copy(C.Name.begin(), C.Name.end(), std::back_inserter(Buffer)); return std::make_tuple(N, true, Value); } ChildOffset += C.Size; if (!C.HasSibling) break; } } return std::make_tuple(N, false, 0); } static std::tuple compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { return compareNode(Offset, Name, Strict, 0, 0, Buffer); } // clang-format off constexpr const char *const HangulSyllables[][3] = { { "G", "A", "" }, { "GG", "AE", "G" }, { "N", "YA", "GG" }, { "D", "YAE", "GS" }, { "DD", "EO", "N", }, { "R", "E", "NJ" }, { "M", "YEO", "NH" }, { "B", "YE", "D" }, { "BB", "O", "L" }, { "S", "WA", "LG" }, { "SS", "WAE", "LM" }, { "", "OE", "LB" }, { "J", "YO", "LS" }, { "JJ", "U", "LT" }, { "C", "WEO", "LP" }, { "K", "WE", "LH" }, { "T", "WI", "M" }, { "P", "YU", "B" }, { "H", "EU", "BS" }, { 0, "YI", "S" }, { 0, "I", "SS" }, { 0, 0, "NG" }, { 0, 0, "J" }, { 0, 0, "C" }, { 0, 0, "K" }, { 0, 0, "T" }, { 0, 0, "P" }, { 0, 0, "H" } }; // clang-format on // Unicode 14.0 // 3.12 Conjoining Jamo Behavior Common constants constexpr const char32_t SBase = 0xAC00; constexpr const uint32_t LCount = 19; constexpr const uint32_t VCount = 21; constexpr const uint32_t TCount = 28; static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column) { assert(Column == 0 || Column == 1 || Column == 2); static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; char NeedleStart = 0; int Len = -1; int Prev = PreviousInName; for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { StringRef Syllable(HangulSyllables[I][Column]); if (int(Syllable.size()) <= Len) continue; std::size_t Consummed = 0; char PreviousInNameCopy = PreviousInName; bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy, NeedleStart); if (!DoesStartWith) continue; Len = Consummed; Pos = I; Prev = PreviousInNameCopy; } if (Len == -1) return 0; PreviousInName = Prev; return size_t(Len); } static llvm::Optional nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { Buffer.clear(); // Hangul Syllable Decomposition std::size_t Consummed = 0; char NameStart = 0, NeedleStart = 0; bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart, NeedleStart); if (!DoesStartWith) return None; Name = Name.substr(Consummed); int L = -1, V = -1, T = -1; Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); if (L != -1 && V != -1 && T != -1 && Name.empty()) { if (!Strict) { Buffer.append("HANGUL SYLLABLE "); if (L != -1) Buffer.append(HangulSyllables[L][0]); if (V != -1) Buffer.append(HangulSyllables[V][1]); if (T != -1) Buffer.append(HangulSyllables[T][2]); } return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + std::uint32_t(T); } // Otherwise, it's an illegal syllable name. return None; } struct GeneratedNamesData { StringRef Prefix; uint32_t Start; uint32_t End; }; // Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings // This needs to be kept in sync with // llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp static const GeneratedNamesData GeneratedNamesDataTable[] = { {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC}, {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD}, {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734}, {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, }; static llvm::Optional nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { for (auto &&Item : GeneratedNamesDataTable) { Buffer.clear(); std::size_t Consummed = 0; char NameStart = 0, NeedleStart = 0; bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, NameStart, NeedleStart, /*isPrefix*/ true); if (!DoesStartWith) continue; auto Number = Name.substr(Consummed); unsigned long long V = 0; // Be consistent about mandating upper casing. if (Strict && llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) return {}; if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) continue; if (!Strict) { Buffer.append(Item.Prefix); Buffer.append(utohexstr(V, true)); } return V; } return None; } static llvm::Optional nameToCodepoint(StringRef Name, bool Strict, BufferType &Buffer) { if (Name.empty()) return None; llvm::Optional Res = nameToHangulCodePoint(Name, Strict, Buffer); if (!Res) Res = nameToGeneratedCodePoint(Name, Strict, Buffer); if (Res) return *Res; Buffer.clear(); Node Node; bool Matches; uint32_t Value; std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); if (Matches) { std::reverse(Buffer.begin(), Buffer.end()); // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. if (!Strict && Value == 0x116c && Name.find_insensitive("O-E") != StringRef::npos) { Buffer = "HANGUL JUNGSEONG O-E"; Value = 0x1180; } return Value; } return None; } llvm::Optional nameToCodepointStrict(StringRef Name) { BufferType Buffer; auto Opt = nameToCodepoint(Name, true, Buffer); return Opt; } llvm::Optional nameToCodepointLooseMatching(StringRef Name) { BufferType Buffer; auto Opt = nameToCodepoint(Name, false, Buffer); if (!Opt) return None; return LooseMatchingResult{*Opt, Buffer}; } // Find the unicode character whose editing distance to Pattern // is shortest, using the Wagner–Fischer algorithm. llvm::SmallVector nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { // We maintain a fixed size vector of matches, // sorted by distance // The worst match (with the biggest distance) are discarded when new elements // are added. std::size_t LargestEditDistance = 0; llvm::SmallVector Matches; Matches.reserve(MaxMatchesCount + 1); auto Insert = [&](const Node &Node, uint32_t Distance, char32_t Value) -> bool { if (Distance > LargestEditDistance) { if (Matches.size() == MaxMatchesCount) return false; LargestEditDistance = Distance; } // To avoid allocations, the creation of the name is delayed // as much as possible. std::string Name; auto GetName = [&] { if (Name.empty()) Name = Node.fullName(); return Name; }; auto It = std::lower_bound( Matches.begin(), Matches.end(), Distance, [&](const MatchForCodepointName &a, std::size_t Distance) { if (Distance == a.Distance) return a.Name < GetName(); return a.Distance < Distance; }); if (It == Matches.end() && Matches.size() == MaxMatchesCount) return false; MatchForCodepointName M{GetName(), Distance, Value}; Matches.insert(It, std::move(M)); if (Matches.size() > MaxMatchesCount) Matches.pop_back(); return true; }; // We ignore case, space, hyphens, etc, // in both the search pattern and the prospective names. auto Normalize = [](StringRef Name) { std::string Out; Out.reserve(Name.size()); for (char C : Name) { if (isAlnum(C)) Out.push_back(toUpper(C)); } return Out; }; std::string NormalizedName = Normalize(Pattern); // Allocate a matrix big enough for longest names. const std::size_t Columns = std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + 1; LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = UnicodeNameToCodepointLargestNameSize + 1; std::vector Distances( Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { assert(Column < Columns); assert(Row < Rows); return Distances[Row * Columns + Column]; }; for (std::size_t I = 0; I < Columns; I++) Get(I, 0) = I; // Visit the childrens, // Filling (and overriding) the matrix for the name fragment of each node // iteratively. CompleteName is used to collect the actual name of potential // match, respecting case and spacing. auto VisitNode = [&](const Node &N, std::size_t Row, auto &VisitNode) -> void { std::size_t J = 0; for (; J < N.Name.size(); J++) { if (!isAlnum(N.Name[J])) continue; Get(0, Row) = Row; for (std::size_t I = 1; I < Columns; I++) { const int Delete = Get(I - 1, Row) + 1; const int Insert = Get(I, Row - 1) + 1; const int Replace = Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); } Row++; } unsigned Cost = Get(Columns - 1, Row - 1); if (N.Value != 0xFFFFFFFF) { Insert(N, Cost, N.Value); } if (N.hasChildren()) { auto ChildOffset = N.ChildrenOffset; for (;;) { Node C = readNode(ChildOffset, &N); ChildOffset += C.Size; if (!C.isValid()) break; VisitNode(C, Row, VisitNode); if (!C.HasSibling) break; } } }; Node Root = createRoot(); VisitNode(Root, 1, VisitNode); return Matches; } } // namespace unicode } // namespace sys } // namespace llvm