Refactor convert_case() to prepare for optimizations.
authorJeff Davis <[email protected]>
Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)
committerJeff Davis <[email protected]>
Thu, 13 Mar 2025 04:51:52 +0000 (21:51 -0700)
Upcoming optimizations will add complexity to convert_case(). This
patch reorganizes slightly so that the complexity can be contained
within the logic to convert the case of a single character, rather
than mixing it in with logic to iterate through the string.

Reviewed-by: Alexander Borisov <[email protected]>
Discussion: https://postgr.es/m/44005c3d-88f4-4a26-981f-fd82dfa8e313@gmail.com

src/common/unicode_case.c

index 7afff1b172b3ea2d05d1627c306f4c75bc5823d1..ccc485bf98fe243883f2474e0a246e2c841e9808 100644 (file)
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 
+enum CaseMapResult
+{
+   CASEMAP_SELF,
+   CASEMAP_SIMPLE,
+   CASEMAP_SPECIAL,
+};
+
 static const pg_case_map *find_case_map(pg_wchar ucs);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
                           CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
                           void *wbstate);
-static bool check_special_conditions(int conditions, const char *str,
-                                    size_t len, size_t offset);
+static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+                                 const char *src, size_t srclen, size_t srcoff,
+                                 pg_wchar *u2, const pg_wchar **special);
 
 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
    {
        pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
        int         u1len = unicode_utf8len(u1);
-       const pg_case_map *casemap = find_case_map(u1);
-       const pg_special_case *special = NULL;
+       pg_wchar    simple = 0;
+       const pg_wchar *special = NULL;
+       enum CaseMapResult casemap_result;
 
        if (str_casekind == CaseTitle)
        {
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
                chr_casekind = CaseLower;
        }
 
-       /*
-        * Find special case that matches the conditions, if any.
-        *
-        * Note: only a single special mapping per codepoint is currently
-        * supported, though Unicode allows for multiple special mappings for
-        * a single codepoint.
-        */
-       if (full && casemap && casemap->special_case)
-       {
-           int16       conditions = casemap->special_case->conditions;
-
-           Assert(casemap->special_case->codepoint == u1);
-           if (check_special_conditions(conditions, src, srclen, srcoff))
-               special = casemap->special_case;
-       }
+       casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
+                                &simple, &special);
 
-       /* perform mapping, update result_len, and write to dst */
-       if (special)
+       switch (casemap_result)
        {
-           for (int i = 0; i < MAX_CASE_EXPANSION; i++)
-           {
-               pg_wchar    u2 = special->map[chr_casekind][i];
-               size_t      u2len = unicode_utf8len(u2);
-
-               if (u2 == '\0')
-                   break;
-
-               if (result_len + u2len <= dstsize)
-                   unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-               result_len += u2len;
-           }
-       }
-       else if (casemap)
-       {
-           pg_wchar    u2 = casemap->simplemap[chr_casekind];
-           pg_wchar    u2len = unicode_utf8len(u2);
-
-           if (result_len + u2len <= dstsize)
-               unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-           result_len += u2len;
-       }
-       else
-       {
-           /* no mapping; copy bytes from src */
-           if (result_len + u1len <= dstsize)
-               memcpy(dst + result_len, src + srcoff, u1len);
-
-           result_len += u1len;
+           case CASEMAP_SELF:
+               /* no mapping; copy bytes from src */
+               Assert(simple == 0);
+               Assert(special == NULL);
+               if (result_len + u1len <= dstsize)
+                   memcpy(dst + result_len, src + srcoff, u1len);
+
+               result_len += u1len;
+               break;
+           case CASEMAP_SIMPLE:
+               {
+                   /* replace with single character */
+                   pg_wchar    u2 = simple;
+                   pg_wchar    u2len = unicode_utf8len(u2);
+
+                   Assert(special == NULL);
+                   if (result_len + u2len <= dstsize)
+                       unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+                   result_len += u2len;
+               }
+               break;
+           case CASEMAP_SPECIAL:
+               /* replace with up to MAX_CASE_EXPANSION characters */
+               Assert(simple == 0);
+               for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
+               {
+                   pg_wchar    u2 = special[i];
+                   size_t      u2len = unicode_utf8len(u2);
+
+                   if (result_len + u2len <= dstsize)
+                       unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+                   result_len += u2len;
+               }
+               break;
        }
 
        srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
    return true;
 }
 
+/*
+ * Unicode allows for special casing to be applied only under certain
+ * circumstances. The only currently-supported condition is Final_Sigma.
+ */
 static bool
 check_special_conditions(int conditions, const char *str, size_t len,
                         size_t offset)
@@ -365,6 +369,51 @@ check_special_conditions(int conditions, const char *str, size_t len,
    return false;
 }
 
+/*
+ * Map the given character to the requested case.
+ *
+ * If full is true, and a special case mapping is found and the conditions are
+ * met, 'special' is set to the mapping result (which is an array of up to
+ * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
+ *
+ * Otherwise, search for a simple mapping, and if found, set 'simple' to the
+ * result and return CASEMAP_SIMPLE.
+ *
+ * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
+ * character without modification.
+ */
+static enum CaseMapResult
+casemap(pg_wchar u1, CaseKind casekind, bool full,
+       const char *src, size_t srclen, size_t srcoff,
+       pg_wchar *simple, const pg_wchar **special)
+{
+   const pg_case_map *map;
+
+   if (u1 < 0x80)
+   {
+       *simple = case_map[u1].simplemap[casekind];
+
+       return CASEMAP_SIMPLE;
+   }
+
+   map = find_case_map(u1);
+
+   if (map == NULL)
+       return CASEMAP_SELF;
+
+   if (full && map->special_case != NULL &&
+       check_special_conditions(map->special_case->conditions,
+                                src, srclen, srcoff))
+   {
+       *special = map->special_case->map[casekind];
+       return CASEMAP_SPECIAL;
+   }
+
+   *simple = map->simplemap[casekind];
+
+   return CASEMAP_SIMPLE;
+}
+
 /* find entry in simple case map, if any */
 static const pg_case_map *
 find_case_map(pg_wchar ucs)